+
+
SQL Reference
+
SQL Reference explains statements, expressions, and data manipulation concepts for database users.
+
Statements
+
The reference lists supported statement syntax.
+
+
+
+""",
+ encoding="utf-8",
+ )
+ archived.write_text(source.read_text(encoding="utf-8"), encoding="utf-8")
+
+ markdown = parse_html_markdown(source, archived)
+
+ assert "Small. Fast. Reliable" not in markdown
+ assert "IE hack" not in markdown
+ assert "Previous topic" not in markdown
+ assert "### SQL Reference" in markdown
+ assert "#### Statements" in markdown
+ assert "## Summary\nSQL Reference explains statements" in markdown
+
+
+def test_parse_document_accepts_html_branch(tmp_path: Path) -> None:
+ source = tmp_path / "page.html"
+ archived = tmp_path / "2026-05-15_page.html"
+ source.write_text("Hello.
", encoding="utf-8")
+ archived.write_text(source.read_text(encoding="utf-8"), encoding="utf-8")
+
+ document = parse_document(source, archived, _config())
+
+ assert document.title == "Web Page - HTML Page"
+ assert document.metadata["extension"] == ".html"
+ assert "### HTML Page" in document.markdown_content
diff --git a/tests/test_image_parser.py b/tests/test_image_parser.py
new file mode 100644
index 0000000..c013499
--- /dev/null
+++ b/tests/test_image_parser.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+
+from heta.config.schema import HetaConfig, InsertPlanningConfig, LLMConfig, MinerUConfig, VectorIndexConfig
+from heta.kb.image_parser import build_image_markdown
+from heta.kb.parser import parse_document
+from heta.kb.text import extract_title
+
+
+def _config() -> HetaConfig:
+ return HetaConfig(
+ version=1,
+ llm=LLMConfig(provider="qwen", api_key="sk-test"),
+ mineru=MinerUConfig.disabled(),
+ vector_index=VectorIndexConfig(enable=False),
+ insert_planning=InsertPlanningConfig.enabled(),
+ )
+
+
+def _custom_without_multimodal_config() -> HetaConfig:
+ return HetaConfig(
+ version=1,
+ llm=LLMConfig(
+ provider="custom",
+ api_key="sk-test",
+ chat_api_key="sk-chat",
+ chat_model="chat-model",
+ chat_base_url="http://chat.local/v1",
+ embedding_api_key="sk-embedding",
+ embedding_model="embedding-model",
+ embedding_base_url="http://embedding.local/v1",
+ ),
+ mineru=MinerUConfig.disabled(),
+ vector_index=VectorIndexConfig(enable=False),
+ insert_planning=InsertPlanningConfig.enabled(),
+ )
+
+
+def test_build_image_markdown_uses_compact_retrieval_sections() -> None:
+ markdown = build_image_markdown(
+ title="Image - Architecture Diagram",
+ source_name="diagram.png",
+ image_path="../../raw/diagram.png",
+ summary="A system architecture diagram.",
+ visual_facts="Scene/type: diagram. Main subject: service pipeline.",
+ visible_text="API Gateway",
+ interpretation_keywords="Represents a backend data flow. keywords: API, pipeline.",
+ )
+
+ assert extract_title(markdown, "fallback") == "Image - Architecture Diagram"
+ assert "" in markdown
+ assert "### Visual Facts" in markdown
+ assert "### Visible Text" in markdown
+ assert "### Interpretation and Keywords" in markdown
+ assert "## Related Pages" in markdown
+ assert "## Source" in markdown
+
+
+def test_parse_document_accepts_image_branch(monkeypatch, tmp_path: Path) -> None:
+ source = tmp_path / "diagram.png"
+ archived = tmp_path / "raw_diagram.png"
+ source.write_bytes(b"png")
+ archived.write_bytes(b"png")
+
+ monkeypatch.setattr(
+ "heta.kb.parser.parse_image_markdown",
+ lambda source_path, archived_path, config: build_image_markdown(
+ title="Image - Diagram",
+ source_name=archived_path.name,
+ image_path="../../raw/raw_diagram.png",
+ summary="A diagram.",
+ visual_facts="A simple diagram.",
+ visible_text="None detected.",
+ interpretation_keywords="diagram, test",
+ ),
+ )
+
+ document = parse_document(source, archived, _config())
+
+ assert document.title == "Image - Diagram"
+ assert document.source_name == "raw_diagram.png"
+ assert document.metadata["extension"] == ".png"
+ assert "### Visual Facts" in document.markdown_content
+
+
+def test_image_requires_multimodal_when_custom_skips_it(tmp_path: Path) -> None:
+ source = tmp_path / "diagram.png"
+ source.write_bytes(b"png")
+
+ try:
+ parse_document(source, source, _custom_without_multimodal_config())
+ except ValueError as exc:
+ assert "requires a multimodal model" in str(exc)
+ assert "heta init" in str(exc)
+ else:
+ raise AssertionError("image parsing should require multimodal config")
diff --git a/tests/test_kb_insert.py b/tests/test_kb_insert.py
index bd9a8a0..7231bc6 100644
--- a/tests/test_kb_insert.py
+++ b/tests/test_kb_insert.py
@@ -2,25 +2,37 @@
import pytest
-from heta.config.schema import HetaConfig, LLMConfig, MinerUConfig, VectorIndexConfig
+from heta.config.schema import (
+ DynamicInsertConfig,
+ InsertPlanningConfig,
+ HetaConfig,
+ LLMConfig,
+ MinerUConfig,
+ VectorIndexConfig,
+)
from heta.kb.discovery import collect_insert_files
-from heta.kb.models import FileChange
-from heta.kb.insert import insert_paths
+from heta.kb.models import FileChange, ParsedDocument
+from heta.kb.insert import _ensure_code_raw_links, insert_paths
from heta.kb.text import frontmatter_page, slugify, summarize
-from heta.kb.wiki import normalize_wiki_pages
+from heta.kb.wiki import normalize_wiki_pages, repair_broken_wiki_links
-def _config(mineru: MinerUConfig | None = None) -> HetaConfig:
+def _config(mineru: MinerUConfig | None = None, *, dynamic_insert: bool = True) -> HetaConfig:
return HetaConfig(
version=1,
llm=LLMConfig(provider="qwen", api_key="sk-test"),
mineru=mineru or MinerUConfig.disabled(),
vector_index=VectorIndexConfig(enable=False),
+ insert_planning=InsertPlanningConfig.enabled(),
+ dynamic_insert=DynamicInsertConfig(enable=dynamic_insert),
)
-def _fake_agent(monkeypatch) -> None:
+def _fake_agent(monkeypatch, calls: list[list[str]] | None = None) -> None:
def run_merge_agent(*, task_id, documents, root_dir, config):
+ assert len(documents) == 1
+ if calls is not None:
+ calls.append([document.source_name for document in documents])
pages = root_dir / "pages"
pages.mkdir(parents=True, exist_ok=True)
added = []
@@ -96,6 +108,156 @@ def test_insert_same_title_updates_existing_page(monkeypatch, tmp_path: Path) ->
assert "## Imported Update" in page.read_text(encoding="utf-8")
+def test_insert_multiple_files_runs_agent_sequentially(monkeypatch, tmp_path: Path) -> None:
+ calls: list[list[str]] = []
+ progress = []
+ _fake_agent(monkeypatch, calls)
+ first = tmp_path / "alpha.md"
+ second = tmp_path / "beta.md"
+ first.write_text("# Alpha\n\nFirst details.", encoding="utf-8")
+ second.write_text("# Beta\n\nSecond details.", encoding="utf-8")
+
+ result = insert_paths(
+ [first, second],
+ _config(),
+ base_dir=tmp_path,
+ on_progress=progress.append,
+ )
+
+ wiki = tmp_path / "workspace" / "kb" / "wiki"
+ assert calls[0][0].endswith("_alpha.md")
+ assert calls[1][0].endswith("_beta.md")
+ assert (wiki / "pages" / "1-alpha.md").exists()
+ assert (wiki / "pages" / "2-beta.md").exists()
+ assert [change.path for change in result.added] == ["pages/1-alpha.md", "pages/2-beta.md"]
+ assert progress[0].percent == 1
+ merge_percents = [event.percent for event in progress if event.phase == "merge"]
+ assert 50 in merge_percents
+ assert 99 in merge_percents
+ assert progress[-1].percent == 100
+ assert progress[-1].phase == "done"
+
+
+def test_insert_defaults_to_static_pages(monkeypatch, tmp_path: Path) -> None:
+ def fail_agent(**kwargs):
+ raise AssertionError("dynamic agent should not run in static insert mode")
+
+ monkeypatch.setattr("heta.kb.insert.run_merge_agent", fail_agent)
+ monkeypatch.setattr(
+ "heta.kb.static_insert.generate_summary",
+ lambda *, document, config: f"Summary for {document.title}.",
+ )
+ source = tmp_path / "manual.md"
+ source.write_text("# Main Heading\n\n## Sub Heading\n\nBody text.", encoding="utf-8")
+
+ result = insert_paths([source], _config(dynamic_insert=False), base_dir=tmp_path)
+
+ wiki = tmp_path / "workspace" / "kb" / "wiki"
+ page = wiki / "pages" / "1-main-heading.md"
+ text = page.read_text(encoding="utf-8")
+ assert result.added[0].path == "pages/1-main-heading.md"
+ assert "Summary for Main Heading." in text
+ assert "### Main Heading" in text
+ assert "#### Sub Heading" in text
+ assert "## Related Pages\n\n- None yet" in text
+ assert "- " + result.raw_files[0].name in text
+ assert "[[Main Heading]]" in (wiki / "index.md").read_text(encoding="utf-8")
+ assert "Created static page: Main Heading" in (wiki / "log.md").read_text(encoding="utf-8")
+
+
+def test_insert_reports_vector_sync_error_without_rolling_back(monkeypatch, tmp_path: Path) -> None:
+ monkeypatch.setattr(
+ "heta.kb.static_insert.generate_summary",
+ lambda *, document, config: f"Summary for {document.title}.",
+ )
+ monkeypatch.setattr(
+ "heta.kb.insert.sync_wiki_vector_index",
+ lambda **kwargs: (_ for _ in ()).throw(RuntimeError("embedding unavailable")),
+ )
+ source = tmp_path / "manual.md"
+ source.write_text("# Main Heading\n\nBody text.", encoding="utf-8")
+ config = HetaConfig(
+ version=1,
+ llm=LLMConfig(provider="qwen", api_key="sk-test"),
+ mineru=MinerUConfig.disabled(),
+ vector_index=VectorIndexConfig(enable=True),
+ insert_planning=InsertPlanningConfig.enabled(),
+ dynamic_insert=DynamicInsertConfig.disabled(),
+ )
+
+ result = insert_paths([source], config, base_dir=tmp_path)
+
+ page = tmp_path / "workspace" / "kb" / "wiki" / "pages" / "1-main-heading.md"
+ assert result.commit_id
+ assert page.exists()
+ assert result.vector_index_error == "embedding unavailable"
+
+
+def test_insert_continues_when_agent_makes_no_wiki_changes(monkeypatch, tmp_path: Path) -> None:
+ calls: list[str] = []
+
+ def run_merge_agent(*, task_id, documents, root_dir, config):
+ document = documents[0]
+ calls.append(document.source_name)
+ if "beta" in document.source_name:
+ return {"added": [], "updated": [], "deleted": []}
+
+ pages = root_dir / "pages"
+ pages.mkdir(parents=True, exist_ok=True)
+ page = pages / f"{slugify(document.title)}.md"
+ page.write_text(
+ frontmatter_page(
+ document.title,
+ document.source_name,
+ summarize(document.markdown_content),
+ document.markdown_content,
+ ),
+ encoding="utf-8",
+ )
+ return {"added": [FileChange("added", document.title, f"pages/{page.name}")], "updated": [], "deleted": []}
+
+ monkeypatch.setattr("heta.kb.insert.run_merge_agent", run_merge_agent)
+ first = tmp_path / "alpha.md"
+ second = tmp_path / "beta.md"
+ third = tmp_path / "gamma.md"
+ first.write_text("# Alpha\n\nFirst details.", encoding="utf-8")
+ second.write_text("# Beta\n\nSecond details.", encoding="utf-8")
+ third.write_text("# Gamma\n\nThird details.", encoding="utf-8")
+
+ result = insert_paths([first, second, third], _config(), base_dir=tmp_path)
+
+ wiki = tmp_path / "workspace" / "kb" / "wiki"
+ assert len(calls) == 3
+ assert [change.path for change in result.added] == ["pages/1-alpha.md", "pages/2-gamma.md"]
+ assert result.skipped_documents == [calls[1]]
+ assert (wiki / "pages" / "1-alpha.md").exists()
+ assert not (wiki / "pages" / "2-beta.md").exists()
+ assert (wiki / "pages" / "2-gamma.md").exists()
+ assert "Skipped no-op merge" not in (wiki / "log.md").read_text(encoding="utf-8")
+
+
+def test_ensure_code_raw_links_restores_agent_dropped_raw_link(tmp_path: Path) -> None:
+ wiki = tmp_path / "wiki"
+ page = wiki / "pages" / "1-code-demo.md"
+ page.parent.mkdir(parents=True)
+ page.write_text(
+ frontmatter_page("Code - demo.py", "2026-05-15_demo.py", "Summary.", "### File Overview\n- language: python"),
+ encoding="utf-8",
+ )
+ document = ParsedDocument(
+ source_path=tmp_path / "demo.py",
+ archived_path=tmp_path / "raw" / "2026-05-15_demo.py",
+ title="Code - demo.py",
+ markdown_content="",
+ source_name="2026-05-15_demo.py",
+ metadata={"extension": ".py"},
+ )
+
+ _ensure_code_raw_links(wiki, document, [FileChange("added", "Code - demo.py", "pages/1-code-demo.md")])
+
+ assert "[Raw source](<../../raw/2026-05-15_demo.py>)" in page.read_text(encoding="utf-8")
+
+
def test_pdf_requires_mineru_when_disabled(tmp_path: Path) -> None:
source = tmp_path / "paper.pdf"
source.write_bytes(b"%PDF")
@@ -104,6 +266,60 @@ def test_pdf_requires_mineru_when_disabled(tmp_path: Path) -> None:
collect_insert_files([source], _config())
+def test_office_requires_mineru_when_disabled(tmp_path: Path) -> None:
+ source = tmp_path / "deck.pptx"
+ source.write_bytes(b"pptx")
+
+ with pytest.raises(ValueError, match="requires MinerU"):
+ collect_insert_files([source], _config())
+
+
+def test_collect_insert_files_accepts_office_when_mineru_enabled(tmp_path: Path) -> None:
+ files = []
+ for name in ["notes.doc", "notes.docx", "deck.ppt", "deck.pptx", "sheet.xls", "sheet.xlsx"]:
+ file = tmp_path / name
+ file.write_bytes(b"office")
+ files.append(file)
+
+ collected = collect_insert_files(
+ [tmp_path],
+ _config(MinerUConfig(enable=True, provider="cloud", api_key="mineru-token", endpoint=None)),
+ )
+
+ assert collected == sorted(files)
+
+
+def test_collect_insert_files_accepts_common_images(tmp_path: Path) -> None:
+ image = tmp_path / "diagram.png"
+ image.write_bytes(b"png")
+
+ files = collect_insert_files([image], _config())
+
+ assert files == [image]
+
+
+def test_collect_insert_files_accepts_audio_and_video(tmp_path: Path) -> None:
+ audio = tmp_path / "meeting.mp3"
+ video = tmp_path / "demo.mp4"
+ audio.write_bytes(b"mp3")
+ video.write_bytes(b"mp4")
+
+ files = collect_insert_files([audio, video], _config())
+
+ assert files == [audio, video]
+
+
+def test_collect_insert_files_accepts_code_and_html(tmp_path: Path) -> None:
+ code = tmp_path / "module.py"
+ html = tmp_path / "index.html"
+ code.write_text("def run():\n pass\n", encoding="utf-8")
+ html.write_text("