From 9ca145f472fd0e0e87aa07f692c1aaa2fe18d013 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 08:04:00 +0800 Subject: [PATCH 01/26] feat: concept dedup, compile pipeline refactor, and bidirectional backlinks - Add concept dedup with briefs and _read_concept_briefs context - Add concepts plan and update prompt templates with create/update/related paths - Extract shared _compile_concepts from compile_short_doc and compile_long_doc - Add bidirectional backlinks between summaries and concepts - Code review fixes: security, robustness, tests, and CI hardening Co-authored-by: Ray --- docs/.DS_Store | Bin 0 -> 6148 bytes .../2026-04-09-concept-dedup-and-update.md | 888 +++++++++++++ .../plans/2026-04-09-retrieve-redesign.md | 1104 +++++++++++++++++ ...6-04-09-concept-dedup-and-update-design.md | 163 +++ .../specs/2026-04-09-retrieve-redesign.md | 262 ++++ openkb/__main__.py | 4 + openkb/agent/compiler.py | 886 +++++++++---- openkb/agent/linter.py | 4 +- tests/test_compiler.py | 887 +++++++++++-- 9 files changed, 3876 insertions(+), 322 deletions(-) create mode 100644 docs/.DS_Store create mode 100644 docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md create mode 100644 docs/superpowers/plans/2026-04-09-retrieve-redesign.md create mode 100644 docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md create mode 100644 docs/superpowers/specs/2026-04-09-retrieve-redesign.md create mode 100644 openkb/__main__.py diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..56fffc4488504f8e54d58ac8d5d950d7d4bc43d7 GIT binary patch literal 6148 zcmeHKF=_)r43rWV1~)EK?iccd#W*j>2aIEPh6_1N+N<)eJS{VlAebYN!fwn6w6m+# z?6OmwPG;tt^WoWSZDvb2(Y`p$jr;VOJygVjaGddB?>5+{9Wnc#k-M_d&R~-D%O4Np z+i5=xC#J}xfE17dQa}nwfh#Fcg>|{Q(s?>Y3P^$bQGnlv1}FByDKS1B7@`FLE)WjG zJbDRWV*uC-r$j_xo>X8`y;=-UI^wPJdf}9qbn|LHyl(dDP&{tO`7P4Td!j}uAO$WJ zc+BOR_5TI_NB@6G(n<$EObXmufgdgD7FqxR literal 0 HcmV?d00001 diff --git a/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md new file mode 100644 index 0000000..1a312a6 --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-concept-dedup-and-update.md @@ -0,0 +1,888 @@ +# Concept Dedup & Existing Page Update — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Give the compiler enough context about existing concepts to make smart dedup/update decisions, and add the ability to rewrite existing concept pages with new information — all without breaking prompt caching. + +**Architecture:** Extend the deterministic pipeline in `compiler.py` with: (1) concept briefs read from disk before the concepts-plan LLM call, (2) a new JSON output format with create/update/related actions, (3) a new concurrent "update" path that sends existing page content to the LLM for rewriting, (4) a code-only "related" path for cross-ref links. Extract shared logic between `compile_short_doc` and `compile_long_doc` into `_compile_concepts`. + +**Tech Stack:** Python, litellm, asyncio, pytest + +--- + +### Task 1: Add `_read_concept_briefs` and test + +**Files:** +- Modify: `openkb/agent/compiler.py:199-207` (File I/O helpers section) +- Modify: `tests/test_compiler.py:98-116` (TestReadWikiContext section) + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import _read_concept_briefs + +class TestReadConceptBriefs: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_no_concepts_dir(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_reads_briefs_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nAttention allows models to focus on relevant input parts selectively.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Attention allows models" in result + + def test_reads_briefs_without_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "rnn.md").write_text( + "Recurrent neural networks process sequences step by step.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- rnn: Recurrent neural networks" in result + + def test_truncates_long_content(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "long.md").write_text("A" * 300, encoding="utf-8") + result = _read_concept_briefs(wiki) + brief_line = result.split("\n")[0] + # slug + ": " + 150 chars = well under 200 + assert len(brief_line) < 200 + + def test_sorted_alphabetically(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8") + (concepts / "alpha.md").write_text("Alpha concept.", encoding="utf-8") + result = _read_concept_briefs(wiki) + lines = result.strip().split("\n") + assert lines[0].startswith("- alpha:") + assert lines[1].startswith("- zebra:") +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: FAIL with `ImportError: cannot import name '_read_concept_briefs'` + +- [ ] **Step 3: Implement `_read_concept_briefs`** + +Add to `openkb/agent/compiler.py` in the File I/O helpers section (after `_read_wiki_context`): + +```python +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact briefs for the LLM. + + Returns a string like: + - attention: Attention allows models to focus on relevant input parts... + - transformer: The Transformer is a neural network architecture... + + Or "(none yet)" if no concept pages exist. + """ + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + briefs = [] + for p in sorted(concepts_dir.glob("*.md")): + text = p.read_text(encoding="utf-8") + # Skip YAML frontmatter + if text.startswith("---"): + parts = text.split("---", 2) + body = parts[2].strip() if len(parts) >= 3 else "" + else: + body = text.strip() + brief = body[:150].replace("\n", " ") + if brief: + briefs.append(f"- {p.stem}: {brief}") + return "\n".join(briefs) or "(none yet)" +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: All 6 tests PASS + +- [ ] **Step 5: Update the import in test file** + +Add `_read_concept_briefs` to the existing import block at the top of `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import ( + compile_long_doc, + compile_short_doc, + _parse_json, + _write_summary, + _write_concept, + _update_index, + _read_wiki_context, + _read_concept_briefs, +) +``` + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add _read_concept_briefs for concept dedup context" +``` + +--- + +### Task 2: Replace prompt template and update JSON parsing + +**Files:** +- Modify: `openkb/agent/compiler.py:53-70` (prompt templates section) +- Modify: `tests/test_compiler.py:21-31` (TestParseJson section) + +- [ ] **Step 1: Write the failing test for new JSON format** + +Add to `tests/test_compiler.py`: + +```python +class TestParseConceptsPlan: + def test_dict_format(self): + text = json.dumps({ + "create": [{"name": "foo", "title": "Foo"}], + "update": [{"name": "bar", "title": "Bar"}], + "related": ["baz"], + }) + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert len(parsed["create"]) == 1 + assert len(parsed["update"]) == 1 + assert parsed["related"] == ["baz"] + + def test_fallback_list_format(self): + """If LLM returns old flat array, _parse_json still works.""" + text = json.dumps([{"name": "foo", "title": "Foo"}]) + parsed = _parse_json(text) + assert isinstance(parsed, list) + + def test_fenced_dict(self): + text = '```json\n{"create": [], "update": [], "related": []}\n```' + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert parsed["create"] == [] +``` + +- [ ] **Step 2: Run test to verify it passes (these use existing `_parse_json`)** + +Run: `pytest tests/test_compiler.py::TestParseConceptsPlan -v` +Expected: All 3 PASS — `_parse_json` already handles dicts. This confirms compatibility. + +- [ ] **Step 3: Replace `_CONCEPTS_LIST_USER` with `_CONCEPTS_PLAN_USER`** + +In `openkb/agent/compiler.py`, replace the `_CONCEPTS_LIST_USER` template (lines 53-70) with: + +```python +_CONCEPTS_PLAN_USER = """\ +Based on the summary above, decide how to update the wiki's concept pages. + +Existing concept pages: +{concept_briefs} + +Return a JSON object with three keys: + +1. "create" — new concepts not covered by any existing page. Array of objects: + {{"name": "concept-slug", "title": "Human-Readable Title"}} + +2. "update" — existing concepts that have significant new information from \ +this document worth integrating. Array of objects: + {{"name": "existing-slug", "title": "Existing Title"}} + +3. "related" — existing concepts tangentially related to this document but \ +not needing content changes, just a cross-reference link. Array of slug strings. + +Rules: +- For the first few documents, create 2-3 foundational concepts at most. +- Do NOT create a concept that overlaps with an existing one — use "update". +- Do NOT create concepts that are just the document topic itself. +- "related" is for lightweight cross-linking only, no content rewrite needed. + +Return ONLY valid JSON, no fences, no explanation. +""" +``` + +- [ ] **Step 4: Add `_CONCEPT_UPDATE_USER` template** + +Add after `_CONCEPT_PAGE_USER` (after line 82): + +```python +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return ONLY the Markdown content (no frontmatter, no code fences). +""" +``` + +- [ ] **Step 5: Run all existing tests to verify nothing breaks** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS (templates aren't tested directly, only via integration tests which we'll update later) + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add concepts plan and update prompt templates" +``` + +--- + +### Task 3: Add `_add_related_link` and test + +**Files:** +- Modify: `openkb/agent/compiler.py` (File I/O helpers section, after `_write_concept`) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write the failing test** + +Add to `tests/test_compiler.py`: + +```python +from openkb.agent.compiler import _add_related_link + +class TestAddRelatedLink: + def test_adds_see_also_link(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-doc]]" in text + assert "paper2.pdf" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper1.pdf") + text = (concepts / "attention.md").read_text() + # Should not duplicate + assert text.count("[[summaries/new-doc]]") == 1 + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _add_related_link(wiki, "nonexistent", "doc", "file.pdf") +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` +Expected: FAIL with `ImportError: cannot import name '_add_related_link'` + +- [ ] **Step 3: Implement `_add_related_link`** + +Add to `openkb/agent/compiler.py` after `_write_concept`: + +```python +def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: + """Add a cross-reference link to an existing concept page (no LLM call).""" + concepts_dir = wiki_dir / "concepts" + path = concepts_dir / f"{concept_slug}.md" + if not path.exists(): + return + + text = path.read_text(encoding="utf-8") + link = f"[[summaries/{doc_name}]]" + if link in text: + return + + # Update sources in frontmatter + if source_file not in text: + if text.startswith("---"): + end = text.index("---", 3) + fm = text[:end + 3] + body = text[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + text = fm + body + else: + text = f"---\nsources: [{source_file}]\n---\n\n" + text + + text += f"\n\nSee also: {link}" + path.write_text(text, encoding="utf-8") +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `pytest tests/test_compiler.py::TestAddRelatedLink -v` +Expected: All 3 tests PASS + +- [ ] **Step 5: Update the import in test file** + +Add `_add_related_link` to the import block at top of `tests/test_compiler.py`. + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add _add_related_link for code-only cross-referencing" +``` + +--- + +### Task 4: Extract `_compile_concepts` and refactor both public functions + +**Files:** +- Modify: `openkb/agent/compiler.py:290-509` (Public API section — full rewrite) +- Modify: `tests/test_compiler.py:153-267` (integration tests) + +This is the core task. It extracts the shared Steps 2-4 into `_compile_concepts`, updates both public functions to call it, and switches to the new concepts plan format. + +- [ ] **Step 1: Write integration test for new create/update/related flow** + +Add to `tests/test_compiler.py`: + +```python +class TestCompileConceptsPlan: + """Integration tests for the new create/update/related flow.""" + + @pytest.mark.asyncio + async def test_create_and_update_flow(self, tmp_path): + """New doc creates one concept and updates an existing one.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + concepts_dir = wiki / "concepts" + concepts_dir.mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + # Pre-existing concept + (concepts_dir / "attention.md").write_text( + "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOld content about attention.", + encoding="utf-8", + ) + + source_path = wiki / "sources" / "new-paper.md" + source_path.write_text("# New Paper\n\nContent about flash attention and transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "new-paper.pdf").write_bytes(b"fake") + + summary_resp = "This paper introduces flash attention, improving on attention mechanisms." + plan_resp = json.dumps({ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention Mechanism"}], + "related": [], + }) + create_page_resp = "# Flash Attention\n\nAn efficient attention algorithm." + update_page_resp = "# Attention\n\nUpdated content with flash attention details." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([create_page_resp, update_page_resp]) + ) + await compile_short_doc("new-paper", source_path, tmp_path, "gpt-4o-mini") + + # New concept created + flash_path = concepts_dir / "flash-attention.md" + assert flash_path.exists() + assert "sources: [new-paper.pdf]" in flash_path.read_text() + + # Existing concept rewritten (not appended) + attn_text = (concepts_dir / "attention.md").read_text() + assert "new-paper.pdf" in attn_text + assert "Updated content with flash attention details" in attn_text + + # Index updated for both + index_text = (wiki / "index.md").read_text() + assert "[[concepts/flash-attention]]" in index_text + + @pytest.mark.asyncio + async def test_related_adds_link_no_llm(self, tmp_path): + """Related concepts get cross-ref links without LLM calls.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + concepts_dir = wiki / "concepts" + concepts_dir.mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + (concepts_dir / "transformer.md").write_text( + "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nArchitecture details.", + encoding="utf-8", + ) + + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") + + summary_resp = "A short summary." + plan_resp = json.dumps({ + "create": [], + "update": [], + "related": ["transformer"], + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + # acompletion should NOT be called (no create/update) + mock_litellm.acompletion = AsyncMock(side_effect=AssertionError("should not be called")) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + # Related concept should have cross-ref link + transformer_text = (concepts_dir / "transformer.md").read_text() + assert "[[summaries/doc]]" in transformer_text + + @pytest.mark.asyncio + async def test_fallback_list_format(self, tmp_path): + """If LLM returns old flat array, treat all as create.""" + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "doc.pdf").write_bytes(b"fake") + + summary_resp = "Summary." + # Old format: flat array + plan_resp = json.dumps([{"name": "foo", "title": "Foo"}]) + page_resp = "# Foo\n\nContent." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([page_resp]) + ) + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + assert (wiki / "concepts" / "foo.md").exists() +``` + +- [ ] **Step 2: Run the new tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestCompileConceptsPlan -v` +Expected: FAIL — the current code uses old prompt format and doesn't handle dict responses + +- [ ] **Step 3: Implement `_compile_concepts` and refactor public functions** + +Replace the entire Public API section (from `DEFAULT_COMPILE_CONCURRENCY` to end of file) in `openkb/agent/compiler.py` with: + +```python +DEFAULT_COMPILE_CONCURRENCY = 5 + + +async def _compile_concepts( + wiki_dir: Path, + kb_dir: Path, + model: str, + system_msg: dict, + doc_msg: dict, + summary: str, + doc_name: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Shared concept compilation logic: plan → create/update/related → index. + + This is the core of the compilation pipeline, shared by both + compile_short_doc and compile_long_doc. + """ + source_file = _find_source_filename(doc_name, kb_dir) + concept_briefs = _read_concept_briefs(wiki_dir) + + # --- Concepts plan (A cached) --- + plan_raw = _llm_call(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPTS_PLAN_USER.format( + concept_briefs=concept_briefs, + )}, + ], "concepts-plan", max_tokens=1024) + + try: + parsed = _parse_json(plan_raw) + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Failed to parse concepts plan: %s", exc) + logger.debug("Raw: %s", plan_raw) + _update_index(wiki_dir, doc_name, []) + return + + # Fallback: if LLM returns flat array, treat all as create + if isinstance(parsed, list): + create_list, update_list, related_list = parsed, [], [] + else: + create_list = parsed.get("create", []) + update_list = parsed.get("update", []) + related_list = parsed.get("related", []) + + if not create_list and not update_list and not related_list: + _update_index(wiki_dir, doc_name, []) + return + + # --- Concurrent concept generation (A cached) --- + semaphore = asyncio.Semaphore(max_concurrency) + + async def _gen_create(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, + update_instruction="", + )}, + ], f"create:{name}") + return name, page_content, False + + async def _gen_update(concept: dict) -> tuple[str, str, bool]: + name = concept["name"] + title = concept.get("title", name) + # Read existing page content for the LLM to integrate + concept_path = wiki_dir / "concepts" / f"{name}.md" + if concept_path.exists(): + raw_text = concept_path.read_text(encoding="utf-8") + # Strip frontmatter for the LLM + if raw_text.startswith("---"): + parts = raw_text.split("---", 2) + existing_content = parts[2].strip() if len(parts) >= 3 else raw_text + else: + existing_content = raw_text + else: + existing_content = "(page not found — create from scratch)" + async with semaphore: + page_content = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_UPDATE_USER.format( + title=title, doc_name=doc_name, + existing_content=existing_content, + )}, + ], f"update:{name}") + return name, page_content, True + + tasks = [] + tasks.extend(_gen_create(c) for c in create_list) + tasks.extend(_gen_update(c) for c in update_list) + + if tasks: + total = len(tasks) + sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather(*tasks, return_exceptions=True) + else: + results = [] + + concept_names = [] + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update = r + _write_concept(wiki_dir, name, page_content, source_file, is_update) + concept_names.append(name) + + # --- Related: code-only cross-ref links --- + for slug in related_list: + _add_related_link(wiki_dir, slug, doc_name, source_file) + + # --- Update index --- + _update_index(wiki_dir, doc_name, concept_names) + + +async def compile_short_doc( + doc_name: str, + source_path: Path, + kb_dir: Path, + model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Compile a short document into wiki pages. + + Step 1: Generate summary from full document text. + Step 2: Plan + generate/update concept pages (via _compile_concepts). + """ + from openkb.config import load_config + + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + source_file = _find_source_filename(doc_name, kb_dir) + content = source_path.read_text(encoding="utf-8") + + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _SUMMARY_USER.format( + doc_name=doc_name, content=content, + )} + + # Step 1: Generate summary + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) + + # Step 2: Compile concepts + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, summary, + doc_name, max_concurrency, + ) + + +async def compile_long_doc( + doc_name: str, + summary_path: Path, + doc_id: str, + kb_dir: Path, + model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: + """Compile a long (PageIndex) document into wiki concept pages. + + The summary page is already written by the indexer. This function + generates an overview, then plans + generates/updates concept pages. + """ + from openkb.config import load_config + + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + summary_text = summary_path.read_text(encoding="utf-8") + + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( + doc_name=doc_name, doc_id=doc_id, content=summary_text, + )} + + # Step 1: Generate overview + overview = _llm_call(model, [system_msg, doc_msg], "overview") + + # Step 2: Compile concepts + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, overview, + doc_name, max_concurrency, + ) +``` + +- [ ] **Step 4: Update existing integration tests** + +Update `TestCompileShortDoc.test_full_pipeline` — the concepts-list response now needs to be the new dict format: + +```python +class TestCompileShortDoc: + @pytest.mark.asyncio + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_response = "# Summary\n\nThis document discusses transformers." + plan_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_page_response = "# Transformer\n\nA neural network architecture." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + summary_path = wiki / "summaries" / "test-doc.md" + assert summary_path.exists() + assert "sources: [test-doc.pdf]" in summary_path.read_text() + + concept_path = wiki / "concepts" / "transformer.md" + assert concept_path.exists() + assert "sources: [test-doc.pdf]" in concept_path.read_text() + + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]]" in index_text + assert "[[concepts/transformer]]" in index_text +``` + +Update `TestCompileShortDoc.test_handles_bad_json` — no changes needed (bad JSON still triggers fallback). + +Update `TestCompileLongDoc.test_full_pipeline`: + +```python +class TestCompileLongDoc: + @pytest.mark.asyncio + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + summary_path = wiki / "summaries" / "big-doc.md" + summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8") + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") + + overview_response = "Overview of the big document." + plan_response = json.dumps({ + "create": [{"name": "deep-learning", "title": "Deep Learning"}], + "update": [], + "related": [], + }) + concept_page_response = "# Deep Learning\n\nA subfield of ML." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([overview_response, plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_long_doc( + "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini" + ) + + concept_path = wiki / "concepts" / "deep-learning.md" + assert concept_path.exists() + assert "Deep Learning" in concept_path.read_text() + + index_text = (wiki / "index.md").read_text() + assert "[[summaries/big-doc]]" in index_text + assert "[[concepts/deep-learning]]" in index_text +``` + +- [ ] **Step 5: Run all tests** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 6: Run the full test suite** + +Run: `pytest tests/ -v` +Expected: All 149+ tests PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: concept dedup with briefs, update/related paths, extract _compile_concepts" +``` + +--- + +### Task 5: Clean up old references and update module docstring + +**Files:** +- Modify: `openkb/agent/compiler.py:1-9` (module docstring) + +- [ ] **Step 1: Update module docstring** + +Replace the docstring at the top of `openkb/agent/compiler.py`: + +```python +"""Wiki compilation pipeline for OpenKB. + +Pipeline leveraging LLM prompt caching: + Step 1: Build base context A (schema + document content). + Step 2: A → generate summary. + Step 3: A + summary → concepts plan (create/update/related). + Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts. + Step 5: Code adds cross-ref links to related concepts, updates index. +""" +``` + +- [ ] **Step 2: Verify `_CONCEPTS_LIST_USER` is fully removed** + +Search for any remaining references to `_CONCEPTS_LIST_USER` in the codebase: + +Run: `grep -r "_CONCEPTS_LIST_USER" openkb/ tests/` +Expected: No matches + +- [ ] **Step 3: Run full test suite one final time** + +Run: `pytest tests/ -q` +Expected: All tests pass + +- [ ] **Step 4: Commit** + +```bash +git add openkb/agent/compiler.py +git commit -m "chore: update compiler docstring for new pipeline" +``` diff --git a/docs/superpowers/plans/2026-04-09-retrieve-redesign.md b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md new file mode 100644 index 0000000..3c659bc --- /dev/null +++ b/docs/superpowers/plans/2026-04-09-retrieve-redesign.md @@ -0,0 +1,1104 @@ +# Retrieve Redesign Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Unify query across long/short docs, add brief summaries to index.md and frontmatter, store long doc sources as JSON with per-page access. + +**Architecture:** (1) LLM prompts return `{"brief", "content"}` JSON — briefs flow into frontmatter and index.md. (2) Indexer stores long doc pages as JSON array. (3) New `get_page_content` tool replaces `pageindex_retrieve`. (4) Query agent uses same tools for all docs. + +**Tech Stack:** Python, litellm, asyncio, pytest + +--- + +### Task 1: Add `get_page_content` tool and `parse_pages` helper + +**Files:** +- Modify: `openkb/agent/tools.py` +- Modify: `tests/test_agent_tools.py` + +- [ ] **Step 1: Write failing tests** + +Add to `tests/test_agent_tools.py`: + +```python +from openkb.agent.tools import get_page_content, parse_pages + +class TestParsePages: + def test_single_page(self): + assert parse_pages("3") == [3] + + def test_range(self): + assert parse_pages("3-5") == [3, 4, 5] + + def test_comma_separated(self): + assert parse_pages("1,3,5") == [1, 3, 5] + + def test_mixed(self): + assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12] + + def test_deduplication(self): + assert parse_pages("3,3,3") == [3] + + def test_sorted(self): + assert parse_pages("5,1,3") == [1, 3, 5] + + def test_ignores_zero_and_negative(self): + assert parse_pages("0,-1,3") == [3] + + +class TestGetPageContent: + def test_reads_pages_from_json(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + {"page": 3, "content": "Page three text."}, + ] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("paper", "1,3", wiki_root) + assert "[Page 1]" in result + assert "Page one text." in result + assert "[Page 3]" in result + assert "Page three text." in result + assert "Page two" not in result + + def test_returns_error_for_missing_file(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("nonexistent", "1", wiki_root) + assert "not found" in result.lower() + + def test_returns_error_for_no_matching_pages(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Only page."}] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("paper", "99", wiki_root) + assert "no content" in result.lower() or result.strip() == "" + + def test_includes_images_info(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}, + ] + (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8") + + result = get_page_content("doc", "1", wiki_root) + assert "img.png" in result + + def test_path_escape_denied(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("../../etc/passwd", "1", wiki_root) + assert "denied" in result.lower() or "not found" in result.lower() +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_agent_tools.py::TestParsePages tests/test_agent_tools.py::TestGetPageContent -v` +Expected: FAIL with `ImportError` + +- [ ] **Step 3: Implement `parse_pages` and `get_page_content`** + +Add to `openkb/agent/tools.py`: + +```python +import json as _json + + +def parse_pages(pages: str) -> list[int]: + """Parse a page specification like '3-5,7,10-12' into a sorted list of ints.""" + result: set[int] = set() + for part in pages.split(","): + part = part.strip() + if "-" in part: + start_str, end_str = part.split("-", 1) + try: + start, end = int(start_str), int(end_str) + result.update(range(start, end + 1)) + except ValueError: + continue + else: + try: + result.add(int(part)) + except ValueError: + continue + return sorted(n for n in result if n >= 1) + + +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Get text content of specific pages from a long document. + + Reads from ``wiki/sources/{doc_name}.json`` which contains a JSON array + of ``{"page": int, "content": str, "images": [...]}`` objects. + + Args: + doc_name: Document name (stem, e.g. ``'attention-is-all-you-need'``). + pages: Page specification (e.g. ``'3-5,7,10-12'``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + Formatted text of requested pages, or error message if not found. + """ + root = Path(wiki_root).resolve() + json_path = (root / "sources" / f"{doc_name}.json").resolve() + if not json_path.is_relative_to(root): + return "Access denied: path escapes wiki root." + if not json_path.exists(): + return f"Document not found: {doc_name}. No sources/{doc_name}.json file." + + data = _json.loads(json_path.read_text(encoding="utf-8")) + page_nums = set(parse_pages(pages)) + matched = [p for p in data if p["page"] in page_nums] + + if not matched: + return f"No content found for pages: {pages}" + + parts: list[str] = [] + for p in matched: + header = f"[Page {p['page']}]" + text = p.get("content", "") + if "images" in p: + img_refs = ", ".join(img["path"] for img in p["images"]) + text += f"\n[Images: {img_refs}]" + parts.append(f"{header}\n{text}") + + return "\n\n".join(parts) +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `pytest tests/test_agent_tools.py -v` +Expected: All PASS + +- [ ] **Step 5: Commit** + +```bash +git add openkb/agent/tools.py tests/test_agent_tools.py +git commit -m "feat: add get_page_content tool and parse_pages helper" +``` + +--- + +### Task 2: Change LLM prompts to return `{"brief", "content"}` JSON + +**Files:** +- Modify: `openkb/agent/compiler.py` (prompt templates, lines 40-105) +- Modify: `tests/test_compiler.py` (TestParseConceptsPlan) + +- [ ] **Step 1: Write test for brief+content JSON parsing** + +Add to `tests/test_compiler.py`: + +```python +class TestParseBriefContent: + def test_dict_with_brief_and_content(self): + text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) + parsed = _parse_json(text) + assert parsed["brief"] == "A short desc" + assert "# Full page" in parsed["content"] + + def test_plain_text_fallback(self): + """If LLM returns plain text, _parse_json raises — caller handles fallback.""" + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("Just plain markdown text without JSON") +``` + +- [ ] **Step 2: Run test to verify it passes (existing _parse_json handles dicts)** + +Run: `pytest tests/test_compiler.py::TestParseBriefContent -v` +Expected: PASS — `_parse_json` already handles dicts + +- [ ] **Step 3: Update `_SUMMARY_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_SUMMARY_USER = """\ +New document: {doc_name} + +Full text: +{content} + +Write a summary page for this document in Markdown. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, ideas, \ +and [[wikilinks]] to concepts that could become cross-document concept pages + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 4: Update `_CONCEPT_PAGE_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_CONCEPT_PAGE_USER = """\ +Write the concept page for: {title} + +This concept relates to the document "{doc_name}" summarized above. +{update_instruction} + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown. Include clear explanation, \ +key details from the source document, and [[wikilinks]] to related concepts \ +and [[summaries/{doc_name}]] + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 5: Update `_CONCEPT_UPDATE_USER` prompt** + +Replace in `openkb/agent/compiler.py`: + +```python +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept (may differ from before) +- "content": The rewritten full concept page in Markdown + +Return ONLY valid JSON, no fences. +""" +``` + +- [ ] **Step 6: Run all tests (prompts aren't tested directly)** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: update LLM prompts to return brief+content JSON" +``` + +--- + +### Task 3: Update `_write_summary` and `_write_concept` to store `brief` in frontmatter + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 274-320, `_write_summary` and `_write_concept`) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write failing tests** + +Update existing and add new tests in `tests/test_compiler.py`: + +```python +class TestWriteSummary: + def test_writes_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.", brief="Introduces transformers") + path = wiki / "summaries" / "my-doc.md" + assert path.exists() + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "brief: Introduces transformers" in text + assert "# Summary" in text + + def test_writes_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "my-doc.pdf", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + text = path.read_text() + assert "sources: [my-doc.pdf]" in text + assert "brief:" not in text +``` + +Update `TestWriteConcept`: + +```python +class TestWriteConcept: + def test_new_concept_with_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus") + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "brief: Mechanism for selective focus" in text + assert "# Attention" in text + + def test_new_concept(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "# Attention" in text + + def test_update_concept_appends_source(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True, brief="Updated brief") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "brief: Updated brief" in text + assert "New info from paper2." in text +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` +Expected: FAIL — `_write_summary` and `_write_concept` don't accept `brief` parameter + +- [ ] **Step 3: Update `_write_summary` to accept `brief`** + +```python +def _write_summary(wiki_dir: Path, doc_name: str, source_file: str, summary: str, brief: str = "") -> None: + """Write summary page with frontmatter.""" + summaries_dir = wiki_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") +``` + +- [ ] **Step 4: Update `_write_concept` to accept `brief`** + +Add `brief: str = ""` parameter to `_write_concept`. In the new-concept branch: + +```python + else: + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + path.write_text(frontmatter + content, encoding="utf-8") +``` + +In the update branch, after updating sources in frontmatter, also update brief: + +```python + if is_update and path.exists(): + existing = path.read_text(encoding="utf-8") + if source_file not in existing: + # ... existing frontmatter update logic ... + # Update brief in frontmatter if provided + if brief and existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "brief:" in fm: + import re + fm = re.sub(r"brief:.*", f"brief: {brief}", fm) + else: + fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1) + existing = fm + body + path.write_text(existing, encoding="utf-8") +``` + +- [ ] **Step 5: Run tests to verify they pass** + +Run: `pytest tests/test_compiler.py::TestWriteSummary tests/test_compiler.py::TestWriteConcept -v` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: store brief in frontmatter of summary and concept pages" +``` + +--- + +### Task 4: Update `_update_index` to include briefs, and update `_read_concept_briefs` to read from frontmatter + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 233-261 and 408-430) +- Modify: `tests/test_compiler.py` + +- [ ] **Step 1: Write failing tests for `_update_index` with briefs** + +```python +class TestUpdateIndex: + def test_appends_entries_with_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention", "transformer"], + doc_brief="Introduces transformers", + concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]] — Introduces transformers" in text + assert "[[concepts/attention]] — Focus mechanism" in text + assert "[[concepts/transformer]] — NN architecture" in text + + def test_no_duplicates(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", [], doc_brief="New brief") + text = (wiki / "index.md").read_text() + assert text.count("[[summaries/my-doc]]") == 1 + + def test_backwards_compat_no_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"]) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]]" in text + assert "[[concepts/attention]]" in text +``` + +Write test for updated `_read_concept_briefs`: + +```python +class TestReadConceptBriefs: + # ... keep existing tests ... + + def test_reads_brief_from_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Selective focus mechanism" in result + + def test_falls_back_to_body_truncation(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "old.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- old: Old concept without brief field." in result +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_compiler.py::TestUpdateIndex tests/test_compiler.py::TestReadConceptBriefs -v` +Expected: FAIL — `_update_index` doesn't accept `doc_brief`/`concept_briefs` parameters + +- [ ] **Step 3: Update `_update_index`** + +```python +def _update_index( + wiki_dir: Path, doc_name: str, concept_names: list[str], + doc_brief: str = "", concept_briefs: dict[str, str] | None = None, +) -> None: + """Append document and concept entries to index.md with optional briefs.""" + index_path = wiki_dir / "index.md" + if not index_path.exists(): + index_path.write_text( + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + + text = index_path.read_text(encoding="utf-8") + + doc_link = f"[[summaries/{doc_name}]]" + if doc_link not in text: + doc_entry = f"- {doc_link}" + if doc_brief: + doc_entry += f" — {doc_brief}" + if "## Documents" in text: + text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + + if concept_briefs is None: + concept_briefs = {} + for name in concept_names: + concept_link = f"[[concepts/{name}]]" + if concept_link not in text: + concept_entry = f"- {concept_link}" + if name in concept_briefs: + concept_entry += f" — {concept_briefs[name]}" + if "## Concepts" in text: + text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + + index_path.write_text(text, encoding="utf-8") +``` + +- [ ] **Step 4: Update `_read_concept_briefs` to read from frontmatter `brief:` field** + +```python +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact one-line summaries. + + Reads ``brief:`` from YAML frontmatter if available, otherwise falls back + to the first 150 characters of the body text. + """ + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + + md_files = sorted(concepts_dir.glob("*.md")) + if not md_files: + return "(none yet)" + + lines: list[str] = [] + for path in md_files: + text = path.read_text(encoding="utf-8") + brief = "" + body = text + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + fm = text[:end + 3] + body = text[end + 3:] + # Try to extract brief from frontmatter + for line in fm.split("\n"): + if line.startswith("brief:"): + brief = line[len("brief:"):].strip() + break + if not brief: + brief = body.strip().replace("\n", " ")[:150] + if brief: + lines.append(f"- {path.stem}: {brief}") + + return "\n".join(lines) or "(none yet)" +``` + +- [ ] **Step 5: Run tests** + +Run: `pytest tests/test_compiler.py -v` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/compiler.py tests/test_compiler.py +git commit -m "feat: add briefs to index.md entries and read from frontmatter" +``` + +--- + +### Task 5: Wire briefs through `_compile_concepts` and public functions + +**Files:** +- Modify: `openkb/agent/compiler.py` (lines 438-611, `_compile_concepts`, `compile_short_doc`, `compile_long_doc`) +- Modify: `tests/test_compiler.py` + +This task connects the brief+content JSON parsing to the write functions and index update. + +- [ ] **Step 1: Write integration test** + +```python +class TestBriefIntegration: + @pytest.mark.asyncio + async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_resp = json.dumps({ + "brief": "A paper about transformers", + "content": "# Summary\n\nThis paper discusses transformers.", + }) + plan_resp = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_resp = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_resp]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + # Check summary frontmatter has brief + summary_text = (wiki / "summaries" / "test-doc.md").read_text() + assert "brief: A paper about transformers" in summary_text + + # Check concept frontmatter has brief + concept_text = (wiki / "concepts" / "transformer.md").read_text() + assert "brief: NN architecture using self-attention" in concept_text + + # Check index has briefs + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]] — A paper about transformers" in index_text + assert "[[concepts/transformer]] — NN architecture using self-attention" in index_text +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_compiler.py::TestBriefIntegration -v` +Expected: FAIL + +- [ ] **Step 3: Update `compile_short_doc` to parse brief+content from summary response** + +In `compile_short_doc`, replace: + +```python + # --- Step 1: Generate summary --- + summary = _llm_call(model, [system_msg, doc_msg], "summary") + _write_summary(wiki_dir, doc_name, source_file, summary) +``` + +With: + +```python + # --- Step 1: Generate summary --- + summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") + try: + summary_parsed = _parse_json(summary_raw) + doc_brief = summary_parsed.get("brief", "") + summary = summary_parsed.get("content", summary_raw) + except (json.JSONDecodeError, ValueError): + doc_brief = "" + summary = summary_raw + _write_summary(wiki_dir, doc_name, source_file, summary, brief=doc_brief) +``` + +- [ ] **Step 4: Update `_compile_concepts` signature and wiring** + +Add `doc_brief: str = ""` parameter to `_compile_concepts`. + +In `_gen_create`, parse the response: + +```python + async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: + name = concept["name"] + title = concept.get("title", name) + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, update_instruction="", + )}, + ], f"create:{name}") + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, False, brief +``` + +Same for `_gen_update` — returns `tuple[str, str, bool, str]` (name, content, is_update, brief). + +In the results processing loop: + +```python + concept_briefs_map: dict[str, str] = {} + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update, brief = r + _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) + concept_names.append(name) + if brief: + concept_briefs_map[name] = brief +``` + +Pass briefs to `_update_index`: + +```python + _update_index(wiki_dir, doc_name, concept_names, + doc_brief=doc_brief, concept_briefs=concept_briefs_map) +``` + +- [ ] **Step 5: Update `compile_short_doc` to pass `doc_brief` to `_compile_concepts`** + +```python + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + summary, doc_name, max_concurrency, doc_brief=doc_brief, + ) +``` + +- [ ] **Step 6: Update `compile_long_doc` to pass `doc_brief` from `IndexResult.description`** + +`compile_long_doc` currently takes `doc_id` but not `description`. Add `doc_description: str = ""` parameter: + +```python +async def compile_long_doc( + doc_name: str, + summary_path: Path, + doc_id: str, + kb_dir: Path, + model: str, + doc_description: str = "", + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, +) -> None: +``` + +The `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain text, not JSON). Pass `doc_description` as `doc_brief`: + +```python + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + overview, doc_name, max_concurrency, doc_brief=doc_description, + ) +``` + +Also update the CLI call in `cli.py` line 135: + +```python +asyncio.run( + compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model, + doc_description=index_result.description) +) +``` + +- [ ] **Step 7: Update existing integration tests for new JSON response format** + +Update all mock LLM responses in `TestCompileShortDoc`, `TestCompileLongDoc`, and `TestCompileConceptsPlan` to return `{"brief": "...", "content": "..."}` JSON instead of plain text for summary and concept responses. + +- [ ] **Step 8: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 9: Commit** + +```bash +git add openkb/agent/compiler.py openkb/cli.py tests/test_compiler.py +git commit -m "feat: wire brief+content JSON through compile pipeline to index and frontmatter" +``` + +--- + +### Task 6: Indexer — long doc sources from markdown to JSON + +**Files:** +- Modify: `openkb/indexer.py` +- Modify: `openkb/tree_renderer.py` (remove `render_source_md`) +- Modify: `tests/test_indexer.py` + +- [ ] **Step 1: Write failing test** + +Update `tests/test_indexer.py`: + +```python + def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): + """Long doc source should be written as JSON, not markdown.""" + import json as json_mod + doc_id = "abc-123" + fake_col = self._make_fake_collection(doc_id, sample_tree) + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + # Mock get_page_content to return page data + fake_col.get_page_content.return_value = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + ] + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + index_long_document(pdf_path, kb_dir) + + # Should be JSON, not MD + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert json_file.exists() + assert not (kb_dir / "wiki" / "sources" / "sample.md").exists() + data = json_mod.loads(json_file.read_text()) + assert len(data) == 2 + assert data[0]["page"] == 1 +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `pytest tests/test_indexer.py::TestIndexLongDocument::test_source_page_written_as_json -v` +Expected: FAIL + +- [ ] **Step 3: Update `indexer.py` to write JSON sources** + +Replace the source writing block (lines 103-110) with: + +```python + # Write wiki/sources/ as JSON (per-page content from PageIndex) + sources_dir = kb_dir / "wiki" / "sources" + sources_dir.mkdir(parents=True, exist_ok=True) + dest_images_dir = sources_dir / "images" / pdf_path.stem + + # Get per-page content from PageIndex + all_pages = col.get_page_content(doc_id, f"1-{doc.get('page_count', 9999)}") + + # Relocate image paths + dest_images_dir.mkdir(parents=True, exist_ok=True) + for page in all_pages: + if "images" in page: + for img in page["images"]: + src_path = Path(img["path"]) + if src_path.exists(): + filename = src_path.name + dest = dest_images_dir / filename + if not dest.exists(): + shutil.copy2(src_path, dest) + img["path"] = f"images/{pdf_path.stem}/{filename}" + + import json as json_mod + (sources_dir / f"{pdf_path.stem}.json").write_text( + json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", + ) +``` + +Remove the `render_source_md` import and `_relocate_images` call. + +- [ ] **Step 4: Remove `render_source_md` from tree_renderer.py** + +Remove the `render_source_md` function and `_render_nodes_source` helper from `openkb/tree_renderer.py`. Keep `render_summary_md` and `_render_nodes_summary`. + +- [ ] **Step 5: Update existing test `test_source_page_written`** + +The old test checks for `.md` — update it to check for `.json` or remove it (replaced by the new test). + +- [ ] **Step 6: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 7: Commit** + +```bash +git add openkb/indexer.py openkb/tree_renderer.py tests/test_indexer.py +git commit -m "feat: store long doc sources as per-page JSON, remove render_source_md" +``` + +--- + +### Task 7: Query agent — remove `pageindex_retrieve`, add `get_page_content`, update instructions + +**Files:** +- Modify: `openkb/agent/query.py` +- Modify: `openkb/schema.py` +- Modify: `tests/test_query.py` + +- [ ] **Step 1: Write failing tests** + +Update `tests/test_query.py`: + +```python +class TestBuildQueryAgent: + def test_agent_name(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert agent.name == "wiki-query" + + def test_agent_has_three_tools(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert len(agent.tools) == 3 + + def test_agent_tool_names(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + names = {t.name for t in agent.tools} + assert "list_files" in names + assert "read_file" in names + assert "get_page_content" in names + assert "pageindex_retrieve" not in names + + def test_instructions_mention_get_page_content(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert "get_page_content" in agent.instructions +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `pytest tests/test_query.py::TestBuildQueryAgent -v` +Expected: FAIL — old signature requires `openkb_dir` + +- [ ] **Step 3: Rewrite `query.py`** + +Remove `_pageindex_retrieve_impl` entirely (~110 lines). Remove `PageIndexClient` import. Update `build_query_agent`: + +```python +def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: + """Build and return the Q&A agent.""" + schema_md = get_agents_md(Path(wiki_root)) + instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) + instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." + + @function_tool + def list_files(directory: str) -> str: + """List all Markdown files in a wiki subdirectory.""" + return list_wiki_files(directory, wiki_root) + + @function_tool + def read_file(path: str) -> str: + """Read a Markdown file from the wiki.""" + return read_wiki_file(path, wiki_root) + + @function_tool + def get_page_content_tool(doc_name: str, pages: str) -> str: + """Get text content of specific pages from a long document. + + Args: + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). + """ + from openkb.agent.tools import get_page_content + return get_page_content(doc_name, pages, wiki_root) + + from agents.model_settings import ModelSettings + + return Agent( + name="wiki-query", + instructions=instructions, + tools=[list_files, read_file, get_page_content_tool], + model=f"litellm/{model}", + model_settings=ModelSettings(parallel_tool_calls=False), + ) +``` + +Update `_QUERY_INSTRUCTIONS_TEMPLATE`: + +```python +_QUERY_INSTRUCTIONS_TEMPLATE = """\ +You are a knowledge-base Q&A agent. You answer questions by searching the wiki. + +{schema_md} + +## Search strategy +1. Read index.md to understand what documents and concepts are available. + Each entry has a brief summary to help you judge relevance. +2. Read relevant summary pages (summaries/) for document overviews. +3. Read concept pages (concepts/) for cross-document synthesis. +4. For long documents, use get_page_content(doc_name, pages) to read + specific pages when you need detailed content. The summary page + shows chapter structure with page ranges to help you decide which + pages to read. +5. Synthesise a clear, well-cited answer. + +Always ground your answer in the wiki content. If you cannot find relevant +information, say so clearly. +""" +``` + +Update `run_query` to match new `build_query_agent` signature (remove `openkb_dir` param): + +```python +async def run_query(question: str, kb_dir: Path, model: str, stream: bool = False) -> str: + from openkb.config import load_config + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + language: str = config.get("language", "en") + + wiki_root = str(kb_dir / "wiki") + agent = build_query_agent(wiki_root, model, language=language) + # ... rest unchanged ... +``` + +- [ ] **Step 4: Update `openkb/schema.py` AGENTS_MD** + +Add a note about `get_page_content` for long documents in the Schema: + +```python +## Page Types +- **Summary Page** (summaries/): Key content of a single source document. +- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]]. +- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses. +- **Source Page** (sources/): Full-text for short docs (.md) or per-page JSON for long docs (.json). +- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained. +``` + +- [ ] **Step 5: Run all tests** + +Run: `pytest tests/ -q` +Expected: All PASS + +- [ ] **Step 6: Commit** + +```bash +git add openkb/agent/query.py openkb/schema.py tests/test_query.py +git commit -m "feat: replace pageindex_retrieve with get_page_content, unify query for all docs" +``` + +--- + +### Task 8: Final cleanup and full verification + +**Files:** +- Modify: `openkb/indexer.py` (remove unused imports) +- Verify all files + +- [ ] **Step 1: Remove unused imports** + +In `indexer.py`, remove `from openkb.tree_renderer import render_source_md` if still present (keep `render_summary_md`). + +In `query.py`, verify `PageIndexClient` import is removed. + +- [ ] **Step 2: Run full test suite** + +Run: `pytest tests/ -v` +Expected: All PASS + +- [ ] **Step 3: Grep for dead references** + +Run: `grep -r "pageindex_retrieve\|render_source_md\|_relocate_images" openkb/ tests/` +Expected: No matches + +- [ ] **Step 4: Commit** + +```bash +git add -A +git commit -m "chore: remove dead imports and references" +``` diff --git a/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md new file mode 100644 index 0000000..2fcd853 --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-concept-dedup-and-update-design.md @@ -0,0 +1,163 @@ +# Concept Dedup & Existing Page Update + +**Date:** 2026-04-09 +**Status:** Approved +**Branch:** bugfix/compile + +## Problem + +The compiler pipeline generates concept pages per document, but: + +1. **No dedup** — LLM only sees concept slug names, not content. It can't reliably judge whether a new concept overlaps with an existing one. As the KB grows, concepts duplicate and diverge. +2. **No update of existing pages** — When a new document has information relevant to existing concepts, those pages are not updated. Knowledge doesn't compound across documents. + +The old agent-based approach solved this (the agent could read/write wiki files freely), but was too slow — 20-30 tool-call round-trips per document. + +## Design + +Extend the existing deterministic pipeline to give the LLM enough context for dedup/update decisions, without adding agent loops or breaking prompt caching. + +### Prompt Caching Invariant + +The cached prefix `[system_msg, doc_msg]` must remain identical across all LLM calls within a single document compilation. All new context (concept briefs, existing page content) goes into messages **after** the cached prefix. + +### Pipeline Overview + +``` +Step 1: [system, doc] → summary (unchanged) +Step 2: [system, doc, summary, concepts_plan_prompt] → concepts plan JSON +Step 3a: [system, doc, summary, create_prompt] × N → new concept pages (concurrent) +Step 3b: [system, doc, summary, update_prompt] × M → rewritten concept pages (concurrent) +Step 3c: code-only × K → add cross-ref links to related concepts +Step 4: update index (unchanged) +``` + +Steps 3a and 3b share a single semaphore and run concurrently together. + +### Part 1: Concept Briefs + +New function `_read_concept_briefs(wiki_dir)` reads existing concept pages and returns a compact summary string: + +``` +- attention: Attention is a mechanism that allows models to focus on relevant parts... +- transformer-architecture: The Transformer is a neural network architecture... +``` + +For each concept file in `wiki/concepts/*.md`: +- Skip YAML frontmatter +- Take first 150 characters of body text +- Format as `- {slug}: {brief}` + +This replaces the current `", ".join(existing_concepts)` in the concepts-list prompt. Pure file I/O, no LLM call. + +### Part 2: Concepts Plan Prompt + +The `_CONCEPTS_LIST_USER` template is replaced with a new `_CONCEPTS_PLAN_USER` template that asks the LLM to return a JSON object with three action types: + +```json +{ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention Mechanism"}], + "related": ["transformer-architecture"] +} +``` + +- **create** — New concept not covered by any existing page. +- **update** — Existing concept with significant new information worth integrating. +- **related** — Existing concept tangentially related; only needs a cross-reference link. + +The prompt includes rules: +- Don't create concepts that overlap with existing ones — use "update" instead. +- Don't create concepts that are just the document topic itself. +- For first few documents, create 2-3 foundational concepts at most. +- "related" is for lightweight cross-linking only. + +### Part 3: Three Execution Paths + +#### create (unchanged) + +Same as current: concurrent `_llm_call_async` with `_CONCEPT_PAGE_USER` template. Written via `_write_concept` with `is_update=False`. + +#### update (new) + +New template `_CONCEPT_UPDATE_USER`: + +``` +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be +integrated into this page. Rewrite the full page incorporating the new +information naturally. Maintain existing cross-references and add new ones +where appropriate. + +Return ONLY the Markdown content (no frontmatter, no code fences). +``` + +Call structure: `[system_msg, doc_msg, {assistant: summary}, update_user_msg]` + +The cached prefix `[system_msg, doc_msg]` is shared with create calls. The `existing_content` (typically 200-500 tokens) is in the final user message only. + +Written via `_write_concept` with `is_update=True`. The frontmatter `sources:` list is updated to include the new source file. + +#### related (code-only, no LLM) + +For each related slug: +1. Read the concept file +2. If `summaries/{doc_name}` is not already linked, append `\n\nSee also: [[summaries/{doc_name}]]` +3. Update frontmatter `sources:` list + +Pure file I/O, millisecond-level. + +### Part 4: Shared Logic Between Short and Long Doc + +Current `compile_short_doc` and `compile_long_doc` duplicate Steps 2-4. Extract shared logic into `_compile_concepts(wiki_dir, model, system_msg, doc_msg, summary, doc_name, kb_dir, max_concurrency)`. + +Public functions become: +- `compile_short_doc`: builds context A from source text → calls `_compile_concepts` +- `compile_long_doc`: builds context A from PageIndex summary → calls `_compile_concepts` + +### Part 5: JSON Parsing Fallback + +If the LLM returns a flat JSON array instead of the expected dict, treat it as all "create" actions: + +```python +if isinstance(parsed, list): + create_list, update_list, related_list = parsed, [], [] +else: + create_list = parsed.get("create", []) + update_list = parsed.get("update", []) + related_list = parsed.get("related", []) +``` + +This ensures backward compatibility if the LLM doesn't follow the new format. + +## Token Cost Analysis + +Compared to current pipeline (per document with C existing concepts): + +| Step | Current | New | Delta | +|------|---------|-----|-------| +| concepts-list prompt | ~50 tokens (slug names) | ~50 + C×30 tokens (briefs) | +C×30 | +| update calls | 0 | M × ~500 tokens (existing content) | +M×500 | +| related | 0 | 0 (code-only) | 0 | + +At C=30 existing concepts: +900 tokens in concepts-list prompt. +At M=2 update calls: +1000 tokens total. + +Total overhead: ~2000 tokens per document. Negligible compared to document content (5K-20K tokens). + +## Files Changed + +- `openkb/agent/compiler.py` — all changes + - New: `_read_concept_briefs()`, `_CONCEPTS_PLAN_USER`, `_CONCEPT_UPDATE_USER`, `_add_related_link()`, `_compile_concepts()` + - Modified: `compile_short_doc()`, `compile_long_doc()`, `_parse_json()` caller logic +- `tests/test_compiler.py` — update tests for new JSON format and update/related paths + +## Not In Scope + +- Concept briefs truncation/filtering for very large KBs (100+ concepts) — revisit when needed +- Interactive ingest (human-in-the-loop checkpoint) — separate feature +- Lint --fix auto-repair — separate feature diff --git a/docs/superpowers/specs/2026-04-09-retrieve-redesign.md b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md new file mode 100644 index 0000000..15224be --- /dev/null +++ b/docs/superpowers/specs/2026-04-09-retrieve-redesign.md @@ -0,0 +1,262 @@ +# Retrieve Redesign: Unified Query, Brief Summaries, and Local Page Content + +**Date:** 2026-04-09 +**Status:** Approved +**Branch:** bugfix/compile + +## Problems + +### 1. Long vs Short Doc Split in Query + +The query agent treats long documents (PageIndex-indexed) and short documents differently: + +- **Short docs**: agent reads `wiki/sources/{name}.md` via `read_file` +- **Long docs**: agent calls `pageindex_retrieve(doc_id, question)` — a black-box RAG call + +**Design Principle**: PageIndex is an indexer, not a retriever. Query-time retrieval should be done by the agent navigating the wiki, using the same tools for all documents. + +### 2. index.md Has No Brief Summaries + +Karpathy's gist says index.md should have "each page listed with a link, **a one-line summary**". Currently it only has wikilinks with no descriptions. The query agent must open every file to understand what's available. + +### 3. No Brief Summaries on Concepts Either + +Same problem: concept entries in index.md have no description. The agent can't judge relevance from the index alone. + +## Design + +### Part 1: Structured LLM Output with Brief Summaries + +All LLM generation steps (summary, concept create, concept update) now return a JSON object with both a one-line brief and the full content. + +#### Summary Generation + +`_SUMMARY_USER` prompt changes to request JSON output: + +``` +Write a summary page for this document in Markdown. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, and [[wikilinks]] + +Return ONLY valid JSON, no fences. +``` + +LLM returns: +```json +{ + "brief": "Introduces the Transformer architecture based entirely on self-attention", + "content": "# Attention Is All You Need\n\nThis paper proposes..." +} +``` + +The `brief` is: +- Written into summary frontmatter: `brief: Introduces the Transformer...` +- Passed to `_update_index` for the Documents section + +The `content` is written to `wiki/summaries/{name}.md` as before. + +#### Concept Generation (create) + +`_CONCEPT_PAGE_USER` prompt changes similarly: + +``` +Write the concept page for: {title} + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown with [[wikilinks]] + +Return ONLY valid JSON, no fences. +``` + +The `brief` is: +- Written into concept frontmatter: `brief: Mechanism allowing each position to attend to all others` +- Passed to `_update_index` for the Concepts section +- Used by `_read_concept_briefs` (read from frontmatter instead of truncating body text) + +#### Concept Generation (update) + +`_CONCEPT_UPDATE_USER` also returns `{"brief": "...", "content": "..."}`. The brief may change as the concept evolves with new information. + +#### Long Doc Summary (overview) + +Long documents do NOT need the LLM to generate a brief. The brief comes directly from PageIndex's `doc_description` field (available via `IndexResult.description`), which is already a document-level summary generated during indexing. `_LONG_DOC_SUMMARY_USER` stays unchanged (returns plain markdown overview, not JSON) — the brief is passed through from the indexer. + +In `compile_long_doc`, the `doc_description` is passed to `_compile_concepts` which forwards it to `_update_index` as the doc brief. + +#### Parsing + +All LLM responses go through `_parse_json`. Callers extract `brief` and `content`: + +```python +parsed = _parse_json(raw) +brief = parsed.get("brief", "") +content = parsed.get("content", raw) # fallback: treat raw as content if not JSON +``` + +The fallback ensures backward compatibility if the LLM returns plain text instead of JSON. + +### Part 2: index.md with Brief Summaries + +`_update_index` signature changes: + +```python +def _update_index(wiki_dir, doc_name, concept_names, doc_brief="", concept_briefs=None): +``` + +Output format: + +```markdown +## Documents +- [[summaries/attention-is-all-you-need]] — Introduces the Transformer architecture based on self-attention +- [[summaries/flash-attention]] — Efficient attention algorithm reducing memory from quadratic to linear + +## Concepts +- [[concepts/self-attention]] — Mechanism allowing each position to attend to all others in a sequence +- [[concepts/transformer]] — Neural network architecture based entirely on attention mechanisms +``` + +When updating an existing entry (re-compile), the brief is updated in place. + +### Part 3: Frontmatter with Brief + +Summary and concept pages get a `brief` field in frontmatter: + +```markdown +--- +sources: [paper.pdf] +brief: Introduces the Transformer architecture based on self-attention +--- + +# Attention Is All You Need +... +``` + +`_read_concept_briefs` is updated to read from `brief:` frontmatter field instead of truncating body text. Fallback to body truncation if `brief:` is absent (backward compat with existing pages). + +### Part 4: Long Doc Sources from Markdown to JSON + +Store per-page content as JSON instead of a giant markdown file. + +**Current**: +``` +wiki/sources/paper.md ← rendered markdown, 10K-50K tokens +``` + +**New**: +``` +wiki/sources/paper.json ← per-page JSON array +``` + +**JSON format** (only the `pages` array from PageIndex, not the full doc object): +```json +[ + { + "page": 1, + "content": "Full text of page 1...", + "images": [{"path": "images/paper/p1_img1.png", "width": 400, "height": 300}] + }, + { + "page": 2, + "content": "Full text of page 2..." + } +] +``` + +`images` field is optional. Image paths are relative to `wiki/sources/`. Short documents are not affected — they stay as `.md`. + +#### Indexer Changes + +In `indexer.py`, replace `render_source_md` + `_relocate_images` with: +1. `col.get_page_content(doc_id, "1-9999")` to get all pages +2. Relocate image paths in each page's `images` array +3. Write as JSON to `wiki/sources/{name}.json` + +### Part 5: New Tool `get_page_content` + +Add to `openkb/agent/tools.py`: + +```python +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Get text content of specific pages from a long document. + + Args: + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). + wiki_root: Absolute path to the wiki root directory. + """ +``` + +Implementation: +1. Read `wiki/sources/{doc_name}.json` +2. Parse `pages` spec into a set of page numbers (comma-separated, ranges with `-`) +3. Filter pages, format as `[Page N]\n{content}\n\n` +4. Return concatenated text, or error if file not found + +### Part 6: Query Agent Changes + +**Remove**: `pageindex_retrieve` tool and `_pageindex_retrieve_impl` entirely. + +**Add**: `get_page_content` tool. + +**Update instructions**: +``` +## Search strategy +1. Read index.md to understand what documents and concepts are available. + Each entry has a brief summary to help you judge relevance. +2. Read relevant summary pages (summaries/) for document overviews. +3. Read concept pages (concepts/) for cross-document synthesis. +4. For long documents, use get_page_content(doc_name, pages) to read + specific pages. The summary page shows chapter structure with page + ranges to help you decide which pages to read. +5. Synthesise a clear, well-cited answer. +``` + +**Remove**: `openkb_dir` and `model` parameters from `build_query_agent`. + +### What Gets Removed + +- `_pageindex_retrieve_impl` (~110 lines) +- `pageindex_retrieve` tool +- `render_source_md` from `tree_renderer.py` +- `_relocate_images` in current form (replaced by per-page relocation) +- PageIndex imports in `query.py` + +### What Stays + +- `render_summary_md` — summaries still markdown +- Short doc pipeline — unchanged +- Image files in `wiki/sources/images/` +- PageIndex in `indexer.py` — still used for tree building + +## Compile Pipeline Changes Summary + +The compile pipeline (`_compile_concepts`, `compile_short_doc`, `compile_long_doc`) changes: + +1. **Summary step**: parse JSON response, extract `brief` + `content` +2. **Concept create/update steps**: parse JSON response, extract `brief` + `content` +3. **`_write_summary`**: add `brief` to frontmatter +4. **`_write_concept`**: add/update `brief` in frontmatter +5. **`_update_index`**: write `— {brief}` after each wikilink +6. **`_read_concept_briefs`**: read from `brief:` frontmatter field (fallback to body truncation) + +## Files Changed + +- `openkb/agent/compiler.py` — prompt templates return JSON with brief+content, parse responses, pass briefs to index/frontmatter +- `openkb/indexer.py` — sources output from md to json, image relocation per-page +- `openkb/agent/tools.py` — add `get_page_content` +- `openkb/agent/query.py` — remove `pageindex_retrieve`, add `get_page_content`, update instructions +- `openkb/tree_renderer.py` — remove `render_source_md` +- `openkb/schema.py` — update AGENTS_MD +- `tests/test_compiler.py` — update for JSON LLM responses +- `tests/test_indexer.py` — update for JSON output +- `tests/test_query.py` — update for new tool set +- `tests/test_agent_tools.py` — add tests for `get_page_content` + +## Not In Scope + +- Cloud PageIndex query support (removed entirely) +- Changes to the lint pipeline +- Interactive ingest diff --git a/openkb/__main__.py b/openkb/__main__.py new file mode 100644 index 0000000..28f9e41 --- /dev/null +++ b/openkb/__main__.py @@ -0,0 +1,4 @@ +"""Allow running OpenKB as ``python -m openkb``.""" +from openkb.cli import cli + +cli() diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index d4e34e3..73b1a9c 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1,202 +1,654 @@ -"""Wiki compilation agent for OpenKB. - -Provides an agent that reads converted documents, generates summaries, -updates concept pages, and maintains the wiki index. +"""Wiki compilation pipeline for OpenKB. + +Pipeline leveraging LLM prompt caching: + Step 1: Build base context A (schema + document content). + Step 2: A → generate summary. + Step 3: A + summary → concepts plan (create/update/related). + Step 4: Concurrent LLM calls (A cached) → generate new + rewrite updated concepts. + Step 5: Code adds cross-ref links to related concepts, updates index. """ from __future__ import annotations +import asyncio +import json +import logging +import re +import sys +import threading +import time from pathlib import Path -from agents import Agent, Runner, function_tool -import os +import litellm + +from openkb.schema import get_agents_md -from pageindex import PageIndexClient +logger = logging.getLogger(__name__) -from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file -from openkb.schema import SCHEMA_MD, get_agents_md +# --------------------------------------------------------------------------- +# Prompt templates +# --------------------------------------------------------------------------- -_COMPILER_INSTRUCTIONS_TEMPLATE = """\ +_SYSTEM_TEMPLATE = """\ You are a wiki compilation agent for a personal knowledge base. {schema_md} -## Your job -When given a new document, you must: -1. Write a summary page to summaries/.md with: - - A YAML frontmatter block: `sources: [filename]` - - Key concepts, findings, and ideas from the document - - [[wikilinks]] to related concepts -2. Update or create concept pages in concepts/ for any significant cross-document themes. -3. Update index.md: - - Under ## Documents: add a one-liner entry for the new document - - Under ## Concepts: add/update entries for any concepts you touched - -Always use the provided tools to read existing wiki pages before writing, -so you can append or update without losing prior content. -Use [[wikilinks]] consistently to connect related pages. +Write all content in {language} language. +Use [[wikilinks]] to connect related pages (e.g. [[concepts/attention]]). """ -_LONG_DOC_INSTRUCTIONS_TEMPLATE = """\ -You are a wiki compilation agent for a personal knowledge base. +_SUMMARY_USER = """\ +New document: {doc_name} -{schema_md} +Full text: +{content} + +Write a summary page for this document in Markdown. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) describing the document's main contribution +- "content": The full summary in Markdown. Include key concepts, findings, ideas, \ +and [[wikilinks]] to concepts that could become cross-document concept pages + +Return ONLY valid JSON, no fences. +""" + + +_CONCEPTS_PLAN_USER = """\ +Based on the summary above, decide how to update the wiki's concept pages. + +Existing concept pages: +{concept_briefs} + +Return a JSON object with three keys: + +1. "create" — new concepts not covered by any existing page. Array of objects: + {{"name": "concept-slug", "title": "Human-Readable Title"}} + +2. "update" — existing concepts that have significant new information from \ +this document worth integrating. Array of objects: + {{"name": "existing-slug", "title": "Existing Title"}} + +3. "related" — existing concepts tangentially related to this document but \ +not needing content changes, just a cross-reference link. Array of slug strings. + +Rules: +- For the first few documents, create 2-3 foundational concepts at most. +- Do NOT create a concept that overlaps with an existing one — use "update". +- Do NOT create concepts that are just the document topic itself. +- "related" is for lightweight cross-linking only, no content rewrite needed. + +Return ONLY valid JSON, no fences, no explanation. +""" + +_CONCEPT_PAGE_USER = """\ +Write the concept page for: {title} + +This concept relates to the document "{doc_name}" summarized above. +{update_instruction} + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept +- "content": The full concept page in Markdown. Include clear explanation, \ +key details from the source document, and [[wikilinks]] to related concepts \ +and [[summaries/{doc_name}]] + +Return ONLY valid JSON, no fences. +""" -## Your job for long documents (already summarised by PageIndex) -The summary and source pages are already written. Your tasks are: -1. Update or create concept pages in concepts/ for significant themes. -2. Update index.md: - - Under ## Documents: add a one-liner entry referencing the document - - Under ## Concepts: add/update entries for any concepts you touched -3. Do NOT regenerate or overwrite the existing summary page. - -Use get_page_content to fetch specific page ranges from long documents when -you need more detail before writing concept pages. -Always read existing wiki pages before writing to preserve prior content. -Use [[wikilinks]] consistently to connect related pages. +_CONCEPT_UPDATE_USER = """\ +Update the concept page for: {title} + +Current content of this page: +{existing_content} + +New information from document "{doc_name}" (summarized above) should be \ +integrated into this page. Rewrite the full page incorporating the new \ +information naturally — do not just append. Maintain existing \ +[[wikilinks]] and add new ones where appropriate. + +Return a JSON object with two keys: +- "brief": A single sentence (under 100 chars) defining this concept (may differ from before) +- "content": The rewritten full concept page in Markdown + +Return ONLY valid JSON, no fences. +""" + +_LONG_DOC_SUMMARY_USER = """\ +This is a PageIndex summary for long document "{doc_name}" (doc_id: {doc_id}): + +{content} + +Based on this structured summary, write a concise overview that captures \ +the key themes and findings. This will be used to generate concept pages. + +Return ONLY the Markdown content (no frontmatter, no code fences). """ -def build_compiler_agent(wiki_root: str, model: str, language: str = "en") -> Agent: - """Build and return the wiki-compiler agent. +# --------------------------------------------------------------------------- +# LLM helpers +# --------------------------------------------------------------------------- + +class _Spinner: + """Animated dots spinner that runs in a background thread.""" + + def __init__(self, label: str): + self._label = label + self._stop = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + sys.stdout.write(f" {self._label}") + sys.stdout.flush() + self._thread = threading.Thread(target=self._run, daemon=True) + self._thread.start() + + def _run(self) -> None: + while not self._stop.wait(timeout=1.0): + sys.stdout.write(".") + sys.stdout.flush() + + def stop(self, suffix: str = "") -> None: + self._stop.set() + if self._thread: + self._thread.join() + sys.stdout.write(f" {suffix}\n") + sys.stdout.flush() + + +def _format_usage(elapsed: float, usage) -> str: + """Format timing and token usage into a short summary string.""" + cached = getattr(usage, "prompt_tokens_details", None) + cache_info = "" + if cached and hasattr(cached, "cached_tokens") and cached.cached_tokens: + cache_info = f", cached={cached.cached_tokens}" + return f"{elapsed:.1f}s (in={usage.prompt_tokens}, out={usage.completion_tokens}{cache_info})" + + +def _fmt_messages(messages: list[dict], max_content: int = 200) -> str: + """Format messages for debug output, truncating long content.""" + parts = [] + for msg in messages: + role = msg["role"] + content = msg["content"] + if len(content) > max_content: + preview = content[:max_content] + f"... ({len(content)} chars)" + else: + preview = content + parts.append(f" [{role}] {preview}") + return "\n".join(parts) + + +def _llm_call(model: str, messages: list[dict], step_name: str, **kwargs) -> str: + """Single LLM call with animated progress and debug logging.""" + logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages)) + if kwargs: + logger.debug("LLM kwargs [%s]: %s", step_name, kwargs) + + spinner = _Spinner(step_name) + spinner.start() + t0 = time.time() + + response = litellm.completion(model=model, messages=messages, **kwargs) + content = response.choices[0].message.content or "" + + spinner.stop(_format_usage(time.time() - t0, response.usage)) + logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else "")) + return content.strip() + - Creates @function_tool wrappers that bind *wiki_root* so the agent - doesn't need to supply it explicitly. +async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> str: + """Async LLM call with timing output and debug logging.""" + logger.debug("LLM request [%s]:\n%s", step_name, _fmt_messages(messages)) - Args: - wiki_root: Absolute path to the wiki directory. - model: LLM model name to use for the agent. - language: Language code for wiki content (e.g. 'en', 'fr'). + t0 = time.time() - Returns: - Configured :class:`~agents.Agent` instance. + response = await litellm.acompletion(model=model, messages=messages) + content = response.choices[0].message.content or "" + + elapsed = time.time() - t0 + sys.stdout.write(f" {step_name}... {_format_usage(elapsed, response.usage)}\n") + sys.stdout.flush() + logger.debug("LLM response [%s]:\n%s", step_name, content[:500] + ("..." if len(content) > 500 else "")) + return content.strip() + + +def _parse_json(text: str) -> list | dict: + """Parse JSON from LLM response, stripping markdown fences if present.""" + cleaned = text.strip() + if cleaned.startswith("```"): + first_nl = cleaned.index("\n") + cleaned = cleaned[first_nl + 1:] + if cleaned.endswith("```"): + cleaned = cleaned[:-3] + return json.loads(cleaned.strip()) + + +# --------------------------------------------------------------------------- +# File I/O helpers +# --------------------------------------------------------------------------- + +def _read_wiki_context(wiki_dir: Path) -> tuple[str, list[str]]: + """Read current index.md content and list of existing concept slugs.""" + index_path = wiki_dir / "index.md" + index_content = index_path.read_text(encoding="utf-8") if index_path.exists() else "" + + concepts_dir = wiki_dir / "concepts" + existing = sorted(p.stem for p in concepts_dir.glob("*.md")) if concepts_dir.exists() else [] + + return index_content, existing + + +def _read_concept_briefs(wiki_dir: Path) -> str: + """Read existing concept pages and return compact one-line summaries. + + For each concept, reads the ``brief:`` field from YAML frontmatter if + present; otherwise falls back to truncating the first 150 chars of the body + (newlines collapsed to spaces). Formats each as ``- {slug}: {brief}``. + + Returns "(none yet)" if the concepts directory is missing or empty. """ - schema_md = get_agents_md(Path(wiki_root)) - instructions = _COMPILER_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) - instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - - @function_tool - def read_file(path: str) -> str: - """Read a Markdown file from the wiki. - - Args: - path: File path relative to wiki root (e.g. 'sources/notes.md'). - """ - return read_wiki_file(path, wiki_root) - - @function_tool - def write_file(path: str, content: str) -> str: - """Write or overwrite a Markdown file in the wiki. - - Args: - path: File path relative to wiki root (e.g. 'concepts/attention.md'). - content: Markdown content to write. - """ - return write_wiki_file(path, content, wiki_root) - - from agents.model_settings import ModelSettings - - return Agent( - name="wiki-compiler", - instructions=instructions, - tools=[list_files, read_file, write_file], - model=f"litellm/{model}", - model_settings=ModelSettings(parallel_tool_calls=False), - ) + concepts_dir = wiki_dir / "concepts" + if not concepts_dir.exists(): + return "(none yet)" + + md_files = sorted(concepts_dir.glob("*.md")) + if not md_files: + return "(none yet)" + + lines: list[str] = [] + for path in md_files: + text = path.read_text(encoding="utf-8") + brief = "" + body = text + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + fm = text[:end + 3] + body = text[end + 3:] + for line in fm.split("\n"): + if line.startswith("brief:"): + brief = line[len("brief:"):].strip() + break + if not brief: + brief = body.strip().replace("\n", " ")[:150] + if brief: + lines.append(f"- {path.stem}: {brief}") + + return "\n".join(lines) or "(none yet)" + + + +def _write_summary(wiki_dir: Path, doc_name: str, summary: str, + doc_type: str = "short") -> None: + """Write summary page with frontmatter.""" + if summary.startswith("---"): + end = summary.find("---", 3) + if end != -1: + summary = summary[end + 3:].lstrip("\n") + summaries_dir = wiki_dir / "summaries" + summaries_dir.mkdir(parents=True, exist_ok=True) + ext = "md" if doc_type == "short" else "json" + fm_lines = [ + f"doc_type: {doc_type}", + f"full_text: sources/{doc_name}.{ext}", + ] + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") + + +_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]') + + +def _sanitize_concept_name(name: str) -> str: + """Sanitize a concept name for safe use as a filename.""" + sanitized = _SAFE_NAME_RE.sub("-", name).strip("-") + return sanitized or "unnamed-concept" + + +def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "") -> None: + """Write or update a concept page, managing the sources frontmatter.""" + concepts_dir = wiki_dir / "concepts" + concepts_dir.mkdir(parents=True, exist_ok=True) + safe_name = _sanitize_concept_name(name) + path = (concepts_dir / f"{safe_name}.md").resolve() + if not path.is_relative_to(concepts_dir.resolve()): + logger.warning("Concept name escapes concepts dir: %s", name) + return + + if is_update and path.exists(): + existing = path.read_text(encoding="utf-8") + if source_file not in existing: + if existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + existing = fm + body + else: + existing = f"---\nsources: [{source_file}]\n---\n\n" + existing + # Strip frontmatter from LLM content to avoid duplicate blocks + clean = content + if clean.startswith("---"): + end = clean.find("---", 3) + if end != -1: + clean = clean[end + 3:].lstrip("\n") + existing += f"\n\n{clean}" + if brief and existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + body = existing[end + 3:] + if "brief:" in fm: + fm = re.sub(r"brief:.*", f"brief: {brief}", fm) + else: + fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1) + existing = fm + body + path.write_text(existing, encoding="utf-8") + else: + if content.startswith("---"): + end = content.find("---", 3) + if end != -1: + content = content[end + 3:].lstrip("\n") + fm_lines = [f"sources: [{source_file}]"] + if brief: + fm_lines.append(f"brief: {brief}") + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + path.write_text(frontmatter + content, encoding="utf-8") + + +def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: + """Add a cross-reference link to an existing concept page (no LLM call).""" + concepts_dir = wiki_dir / "concepts" + path = concepts_dir / f"{concept_slug}.md" + if not path.exists(): + return + + text = path.read_text(encoding="utf-8") + link = f"[[summaries/{doc_name}]]" + if link in text: + return + + # Update sources in frontmatter + if source_file not in text: + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + fm = text[:end + 3] + body = text[end + 3:] + if "sources:" in fm: + fm = fm.replace("sources: [", f"sources: [{source_file}, ") + else: + fm = fm.replace("---\n", f"---\nsources: [{source_file}]\n", 1) + text = fm + body + else: + text = f"---\nsources: [{source_file}]\n---\n\n" + text + + text += f"\n\nSee also: {link}" + path.write_text(text, encoding="utf-8") + + +def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Append missing concept wikilinks to the summary page (no LLM call). + + After all concepts are generated, this ensures the summary page links + back to every related concept — closing the bidirectional link that + concept pages already have toward the summary. + + If a ``## Related Concepts`` section already exists, new links are + appended into it rather than creating a duplicate section. + """ + summary_path = wiki_dir / "summaries" / f"{doc_name}.md" + if not summary_path.exists(): + return + + text = summary_path.read_text(encoding="utf-8") + missing = [slug for slug in concept_slugs if f"[[concepts/{slug}]]" not in text] + if not missing: + return + new_links = "\n".join(f"- [[concepts/{s}]]" for s in missing) + if "## Related Concepts" in text: + # Append into existing section + text = text.replace("## Related Concepts\n", f"## Related Concepts\n{new_links}\n", 1) + else: + text += f"\n\n## Related Concepts\n{new_links}\n" + summary_path.write_text(text, encoding="utf-8") -def build_long_doc_compiler_agent(wiki_root: str, kb_dir: str, model: str, language: str = "en") -> Agent: - """Build the wiki-compiler agent with an extra get_page_content tool. - Args: - wiki_root: Absolute path to the wiki directory. - kb_dir: Absolute path to the knowledge base root (contains .openkb/). - model: LLM model name to use for the agent. - language: Language code for wiki content (e.g. 'en', 'fr'). +def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Append missing summary wikilink to each concept page (no LLM call). - Returns: - Configured :class:`~agents.Agent` instance. + Ensures every concept page links back to the source document's summary, + regardless of whether the LLM included the link in its output. + + If a ``## Related Documents`` section already exists, the link is + appended into it rather than creating a duplicate section. """ - from openkb.config import load_config + link = f"[[summaries/{doc_name}]]" + concepts_dir = wiki_dir / "concepts" + + for slug in concept_slugs: + path = concepts_dir / f"{slug}.md" + if not path.exists(): + continue + text = path.read_text(encoding="utf-8") + if link in text: + continue + if "## Related Documents" in text: + text = text.replace("## Related Documents\n", f"## Related Documents\n- {link}\n", 1) + else: + text += f"\n\n## Related Documents\n- {link}\n" + path.write_text(text, encoding="utf-8") + + +def _update_index( + wiki_dir: Path, doc_name: str, concept_names: list[str], + doc_brief: str = "", concept_briefs: dict[str, str] | None = None, + doc_type: str = "short", +) -> None: + """Append document and concept entries to index.md. - openkb_dir = Path(kb_dir) / ".openkb" - config = load_config(openkb_dir / "config.yaml") - _model = config.get("model", model) - pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "") - client = PageIndexClient( - api_key=pageindex_api_key or None, - model=_model, - storage_path=str(openkb_dir), - ) - col = client.collection() - - schema_md = get_agents_md(Path(wiki_root)) - instructions = _LONG_DOC_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) - instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - - @function_tool - def read_file(path: str) -> str: - """Read a Markdown file from the wiki. - - Args: - path: File path relative to wiki root (e.g. 'sources/notes.md'). - """ - return read_wiki_file(path, wiki_root) - - @function_tool - def write_file(path: str, content: str) -> str: - """Write or overwrite a Markdown file in the wiki. - - Args: - path: File path relative to wiki root (e.g. 'concepts/attention.md'). - content: Markdown content to write. - """ - return write_wiki_file(path, content, wiki_root) - - @function_tool - def get_page_content(doc_id: str, pages: str) -> str: - """Retrieve text content for specific pages of a long document. - - Args: - doc_id: Document identifier from PageIndex. - pages: Page range string, e.g. '1-5' or '3,7,12'. - """ - results = col.get_page_content(doc_id, pages) - if not results: - return "No content found for the given pages." - parts = [] - for item in results: - page_num = item.get("page_index", "?") - text = item.get("text", "") - parts.append(f"[Page {page_num}]\n{text}") - return "\n\n".join(parts) - - from agents.model_settings import ModelSettings - - return Agent( - name="wiki-compiler", - instructions=instructions, - tools=[list_files, read_file, write_file, get_page_content], - model=f"litellm/{_model}", - model_settings=ModelSettings(parallel_tool_calls=False), - ) + When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries + are written as ``- [[link]] (type) — brief text``. Existing entries are + detected by the link part only, so updating a brief on a re-compile works. + ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the + query agent knows how to access detailed content. + """ + if concept_briefs is None: + concept_briefs = {} + + index_path = wiki_dir / "index.md" + if not index_path.exists(): + index_path.write_text( + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + + text = index_path.read_text(encoding="utf-8") + + doc_link = f"[[summaries/{doc_name}]]" + if doc_link not in text: + doc_entry = f"- {doc_link} ({doc_type})" + if doc_brief: + doc_entry += f" — {doc_brief}" + if "## Documents" in text: + text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + + for name in concept_names: + concept_link = f"[[concepts/{name}]]" + if concept_link not in text: + concept_entry = f"- {concept_link}" + if name in concept_briefs: + concept_entry += f" — {concept_briefs[name]}" + if "## Concepts" in text: + text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + + index_path.write_text(text, encoding="utf-8") + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +DEFAULT_COMPILE_CONCURRENCY = 5 + + +async def _compile_concepts( + wiki_dir: Path, + kb_dir: Path, + model: str, + system_msg: dict, + doc_msg: dict, + summary: str, + doc_name: str, + max_concurrency: int, + doc_brief: str = "", + doc_type: str = "short", +) -> None: + """Shared Steps 2-4: concepts plan → generate/update → index. + + Uses ``_CONCEPTS_PLAN_USER`` to get a plan with create/update/related + actions, then executes each action type accordingly. + """ + source_file = f"summaries/{doc_name}.md" + + # --- Step 2: Get concepts plan (A cached) --- + concept_briefs = _read_concept_briefs(wiki_dir) + + plan_raw = _llm_call(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPTS_PLAN_USER.format( + concept_briefs=concept_briefs, + )}, + ], "concepts-plan", max_tokens=1024) + + try: + parsed = _parse_json(plan_raw) + except (json.JSONDecodeError, ValueError) as exc: + logger.warning("Failed to parse concepts plan: %s", exc) + logger.debug("Raw: %s", plan_raw) + _update_index(wiki_dir, doc_name, []) + return + + # Fallback: if LLM returns a flat list, treat all items as "create" + if isinstance(parsed, list): + plan = {"create": parsed, "update": [], "related": []} + else: + plan = { + "create": parsed.get("create", []), + "update": parsed.get("update", []), + "related": parsed.get("related", []), + } + + create_items = plan["create"] + update_items = plan["update"] + related_items = plan["related"] + + if not create_items and not update_items and not related_items: + _update_index(wiki_dir, doc_name, []) + return + + # --- Step 3: Generate/update concept pages concurrently (A cached) --- + semaphore = asyncio.Semaphore(max_concurrency) + + async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: + name = concept["name"] + title = concept.get("title", name) + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_PAGE_USER.format( + title=title, doc_name=doc_name, + update_instruction="", + )}, + ], f"concept: {name}") + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, False, brief + + async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: + name = concept["name"] + title = concept.get("title", name) + concept_path = wiki_dir / "concepts" / f"{name}.md" + if concept_path.exists(): + raw_text = concept_path.read_text(encoding="utf-8") + if raw_text.startswith("---"): + parts = raw_text.split("---", 2) + existing_content = parts[2].strip() if len(parts) >= 3 else raw_text + else: + existing_content = raw_text + else: + existing_content = "(page not found — create from scratch)" + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, + doc_msg, + {"role": "assistant", "content": summary}, + {"role": "user", "content": _CONCEPT_UPDATE_USER.format( + title=title, doc_name=doc_name, + existing_content=existing_content, + )}, + ], f"update: {name}") + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + content = parsed.get("content", raw) + except (json.JSONDecodeError, ValueError): + brief, content = "", raw + return name, content, True, brief + + tasks = [] + tasks.extend(_gen_create(c) for c in create_items) + tasks.extend(_gen_update(c) for c in update_items) + + concept_names: list[str] = [] + concept_briefs_map: dict[str, str] = {} + + if tasks: + total = len(tasks) + sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") + sys.stdout.flush() + + results = await asyncio.gather(*tasks, return_exceptions=True) + + for r in results: + if isinstance(r, Exception): + logger.warning("Concept generation failed: %s", r) + continue + name, page_content, is_update, brief = r + _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) + concept_names.append(name) + if brief: + concept_briefs_map[name] = brief + + # --- Step 3b: Process related items (code only, no LLM) --- + for slug in related_items: + _add_related_link(wiki_dir, slug, doc_name, source_file) + + # --- Step 3c: Backlink — summary ↔ concepts (code only) --- + all_concept_slugs = concept_names + [s for s in related_items] + if all_concept_slugs: + _backlink_summary(wiki_dir, doc_name, all_concept_slugs) + _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) + + # --- Step 4: Update index (code only) --- + _update_index(wiki_dir, doc_name, concept_names, + doc_brief=doc_brief, concept_briefs=concept_briefs_map, + doc_type=doc_type) async def compile_short_doc( @@ -204,17 +656,12 @@ async def compile_short_doc( source_path: Path, kb_dir: Path, model: str, + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: - """Run the compiler agent for a short (non-PageIndex) document. - - Reads the converted source Markdown, then asks the agent to generate a - summary, update concept pages, and update the index. + """Compile a short document using a multi-step LLM pipeline with caching. - Args: - doc_name: Document stem name (no extension). - source_path: Path to the converted Markdown in wiki/sources/. - kb_dir: Root of the knowledge base (contains wiki/ and .openkb/). - model: LLM model name. + Step 1: Build base context A (schema + doc content), generate summary. + Steps 2-4: Delegated to ``_compile_concepts``. """ from openkb.config import load_config @@ -222,17 +669,35 @@ async def compile_short_doc( config = load_config(openkb_dir / "config.yaml") language: str = config.get("language", "en") - wiki_root = str(kb_dir / "wiki") - agent = build_compiler_agent(wiki_root, model, language=language) - + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) content = source_path.read_text(encoding="utf-8") - message = ( - f"New document: {doc_name}\n\n" - f"Full text:\n{content}\n\n" - "Generate summary, update concepts, update index." - ) - await Runner.run(agent, message) + # Base context A: system + document + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _SUMMARY_USER.format( + doc_name=doc_name, content=content, + )} + + # --- Step 1: Generate summary --- + summary_raw = _llm_call(model, [system_msg, doc_msg], "summary") + try: + summary_parsed = _parse_json(summary_raw) + doc_brief = summary_parsed.get("brief", "") + summary = summary_parsed.get("content", summary_raw) + except (json.JSONDecodeError, ValueError): + doc_brief = "" + summary = summary_raw + _write_summary(wiki_dir, doc_name, summary) + + # --- Steps 2-4: Concept plan → generate/update → index --- + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + summary, doc_name, max_concurrency, doc_brief=doc_brief, + doc_type="short", + ) async def compile_long_doc( @@ -241,18 +706,13 @@ async def compile_long_doc( doc_id: str, kb_dir: Path, model: str, + doc_description: str = "", + max_concurrency: int = DEFAULT_COMPILE_CONCURRENCY, ) -> None: - """Run the compiler agent for a long (PageIndex) document. - - The summary page is already written. The agent updates concept pages and - the index without regenerating the summary. + """Compile a long (PageIndex) document's concepts and index. - Args: - doc_name: Document stem name (no extension). - summary_path: Path to the existing summary Markdown in wiki/summaries/. - doc_id: PageIndex document identifier. - kb_dir: Root of the knowledge base. - model: LLM model name. + The summary page is already written by the indexer. This function + generates concept pages and updates the index. """ from openkb.config import load_config @@ -260,14 +720,24 @@ async def compile_long_doc( config = load_config(openkb_dir / "config.yaml") language: str = config.get("language", "en") - wiki_root = str(kb_dir / "wiki") - agent = build_long_doc_compiler_agent(wiki_root, str(kb_dir), model, language=language) - - content = summary_path.read_text(encoding="utf-8") - message = ( - f"New long document: {doc_name} (doc_id: {doc_id})\n" - f"Summary tree:\n{content}\n" - "Update concepts and index. Do NOT regenerate summary." + wiki_dir = kb_dir / "wiki" + schema_md = get_agents_md(wiki_dir) + summary_content = summary_path.read_text(encoding="utf-8") + + # Base context A + system_msg = {"role": "system", "content": _SYSTEM_TEMPLATE.format( + schema_md=schema_md, language=language, + )} + doc_msg = {"role": "user", "content": _LONG_DOC_SUMMARY_USER.format( + doc_name=doc_name, doc_id=doc_id, content=summary_content, + )} + + # --- Step 1: Generate overview --- + overview = _llm_call(model, [system_msg, doc_msg], "overview") + + # --- Steps 2-4: Concept plan → generate/update → index --- + await _compile_concepts( + wiki_dir, kb_dir, model, system_msg, doc_msg, + overview, doc_name, max_concurrency, doc_brief=doc_description, + doc_type="pageindex", ) - - await Runner.run(agent, message) diff --git a/openkb/agent/linter.py b/openkb/agent/linter.py index 5201949..fb81da7 100644 --- a/openkb/agent/linter.py +++ b/openkb/agent/linter.py @@ -6,6 +6,8 @@ from agents import Agent, Runner, function_tool from openkb.agent.tools import list_wiki_files, read_wiki_file + +MAX_TURNS = 50 from openkb.schema import SCHEMA_MD, get_agents_md _LINTER_INSTRUCTIONS_TEMPLATE = """\ @@ -102,5 +104,5 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str: "Produce a structured Markdown report." ) - result = await Runner.run(agent, prompt) + result = await Runner.run(agent, prompt, max_turns=MAX_TURNS) return result.final_output or "Knowledge lint completed. No output produced." diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 0549bb9..a895b79 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1,158 +1,819 @@ -"""Tests for openkb.agent.compiler.""" +"""Tests for openkb.agent.compiler pipeline.""" from __future__ import annotations +import json from pathlib import Path -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock, patch, AsyncMock import pytest from openkb.agent.compiler import ( - build_compiler_agent, compile_long_doc, compile_short_doc, + _compile_concepts, + _parse_json, + _write_summary, + _write_concept, + _update_index, + _read_wiki_context, + _read_concept_briefs, + _add_related_link, + _backlink_summary, + _backlink_concepts, ) -from openkb.schema import SCHEMA_MD -class TestBuildCompilerAgent: - def test_agent_name(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - assert agent.name == "wiki-compiler" - - def test_agent_tools_count(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - # list_files, read_file, write_file - assert len(agent.tools) == 3 - - def test_schema_in_instructions(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - assert SCHEMA_MD in agent.instructions - - def test_agent_model(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "my-custom-model") - assert agent.model == "litellm/my-custom-model" - - def test_tool_names(self, tmp_path): - agent = build_compiler_agent(str(tmp_path), "gpt-4o-mini") - tool_names = {t.name for t in agent.tools} - assert "list_files" in tool_names - assert "read_file" in tool_names - assert "write_file" in tool_names +class TestParseJson: + def test_plain_json(self): + assert _parse_json('[{"name": "foo"}]') == [{"name": "foo"}] + + def test_fenced_json(self): + text = '```json\n[{"name": "bar"}]\n```' + assert _parse_json(text) == [{"name": "bar"}] + + def test_invalid_json(self): + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("not json") + + +class TestParseConceptsPlan: + def test_dict_format(self): + text = json.dumps({ + "create": [{"name": "foo", "title": "Foo"}], + "update": [{"name": "bar", "title": "Bar"}], + "related": ["baz"], + }) + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert len(parsed["create"]) == 1 + assert len(parsed["update"]) == 1 + assert parsed["related"] == ["baz"] + + def test_fallback_list_format(self): + text = json.dumps([{"name": "foo", "title": "Foo"}]) + parsed = _parse_json(text) + assert isinstance(parsed, list) + + def test_fenced_dict(self): + text = '```json\n{"create": [], "update": [], "related": []}\n```' + parsed = _parse_json(text) + assert isinstance(parsed, dict) + assert parsed["create"] == [] + + +class TestParseBriefContent: + def test_dict_with_brief_and_content(self): + text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) + parsed = _parse_json(text) + assert parsed["brief"] == "A short desc" + assert "# Full page" in parsed["content"] + + def test_plain_text_fallback(self): + """If LLM returns plain text, _parse_json raises — caller handles fallback.""" + with pytest.raises((json.JSONDecodeError, ValueError)): + _parse_json("Just plain markdown text without JSON") + + +class TestWriteSummary: + def test_writes_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + assert path.exists() + text = path.read_text() + assert "doc_type: short" in text + assert "full_text: sources/my-doc.md" in text + assert "# Summary" in text + + def test_writes_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_summary(wiki, "my-doc", "# Summary\n\nContent here.") + path = wiki / "summaries" / "my-doc.md" + text = path.read_text() + assert "doc_type: short" in text + assert "full_text: sources/my-doc.md" in text + + +class TestWriteConcept: + def test_new_concept_with_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False, brief="Mechanism for selective focus") + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "brief: Mechanism for selective focus" in text + assert "# Attention" in text + + def test_new_concept_without_brief(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + _write_concept(wiki, "attention", "# Attention\n\nDetails.", "paper.pdf", False) + path = wiki / "concepts" / "attention.md" + assert path.exists() + text = path.read_text() + assert "sources: [paper.pdf]" in text + assert "brief:" not in text + + def test_update_concept_updates_brief(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\nbrief: Old brief\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info.", "paper2.pdf", True, brief="Updated brief") + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "brief: Updated brief" in text + assert "Old brief" not in text + + def test_update_concept_appends_source(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nOld content.", + encoding="utf-8", + ) + _write_concept(wiki, "attention", "New info from paper2.", "paper2.pdf", True) + text = (concepts / "attention.md").read_text() + assert "paper2.pdf" in text + assert "paper1.pdf" in text + assert "New info from paper2." in text + + +class TestUpdateIndex: + def test_appends_entries_with_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention", "transformer"], + doc_brief="Introduces transformers", + concept_briefs={"attention": "Focus mechanism", "transformer": "NN architecture"}) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]] (short) — Introduces transformers" in text + assert "[[concepts/attention]] — Focus mechanism" in text + assert "[[concepts/transformer]] — NN architecture" in text + + def test_no_duplicates(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n- [[summaries/my-doc]] — Old brief\n\n## Concepts\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", [], doc_brief="New brief") + text = (wiki / "index.md").read_text() + assert text.count("[[summaries/my-doc]]") == 1 + + def test_backwards_compat_no_briefs(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index(wiki, "my-doc", ["attention"]) + text = (wiki / "index.md").read_text() + assert "[[summaries/my-doc]]" in text + assert "[[concepts/attention]]" in text + + +class TestReadWikiContext: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + index, concepts = _read_wiki_context(wiki) + assert index == "" + assert concepts == [] + + def test_with_content(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text("# Index\n", encoding="utf-8") + concepts_dir = wiki / "concepts" + concepts_dir.mkdir() + (concepts_dir / "attention.md").write_text("# Attention", encoding="utf-8") + (concepts_dir / "transformer.md").write_text("# Transformer", encoding="utf-8") + index, concepts = _read_wiki_context(wiki) + assert "# Index" in index + assert concepts == ["attention", "transformer"] + + +class TestReadConceptBriefs: + def test_empty_wiki(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "concepts").mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_no_concepts_dir(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + assert _read_concept_briefs(wiki) == "(none yet)" + + def test_reads_briefs_with_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nAttention is a mechanism that allows models to focus on relevant parts.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention:" in result + assert "Attention is a mechanism" in result + assert "sources" not in result + assert "---" not in result + + def test_reads_briefs_without_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "transformer.md").write_text( + "Transformer is a neural network architecture based on attention.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- transformer:" in result + assert "Transformer is a neural network" in result + + def test_truncates_long_content(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + long_body = "A" * 300 + (concepts / "longconcept.md").write_text(long_body, encoding="utf-8") + result = _read_concept_briefs(wiki) + # The brief part should be truncated at 150 chars + brief = result.split("- longconcept: ", 1)[1] + assert len(brief) == 150 + assert brief == "A" * 150 + + def test_sorted_alphabetically(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "zebra.md").write_text("Zebra concept.", encoding="utf-8") + (concepts / "apple.md").write_text("Apple concept.", encoding="utf-8") + (concepts / "mango.md").write_text("Mango concept.", encoding="utf-8") + result = _read_concept_briefs(wiki) + lines = result.strip().splitlines() + slugs = [line.split(":")[0].lstrip("- ") for line in lines] + assert slugs == ["apple", "mango", "zebra"] + + def test_reads_brief_from_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\nbrief: Selective focus mechanism\n---\n\n# Attention\n\nLong content...", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- attention: Selective focus mechanism" in result + + def test_falls_back_to_body_truncation(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "old.md").write_text( + "---\nsources: [paper.pdf]\n---\n\nOld concept without brief field.", + encoding="utf-8", + ) + result = _read_concept_briefs(wiki) + assert "- old: Old concept without brief field." in result + + +class TestBacklinkSummary: + def test_adds_missing_concept_links(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nContent about attention.", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert "[[concepts/attention]]" in text + assert "[[concepts/transformer]]" in text + + def test_skips_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Summary\n\nSee [[concepts/attention]].", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + # attention already linked, should not duplicate + assert text.count("[[concepts/attention]]") == 1 + # transformer should be added + assert "[[concepts/transformer]]" in text + + def test_no_op_when_all_linked(self, tmp_path): + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + original = "# Summary\n\n[[concepts/attention]] and [[concepts/transformer]]" + (summaries / "paper.md").write_text(original, encoding="utf-8") + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + assert (summaries / "paper.md").read_text() == original + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _backlink_summary(wiki, "nonexistent", ["attention"]) + + def test_merges_into_existing_section(self, tmp_path): + """Second add should merge into existing ## Related Concepts, not duplicate.""" + wiki = tmp_path / "wiki" + summaries = wiki / "summaries" + summaries.mkdir(parents=True) + (summaries / "paper.md").write_text( + "# Summary\n\nContent.\n\n## Related Concepts\n- [[concepts/attention]]\n", + encoding="utf-8", + ) + _backlink_summary(wiki, "paper", ["attention", "transformer"]) + text = (summaries / "paper.md").read_text() + assert text.count("## Related Concepts") == 1 + assert "[[concepts/transformer]]" in text + assert text.count("[[concepts/attention]]") == 1 + + +class TestBacklinkConcepts: + def test_adds_summary_link_to_concept(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper.pdf]\n---\n\n# Attention\n\nContent.", + encoding="utf-8", + ) + _backlink_concepts(wiki, "paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert "[[summaries/paper]]" in text + assert "## Related Documents" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\nBased on [[summaries/paper]].", + encoding="utf-8", + ) + _backlink_concepts(wiki, "paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert text.count("[[summaries/paper]]") == 1 + assert "## Related Documents" not in text + + def test_merges_into_existing_section(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "# Attention\n\n## Related Documents\n- [[summaries/old-paper]]\n", + encoding="utf-8", + ) + _backlink_concepts(wiki, "new-paper", ["attention"]) + text = (concepts / "attention.md").read_text() + assert text.count("## Related Documents") == 1 + assert "[[summaries/old-paper]]" in text + assert "[[summaries/new-paper]]" in text + + def test_skips_missing_concept_file(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "concepts").mkdir(parents=True) + # Should not raise + _backlink_concepts(wiki, "paper", ["nonexistent"]) + + +class TestAddRelatedLink: + def test_adds_see_also_link(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSome content.", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper2.pdf") + text = (concepts / "attention.md").read_text() + assert "[[summaries/new-doc]]" in text + assert "paper2.pdf" in text + + def test_skips_if_already_linked(self, tmp_path): + wiki = tmp_path / "wiki" + concepts = wiki / "concepts" + concepts.mkdir(parents=True) + (concepts / "attention.md").write_text( + "---\nsources: [paper1.pdf]\n---\n\n# Attention\n\nSee also: [[summaries/new-doc]]", + encoding="utf-8", + ) + _add_related_link(wiki, "attention", "new-doc", "paper1.pdf") + text = (concepts / "attention.md").read_text() + assert text.count("[[summaries/new-doc]]") == 1 + + def test_skips_if_file_missing(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + # Should not raise + _add_related_link(wiki, "nonexistent", "doc", "file.pdf") + + +def _mock_completion(responses: list[str]): + """Create a mock for litellm.completion that returns responses in order.""" + call_count = {"n": 0} + + def side_effect(*args, **kwargs): + idx = min(call_count["n"], len(responses) - 1) + call_count["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = responses[idx] + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + return side_effect + + +def _mock_acompletion(responses: list[str]): + """Create an async mock for litellm.acompletion.""" + call_count = {"n": 0} + + async def side_effect(*args, **kwargs): + idx = min(call_count["n"], len(responses) - 1) + call_count["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + mock_resp.choices[0].message.content = responses[idx] + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + return side_effect class TestCompileShortDoc: @pytest.mark.asyncio - async def test_calls_runner_run(self, tmp_path): - # Create a source file - wiki_dir = tmp_path / "wiki" - wiki_dir.mkdir() - source_path = wiki_dir / "sources" / "my_doc.md" - source_path.parent.mkdir(parents=True) - source_path.write_text("# My Doc\n\nSome content.", encoding="utf-8") - - # Create .openkb dir for agent build - openkb_dir = tmp_path / ".openkb" - openkb_dir.mkdir() - - mock_result = MagicMock() - mock_result.final_output = "Done" + async def test_full_pipeline(self, tmp_path): + # Setup KB structure + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nSome content about transformers.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_response = json.dumps({ + "brief": "Discusses transformers", + "content": "# Summary\n\nThis document discusses transformers.", + }) + concepts_list_response = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_page_response = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, concepts_list_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") - with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run: - mock_run.return_value = mock_result - await compile_short_doc("my_doc", source_path, tmp_path, "gpt-4o-mini") + # Verify summary written + summary_path = wiki / "summaries" / "test-doc.md" + assert summary_path.exists() + assert "full_text: sources/test-doc.md" in summary_path.read_text() - mock_run.assert_called_once() - call_args = mock_run.call_args - agent_arg = call_args[0][0] - message_arg = call_args[0][1] + # Verify concept written + concept_path = wiki / "concepts" / "transformer.md" + assert concept_path.exists() + assert "sources: [summaries/test-doc.md]" in concept_path.read_text() - assert agent_arg.name == "wiki-compiler" - assert "my_doc" in message_arg - assert "Some content." in message_arg - assert "Generate summary" in message_arg + # Verify index updated + index_text = (wiki / "index.md").read_text() + assert "[[summaries/test-doc]]" in index_text + assert "[[concepts/transformer]]" in index_text @pytest.mark.asyncio - async def test_message_contains_doc_name_and_content(self, tmp_path): - wiki_dir = tmp_path / "wiki" - source_path = wiki_dir / "sources" / "test_paper.md" - source_path.parent.mkdir(parents=True) - source_path.write_text("# Test Paper\n\nKey findings here.", encoding="utf-8") - + async def test_handles_bad_json(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "doc.md" + source_path.write_text("Content", encoding="utf-8") (tmp_path / ".openkb").mkdir() - captured = {} - - async def fake_run(agent, message, **kwargs): - captured["message"] = message - return MagicMock(final_output="ok") - - with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run): - await compile_short_doc("test_paper", source_path, tmp_path, "gpt-4o-mini") + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion(["Plain summary text", "not valid json"]) + ) + # Should not raise + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") - assert "test_paper" in captured["message"] - assert "Key findings here." in captured["message"] + # Summary should still be written + assert (wiki / "summaries" / "doc.md").exists() class TestCompileLongDoc: @pytest.mark.asyncio - async def test_calls_runner_run(self, tmp_path): - wiki_dir = tmp_path / "wiki" - summary_path = wiki_dir / "summaries" / "big_doc.md" - summary_path.parent.mkdir(parents=True) - summary_path.write_text("# Big Doc Summary\n\nSection tree.", encoding="utf-8") - + async def test_full_pipeline(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + summary_path = wiki / "summaries" / "big-doc.md" + summary_path.write_text("# Big Doc\n\nPageIndex summary tree.", encoding="utf-8") openkb_dir = tmp_path / ".openkb" openkb_dir.mkdir() - # Write minimal config (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "big-doc.pdf").write_bytes(b"fake") + + overview_response = "Overview of the big document." + concepts_list_response = json.dumps({ + "create": [{"name": "deep-learning", "title": "Deep Learning"}], + "update": [], + "related": [], + }) + concept_page_response = json.dumps({ + "brief": "Subfield of ML using neural networks", + "content": "# Deep Learning\n\nA subfield of ML.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([overview_response, concepts_list_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await compile_long_doc( + "big-doc", summary_path, "doc-123", tmp_path, "gpt-4o-mini" + ) - mock_result = MagicMock() - mock_result.final_output = "Done" + concept_path = wiki / "concepts" / "deep-learning.md" + assert concept_path.exists() + assert "Deep Learning" in concept_path.read_text() - with patch("openkb.agent.compiler.Runner.run", new_callable=AsyncMock) as mock_run, \ - patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls: - mock_client = MagicMock() - mock_client_cls.return_value = mock_client - mock_run.return_value = mock_result + index_text = (wiki / "index.md").read_text() + assert "[[summaries/big-doc]]" in index_text + assert "[[concepts/deep-learning]]" in index_text - await compile_long_doc( - "big_doc", summary_path, "doc-abc123", tmp_path, "gpt-4o-mini" - ) - mock_run.assert_called_once() - call_args = mock_run.call_args - message_arg = call_args[0][1] +class TestCompileConceptsPlan: + """Integration tests for _compile_concepts with the new plan format.""" + + def _setup_wiki(self, tmp_path, existing_concepts=None): + """Helper to set up a wiki directory with optional existing concepts.""" + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n", + encoding="utf-8", + ) + (tmp_path / "raw").mkdir(exist_ok=True) + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") - assert "big_doc" in message_arg - assert "doc-abc123" in message_arg - assert "Do NOT regenerate summary" in message_arg + if existing_concepts: + for name, content in existing_concepts.items(): + (wiki / "concepts" / f"{name}.md").write_text( + content, encoding="utf-8", + ) + + return wiki @pytest.mark.asyncio - async def test_long_doc_agent_has_four_tools(self, tmp_path): - wiki_dir = tmp_path / "wiki" - summary_path = wiki_dir / "summaries" / "big.md" - summary_path.parent.mkdir(parents=True) - summary_path.write_text("Summary content", encoding="utf-8") + async def test_create_and_update_flow(self, tmp_path): + """Pre-existing 'attention' concept; plan creates 'flash-attention' and updates 'attention'.""" + wiki = self._setup_wiki(tmp_path, existing_concepts={ + "attention": "---\nsources: [old-paper.pdf]\n---\n\n# Attention\n\nOriginal content about attention.", + }) + + plan_response = json.dumps({ + "create": [{"name": "flash-attention", "title": "Flash Attention"}], + "update": [{"name": "attention", "title": "Attention"}], + "related": [], + }) + create_page_response = json.dumps({ + "brief": "Efficient attention algorithm", + "content": "# Flash Attention\n\nAn efficient attention algorithm.", + }) + update_page_response = json.dumps({ + "brief": "Updated attention mechanism", + "content": "# Attention\n\nUpdated content with new info.", + }) + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document about attention mechanisms."} + summary = "Summary of the document." + + call_order = {"n": 0} + + async def ordered_acompletion(*args, **kwargs): + idx = call_order["n"] + call_order["n"] += 1 + mock_resp = MagicMock() + mock_resp.choices = [MagicMock()] + # create tasks come first, then update tasks + if idx == 0: + mock_resp.choices[0].message.content = create_page_response + else: + mock_resp.choices[0].message.content = update_page_response + mock_resp.usage = MagicMock(prompt_tokens=100, completion_tokens=50) + mock_resp.usage.prompt_tokens_details = None + return mock_resp + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=ordered_acompletion + ) + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) - openkb_dir = tmp_path / ".openkb" - openkb_dir.mkdir() - (openkb_dir / "config.yaml").write_text("model: gpt-4o-mini\n") + # Verify flash-attention created + fa_path = wiki / "concepts" / "flash-attention.md" + assert fa_path.exists() + fa_text = fa_path.read_text() + assert "sources: [summaries/test-doc.md]" in fa_text + assert "Flash Attention" in fa_text + + # Verify attention updated (is_update=True path in _write_concept) + att_path = wiki / "concepts" / "attention.md" + assert att_path.exists() + att_text = att_path.read_text() + assert "summaries/test-doc.md" in att_text + assert "old-paper.pdf" in att_text + + # Verify index updated + index_text = (wiki / "index.md").read_text() + assert "[[concepts/flash-attention]]" in index_text + assert "[[concepts/attention]]" in index_text + + @pytest.mark.asyncio + async def test_related_adds_link_no_llm(self, tmp_path): + """Plan has only related items. No acompletion calls should be made.""" + wiki = self._setup_wiki(tmp_path, existing_concepts={ + "transformer": "---\nsources: [old.pdf]\n---\n\n# Transformer\n\nContent about transformers.", + }) + + plan_response = json.dumps({ + "create": [], + "update": [], + "related": ["transformer"], + }) + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document content."} + summary = "Summary." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock() + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) + # acompletion should never be called — related is code-only + mock_litellm.acompletion.assert_not_called() - captured_agent = {} + # Verify link added to transformer page + transformer_text = (wiki / "concepts" / "transformer.md").read_text() + assert "[[summaries/test-doc]]" in transformer_text + assert "summaries/test-doc.md" in transformer_text - async def fake_run(agent, message, **kwargs): - captured_agent["agent"] = agent - return MagicMock(final_output="ok") + @pytest.mark.asyncio + async def test_fallback_list_format(self, tmp_path): + """LLM returns a flat array instead of dict — treated as all create.""" + wiki = self._setup_wiki(tmp_path) + + plan_response = json.dumps([ + {"name": "attention", "title": "Attention"}, + ]) + concept_page_response = json.dumps({ + "brief": "A mechanism for focusing", + "content": "# Attention\n\nA mechanism for focusing.", + }) + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document content."} + summary = "Summary." + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_page_response]) + ) + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + summary, "test-doc", 5, + ) - with patch("openkb.agent.compiler.Runner.run", side_effect=fake_run), \ - patch("openkb.agent.compiler.PageIndexClient") as mock_client_cls: - mock_client_cls.return_value = MagicMock() + # Verify concept was created (not updated) + att_path = wiki / "concepts" / "attention.md" + assert att_path.exists() + att_text = att_path.read_text() + assert "sources: [summaries/test-doc.md]" in att_text + assert "Attention" in att_text - await compile_long_doc( - "big", summary_path, "doc-xyz", tmp_path, "gpt-4o-mini" + +class TestBriefIntegration: + @pytest.mark.asyncio + async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): + wiki = tmp_path / "wiki" + (wiki / "sources").mkdir(parents=True) + (wiki / "summaries").mkdir(parents=True) + (wiki / "concepts").mkdir(parents=True) + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + source_path = wiki / "sources" / "test-doc.md" + source_path.write_text("# Test Doc\n\nContent.", encoding="utf-8") + (tmp_path / ".openkb").mkdir() + (tmp_path / "raw").mkdir() + (tmp_path / "raw" / "test-doc.pdf").write_bytes(b"fake") + + summary_resp = json.dumps({ + "brief": "A paper about transformers", + "content": "# Summary\n\nThis paper discusses transformers.", + }) + plan_resp = json.dumps({ + "create": [{"name": "transformer", "title": "Transformer"}], + "update": [], + "related": [], + }) + concept_resp = json.dumps({ + "brief": "NN architecture using self-attention", + "content": "# Transformer\n\nA neural network architecture.", + }) + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_resp, plan_resp]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_acompletion([concept_resp]) ) + await compile_short_doc("test-doc", source_path, tmp_path, "gpt-4o-mini") + + # Summary frontmatter has doc_type and full_text + summary_text = (wiki / "summaries" / "test-doc.md").read_text() + assert "doc_type: short" in summary_text + assert "full_text: sources/test-doc.md" in summary_text + + # Concept frontmatter has brief + concept_text = (wiki / "concepts" / "transformer.md").read_text() + assert "brief: NN architecture using self-attention" in concept_text - agent = captured_agent["agent"] - assert len(agent.tools) == 4 - tool_names = {t.name for t in agent.tools} - assert "get_page_content" in tool_names + # Index has briefs + index_text = (wiki / "index.md").read_text() + assert "— A paper about transformers" in index_text + assert "— NN architecture using self-attention" in index_text From 901d8bc3b98ea61b6a02c22b774bce0faa67b7f1 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 08:04:10 +0800 Subject: [PATCH 02/26] feat: brief system, per-page JSON sources, and unified query agent - Add get_page_content tool and parse_pages helper for page-level access - Store long doc sources as per-page JSON extracted by pymupdf - Unify summary frontmatter to doc_type + full_text fields - Update schema and tree renderer for new frontmatter format - All image paths use sources/images/ prefix relative to wiki root Co-authored-by: Ray --- openkb/agent/tools.py | 117 ++++++++++++++++++++++++++++++++++ openkb/indexer.py | 52 +++++---------- openkb/schema.py | 5 +- openkb/tree_renderer.py | 39 +----------- tests/test_agent_tools.py | 87 ++++++++++++++++++++++++- tests/test_indexer.py | 55 +++++++++++----- tests/test_tree_renderer.py | 124 +----------------------------------- 7 files changed, 266 insertions(+), 213 deletions(-) diff --git a/openkb/agent/tools.py b/openkb/agent/tools.py index 7a5b1ca..2fe930b 100644 --- a/openkb/agent/tools.py +++ b/openkb/agent/tools.py @@ -6,6 +6,7 @@ """ from __future__ import annotations +import json as _json from pathlib import Path @@ -52,6 +53,121 @@ def read_wiki_file(path: str, wiki_root: str) -> str: return full_path.read_text(encoding="utf-8") +def parse_pages(pages: str) -> list[int]: + """Parse a page specification string into a sorted, deduplicated list of page numbers. + + Args: + pages: Page spec such as ``"3-5,7,10-12"``. + + Returns: + Sorted list of positive page numbers, e.g. ``[3, 4, 5, 7, 10, 11, 12]``. + """ + result: set[int] = set() + for part in pages.split(","): + part = part.strip() + if "-" in part: + # Handle ranges like "3-5"; also handle negative numbers by only + # splitting on the first "-" that follows a digit. + segments = part.split("-") + # Re-join to handle leading negatives: segments[0] may be empty + # if part starts with "-". We just try to parse start/end. + try: + if len(segments) == 2: + start, end = int(segments[0]), int(segments[1]) + result.update(range(start, end + 1)) + elif len(segments) == 3 and segments[0] == "": + # e.g. "-1" split gives ['', '1'] + result.add(-int(segments[1])) + # More complex cases (e.g. negative range) are ignored. + except ValueError: + pass + else: + try: + result.add(int(part)) + except ValueError: + pass + return sorted(n for n in result if n > 0) + + +def get_page_content(doc_name: str, pages: str, wiki_root: str) -> str: + """Return formatted content for specified pages of a document. + + Reads ``{wiki_root}/sources/{doc_name}.json`` which must be a JSON array of + objects with at least ``{"page": int, "content": str}`` fields and an + optional ``"images"`` list of ``{"path": str, ...}`` objects. + + Args: + doc_name: Document name without extension (e.g. ``"paper"``). + pages: Page specification string (e.g. ``"1-3,7"``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + Formatted page content, or an error message string. + """ + root = Path(wiki_root).resolve() + target = (root / "sources" / f"{doc_name}.json").resolve() + if not target.is_relative_to(root): + return "Access denied: path escapes wiki root." + if not target.exists(): + return f"File not found: sources/{doc_name}.json" + + data = _json.loads(target.read_text(encoding="utf-8")) + requested = set(parse_pages(pages)) + matches = [entry for entry in data if entry.get("page") in requested] + + if not matches: + return f"No content found for pages {pages} in {doc_name}." + + parts: list[str] = [] + for entry in matches: + page_num = entry["page"] + content = entry.get("content", "") + block = f"[Page {page_num}]\n{content}" + images = entry.get("images") + if images: + paths = ", ".join(img["path"] for img in images if "path" in img) + if paths: + block += f"\n[Images: {paths}]" + parts.append(block) + + return "\n\n".join(parts) + "\n\n" + + +_MIME_TYPES = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".bmp": "image/bmp", +} + + +def read_wiki_image(path: str, wiki_root: str) -> dict: + """Read an image file from the wiki and return as base64 data URL. + + Args: + path: Image path relative to *wiki_root* (e.g. ``"sources/images/doc/p1_img1.png"``). + wiki_root: Absolute path to the wiki root directory. + + Returns: + A dict with ``type``, ``image_url`` keys for ``ToolOutputImage``, + or a dict with ``type``, ``text`` keys on error. + """ + import base64 + + root = Path(wiki_root).resolve() + full_path = (root / path).resolve() + if not full_path.is_relative_to(root): + return {"type": "text", "text": "Access denied: path escapes wiki root."} + if not full_path.exists(): + return {"type": "text", "text": f"Image not found: {path}"} + + mime = _MIME_TYPES.get(full_path.suffix.lower(), "image/png") + b64 = base64.b64encode(full_path.read_bytes()).decode() + return {"type": "image", "image_url": f"data:{mime};base64,{b64}"} + + def write_wiki_file(path: str, content: str, wiki_root: str) -> str: """Write or overwrite a Markdown file in the wiki. @@ -72,3 +188,4 @@ def write_wiki_file(path: str, content: str, wiki_root: str) -> str: full_path.parent.mkdir(parents=True, exist_ok=True) full_path.write_text(content, encoding="utf-8") return f"Written: {path}" + diff --git a/openkb/indexer.py b/openkb/indexer.py index 18aafc6..dd8ddaf 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -1,9 +1,9 @@ """PageIndex indexer for long documents.""" from __future__ import annotations +import json as json_mod import logging -import re -import shutil + from dataclasses import dataclass from pathlib import Path @@ -12,12 +12,10 @@ from pageindex import IndexConfig, PageIndexClient from openkb.config import load_config -from openkb.tree_renderer import render_source_md, render_summary_md +from openkb.tree_renderer import render_summary_md logger = logging.getLogger(__name__) -_IMG_REF_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)") - @dataclass class IndexResult: @@ -28,31 +26,6 @@ class IndexResult: tree: dict -def _relocate_images(markdown: str, doc_stem: str, dest_images_dir: Path) -> str: - """Copy images from PageIndex internal paths to wiki/sources/images/ and rewrite refs. - - PageIndex stores images internally (e.g. .openkb/files/{collection}/{doc_id}/images/). - We copy them to dest_images_dir and rewrite paths to be relative to the .md file - (i.e. images/{doc_stem}/filename). - """ - dest_images_dir.mkdir(parents=True, exist_ok=True) - - def _replace(match: re.Match) -> str: - alt = match.group(1) - src_path_str = match.group(2) - src_path = Path(src_path_str) - if not src_path.exists(): - logger.warning("Image not found: %s", src_path) - return match.group(0) - filename = src_path.name - dest = dest_images_dir / filename - if not dest.exists(): - shutil.copy2(src_path, dest) - return f"![{alt}](images/{doc_stem}/{filename})" - - return _IMG_REF_RE.sub(_replace, markdown) - - def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: """Index a long PDF document using PageIndex and write wiki pages.""" openkb_dir = kb_dir / ".openkb" @@ -94,20 +67,27 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: description: str = doc.get("doc_description", "") structure: list = doc.get("structure", []) + # Debug: print doc keys and page_count to diagnose get_page_content range + logger.info("Doc keys: %s", list(doc.keys())) + logger.info("page_count from doc: %s", doc.get("page_count", "NOT PRESENT")) + tree = { "doc_name": doc_name, "doc_description": description, "structure": structure, } - # Write wiki/sources/ — copy images from PageIndex internal location - # and rewrite paths to be relative to the .md file (images/{stem}/filename) + # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex) sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) - dest_images_dir = sources_dir / "images" / pdf_path.stem - source_md = render_source_md(tree, doc_name, doc_id) - source_md = _relocate_images(source_md, pdf_path.stem, dest_images_dir) - (sources_dir / f"{pdf_path.stem}.md").write_text(source_md, encoding="utf-8") + images_dir = sources_dir / "images" / pdf_path.stem + + from openkb.images import convert_pdf_to_pages + all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) + + (sources_dir / f"{pdf_path.stem}.json").write_text( + json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", + ) # Write wiki/summaries/ (no images, just summaries) summaries_dir = kb_dir / "wiki" / "summaries" diff --git a/openkb/schema.py b/openkb/schema.py index d0fc602..b2c8cf0 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -6,7 +6,7 @@ # Wiki Schema ## Directory Structure -- sources/ — Full-text converted from raw documents. Do not modify directly. +- sources/ — Document content. Short docs as .md, long docs as .json (per-page). Do not modify directly. - sources/images/ — Extracted images from documents, referenced by sources. - summaries/ — One per source document. Summary of key content. - concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents. @@ -35,10 +35,9 @@ ## Format - Use [[wikilink]] to link other wiki pages (e.g., [[concepts/attention]]) -- Summary pages header: `sources: [paper.pdf]` -- Concept pages header: `sources: [paper1.pdf, paper2.pdf, ...]` - Standard Markdown heading hierarchy - Keep each page focused on a single topic +- Do not include YAML frontmatter (---) in generated content; it is managed by code """ # Backward compat alias diff --git a/openkb/tree_renderer.py b/openkb/tree_renderer.py index f745b48..efad980 100644 --- a/openkb/tree_renderer.py +++ b/openkb/tree_renderer.py @@ -6,32 +6,12 @@ def _yaml_frontmatter(source_name: str, doc_id: str) -> str: """Return a YAML frontmatter block for a PageIndex wiki page.""" return ( "---\n" - f"source: {source_name}\n" - "type: pageindex\n" - f"doc_id: {doc_id}\n" + "doc_type: pageindex\n" + f"full_text: sources/{source_name}.json\n" "---\n" ) -def _render_nodes_source(nodes: list[dict], depth: int) -> str: - """Recursively render nodes for the *source* view (text content).""" - lines: list[str] = [] - heading_prefix = "#" * min(depth, 6) - for node in nodes: - title = node.get("title", "") - start = node.get("start_index", "") - end = node.get("end_index", "") - text = node.get("text", "") - children = node.get("nodes", []) - - lines.append(f"{heading_prefix} {title} (pages {start}\u2013{end})\n") - if text: - lines.append(f"{text}\n") - if children: - lines.append(_render_nodes_source(children, depth + 1)) - - return "\n".join(lines) - def _render_nodes_summary(nodes: list[dict], depth: int) -> str: """Recursively render nodes for the *summary* view (summaries only).""" @@ -53,24 +33,11 @@ def _render_nodes_summary(nodes: list[dict], depth: int) -> str: return "\n".join(lines) -def render_source_md(tree: dict, source_name: str, doc_id: str) -> str: - """Render the full-text (source) Markdown page for a PageIndex tree. - - The page begins with YAML frontmatter, then recursively renders - every node as a heading with its ``(pages X–Y)`` range and full text. - Heading level equals tree depth (h1 at root), capped at h6. - """ - frontmatter = _yaml_frontmatter(source_name, doc_id) - structure = tree.get("structure", []) - body = _render_nodes_source(structure, depth=1) - return frontmatter + "\n" + body - def render_summary_md(tree: dict, source_name: str, doc_id: str) -> str: """Render the summary Markdown page for a PageIndex tree. - Identical structure to :func:`render_source_md` but replaces node text - with ``Summary: {summary}`` for each node. + Renders each node as a heading with page range and its summary text. """ frontmatter = _yaml_frontmatter(source_name, doc_id) structure = tree.get("structure", []) diff --git a/tests/test_agent_tools.py b/tests/test_agent_tools.py index bfffc2f..3d95a88 100644 --- a/tests/test_agent_tools.py +++ b/tests/test_agent_tools.py @@ -5,7 +5,7 @@ import pytest -from openkb.agent.tools import list_wiki_files, read_wiki_file, write_wiki_file +from openkb.agent.tools import get_page_content, list_wiki_files, parse_pages, read_wiki_file, write_wiki_file # --------------------------------------------------------------------------- @@ -128,3 +128,88 @@ def test_returns_written_path(self, tmp_path): result = write_wiki_file("reports/health.md", "All good.", wiki_root) assert result == "Written: reports/health.md" + + +# --------------------------------------------------------------------------- +# parse_pages +# --------------------------------------------------------------------------- + + +class TestParsePages: + def test_single_page(self): + assert parse_pages("3") == [3] + + def test_range(self): + assert parse_pages("3-5") == [3, 4, 5] + + def test_comma_separated(self): + assert parse_pages("1,3,5") == [1, 3, 5] + + def test_mixed(self): + assert parse_pages("1-3,7,10-12") == [1, 2, 3, 7, 10, 11, 12] + + def test_deduplication(self): + assert parse_pages("3,3,3") == [3] + + def test_sorted(self): + assert parse_pages("5,1,3") == [1, 3, 5] + + def test_ignores_zero_and_negative(self): + assert parse_pages("0,-1,3") == [3] + + +# --------------------------------------------------------------------------- +# get_page_content +# --------------------------------------------------------------------------- + + +class TestGetPageContent: + def test_reads_pages_from_json(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + {"page": 3, "content": "Page three text."}, + ] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("paper", "1,3", wiki_root) + assert "[Page 1]" in result + assert "Page one text." in result + assert "[Page 3]" in result + assert "Page three text." in result + assert "Page two" not in result + + def test_returns_error_for_missing_file(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("nonexistent", "1", wiki_root) + assert "not found" in result.lower() + + def test_returns_error_for_no_matching_pages(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Only page."}] + (sources / "paper.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("paper", "99", wiki_root) + assert "no content" in result.lower() + + def test_includes_images_info(self, tmp_path): + import json + wiki_root = str(tmp_path) + sources = tmp_path / "sources" + sources.mkdir() + pages = [{"page": 1, "content": "Text.", "images": [{"path": "images/p/img.png", "width": 100, "height": 80}]}] + (sources / "doc.json").write_text(json.dumps(pages), encoding="utf-8") + result = get_page_content("doc", "1", wiki_root) + assert "img.png" in result + + def test_path_escape_denied(self, tmp_path): + wiki_root = str(tmp_path) + (tmp_path / "sources").mkdir() + result = get_page_content("../../etc/passwd", "1", wiki_root) + assert "denied" in result.lower() or "not found" in result.lower() diff --git a/tests/test_indexer.py b/tests/test_indexer.py index c9c7101..3dbb677 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -23,8 +23,17 @@ def _make_fake_collection(self, doc_id: str, sample_tree: dict): "doc_type": "pdf", "structure": sample_tree["structure"], } + + # get_page_content returns empty list by default (overridden per test as needed) + col.get_page_content.return_value = [] return col + def _fake_pages(self): + return [ + {"page": 1, "content": "Page one text.", "images": []}, + {"page": 2, "content": "Page two text.", "images": []}, + ] + def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): doc_id = "abc-123" fake_col = self._make_fake_collection(doc_id, sample_tree) @@ -35,7 +44,8 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): result = index_long_document(pdf_path, kb_dir) assert isinstance(result, IndexResult) @@ -43,24 +53,34 @@ def test_returns_index_result(self, kb_dir, sample_tree, tmp_path): assert result.description == sample_tree["doc_description"] assert result.tree is not None - def test_source_page_written(self, kb_dir, sample_tree, tmp_path): + def test_source_page_written_as_json(self, kb_dir, sample_tree, tmp_path): + """Long doc source should be written as JSON, not markdown.""" + import json as json_mod doc_id = "abc-123" fake_col = self._make_fake_collection(doc_id, sample_tree) fake_client = MagicMock() fake_client.collection.return_value = fake_col + # Mock get_page_content to return page data + fake_col.get_page_content.return_value = [ + {"page": 1, "content": "Page one text."}, + {"page": 2, "content": "Page two text."}, + ] pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) - source_file = kb_dir / "wiki" / "sources" / "sample.md" - assert source_file.exists() - content = source_file.read_text(encoding="utf-8") - assert "type: pageindex" in content - assert "Introduction" in content + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert json_file.exists() + assert not (kb_dir / "wiki" / "sources" / "sample.md").exists() + data = json_mod.loads(json_file.read_text()) + assert len(data) == 2 + assert data[0]["page"] == 1 + assert data[0]["content"] == "Page one text." def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): doc_id = "abc-123" @@ -72,13 +92,14 @@ def test_summary_page_written(self, kb_dir, sample_tree, tmp_path): pdf_path = tmp_path / "sample.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client): + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) summary_file = kb_dir / "wiki" / "summaries" / "sample.md" assert summary_file.exists() content = summary_file.read_text(encoding="utf-8") - assert "type: pageindex" in content + assert "doc_type: pageindex" in content assert "Summary:" in content def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_path): @@ -92,13 +113,15 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat pdf_path = tmp_path / "report.pdf" pdf_path.write_bytes(b"%PDF-1.4 fake") - with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls: + with patch("openkb.indexer.PageIndexClient", return_value=fake_client) as mock_cls, \ + patch("openkb.images.convert_pdf_to_pages", return_value=self._fake_pages()): index_long_document(pdf_path, kb_dir) - # Verify PageIndexClient was instantiated + # Verify PageIndexClient was instantiated with correct IndexConfig mock_cls.assert_called_once() - # Check that index_config with correct flags was passed _, kwargs = mock_cls.call_args - ic = kwargs.get("index_config") or mock_cls.call_args[0][0] if mock_cls.call_args[0] else None - # Either as positional or keyword — either way PageIndexClient was called - assert mock_cls.called + ic = kwargs.get("index_config") + assert ic is not None, "index_config must be passed to PageIndexClient" + assert ic.if_add_node_text is True + assert ic.if_add_node_summary is True + assert ic.if_add_doc_description is True diff --git a/tests/test_tree_renderer.py b/tests/test_tree_renderer.py index 1d81b3b..f20e174 100644 --- a/tests/test_tree_renderer.py +++ b/tests/test_tree_renderer.py @@ -3,124 +3,7 @@ import pytest -from openkb.tree_renderer import render_source_md, render_summary_md - - -# --------------------------------------------------------------------------- -# render_source_md -# --------------------------------------------------------------------------- - - -class TestRenderSourceMd: - def test_has_yaml_frontmatter(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert output.startswith("---\n") - assert "source: Sample Document" in output - assert "type: pageindex" in output - assert "doc_id: doc-abc" in output - assert "---\n" in output - - def test_top_level_nodes_are_h1(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "# Introduction" in output - assert "# Conclusion" in output - - def test_nested_nodes_are_h2(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "## Background" in output - assert "## Motivation" in output - - def test_page_range_included(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "(pages 0–120)" in output # Introduction - assert "(pages 0–60)" in output # Background - assert "(pages 61–120)" in output # Motivation - assert "(pages 121–200)" in output # Conclusion - - def test_node_text_included(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - assert "This document introduces the core concepts of the system." in output - assert "Background information on the subject." in output - - def test_no_summary_in_source(self, sample_tree): - output = render_source_md(sample_tree, "Sample Document", "doc-abc") - # Source pages show text, not summaries - assert "Summary:" not in output - - def test_heading_depth_capped_at_6(self): - """Deeply nested nodes must not exceed h6.""" - deep_tree = { - "doc_name": "Deep", - "doc_description": "A deeply nested doc.", - "structure": [ - { - "title": "L1", - "start_index": 0, - "end_index": 10, - "text": "L1 text", - "summary": "L1 summary", - "nodes": [ - { - "title": "L2", - "start_index": 0, - "end_index": 5, - "text": "L2 text", - "summary": "L2 summary", - "nodes": [ - { - "title": "L3", - "start_index": 0, - "end_index": 3, - "text": "L3 text", - "summary": "L3 summary", - "nodes": [ - { - "title": "L4", - "start_index": 0, - "end_index": 1, - "text": "L4 text", - "summary": "L4 summary", - "nodes": [ - { - "title": "L5", - "start_index": 0, - "end_index": 1, - "text": "L5 text", - "summary": "L5 summary", - "nodes": [ - { - "title": "L6", - "start_index": 0, - "end_index": 1, - "text": "L6 text", - "summary": "L6 summary", - "nodes": [ - { - "title": "L7", - "start_index": 0, - "end_index": 1, - "text": "L7 text", - "summary": "L7 summary", - "nodes": [], - } - ], - } - ], - } - ], - } - ], - } - ], - } - ], - } - ], - } - output = render_source_md(deep_tree, "Deep", "doc-deep") - # L7 is at depth 7 — must render as h6, not h7 - assert "#######" not in output - assert "L7 text" in output +from openkb.tree_renderer import render_summary_md # --------------------------------------------------------------------------- @@ -132,9 +15,8 @@ class TestRenderSummaryMd: def test_has_yaml_frontmatter(self, sample_tree): output = render_summary_md(sample_tree, "Sample Document", "doc-abc") assert output.startswith("---\n") - assert "source: Sample Document" in output - assert "type: pageindex" in output - assert "doc_id: doc-abc" in output + assert "doc_type: pageindex" in output + assert "full_text: sources/Sample Document.json" in output def test_top_level_nodes_are_h1(self, sample_tree): output = render_summary_md(sample_tree, "Sample Document", "doc-abc") From 0bf7084976eee46d725fc32692b65823602a5882 Mon Sep 17 00:00:00 2001 From: mountain Date: Fri, 10 Apr 2026 08:04:21 +0800 Subject: [PATCH 03/26] fix: default model, API key warning, config and CI improvements - Change default model to gpt-5.4-mini - Warn when no LLM API key found instead of failing silently - Fix CI publish workflow and test isolation Co-authored-by: Ray --- .github/workflows/publish.yml | 7 ++++--- openkb/config.py | 32 +++++++++++++++++++++++++++++++- openkb/watcher.py | 11 ++++++----- tests/test_config.py | 2 +- tests/test_converter.py | 11 ++++------- tests/test_list_status.py | 6 ++++-- 6 files changed, 50 insertions(+), 19 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 78fd0e0..17b26c2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,12 +8,13 @@ on: jobs: publish: runs-on: ubuntu-latest + environment: pypi permissions: id-token: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.2.2 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: "3.12" @@ -24,4 +25,4 @@ jobs: run: python -m build - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # release/v1.11.0 diff --git a/openkb/config.py b/openkb/config.py index fbd7bca..b83e134 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -6,11 +6,14 @@ import yaml DEFAULT_CONFIG: dict[str, Any] = { - "model": "gpt-5.4", + "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, } +GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" +GLOBAL_CONFIG_PATH = GLOBAL_CONFIG_DIR / "global.yaml" + def load_config(config_path: Path) -> dict[str, Any]: """Load YAML config from config_path, merged with DEFAULT_CONFIG. @@ -30,3 +33,30 @@ def save_config(config_path: Path, config: dict) -> None: config_path.parent.mkdir(parents=True, exist_ok=True) with config_path.open("w", encoding="utf-8") as fh: yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True) + + +def load_global_config() -> dict[str, Any]: + """Load the global config from ~/.config/openkb/global.yaml.""" + if GLOBAL_CONFIG_PATH.exists(): + with GLOBAL_CONFIG_PATH.open("r", encoding="utf-8") as fh: + return yaml.safe_load(fh) or {} + return {} + + +def save_global_config(config: dict[str, Any]) -> None: + """Save the global config to ~/.config/openkb/global.yaml.""" + GLOBAL_CONFIG_DIR.mkdir(parents=True, exist_ok=True) + with GLOBAL_CONFIG_PATH.open("w", encoding="utf-8") as fh: + yaml.safe_dump(config, fh, allow_unicode=True, sort_keys=True) + + +def register_kb(kb_path: Path) -> None: + """Register a KB path in the global config's known_kbs list.""" + gc = load_global_config() + known = gc.get("known_kbs", []) + resolved = str(kb_path.resolve()) + if resolved not in known: + known.append(resolved) + gc["known_kbs"] = known + gc["default_kb"] = resolved + save_global_config(gc) diff --git a/openkb/watcher.py b/openkb/watcher.py index 77fdf24..2a0fae9 100644 --- a/openkb/watcher.py +++ b/openkb/watcher.py @@ -37,11 +37,12 @@ def __init__(self, callback: Callable[[list[str]], None], debounce_seconds: floa def _schedule_flush(self) -> None: """Cancel any existing timer and start a fresh debounce timer.""" - if self._timer is not None: - self._timer.cancel() - self._timer = threading.Timer(self._debounce_seconds, self._flush) - self._timer.daemon = True - self._timer.start() + with self._lock: + if self._timer is not None: + self._timer.cancel() + self._timer = threading.Timer(self._debounce_seconds, self._flush) + self._timer.daemon = True + self._timer.start() def _flush(self) -> None: """Call the callback with all collected pending paths, then clear.""" diff --git a/tests/test_config.py b/tests/test_config.py index 31bd0ab..495e075 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -10,7 +10,7 @@ def test_default_config_keys(): def test_default_config_values(): - assert DEFAULT_CONFIG["model"] == "gpt-5.4" + assert DEFAULT_CONFIG["model"] == "gpt-5.4-mini" assert DEFAULT_CONFIG["language"] == "en" assert DEFAULT_CONFIG["pageindex_threshold"] == 20 diff --git a/tests/test_converter.py b/tests/test_converter.py index 5efb6eb..6c184fd 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -81,27 +81,24 @@ def test_md_raw_file_copied(self, kb_dir): class TestConvertDocumentPdfShort: - def test_short_pdf_converted_via_markitdown(self, kb_dir, tmp_path): - """PDF under threshold is converted with markitdown.""" + def test_short_pdf_converted_via_pymupdf(self, kb_dir, tmp_path): + """PDF under threshold is converted with pymupdf (convert_pdf_with_images).""" src = tmp_path / "short.pdf" src.write_bytes(b"%PDF-1.4 fake content") - fake_result = MagicMock() - fake_result.text_content = "# Short PDF\n\nConverted content." - with ( patch("openkb.converter.pymupdf.open") as mock_mu, - patch("openkb.converter.MarkItDown") as mock_mid_cls, + patch("openkb.converter.convert_pdf_with_images", return_value="# Short PDF\n\nConverted.") as mock_cpwi, ): fake_doc = MagicMock() fake_doc.page_count = 5 # below default threshold of 20 fake_doc.__enter__ = MagicMock(return_value=fake_doc) fake_doc.__exit__ = MagicMock(return_value=False) mock_mu.return_value = fake_doc - mock_mid_cls.return_value.convert.return_value = fake_result result = convert_document(src, kb_dir) + mock_cpwi.assert_called_once() assert result.skipped is False assert result.is_long_doc is False assert result.source_path is not None diff --git a/tests/test_list_status.py b/tests/test_list_status.py index 0ef9f56..21b8de4 100644 --- a/tests/test_list_status.py +++ b/tests/test_list_status.py @@ -32,7 +32,8 @@ def _setup_kb(tmp_path: Path) -> Path: class TestListCommand: def test_list_no_kb(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["list"]) assert "No knowledge base found" in result.output @@ -91,7 +92,8 @@ def test_list_no_concepts_section_when_empty(self, tmp_path): class TestStatusCommand: def test_status_no_kb(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["status"]) assert "No knowledge base found" in result.output From 5a75301e046623b084a2b3c3ca2719dc6a851d1b Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:04:40 +0800 Subject: [PATCH 04/26] fix: improve init prompts, warning suppression, and CLI polish - Move warning suppression after imports to avoid markitdown override - Improve init prompts with explicit defaults - Use American English throughout (initialized, normalized, Synthesize) - Replace unicode ellipsis with ASCII - Remove empty explorations/reports dirs from init - Fix test isolation for _find_kb_dir --- openkb/cli.py | 190 ++++++++++++++++++++++++++++++-------- openkb/lint.py | 2 +- tests/test_add_command.py | 8 +- tests/test_cli.py | 13 ++- 4 files changed, 165 insertions(+), 48 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index da664f5..3683371 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -3,28 +3,71 @@ import asyncio import json +import logging import time from pathlib import Path import os +from agents import set_tracing_disabled +set_tracing_disabled(True) +# Use local model cost map — skip fetching from GitHub on every invocation +os.environ.setdefault("LITELLM_LOCAL_MODEL_COST_MAP", "True") + import click import litellm +litellm.suppress_debug_info = True from dotenv import load_dotenv -from openkb.config import DEFAULT_CONFIG, load_config, save_config +from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb from openkb.converter import convert_document from openkb.log import append_log from openkb.schema import AGENTS_MD -load_dotenv() +# Suppress warnings after all imports — markitdown overrides filters at import time +import warnings +warnings.filterwarnings("ignore") + +load_dotenv() # load from cwd (covers running inside the KB dir) + + +def _setup_llm_key(kb_dir: Path | None = None) -> None: + """Set LiteLLM API key from LLM_API_KEY env var if present. + + Load order (override=False, so first one wins): + 1. System environment variables (already set) + 2. KB-local .env (kb_dir/.env) + 3. Global .env (~/.config/openkb/.env) + Also propagates to provider-specific env vars (OPENAI_API_KEY, etc.) + so that the Agents SDK litellm provider can pick them up. + """ + if kb_dir is not None: + env_file = kb_dir / ".env" + if env_file.exists(): + load_dotenv(env_file, override=False) + + from openkb.config import GLOBAL_CONFIG_DIR + global_env = GLOBAL_CONFIG_DIR / ".env" + if global_env.exists(): + load_dotenv(global_env, override=False) -def _setup_llm_key() -> None: - """Set LiteLLM API key from LLM_API_KEY env var if present.""" api_key = os.environ.get("LLM_API_KEY", "") - if api_key: + if not api_key: + # Check if any provider key is already set + has_key = any(os.environ.get(k) for k in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY")) + if not has_key: + click.echo( + "Warning: No LLM API key found. Set one of:\n" + f" 1. {kb_dir / '.env' if kb_dir else '/.env'} — LLM_API_KEY=sk-...\n" + f" 2. {GLOBAL_CONFIG_DIR / '.env'} — LLM_API_KEY=sk-...\n" + " 3. Export LLM_API_KEY in your shell profile" + ) + else: litellm.api_key = api_key + for env_var in ("OPENAI_API_KEY", "ANTHROPIC_API_KEY", "GEMINI_API_KEY"): + if not os.environ.get(env_var): + os.environ[env_var] = api_key # Supported document extensions for the `add` command SUPPORTED_EXTENSIONS = { @@ -53,11 +96,29 @@ def _display_type(raw_type: str) -> str: # Helpers # --------------------------------------------------------------------------- -def _find_kb_dir() -> Path | None: - """Return the knowledge-base root if .openkb/ exists in cwd, else None.""" - candidate = Path(".openkb") - if candidate.exists() and candidate.is_dir(): - return Path(".") +def _find_kb_dir(override: Path | None = None) -> Path | None: + """Find the KB root: explicit override → walk up from cwd → global default_kb.""" + # 0. Explicit override (--kb-dir or OPENKB_DIR) + if override is not None: + if (override / ".openkb").is_dir(): + return override + return None + # 1. Walk up from cwd + current = Path.cwd().resolve() + while True: + if (current / ".openkb").is_dir(): + return current + parent = current.parent + if parent == current: + break + current = parent + # 2. Fall back to global config default_kb + gc = load_global_config() + default = gc.get("default_kb") + if default: + p = Path(default) + if (p / ".openkb").is_dir(): + return p return None @@ -73,9 +134,10 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: from openkb.agent.compiler import compile_long_doc, compile_short_doc from openkb.state import HashRegistry + logger = logging.getLogger(__name__) openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) registry = HashRegistry(openkb_dir / "hashes.json") @@ -85,6 +147,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: result = convert_document(file_path, kb_dir) except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") + logger.debug("Conversion traceback:", exc_info=True) return if result.skipped: @@ -95,20 +158,22 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: # 3/4. Index and compile if result.is_long_doc: - click.echo(f" Long document detected — indexing with PageIndex…") + click.echo(f" Long document detected — indexing with PageIndex...") try: from openkb.indexer import index_long_document index_result = index_long_document(result.raw_path, kb_dir) except Exception as exc: click.echo(f" [ERROR] Indexing failed: {exc}") + logger.debug("Indexing traceback:", exc_info=True) return summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md" - click.echo(f" Compiling long doc (doc_id={index_result.doc_id})…") + click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...") for attempt in range(2): try: asyncio.run( - compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model) + compile_long_doc(doc_name, summary_path, index_result.doc_id, kb_dir, model, + doc_description=index_result.description) ) break except Exception as exc: @@ -117,9 +182,10 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: time.sleep(2) else: click.echo(f" [ERROR] Compilation failed: {exc}") + logger.debug("Compilation traceback:", exc_info=True) return else: - click.echo(f" Compiling short doc…") + click.echo(f" Compiling short doc...") for attempt in range(2): try: asyncio.run(compile_short_doc(doc_name, result.source_path, kb_dir, model)) @@ -130,6 +196,7 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: time.sleep(2) else: click.echo(f" [ERROR] Compilation failed: {exc}") + logger.debug("Compilation traceback:", exc_info=True) return # Register hash only after successful compilation @@ -146,8 +213,38 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: # --------------------------------------------------------------------------- @click.group() -def cli(): +@click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging.") +@click.option("--kb-dir", "kb_dir_override", default=None, type=click.Path(exists=True, file_okay=False, resolve_path=True), help="Path to a KB root directory (overrides auto-detection).") +@click.pass_context +def cli(ctx, verbose, kb_dir_override): """OpenKB — Karpathy's LLM Knowledge Base workflow, powered by PageIndex.""" + logging.basicConfig( + format="%(name)s %(levelname)s: %(message)s", + level=logging.WARNING, + ) + if verbose: + logging.getLogger("openkb").setLevel(logging.DEBUG) + ctx.ensure_object(dict) + if kb_dir_override: + ctx.obj["kb_dir_override"] = Path(kb_dir_override) + else: + env_kb = os.environ.get("OPENKB_DIR") + if env_kb: + ctx.obj["kb_dir_override"] = Path(env_kb).resolve() + else: + ctx.obj["kb_dir_override"] = None + + +@cli.command() +@click.argument("path", default=".") +def use(path): + """Set PATH as the default knowledge base.""" + target = Path(path).resolve() + if not (target / ".openkb").is_dir(): + click.echo(f"Not a knowledge base: {target}") + return + register_kb(target) + click.echo(f"Default KB set to: {target}") @cli.command() @@ -160,22 +257,26 @@ def init(): # Interactive prompts model = click.prompt( - "Model (e.g. gpt-5.4, anthropic/claude-sonnet-4-6, gemini/gemini-3.1-pro-preview)", + f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]", default=DEFAULT_CONFIG["model"], + show_default=False, + ) + language = click.prompt( + f"Language [default: {DEFAULT_CONFIG['language']}]", + default=DEFAULT_CONFIG["language"], + show_default=False, ) - language = click.prompt("Language", default=DEFAULT_CONFIG["language"]) pageindex_threshold = click.prompt( - "PageIndex threshold (pages)", + f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]", default=DEFAULT_CONFIG["pageindex_threshold"], type=int, + show_default=False, ) # Create directory structure Path("raw").mkdir(exist_ok=True) Path("wiki/sources/images").mkdir(parents=True, exist_ok=True) Path("wiki/summaries").mkdir(parents=True, exist_ok=True) Path("wiki/concepts").mkdir(parents=True, exist_ok=True) - Path("wiki/explorations").mkdir(parents=True, exist_ok=True) - Path("wiki/reports").mkdir(parents=True, exist_ok=True) # Write wiki files Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8") @@ -195,14 +296,18 @@ def init(): save_config(openkb_dir / "config.yaml", config) (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8") - click.echo("Knowledge base initialised.") + # Register this KB in the global config + register_kb(Path.cwd()) + + click.echo("Knowledge base initialized.") @cli.command() @click.argument("path") -def add(path): +@click.pass_context +def add(ctx, path): """Add a document or directory of documents at PATH to the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -238,9 +343,10 @@ def add(path): @cli.command() @click.argument("question") @click.option("--save", is_flag=True, default=False, help="Save the answer to wiki/explorations/.") -def query(question, save): +@click.pass_context +def query(ctx, question, save): """Query the knowledge base with QUESTION.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -249,7 +355,7 @@ def query(question, save): openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) try: @@ -273,9 +379,10 @@ def query(question, save): @cli.command() -def watch(): +@click.pass_context +def watch(ctx): """Watch the raw/ directory for new documents and process them automatically.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -301,10 +408,13 @@ def on_new_files(paths): @cli.command() -@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues.") # TODO: --fix not yet implemented -def lint(fix): +@click.option("--fix", is_flag=True, default=False, help="Automatically fix lint issues (not yet implemented).") +@click.pass_context +def lint(ctx, fix): """Lint the knowledge base for structural and semantic inconsistencies.""" - kb_dir = _find_kb_dir() + if fix: + click.echo("Warning: --fix is not yet implemented. Running lint in report-only mode.") + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -314,16 +424,16 @@ def lint(fix): openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") - _setup_llm_key() + _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) # Structural lint - click.echo("Running structural lint…") + click.echo("Running structural lint...") structural_report = run_structural_lint(kb_dir) click.echo(structural_report) # Knowledge lint (semantic) - click.echo("Running knowledge lint…") + click.echo("Running knowledge lint...") try: knowledge_report = asyncio.run(run_knowledge_lint(kb_dir, model)) except Exception as exc: @@ -343,9 +453,10 @@ def lint(fix): @cli.command(name="list") -def list_cmd(): +@click.pass_context +def list_cmd(ctx): """List all documents in the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return @@ -403,9 +514,10 @@ def list_cmd(): @cli.command() -def status(): +@click.pass_context +def status(ctx): """Show the current status of the knowledge base.""" - kb_dir = _find_kb_dir() + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return diff --git a/openkb/lint.py b/openkb/lint.py index c1c9105..78b22e5 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -29,7 +29,7 @@ def _read_md(path: Path) -> str: def _all_wiki_pages(wiki: Path) -> dict[str, Path]: """Return a mapping of stem/relative-path → absolute Path for all .md files. - Keys are normalised: 'concepts/attention', 'summaries/paper', 'index', etc. + Keys are normalized: 'concepts/attention', 'summaries/paper', 'index', etc. """ pages: dict[str, Path] = {} for md in wiki.rglob("*.md"): diff --git a/tests/test_add_command.py b/tests/test_add_command.py index 0ad9397..2ad22e7 100644 --- a/tests/test_add_command.py +++ b/tests/test_add_command.py @@ -37,8 +37,9 @@ def test_finds_openkb_dir(self, tmp_path, monkeypatch): def test_returns_none_if_no_openkb(self, tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) - result = _find_kb_dir() - assert result is None + with patch("openkb.cli.load_global_config", return_value={}): + result = _find_kb_dir() + assert result is None class TestAddCommand: @@ -57,7 +58,8 @@ def _setup_kb(self, tmp_path): def test_add_missing_init(self, tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli._find_kb_dir", return_value=None): result = runner.invoke(cli, ["add", "somefile.pdf"]) assert "No knowledge base found" in result.output diff --git a/tests/test_cli.py b/tests/test_cli.py index 1ad10b3..afb961d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,6 @@ import json +from unittest.mock import patch + import pytest from click.testing import CliRunner @@ -8,7 +10,8 @@ def test_init_creates_structure(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 @@ -20,13 +23,11 @@ def test_init_creates_structure(tmp_path): assert (cwd / "wiki" / "sources" / "images").is_dir() assert (cwd / "wiki" / "summaries").is_dir() assert (cwd / "wiki" / "concepts").is_dir() - assert (cwd / "wiki" / "reports").is_dir() assert (cwd / ".openkb").is_dir() # Files assert (cwd / "wiki" / "AGENTS.md").is_file() assert (cwd / "wiki" / "log.md").is_file() - assert (cwd / "wiki" / "explorations").is_dir() assert (cwd / "wiki" / "index.md").is_file() assert (cwd / ".openkb" / "config.yaml").is_file() assert (cwd / ".openkb" / "hashes.json").is_file() @@ -42,7 +43,8 @@ def test_init_creates_structure(tmp_path): def test_init_schema_content(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 @@ -53,7 +55,8 @@ def test_init_schema_content(tmp_path): def test_init_already_exists(tmp_path): runner = CliRunner() - with runner.isolated_filesystem(temp_dir=tmp_path): + with runner.isolated_filesystem(temp_dir=tmp_path), \ + patch("openkb.cli.register_kb"): # First run should succeed result = runner.invoke(cli, ["init"]) assert result.exit_code == 0 From 8e9edebfc3bb662b0e039eff411c01496945add1 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:04:53 +0800 Subject: [PATCH 05/26] feat: improve query agent with multimodal get_image tool - Add get_image tool for viewing images referenced in source documents - Use ToolOutputImage for proper image content in LLM context - Update prompt: use full_text field, restrict get_page_content to pageindex - Add self-talk before tool calls, enforce concise answers - Prevent duplicate frontmatter in LLM-generated content via schema update --- openkb/agent/query.py | 217 ++++++++++-------------------------------- tests/test_query.py | 103 +++----------------- 2 files changed, 62 insertions(+), 258 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 6a740fb..d252ee6 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -3,14 +3,13 @@ from pathlib import Path -import litellm from agents import Agent, Runner, function_tool -import os -from pageindex import PageIndexClient +from agents import ToolOutputImage, ToolOutputText +from openkb.agent.tools import read_wiki_file, read_wiki_image -from openkb.agent.tools import list_wiki_files, read_wiki_file -from openkb.schema import SCHEMA_MD, get_agents_md +MAX_TURNS = 50 +from openkb.schema import get_agents_md _QUERY_INSTRUCTIONS_TEMPLATE = """\ You are a knowledge-base Q&A agent. You answer questions by searching the wiki. @@ -18,185 +17,73 @@ {schema_md} ## Search strategy -1. Start by reading index.md to understand what documents and concepts are available. -2. Read relevant summary pages (summaries/) to get document overviews. +1. Read index.md to see all documents and concepts with brief summaries. + Each document is marked (short) or (pageindex) to indicate its type. +2. Read relevant summary pages (summaries/) for document overviews. + Note: summaries may omit details. 3. Read concept pages (concepts/) for cross-document synthesis. -4. For long documents indexed with PageIndex, call pageindex_retrieve with the - document ID and the user's question to get detailed page-level content. -5. Synthesise a clear, well-cited answer. - -Always ground your answer in the wiki content. If you cannot find relevant -information, say so clearly. +4. When you need detailed source document content, each summary page has a + `full_text` frontmatter field with the path to the original document content: + - Short documents (doc_type: short): read_file with that path. + - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) + with tight page ranges. The summary shows document tree structure with page + ranges to help you target. Never fetch the whole document. +5. When source content references images (e.g. ![image](sources/images/doc/file.png)), + use get_image to view them. Always view images when the question asks about + a figure, chart, diagram, or visual content. +6. Synthesize a clear, concise, well-cited answer grounded in wiki content. + +Answer based only on wiki content. Be concise. +Before each tool call, output one short sentence explaining the reason. + +If you cannot find relevant information, say so clearly. """ -def _pageindex_retrieve_impl(doc_id: str, question: str, openkb_dir: str, model: str) -> str: - """Retrieve relevant content from a long document via PageIndex. - - For cloud-indexed docs: delegates to col.query() directly. - For local docs: uses structure-based page selection + get_page_content. - """ - pageindex_api_key = os.environ.get("PAGEINDEX_API_KEY", "") - # Determine if this doc was cloud-indexed (cloud doc_ids have "pi-" prefix) - is_cloud_doc = doc_id.startswith("pi-") - - if is_cloud_doc: - # Cloud doc: use PageIndex streaming query (avoids timeout, shows progress) - import sys - import asyncio - import threading - - client = PageIndexClient(api_key=pageindex_api_key or None, model=model) - col = client.collection() - try: - stream = col.query(question, doc_ids=[doc_id], stream=True) - collected: list[str] = [] - done = threading.Event() - - async def _consume(): - try: - async for event in stream: - if event.type == "answer_delta": - sys.stdout.write(event.data) - sys.stdout.flush() - collected.append(event.data) - elif event.type == "tool_call": - name = event.data.get("name", "") - args = event.data.get("args", "") - sys.stdout.write(f"\n [PageIndex] {name}({args})\n") - sys.stdout.flush() - sys.stdout.write("\n") - sys.stdout.flush() - finally: - done.set() - - # Run streaming in a separate thread with its own event loop - def _run(): - loop = asyncio.new_event_loop() - loop.run_until_complete(_consume()) - loop.close() - - t = threading.Thread(target=_run, daemon=True) - t.start() - t.join(timeout=120) - return "".join(collected) if collected else "No answer from PageIndex." - except Exception as exc: - return f"Error querying cloud PageIndex: {exc}" - - # Local doc: use local PageIndex with structure-based retrieval - client = PageIndexClient(model=model, storage_path=openkb_dir) - col = client.collection() - - try: - structure = col.get_document_structure(doc_id) - except Exception as exc: - return f"Error retrieving document structure: {exc}" - - if not structure: - return "No structure found for document." - sections = [] - for idx, node in enumerate(structure): - title = node.get("title", f"Section {idx + 1}") - node_id = node.get("node_id", str(idx)) - summary = node.get("summary", "") - start = node.get("start_index", idx) - end = node.get("end_index", idx) - sections.append( - f"node_id={node_id} title='{title}' pages={start}-{end} summary='{summary}'" - ) - - sections_text = "\n".join(sections) - prompt = ( - f"Given the following document sections:\n{sections_text}\n\n" - f"Which page ranges are most relevant to this question: '{question}'?\n" - "Reply with a comma-separated list of page numbers or ranges (e.g. '1-3,7,10-12'). " - "Return ONLY the page specification, nothing else." - ) - - # 2. Ask LLM which pages are relevant - try: - response = litellm.completion( - model=model, - messages=[{"role": "user", "content": prompt}], - ) - page_spec = response.choices[0].message.content.strip() - except Exception as exc: - return f"Error selecting relevant pages: {exc}" - - if not page_spec: - return "Could not determine relevant pages." - - # 3. Fetch those pages - try: - pages = col.get_page_content(doc_id, page_spec) - except Exception as exc: - return f"Error fetching page content: {exc}" - - if not pages: - return f"No content found for pages: {page_spec}" - - parts = [] - for item in pages: - page_num = item.get("page_index", "?") - text = item.get("text", "") - parts.append(f"[Page {page_num}]\n{text}") - - return "\n\n".join(parts) - - -def build_query_agent(wiki_root: str, openkb_dir: str, model: str, language: str = "en") -> Agent: - """Build and return the Q&A agent. - - Args: - wiki_root: Absolute path to the wiki directory. - openkb_dir: Path to the .openkb/ state directory. - model: LLM model name. - language: Language code for wiki content (e.g. 'en', 'fr'). - - Returns: - Configured :class:`~agents.Agent` instance. - """ +def build_query_agent(wiki_root: str, model: str, language: str = "en") -> Agent: + """Build and return the Q&A agent.""" schema_md = get_agents_md(Path(wiki_root)) instructions = _QUERY_INSTRUCTIONS_TEMPLATE.format(schema_md=schema_md) instructions += f"\n\nIMPORTANT: Write all wiki content in {language} language." - @function_tool - def list_files(directory: str) -> str: - """List all Markdown files in a wiki subdirectory. - - Args: - directory: Subdirectory path relative to wiki root (e.g. 'sources'). - """ - return list_wiki_files(directory, wiki_root) - @function_tool def read_file(path: str) -> str: """Read a Markdown file from the wiki. - Args: path: File path relative to wiki root (e.g. 'summaries/paper.md'). """ return read_wiki_file(path, wiki_root) @function_tool - def pageindex_retrieve(doc_id: str, question: str) -> str: - """Retrieve relevant content from a long document via PageIndex. - - Use this when you need detailed content from a document that was - indexed with PageIndex (long documents). + def get_page_content_tool(doc_name: str, pages: str) -> str: + """Get text content of specific pages from a PageIndex (long) document. + Only use for documents with doc_type: pageindex. For short documents, + use read_file instead. + Args: + doc_name: Document name (e.g. 'attention-is-all-you-need'). + pages: Page specification (e.g. '3-5,7,10-12'). + """ + from openkb.agent.tools import get_page_content + return get_page_content(doc_name, pages, wiki_root) + @function_tool + def get_image(image_path: str) -> ToolOutputImage | ToolOutputText: + """View an image from the wiki. + Use when source content references images you need to see. Args: - doc_id: PageIndex document identifier (found in index.md). - question: The question you are trying to answer. + image_path: Image path relative to wiki root (e.g. 'sources/images/doc/p1_img1.png'). """ - return _pageindex_retrieve_impl(doc_id, question, openkb_dir, model) + result = read_wiki_image(image_path, wiki_root) + if result["type"] == "image": + return ToolOutputImage(image_url=result["image_url"]) + return ToolOutputText(text=result["text"]) from agents.model_settings import ModelSettings return Agent( name="wiki-query", instructions=instructions, - tools=[list_files, read_file, pageindex_retrieve], + tools=[read_file, get_page_content_tool, get_image], model=f"litellm/{model}", model_settings=ModelSettings(parallel_tool_calls=False), ) @@ -224,15 +111,14 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals language: str = config.get("language", "en") wiki_root = str(kb_dir / "wiki") - openkb_path = str(openkb_dir) - agent = build_query_agent(wiki_root, openkb_path, model, language=language) + agent = build_query_agent(wiki_root, model, language=language) if not stream: - result = await Runner.run(agent, question) + result = await Runner.run(agent, question, max_turns=MAX_TURNS) return result.final_output or "" - result = Runner.run_streamed(agent, question) + result = Runner.run_streamed(agent, question, max_turns=MAX_TURNS) collected = [] async for event in result.stream_events(): if isinstance(event, RawResponsesStreamEvent): @@ -247,13 +133,10 @@ async def run_query(question: str, kb_dir: Path, model: str, stream: bool = Fals if item.type == "tool_call_item": raw = item.raw_item args = getattr(raw, "arguments", "{}") - sys.stdout.write(f"\n[tool call] {raw.name}({args})\n") + sys.stdout.write(f"\n[tool call] {raw.name}({args})\n\n") sys.stdout.flush() elif item.type == "tool_call_output_item": - output = str(item.output) - preview = output[:200] + "..." if len(output) > 200 else output - sys.stdout.write(f"[tool output] {preview}\n\n") - sys.stdout.flush() + pass sys.stdout.write("\n") sys.stdout.flush() return "".join(collected) if collected else result.final_output or "" diff --git a/tests/test_query.py b/tests/test_query.py index 084fc9e..e00d2ea 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -6,119 +6,40 @@ import pytest -from openkb.agent.query import _pageindex_retrieve_impl, build_query_agent, run_query +from openkb.agent.query import build_query_agent, run_query from openkb.schema import SCHEMA_MD class TestBuildQueryAgent: def test_agent_name(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert agent.name == "wiki-query" def test_agent_has_three_tools(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert len(agent.tools) == 3 def test_agent_tool_names(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") names = {t.name for t in agent.tools} - assert "list_files" in names assert "read_file" in names - assert "pageindex_retrieve" in names + assert "get_page_content_tool" in names + assert "get_image" in names - def test_instructions_reference_registered_pageindex_tool(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") - tool_names = {t.name for t in agent.tools} - assert "pageindex_retrieve" in agent.instructions - assert "pageindex_retrieve" in tool_names + def test_instructions_mention_get_page_content(self, tmp_path): + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") + assert "get_page_content" in agent.instructions + assert "pageindex_retrieve" not in agent.instructions def test_schema_in_instructions(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "gpt-4o-mini") + agent = build_query_agent(str(tmp_path), "gpt-4o-mini") assert SCHEMA_MD in agent.instructions def test_agent_model(self, tmp_path): - agent = build_query_agent(str(tmp_path), str(tmp_path / "pi"), "my-model") + agent = build_query_agent(str(tmp_path), "my-model") assert agent.model == "litellm/my-model" -class TestPageindexRetrieve: - def test_returns_page_content(self, tmp_path): - mock_structure = [ - { - "node_id": "n1", - "title": "Introduction", - "start_index": 1, - "end_index": 5, - "summary": "Overview section", - } - ] - mock_pages = [ - {"page_index": 1, "text": "Introduction text here."}, - {"page_index": 2, "text": "More intro content."}, - ] - - mock_col = MagicMock() - mock_col.get_document_structure.return_value = mock_structure - mock_col.get_page_content.return_value = mock_pages - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \ - patch("openkb.agent.query.litellm.completion") as mock_llm, \ - patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False): - mock_llm.return_value = MagicMock( - choices=[MagicMock(message=MagicMock(content="1-2"))] - ) - result = _pageindex_retrieve_impl("doc123", "What is the intro?", "/db", "gpt-4o-mini") - - assert "Introduction text here." in result - assert "More intro content." in result - - def test_cloud_doc_uses_streaming_query(self, tmp_path): - """Cloud doc (pi- prefix) delegates to col.query(stream=True).""" - from dataclasses import dataclass - from typing import Any - - @dataclass - class FakeEvent: - type: str - data: Any - - class FakeStream: - async def __aiter__(self): - yield FakeEvent(type="answer_delta", data="Cloud ") - yield FakeEvent(type="answer_delta", data="answer about MCP.") - - mock_stream = FakeStream() - - mock_col = MagicMock() - mock_col.query.return_value = mock_stream - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client): - result = _pageindex_retrieve_impl("pi-abc123", "What is MCP?", "/db", "gpt-4o-mini") - - assert "Cloud answer about MCP." in result - mock_col.query.assert_called_once_with("What is MCP?", doc_ids=["pi-abc123"], stream=True) - - def test_local_empty_structure_returns_error(self, tmp_path): - """Local doc with empty structure returns error.""" - mock_col = MagicMock() - mock_col.get_document_structure.return_value = [] - - mock_client = MagicMock() - mock_client.collection.return_value = mock_col - - with patch("openkb.agent.query.PageIndexClient", return_value=mock_client), \ - patch.dict("os.environ", {"PAGEINDEX_API_KEY": ""}, clear=False): - result = _pageindex_retrieve_impl("local-uuid-123", "What?", "/db", "gpt-4o-mini") - - assert "No structure found" in result - - class TestRunQuery: @pytest.mark.asyncio async def test_run_query_returns_final_output(self, tmp_path): From 44bf83e1878d684b7cca8dc6792e691d2d9d2d48 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:08:45 +0800 Subject: [PATCH 06/26] refactor: unify image paths and add pymupdf per-page extraction - Add convert_pdf_to_pages for per-page content+image extraction - All image paths use sources/images/ prefix relative to wiki root - Remove page marker comments from short doc source markdown --- docs/.DS_Store | Bin 6148 -> 0 bytes openkb/images.py | 74 ++++++++++++++++++++++++++++++++++++++----- tests/test_images.py | 14 ++++---- 3 files changed, 73 insertions(+), 15 deletions(-) delete mode 100644 docs/.DS_Store diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index 56fffc4488504f8e54d58ac8d5d950d7d4bc43d7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKF=_)r43rWV1~)EK?iccd#W*j>2aIEPh6_1N+N<)eJS{VlAebYN!fwn6w6m+# z?6OmwPG;tt^WoWSZDvb2(Y`p$jr;VOJygVjaGddB?>5+{9Wnc#k-M_d&R~-D%O4Np z+i5=xC#J}xfE17dQa}nwfh#Fcg>|{Q(s?>Y3P^$bQGnlv1}FByDKS1B7@`FLE)WjG zJbDRWV*uC-r$j_xo>X8`y;=-UI^wPJdf}9qbn|LHyl(dDP&{tO`7P4Td!j}uAO$WJ zc+BOR_5TI_NB@6G(n<$EObXmufgdgD7FqxR diff --git a/openkb/images.py b/openkb/images.py index 80ef37f..7628414 100644 --- a/openkb/images.py +++ b/openkb/images.py @@ -67,11 +67,66 @@ def extract_pdf_images(pdf_path: Path, doc_name: str, images_dir: Path) -> dict[ logger.warning("Failed to save image block on page %d", page_num) continue - rel_path = f"images/{doc_name}/{filename}" + rel_path = f"sources/images/{doc_name}/{filename}" page_images.setdefault(page_num, []).append(rel_path) return page_images +def convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict]: + """Convert a PDF to per-page dicts with text content and images. + + Each dict has ``{"page": int, "content": str, "images": [{"path": str}]}``. + Images are saved to *images_dir* and referenced with wiki-root-relative paths. + """ + images_dir.mkdir(parents=True, exist_ok=True) + pages: list[dict] = [] + img_counter = 0 + + with pymupdf.open(str(pdf_path)) as doc: + for page_idx in range(len(doc)): + page = doc[page_idx] + page_num = page_idx + 1 + parts: list[str] = [] + page_images: list[dict] = [] + + for block in page.get_text("dict")["blocks"]: + if block["type"] == 0: # text block + lines = [] + for line in block["lines"]: + spans_text = "".join(span["text"] for span in line["spans"]) + lines.append(spans_text) + parts.append("\n".join(lines)) + + elif block["type"] == 1: # image block + width = block.get("width", 0) + height = block.get("height", 0) + if width < _MIN_IMAGE_DIM or height < _MIN_IMAGE_DIM: + continue + image_bytes = block.get("image") + if not image_bytes: + continue + try: + pix = pymupdf.Pixmap(image_bytes) + if pix.n > 4: + pix = pymupdf.Pixmap(pymupdf.csRGB, pix) + img_counter += 1 + filename = f"p{page_num}_img{img_counter}.png" + (images_dir / filename).write_bytes(pix.tobytes("png")) + pix = None + img_path = f"sources/images/{doc_name}/{filename}" + parts.append(f"\n![image]({img_path})\n") + page_images.append({"path": img_path}) + except Exception: + logger.warning("Failed to save image block on page %d", page_num) + + pages.append({ + "page": page_num, + "content": "\n".join(parts), + "images": page_images, + }) + return pages + + def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> str: """Convert a PDF to markdown with inline images using pymupdf dict-mode. @@ -89,7 +144,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> for page_idx in range(len(doc)): page = doc[page_idx] page_num = page_idx + 1 - parts.append(f"\n\n\n") + parts.append("\n\n") for block in page.get_text("dict")["blocks"]: if block["type"] == 0: # text block @@ -115,7 +170,7 @@ def convert_pdf_with_images(pdf_path: Path, doc_name: str, images_dir: Path) -> filename = f"p{page_num}_img{img_counter}.png" (images_dir / filename).write_bytes(pix.tobytes("png")) pix = None - parts.append(f"\n![image](images/{doc_name}/{filename})\n") + parts.append(f"\n![image](sources/images/{doc_name}/{filename})\n") except Exception: logger.warning("Failed to save image block on page %d", page_num) return "\n".join(parts) @@ -126,7 +181,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str For each ``![alt](data:image/ext;base64,DATA)`` match: - Decode base64 bytes → save to ``images_dir/img_NNN.ext`` - - Replace the link with ``![alt](images/{doc_name}/img_NNN.ext)`` + - Replace the link with ``![alt](sources/images/{doc_name}/img_NNN.ext)`` - On decode failure: log a warning and leave the original text unchanged. """ counter = 0 @@ -150,7 +205,7 @@ def extract_base64_images(markdown: str, doc_name: str, images_dir: Path) -> str images_dir.mkdir(parents=True, exist_ok=True) dest.write_bytes(image_bytes) - new_ref = f"![{alt}](images/{doc_name}/{filename})" + new_ref = f"![{alt}](sources/images/{doc_name}/{filename})" result = result.replace(match.group(0), new_ref, 1) return result @@ -164,14 +219,17 @@ def copy_relative_images( For each ``![alt](relative/path)`` match (skipping http/https and data URIs): - Resolve path relative to ``source_dir`` - Copy to ``images_dir/{filename}`` - - Replace link with ``![alt](images/{doc_name}/{filename})`` + - Replace link with ``![alt](sources/images/{doc_name}/{filename})`` - Missing source file: log a warning and leave the original text unchanged. """ result = markdown for match in _RELATIVE_RE.finditer(markdown): alt, rel_path = match.group(1), match.group(2) - src = source_dir / rel_path + src = (source_dir / rel_path).resolve() + if not src.is_relative_to(source_dir.resolve()): + logger.warning("Image path escapes source dir: %s; skipping.", rel_path) + continue if not src.exists(): logger.warning( "Relative image not found: %s; leaving original link.", src @@ -183,7 +241,7 @@ def copy_relative_images( images_dir.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dest) - new_ref = f"![{alt}](images/{doc_name}/{filename})" + new_ref = f"![{alt}](sources/images/{doc_name}/{filename})" result = result.replace(match.group(0), new_ref, 1) return result diff --git a/tests/test_images.py b/tests/test_images.py index 0b3be21..8bbc722 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -44,7 +44,7 @@ def test_single_base64_image_extracted(self, tmp_path): # Result should reference a saved file, not the raw base64 assert "data:image/png;base64," not in result - assert "![alt text](images/doc/img_001.png)" == result + assert "![alt text](sources/images/doc/img_001.png)" == result # File should exist on disk saved = images_dir / "img_001.png" @@ -62,8 +62,8 @@ def test_multiple_base64_images_numbered_sequentially(self, tmp_path): ) result = extract_base64_images(md, "doc", images_dir) - assert "![fig1](images/doc/img_001.png)" in result - assert "![fig2](images/doc/img_002.jpeg)" in result + assert "![fig1](sources/images/doc/img_001.png)" in result + assert "![fig2](sources/images/doc/img_002.jpeg)" in result assert (images_dir / "img_001.png").exists() assert (images_dir / "img_002.jpeg").exists() @@ -92,7 +92,7 @@ def test_mixed_valid_invalid_base64(self, tmp_path, caplog): import logging with caplog.at_level(logging.WARNING, logger="openkb.images"): result = extract_base64_images(md, "doc", images_dir) - assert "![good](images/doc/img_001.png)" in result + assert "![good](sources/images/doc/img_001.png)" in result assert f"data:image/png;base64,{bad}" in result @@ -114,7 +114,7 @@ def test_existing_relative_image_copied_and_rewritten(self, tmp_path): md = "![diagram](diagram.png)" result = copy_relative_images(md, source_dir, "doc", images_dir) - assert "![diagram](images/doc/diagram.png)" == result + assert "![diagram](sources/images/doc/diagram.png)" == result assert (images_dir / "diagram.png").read_bytes() == FAKE_PNG def test_missing_relative_image_leaves_original(self, tmp_path, caplog): @@ -163,7 +163,7 @@ def test_multiple_relative_images_all_copied(self, tmp_path): md = "![a](a.png)\n![b](b.jpg)" result = copy_relative_images(md, source_dir, "doc", images_dir) - assert "![a](images/doc/a.png)" in result - assert "![b](images/doc/b.jpg)" in result + assert "![a](sources/images/doc/a.png)" in result + assert "![b](sources/images/doc/b.jpg)" in result assert (images_dir / "a.png").exists() assert (images_dir / "b.jpg").exists() From 7ca95f9e82b4fcf087f7197396e4e4fa55b5acd3 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:33:41 +0800 Subject: [PATCH 07/26] fix: replace concept body on update instead of appending The _CONCEPT_UPDATE_USER prompt asks the LLM for a full rewrite, but _write_concept was appending the rewrite to the existing body, causing content duplication on every concept update. --- openkb/agent/compiler.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 73b1a9c..2e4de2b 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -338,7 +338,15 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is end = clean.find("---", 3) if end != -1: clean = clean[end + 3:].lstrip("\n") - existing += f"\n\n{clean}" + # Replace body with LLM rewrite (prompt asks for full rewrite, not delta) + if existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + existing = existing[:end + 3] + "\n\n" + clean + else: + existing = clean + else: + existing = clean if brief and existing.startswith("---"): end = existing.find("---", 3) if end != -1: From d41588a9c43a6ce783bb66f41ab3e83173dd1a45 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:44:11 +0800 Subject: [PATCH 08/26] fix: use json_repair for robust LLM JSON parsing Replace hand-rolled fence stripping with json_repair to handle malformed JSON, missing fences, and prose-wrapped responses from LLMs. Also fixes str.index() ValueError on fenced blocks without newlines. --- openkb/agent/compiler.py | 12 ++++++++---- pyproject.toml | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 2e4de2b..05bda6f 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -215,14 +215,18 @@ async def _llm_call_async(model: str, messages: list[dict], step_name: str) -> s def _parse_json(text: str) -> list | dict: - """Parse JSON from LLM response, stripping markdown fences if present.""" + """Parse JSON from LLM response, handling fences, prose, and malformed JSON.""" + from json_repair import repair_json cleaned = text.strip() if cleaned.startswith("```"): - first_nl = cleaned.index("\n") - cleaned = cleaned[first_nl + 1:] + first_nl = cleaned.find("\n") + cleaned = cleaned[first_nl + 1:] if first_nl != -1 else cleaned[3:] if cleaned.endswith("```"): cleaned = cleaned[:-3] - return json.loads(cleaned.strip()) + result = json.loads(repair_json(cleaned.strip())) + if not isinstance(result, (dict, list)): + raise ValueError(f"Expected JSON object or array, got {type(result).__name__}") + return result # --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 393cbd0..f7c60ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "openai-agents", "pyyaml", "python-dotenv", + "json-repair", ] [project.urls] From 7dd70c648d182438c083746b8b94854e590a717f Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:51:27 +0800 Subject: [PATCH 09/26] fix: use pdf_path.stem for full_text frontmatter path --- openkb/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index dd8ddaf..8aed15c 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -92,7 +92,7 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: # Write wiki/summaries/ (no images, just summaries) summaries_dir = kb_dir / "wiki" / "summaries" summaries_dir.mkdir(parents=True, exist_ok=True) - summary_md = render_summary_md(tree, doc_name, doc_id) + summary_md = render_summary_md(tree, pdf_path.stem, doc_id) (summaries_dir / f"{pdf_path.stem}.md").write_text(summary_md, encoding="utf-8") return IndexResult(doc_id=doc_id, description=description, tree=tree) From b90f0b4920b4c4ede8130893f650c79461d64f5e Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:51:53 +0800 Subject: [PATCH 10/26] fix: sanitize concept names before links and index --- openkb/agent/compiler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 05bda6f..8046a22 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -643,16 +643,18 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: continue name, page_content, is_update, brief = r _write_concept(wiki_dir, name, page_content, source_file, is_update, brief=brief) - concept_names.append(name) + safe_name = _sanitize_concept_name(name) + concept_names.append(safe_name) if brief: - concept_briefs_map[name] = brief + concept_briefs_map[safe_name] = brief # --- Step 3b: Process related items (code only, no LLM) --- - for slug in related_items: + sanitized_related = [_sanitize_concept_name(s) for s in related_items] + for slug in sanitized_related: _add_related_link(wiki_dir, slug, doc_name, source_file) # --- Step 3c: Backlink — summary ↔ concepts (code only) --- - all_concept_slugs = concept_names + [s for s in related_items] + all_concept_slugs = concept_names + sanitized_related if all_concept_slugs: _backlink_summary(wiki_dir, doc_name, all_concept_slugs) _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) From 3dd84f3df4ec4686e590322e50ac420d2b0bb3b6 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 08:52:03 +0800 Subject: [PATCH 11/26] fix: pass doc_type and doc_brief in early return paths --- openkb/agent/compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 8046a22..e22c4c8 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -547,7 +547,7 @@ async def _compile_concepts( except (json.JSONDecodeError, ValueError) as exc: logger.warning("Failed to parse concepts plan: %s", exc) logger.debug("Raw: %s", plan_raw) - _update_index(wiki_dir, doc_name, []) + _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) return # Fallback: if LLM returns a flat list, treat all items as "create" @@ -565,7 +565,7 @@ async def _compile_concepts( related_items = plan["related"] if not create_items and not update_items and not related_items: - _update_index(wiki_dir, doc_name, []) + _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) return # --- Step 3: Generate/update concept pages concurrently (A cached) --- From ef60f7d9a3f3474404dd9f30621a8f6437ea3655 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 09:14:47 +0800 Subject: [PATCH 12/26] fix: sanitize concept name in _gen_update and correct _update_index docstring --- openkb/agent/compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index e22c4c8..b8f8f98 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -468,7 +468,7 @@ def _update_index( When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries are written as ``- [[link]] (type) — brief text``. Existing entries are - detected by the link part only, so updating a brief on a re-compile works. + detected by the link part only and skipped to avoid duplicates. ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the query agent knows how to access detailed content. """ @@ -595,7 +595,7 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: name = concept["name"] title = concept.get("title", name) - concept_path = wiki_dir / "concepts" / f"{name}.md" + concept_path = wiki_dir / "concepts" / f"{_sanitize_concept_name(name)}.md" if concept_path.exists(): raw_text = concept_path.read_text(encoding="utf-8") if raw_text.startswith("---"): From 8818adaaee9c57b657b6df53430c479d043ff214 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 10:34:15 +0800 Subject: [PATCH 13/26] fix: update existing concept briefs in index.md instead of skipping --- openkb/agent/compiler.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index b8f8f98..d5d80cf 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -494,10 +494,18 @@ def _update_index( for name in concept_names: concept_link = f"[[concepts/{name}]]" - if concept_link not in text: - concept_entry = f"- {concept_link}" + concept_entry = f"- {concept_link}" + if name in concept_briefs: + concept_entry += f" — {concept_briefs[name]}" + if concept_link in text: if name in concept_briefs: - concept_entry += f" — {concept_briefs[name]}" + lines = text.split("\n") + for i, line in enumerate(lines): + if concept_link in line: + lines[i] = concept_entry + break + text = "\n".join(lines) + else: if "## Concepts" in text: text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) From aabcf5f3c5b2eec9859cde05a3a82dde12d5d969 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 10:35:16 +0800 Subject: [PATCH 14/26] fix: preserve non-ASCII characters in concept name slugs --- openkb/agent/compiler.py | 4 +++- tests/test_compiler.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index d5d80cf..e59b9c5 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -16,6 +16,7 @@ import sys import threading import time +import unicodedata from pathlib import Path import litellm @@ -302,11 +303,12 @@ def _write_summary(wiki_dir: Path, doc_name: str, summary: str, (summaries_dir / f"{doc_name}.md").write_text(frontmatter + summary, encoding="utf-8") -_SAFE_NAME_RE = re.compile(r'[^a-zA-Z0-9_\-]') +_SAFE_NAME_RE = re.compile(r'[^\w\-]') def _sanitize_concept_name(name: str) -> str: """Sanitize a concept name for safe use as a filename.""" + name = unicodedata.normalize("NFKC", name) sanitized = _SAFE_NAME_RE.sub("-", name).strip("-") return sanitized or "unnamed-concept" diff --git a/tests/test_compiler.py b/tests/test_compiler.py index a895b79..645b5dc 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -12,6 +12,7 @@ compile_short_doc, _compile_concepts, _parse_json, + _sanitize_concept_name, _write_summary, _write_concept, _update_index, @@ -74,6 +75,39 @@ def test_plain_text_fallback(self): _parse_json("Just plain markdown text without JSON") +class TestSanitizeConceptName: + def test_ascii_passthrough(self): + assert _sanitize_concept_name("hello-world") == "hello-world" + + def test_spaces_replaced(self): + assert _sanitize_concept_name("hello world") == "hello-world" + + def test_chinese(self): + result = _sanitize_concept_name("注意力机制") + assert result == "注意力机制" + + def test_japanese(self): + result = _sanitize_concept_name("トランスフォーマー") + assert result == "トランスフォーマー" + + def test_french_accents(self): + result = _sanitize_concept_name("réseau neuronal") + assert "r" in result + assert result != "r-seau-neuronal" # accented chars preserved, not stripped + + def test_distinct_chinese_names_no_collision(self): + a = _sanitize_concept_name("注意力机制") + b = _sanitize_concept_name("变压器模型") + assert a != b + + def test_empty_fallback(self): + assert _sanitize_concept_name("!!!") == "unnamed-concept" + + def test_nfkc_normalization(self): + # U+FF21 (fullwidth A) should normalize to regular A + assert _sanitize_concept_name("\uff21\uff22") == "AB" + + class TestWriteSummary: def test_writes_with_frontmatter(self, tmp_path): wiki = tmp_path / "wiki" From 9df6e6c5df9aa5378d5568e12c2a68b2c92930a6 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 10:44:49 +0800 Subject: [PATCH 15/26] fix: always replace concept body on update, not only when source is new --- openkb/agent/compiler.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index e59b9c5..3702af0 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -338,21 +338,21 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is existing = fm + body else: existing = f"---\nsources: [{source_file}]\n---\n\n" + existing - # Strip frontmatter from LLM content to avoid duplicate blocks - clean = content - if clean.startswith("---"): - end = clean.find("---", 3) - if end != -1: - clean = clean[end + 3:].lstrip("\n") - # Replace body with LLM rewrite (prompt asks for full rewrite, not delta) - if existing.startswith("---"): - end = existing.find("---", 3) - if end != -1: - existing = existing[:end + 3] + "\n\n" + clean - else: - existing = clean + # Strip frontmatter from LLM content to avoid duplicate blocks + clean = content + if clean.startswith("---"): + end = clean.find("---", 3) + if end != -1: + clean = clean[end + 3:].lstrip("\n") + # Replace body with LLM rewrite (prompt asks for full rewrite, not delta) + if existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + existing = existing[:end + 3] + "\n\n" + clean else: existing = clean + else: + existing = clean if brief and existing.startswith("---"): end = existing.find("---", 3) if end != -1: From ef235d228b8f14aa6c07de34a3e0ac2b2e6e648e Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 11:07:44 +0800 Subject: [PATCH 16/26] Fix concept index updates by section --- openkb/agent/compiler.py | 73 ++++++++++++++++++++++++++++++++-------- tests/test_compiler.py | 48 ++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 14 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 3702af0..f0bd0e0 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -284,6 +284,55 @@ def _read_concept_briefs(wiki_dir: Path) -> str: return "\n".join(lines) or "(none yet)" +def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: + """Return the [start, end) bounds for a Markdown H2 section.""" + for i, line in enumerate(lines): + if line == heading: + start = i + 1 + end = len(lines) + for j in range(start, len(lines)): + if lines[j].startswith("## "): + end = j + break + return start, end + return None + + +def _section_contains_link(lines: list[str], heading: str, link: str) -> bool: + """Check whether a wikilink already exists inside the named section.""" + bounds = _get_section_bounds(lines, heading) + if bounds is None: + return False + + start, end = bounds + return any(link in line for line in lines[start:end]) + + +def _replace_section_entry(lines: list[str], heading: str, link: str, entry: str) -> bool: + """Replace the first matching entry within a specific section.""" + bounds = _get_section_bounds(lines, heading) + if bounds is None: + return False + + start, end = bounds + for i in range(start, end): + if link in lines[i]: + lines[i] = entry + return True + return False + + +def _insert_section_entry(lines: list[str], heading: str, entry: str) -> bool: + """Insert a new entry at the top of a specific section.""" + bounds = _get_section_bounds(lines, heading) + if bounds is None: + return False + + start, _ = bounds + lines.insert(start, entry) + return True + + def _write_summary(wiki_dir: Path, doc_name: str, summary: str, doc_type: str = "short") -> None: @@ -469,8 +518,9 @@ def _update_index( """Append document and concept entries to index.md. When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries - are written as ``- [[link]] (type) — brief text``. Existing entries are - detected by the link part only and skipped to avoid duplicates. + are written as ``- [[link]] (type) — brief text``. Existing entries are + detected within their own section by the link part only and skipped to + avoid duplicates. ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the query agent knows how to access detailed content. """ @@ -485,32 +535,27 @@ def _update_index( ) text = index_path.read_text(encoding="utf-8") + lines = text.split("\n") doc_link = f"[[summaries/{doc_name}]]" - if doc_link not in text: + if not _section_contains_link(lines, "## Documents", doc_link): doc_entry = f"- {doc_link} ({doc_type})" if doc_brief: doc_entry += f" — {doc_brief}" - if "## Documents" in text: - text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + _insert_section_entry(lines, "## Documents", doc_entry) for name in concept_names: concept_link = f"[[concepts/{name}]]" concept_entry = f"- {concept_link}" if name in concept_briefs: concept_entry += f" — {concept_briefs[name]}" - if concept_link in text: + if _section_contains_link(lines, "## Concepts", concept_link): if name in concept_briefs: - lines = text.split("\n") - for i, line in enumerate(lines): - if concept_link in line: - lines[i] = concept_entry - break - text = "\n".join(lines) + _replace_section_entry(lines, "## Concepts", concept_link, concept_entry) else: - if "## Concepts" in text: - text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + _insert_section_entry(lines, "## Concepts", concept_entry) + text = "\n".join(lines) index_path.write_text(text, encoding="utf-8") diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 645b5dc..c473155 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -221,6 +221,54 @@ def test_backwards_compat_no_briefs(self, tmp_path): assert "[[summaries/my-doc]]" in text assert "[[concepts/attention]]" in text + def test_updates_concept_brief_only_inside_concepts_section(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n" + "## Documents\n" + "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here\n\n" + "## Concepts\n" + "- [[concepts/attention]] — Old brief\n\n" + "## Explorations\n", + encoding="utf-8", + ) + + _update_index( + wiki, + "my-doc", + ["attention"], + concept_briefs={"attention": "New brief"}, + ) + + text = (wiki / "index.md").read_text() + assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text + assert "- [[concepts/attention]] — New brief" in text + assert "- [[concepts/attention]] — Old brief" not in text + + def test_adds_concept_entry_when_link_exists_outside_concepts_section(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n" + "## Documents\n" + "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here\n\n" + "## Concepts\n\n" + "## Explorations\n", + encoding="utf-8", + ) + + _update_index( + wiki, + "my-doc", + ["attention"], + concept_briefs={"attention": "New brief"}, + ) + + text = (wiki / "index.md").read_text() + assert "- [[summaries/my-doc]] (short) — Mentions [[concepts/attention]] here" in text + assert "- [[concepts/attention]] — New brief" in text + class TestReadWikiContext: def test_empty_wiki(self, tmp_path): From 3e3d56f86b141e32d0d804508e14196d13b30d73 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 11:24:18 +0800 Subject: [PATCH 17/26] Fix exact concept index row matching --- openkb/agent/compiler.py | 55 ++++++++++++++++++++++++++++++---------- tests/test_compiler.py | 20 +++++++++++++++ 2 files changed, 61 insertions(+), 14 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 3702af0..5738c0f 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -461,6 +461,35 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) path.write_text(text, encoding="utf-8") +def _find_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: + """Return [start, end) line indexes for a section headed by ``heading``.""" + for i, line in enumerate(lines): + if line != heading: + continue + start = i + 1 + end = len(lines) + for j in range(start, len(lines)): + if lines[j].startswith("## "): + end = j + break + return start, end + return None + + +def _find_index_entry_line(lines: list[str], heading: str, link: str) -> int | None: + """Find an index entry that starts with ``- {link}`` inside one section only.""" + bounds = _find_section_bounds(lines, heading) + if bounds is None: + return None + + start, end = bounds + entry_prefix = f"- {link}" + for i in range(start, end): + if lines[i].startswith(entry_prefix): + return i + return None + + def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], doc_brief: str = "", concept_briefs: dict[str, str] | None = None, @@ -484,34 +513,32 @@ def _update_index( encoding="utf-8", ) - text = index_path.read_text(encoding="utf-8") + lines = index_path.read_text(encoding="utf-8").split("\n") doc_link = f"[[summaries/{doc_name}]]" - if doc_link not in text: + if _find_index_entry_line(lines, "## Documents", doc_link) is None: doc_entry = f"- {doc_link} ({doc_type})" if doc_brief: doc_entry += f" — {doc_brief}" - if "## Documents" in text: - text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) + doc_bounds = _find_section_bounds(lines, "## Documents") + if doc_bounds is not None: + lines.insert(doc_bounds[0], doc_entry) for name in concept_names: concept_link = f"[[concepts/{name}]]" concept_entry = f"- {concept_link}" if name in concept_briefs: concept_entry += f" — {concept_briefs[name]}" - if concept_link in text: + concept_line = _find_index_entry_line(lines, "## Concepts", concept_link) + if concept_line is not None: if name in concept_briefs: - lines = text.split("\n") - for i, line in enumerate(lines): - if concept_link in line: - lines[i] = concept_entry - break - text = "\n".join(lines) + lines[concept_line] = concept_entry else: - if "## Concepts" in text: - text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) + concept_bounds = _find_section_bounds(lines, "## Concepts") + if concept_bounds is not None: + lines.insert(concept_bounds[0], concept_entry) - index_path.write_text(text, encoding="utf-8") + index_path.write_text("\n".join(lines), encoding="utf-8") # --------------------------------------------------------------------------- diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 645b5dc..3c8ea5e 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -198,6 +198,26 @@ def test_appends_entries_with_briefs(self, tmp_path): assert "[[concepts/attention]] — Focus mechanism" in text assert "[[concepts/transformer]] — NN architecture" in text + def test_updates_only_exact_concept_row(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n" + "- [[concepts/transformer]] — Uses [[concepts/attention]] internally\n" + "- [[concepts/attention]] — Old brief\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index( + wiki, + "my-doc", + ["attention"], + concept_briefs={"attention": "New brief"}, + ) + text = (wiki / "index.md").read_text() + assert "- [[concepts/transformer]] — Uses [[concepts/attention]] internally" in text + assert "- [[concepts/attention]] — New brief" in text + assert text.count("[[concepts/attention]] — New brief") == 1 + def test_no_duplicates(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() From ed0d6baeaadb4b53f5a1fb38dfb9a8cf7d3896d9 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 11:24:18 +0800 Subject: [PATCH 18/26] Fix exact concept index row matching --- openkb/agent/compiler.py | 17 ++++++++--------- tests/test_compiler.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index f0bd0e0..d94a558 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -299,13 +299,14 @@ def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | Non def _section_contains_link(lines: list[str], heading: str, link: str) -> bool: - """Check whether a wikilink already exists inside the named section.""" + """Check whether an index entry already exists inside the named section.""" bounds = _get_section_bounds(lines, heading) if bounds is None: return False start, end = bounds - return any(link in line for line in lines[start:end]) + entry_prefix = f"- {link}" + return any(line.startswith(entry_prefix) for line in lines[start:end]) def _replace_section_entry(lines: list[str], heading: str, link: str, entry: str) -> bool: @@ -315,8 +316,9 @@ def _replace_section_entry(lines: list[str], heading: str, link: str, entry: str return False start, end = bounds + entry_prefix = f"- {link}" for i in range(start, end): - if link in lines[i]: + if lines[i].startswith(entry_prefix): lines[i] = entry return True return False @@ -509,7 +511,6 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) text += f"\n\n## Related Documents\n- {link}\n" path.write_text(text, encoding="utf-8") - def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], doc_brief: str = "", concept_briefs: dict[str, str] | None = None, @@ -519,7 +520,7 @@ def _update_index( When ``doc_brief`` or entries in ``concept_briefs`` are provided, entries are written as ``- [[link]] (type) — brief text``. Existing entries are - detected within their own section by the link part only and skipped to + detected within their own section by exact entry prefix and skipped to avoid duplicates. ``doc_type`` is ``"short"`` or ``"pageindex"`` — shown in the entry so the query agent knows how to access detailed content. @@ -534,8 +535,7 @@ def _update_index( encoding="utf-8", ) - text = index_path.read_text(encoding="utf-8") - lines = text.split("\n") + lines = index_path.read_text(encoding="utf-8").split("\n") doc_link = f"[[summaries/{doc_name}]]" if not _section_contains_link(lines, "## Documents", doc_link): @@ -555,8 +555,7 @@ def _update_index( else: _insert_section_entry(lines, "## Concepts", concept_entry) - text = "\n".join(lines) - index_path.write_text(text, encoding="utf-8") + index_path.write_text("\n".join(lines), encoding="utf-8") # --------------------------------------------------------------------------- diff --git a/tests/test_compiler.py b/tests/test_compiler.py index c473155..2a2e82d 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -198,6 +198,26 @@ def test_appends_entries_with_briefs(self, tmp_path): assert "[[concepts/attention]] — Focus mechanism" in text assert "[[concepts/transformer]] — NN architecture" in text + def test_updates_only_exact_concept_row(self, tmp_path): + wiki = tmp_path / "wiki" + wiki.mkdir() + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n" + "- [[concepts/transformer]] — Uses [[concepts/attention]] internally\n" + "- [[concepts/attention]] — Old brief\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index( + wiki, + "my-doc", + ["attention"], + concept_briefs={"attention": "New brief"}, + ) + text = (wiki / "index.md").read_text() + assert "- [[concepts/transformer]] — Uses [[concepts/attention]] internally" in text + assert "- [[concepts/attention]] — New brief" in text + assert text.count("[[concepts/attention]] — New brief") == 1 + def test_no_duplicates(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() From b6f6ba38810c9eecf7829f6e7aff8dc2aee2dcea Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 11:31:49 +0800 Subject: [PATCH 19/26] Revert "Fix exact concept index row matching" This reverts commit 3e3d56f86b141e32d0d804508e14196d13b30d73. --- openkb/agent/compiler.py | 55 ++++++++++------------------------------ tests/test_compiler.py | 20 --------------- 2 files changed, 14 insertions(+), 61 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 5738c0f..3702af0 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -461,35 +461,6 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) path.write_text(text, encoding="utf-8") -def _find_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | None: - """Return [start, end) line indexes for a section headed by ``heading``.""" - for i, line in enumerate(lines): - if line != heading: - continue - start = i + 1 - end = len(lines) - for j in range(start, len(lines)): - if lines[j].startswith("## "): - end = j - break - return start, end - return None - - -def _find_index_entry_line(lines: list[str], heading: str, link: str) -> int | None: - """Find an index entry that starts with ``- {link}`` inside one section only.""" - bounds = _find_section_bounds(lines, heading) - if bounds is None: - return None - - start, end = bounds - entry_prefix = f"- {link}" - for i in range(start, end): - if lines[i].startswith(entry_prefix): - return i - return None - - def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], doc_brief: str = "", concept_briefs: dict[str, str] | None = None, @@ -513,32 +484,34 @@ def _update_index( encoding="utf-8", ) - lines = index_path.read_text(encoding="utf-8").split("\n") + text = index_path.read_text(encoding="utf-8") doc_link = f"[[summaries/{doc_name}]]" - if _find_index_entry_line(lines, "## Documents", doc_link) is None: + if doc_link not in text: doc_entry = f"- {doc_link} ({doc_type})" if doc_brief: doc_entry += f" — {doc_brief}" - doc_bounds = _find_section_bounds(lines, "## Documents") - if doc_bounds is not None: - lines.insert(doc_bounds[0], doc_entry) + if "## Documents" in text: + text = text.replace("## Documents\n", f"## Documents\n{doc_entry}\n", 1) for name in concept_names: concept_link = f"[[concepts/{name}]]" concept_entry = f"- {concept_link}" if name in concept_briefs: concept_entry += f" — {concept_briefs[name]}" - concept_line = _find_index_entry_line(lines, "## Concepts", concept_link) - if concept_line is not None: + if concept_link in text: if name in concept_briefs: - lines[concept_line] = concept_entry + lines = text.split("\n") + for i, line in enumerate(lines): + if concept_link in line: + lines[i] = concept_entry + break + text = "\n".join(lines) else: - concept_bounds = _find_section_bounds(lines, "## Concepts") - if concept_bounds is not None: - lines.insert(concept_bounds[0], concept_entry) + if "## Concepts" in text: + text = text.replace("## Concepts\n", f"## Concepts\n{concept_entry}\n", 1) - index_path.write_text("\n".join(lines), encoding="utf-8") + index_path.write_text(text, encoding="utf-8") # --------------------------------------------------------------------------- diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 3c8ea5e..645b5dc 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -198,26 +198,6 @@ def test_appends_entries_with_briefs(self, tmp_path): assert "[[concepts/attention]] — Focus mechanism" in text assert "[[concepts/transformer]] — NN architecture" in text - def test_updates_only_exact_concept_row(self, tmp_path): - wiki = tmp_path / "wiki" - wiki.mkdir() - (wiki / "index.md").write_text( - "# Index\n\n## Documents\n\n## Concepts\n" - "- [[concepts/transformer]] — Uses [[concepts/attention]] internally\n" - "- [[concepts/attention]] — Old brief\n\n## Explorations\n", - encoding="utf-8", - ) - _update_index( - wiki, - "my-doc", - ["attention"], - concept_briefs={"attention": "New brief"}, - ) - text = (wiki / "index.md").read_text() - assert "- [[concepts/transformer]] — Uses [[concepts/attention]] internally" in text - assert "- [[concepts/attention]] — New brief" in text - assert text.count("[[concepts/attention]] — New brief") == 1 - def test_no_duplicates(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() From dc356251f4170ca0186e75f4887a96268eaec3b9 Mon Sep 17 00:00:00 2001 From: Ray Date: Fri, 10 Apr 2026 20:51:24 +0800 Subject: [PATCH 20/26] Simplify init prompts and capture API key to .env Drop the language and pageindex_threshold prompts from `openkb init`; both fall back to config defaults and can be edited later in `.openkb/config.yaml`. In their place, add an interactive API key prompt that writes `LLM_API_KEY` to `./.env` (chmod 0600) when the user provides one, so first-time setup no longer requires a separate manual step. Also polish the model prompt with provider examples and a link to LiteLLM for others. --- openkb/cli.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 3683371..32bdaaa 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -256,22 +256,23 @@ def init(): return # Interactive prompts + click.echo("Pick an LLM in `provider/model` LiteLLM format:") + click.echo(" OpenAI: gpt-5.4-mini, gpt-5.4") + click.echo(" Anthropic: anthropic/claude-sonnet-4-6, anthropic/claude-opus-4-6") + click.echo(" Gemini: gemini/gemini-3.1-pro-preview, gemini/gemini-3-flash-preview") + click.echo(" Others: see https://docs.litellm.ai/docs/providers") + click.echo() model = click.prompt( - f"Model (e.g. gpt-5.4-mini, anthropic/claude-sonnet-4-6) [default: {DEFAULT_CONFIG['model']}]", + f"Model (enter for default {DEFAULT_CONFIG['model']})", default=DEFAULT_CONFIG["model"], show_default=False, ) - language = click.prompt( - f"Language [default: {DEFAULT_CONFIG['language']}]", - default=DEFAULT_CONFIG["language"], + api_key = click.prompt( + "LLM API Key (saved to .env, enter to skip)", + default="", + hide_input=True, show_default=False, - ) - pageindex_threshold = click.prompt( - f"PageIndex threshold (pages) [default: {DEFAULT_CONFIG['pageindex_threshold']}]", - default=DEFAULT_CONFIG["pageindex_threshold"], - type=int, - show_default=False, - ) + ).strip() # Create directory structure Path("raw").mkdir(exist_ok=True) Path("wiki/sources/images").mkdir(parents=True, exist_ok=True) @@ -290,12 +291,22 @@ def init(): openkb_dir.mkdir() config = { "model": model, - "language": language, - "pageindex_threshold": pageindex_threshold, + "language": DEFAULT_CONFIG["language"], + "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"], } save_config(openkb_dir / "config.yaml", config) (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8") + # Write API key to KB-local .env (0600) if the user provided one + if api_key: + env_path = Path(".env") + if env_path.exists(): + click.echo(".env already exists, skipping write. Add LLM_API_KEY manually if needed.") + else: + env_path.write_text(f"LLM_API_KEY={api_key}\n", encoding="utf-8") + os.chmod(env_path, 0o600) + click.echo("Saved LLM API key to .env.") + # Register this KB in the global config register_kb(Path.cwd()) From 8c5bc2f687fcc76e7981c5c98888bdadd5509d0f Mon Sep 17 00:00:00 2001 From: Ray Date: Sat, 11 Apr 2026 01:24:55 +0800 Subject: [PATCH 21/26] Use cloud OCR for per-page content in cloud mode When PAGEINDEX_API_KEY is set, index_long_document now fetches per-page markdown via col.get_page_content() instead of running local pymupdf. Cloud OCR produces cleaner output (preserves tables, math, and section headers) than raw pymupdf text extraction. Falls back to local pymupdf if the cloud call raises or returns an empty result. --- openkb/indexer.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index 8aed15c..6ea9d73 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -77,13 +77,28 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: "structure": structure, } - # Write wiki/sources/ — extract per-page content with pymupdf (not PageIndex) + # Write wiki/sources/ — per-page content sources_dir = kb_dir / "wiki" / "sources" sources_dir.mkdir(parents=True, exist_ok=True) images_dir = sources_dir / "images" / pdf_path.stem from openkb.images import convert_pdf_to_pages - all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) + + all_pages: list = [] + if pageindex_api_key: + # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content + # requires a page range, so pass "1-N". + from openkb.converter import get_pdf_page_count + page_count = get_pdf_page_count(pdf_path) + try: + all_pages = col.get_page_content(doc_id, f"1-{page_count}") + except Exception as exc: + logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc) + + if not all_pages: + if pageindex_api_key: + logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name) + all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) (sources_dir / f"{pdf_path.stem}.json").write_text( json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", From e0ab3f97537d7357714e6917e516b07cda0c9810 Mon Sep 17 00:00:00 2001 From: Ray Date: Sat, 11 Apr 2026 01:31:32 +0800 Subject: [PATCH 22/26] Bump pageindex to 0.3.0.dev1 Picks up the cloud add_document poll fix from VectifyAI/PageIndex#226, which switches the readiness signal from retrieval_ready to status == "completed". --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f7c60ac..264ab9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "agents", "document"] dependencies = [ - "pageindex==0.3.0.dev0", + "pageindex==0.3.0.dev1", "markitdown[all]", "click>=8.0", "watchdog>=3.0", From b77e95d3b7a790887830baa01379013a92f87a1d Mon Sep 17 00:00:00 2001 From: Ray Date: Sat, 11 Apr 2026 01:31:32 +0800 Subject: [PATCH 23/26] Silence import-time warnings from pydub Move warnings.filterwarnings("ignore") to before the module imports so pydub's missing-ffmpeg RuntimeWarning, emitted when markitdown pulls it in, is suppressed. The existing post-import call is kept because markitdown clobbers the filter state during its own import. --- openkb/cli.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/openkb/cli.py b/openkb/cli.py index 32bdaaa..550ee5c 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -1,6 +1,12 @@ """OpenKB CLI — command-line interface for the knowledge base workflow.""" from __future__ import annotations +# Silence import-time warnings (e.g. pydub's missing-ffmpeg warning emitted +# when markitdown pulls it in). markitdown later clobbers the filters during +# its own import, so we re-apply after all imports below. +import warnings +warnings.filterwarnings("ignore") + import asyncio import json import logging From fde9b6dd17f27750b143600562c5d4148c50517b Mon Sep 17 00:00:00 2001 From: "ray.zhang" Date: Sat, 11 Apr 2026 12:49:24 +0800 Subject: [PATCH 24/26] feat: add SQLite-backed registry --- openkb/cli.py | 31 ++++---- openkb/config.py | 1 + openkb/converter.py | 7 +- openkb/state.py | 168 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 186 insertions(+), 21 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 550ee5c..40da049 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -138,14 +138,15 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: 4. Else: compile_short_doc. """ from openkb.agent.compiler import compile_long_doc, compile_short_doc - from openkb.state import HashRegistry + from openkb.state import get_registry logger = logging.getLogger(__name__) openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) - registry = HashRegistry(openkb_dir / "hashes.json") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) # 2. Convert document click.echo(f"Adding: {file_path.name}") @@ -299,9 +300,10 @@ def init(): "model": model, "language": DEFAULT_CONFIG["language"], "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"], + "storage_backend": DEFAULT_CONFIG["storage_backend"], } save_config(openkb_dir / "config.yaml", config) - (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8") + # SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建 # Write API key to KB-local .env (0600) if the user provided one if api_key: @@ -478,13 +480,13 @@ def list_cmd(ctx): click.echo("No knowledge base found. Run `openkb init` first.") return - openkb_dir = kb_dir / ".openkb" - hashes_file = openkb_dir / "hashes.json" - if not hashes_file.exists(): - click.echo("No documents indexed yet.") - return + from openkb.state import get_registry - hashes = json.loads(hashes_file.read_text(encoding="utf-8")) + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) + hashes = registry.all_entries() if not hashes: click.echo("No documents indexed yet.") return @@ -561,11 +563,14 @@ def status(ctx): click.echo(f" {'raw':<20} {raw_count:<10}") # Hash registry summary + from openkb.state import get_registry + openkb_dir = kb_dir / ".openkb" - hashes_file = openkb_dir / "hashes.json" - if hashes_file.exists(): - hashes = json.loads(hashes_file.read_text(encoding="utf-8")) - click.echo(f"\n Total indexed: {len(hashes)} document(s)") + config = load_config(openkb_dir / "config.yaml") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) + hashes = registry.all_entries() + click.echo(f"\n Total indexed: {len(hashes)} document(s)") # Last compile time: newest file in wiki/summaries/ summaries_dir = wiki_dir / "summaries" diff --git a/openkb/config.py b/openkb/config.py index b83e134..4c2169a 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -9,6 +9,7 @@ "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, + "storage_backend": "sqlite", } GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" diff --git a/openkb/converter.py b/openkb/converter.py index 3f5f529..51359a6 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -11,7 +11,7 @@ from openkb.config import load_config from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images -from openkb.state import HashRegistry +from openkb.state import get_registry logger = logging.getLogger(__name__) @@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") threshold: int = config.get("pageindex_threshold", 20) - registry = HashRegistry(openkb_dir / "hashes.json") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) # ------------------------------------------------------------------ # 1. Hash check # ------------------------------------------------------------------ - file_hash = HashRegistry.hash_file(src) + file_hash = registry.hash_file(src) if registry.is_known(file_hash): logger.info("Skipping already-known file: %s", src.name) return ConvertResult(skipped=True) diff --git a/openkb/state.py b/openkb/state.py index 9381606..dc9cd6a 100644 --- a/openkb/state.py +++ b/openkb/state.py @@ -2,7 +2,19 @@ import hashlib import json +import sqlite3 +from contextlib import contextmanager from pathlib import Path +from typing import Iterator + + +def _hash_file(path: Path) -> str: + """Return the SHA-256 hex digest (64 chars) of the file at path.""" + h = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(65536), b""): + h.update(chunk) + return h.hexdigest() class HashRegistry: @@ -57,8 +69,154 @@ def _persist(self) -> None: @staticmethod def hash_file(path: Path) -> str: """Return the SHA-256 hex digest (64 chars) of the file at path.""" - h = hashlib.sha256() - with path.open("rb") as fh: - for chunk in iter(lambda: fh.read(65536), b""): - h.update(chunk) - return h.hexdigest() + return _hash_file(path) + + +class DbRegistry: + """SQLite-backed registry mapping file SHA-256 hashes to metadata dicts. + + Provides better scalability, concurrency support, and extensibility + compared to JSON-backed HashRegistry. + """ + + def __init__(self, path: Path, migrate_from: Path | None = None) -> None: + """Initialize DbRegistry. + + Args: + path: Path to SQLite database file. + migrate_from: Optional path to JSON file to migrate from. + Migration only happens if DB doesn't exist yet. + """ + self._path = path + should_migrate = migrate_from is not None and not path.exists() + self._init_db() + if should_migrate: + self._migrate_from_json(migrate_from) + + def _migrate_from_json(self, json_path: Path) -> None: + """Migrate data from JSON file to SQLite database.""" + if not json_path.exists(): + return + + with json_path.open("r", encoding="utf-8") as fh: + data: dict[str, dict] = json.load(fh) + + with self._connect() as conn: + for file_hash, metadata in data.items(): + metadata_json = json.dumps(metadata, ensure_ascii=False) + conn.execute(""" + INSERT OR REPLACE INTO registry (file_hash, metadata_json) + VALUES (?, ?) + """, (file_hash, metadata_json)) + + def _init_db(self) -> None: + """Initialize database schema if not exists.""" + self._path.parent.mkdir(parents=True, exist_ok=True) + + with self._connect() as conn: + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA foreign_keys=ON") + conn.execute(""" + CREATE TABLE IF NOT EXISTS registry ( + file_hash TEXT PRIMARY KEY, + metadata_json TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_created_at ON registry(created_at) + """) + + @contextmanager + def _connect(self) -> Iterator[sqlite3.Connection]: + """Context manager for database connections.""" + conn = sqlite3.connect(str(self._path)) + try: + yield conn + conn.commit() + finally: + conn.close() + + def is_known(self, file_hash: str) -> bool: + """Return True if file_hash is already registered.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT 1 FROM registry WHERE file_hash = ?", + (file_hash,) + ) + return cursor.fetchone() is not None + + def get(self, file_hash: str) -> dict | None: + """Return metadata for file_hash, or None if not found.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT metadata_json FROM registry WHERE file_hash = ?", + (file_hash,) + ) + row = cursor.fetchone() + if row is None: + return None + return json.loads(row[0]) + + def all_entries(self) -> dict[str, dict]: + """Return a shallow copy of all hash -> metadata entries.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT file_hash, metadata_json FROM registry" + ) + return { + row[0]: json.loads(row[1]) + for row in cursor.fetchall() + } + + def add(self, file_hash: str, metadata: dict) -> None: + """Register file_hash with metadata and persist to disk. + + If file_hash already exists, updates the metadata. + """ + metadata_json = json.dumps(metadata, ensure_ascii=False) + with self._connect() as conn: + conn.execute(""" + INSERT INTO registry (file_hash, metadata_json, updated_at) + VALUES (?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(file_hash) DO UPDATE SET + metadata_json = excluded.metadata_json, + updated_at = CURRENT_TIMESTAMP + """, (file_hash, metadata_json)) + + @staticmethod + def hash_file(path: Path) -> str: + """Return the SHA-256 hex digest (64 chars) of the file at path.""" + return _hash_file(path) + + +def get_registry( + openkb_dir: Path, + backend: str = "sqlite", +) -> HashRegistry | DbRegistry: + """Factory function to get the appropriate registry implementation. + + Args: + openkb_dir: Path to .openkb directory. + backend: Storage backend - "sqlite" or "json". + + Returns: + HashRegistry for "json" backend, DbRegistry for "sqlite" backend. + + When switching from json to sqlite and a JSON file exists, + automatically migrates the data. + """ + if backend not in ("sqlite", "json"): + raise ValueError(f"Unknown storage_backend: {backend!r}") + + if backend == "json": + return HashRegistry(openkb_dir / "hashes.json") + + db_path = openkb_dir / "hashes.db" + json_path = openkb_dir / "hashes.json" + + if json_path.exists() and not db_path.exists(): + return DbRegistry(db_path, migrate_from=json_path) + + return DbRegistry(db_path) From 6dad765298ce12234fc6487198d7117daf02a0f7 Mon Sep 17 00:00:00 2001 From: "ray.zhang" Date: Sat, 11 Apr 2026 12:49:24 +0800 Subject: [PATCH 25/26] feat: add SQLite backend and migration tests --- tests/test_cli.py | 9 +- tests/test_config_storage_backend.py | 37 ++++++ tests/test_converter.py | 5 +- tests/test_db_registry.py | 172 +++++++++++++++++++++++++++ tests/test_migration.py | 74 ++++++++++++ tests/test_state.py | 47 ++++++++ 6 files changed, 338 insertions(+), 6 deletions(-) create mode 100644 tests/test_config_storage_backend.py create mode 100644 tests/test_db_registry.py create mode 100644 tests/test_migration.py diff --git a/tests/test_cli.py b/tests/test_cli.py index afb961d..407896d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ from unittest.mock import patch import pytest +import yaml from click.testing import CliRunner from openkb.cli import cli @@ -30,11 +31,11 @@ def test_init_creates_structure(tmp_path): assert (cwd / "wiki" / "log.md").is_file() assert (cwd / "wiki" / "index.md").is_file() assert (cwd / ".openkb" / "config.yaml").is_file() - assert (cwd / ".openkb" / "hashes.json").is_file() + # SQLite DB 在首次访问时由 get_registry() 惰性创建 + assert not (cwd / ".openkb" / "hashes.json").exists() - # hashes.json is empty object - hashes = json.loads((cwd / ".openkb" / "hashes.json").read_text()) - assert hashes == {} + config = yaml.safe_load((cwd / ".openkb" / "config.yaml").read_text()) + assert config["storage_backend"] == "sqlite" # index.md header index_content = (cwd / "wiki" / "index.md").read_text() diff --git a/tests/test_config_storage_backend.py b/tests/test_config_storage_backend.py new file mode 100644 index 0000000..7a0e987 --- /dev/null +++ b/tests/test_config_storage_backend.py @@ -0,0 +1,37 @@ +"""Tests for storage_backend config option.""" +from __future__ import annotations + +from pathlib import Path + +from openkb.config import DEFAULT_CONFIG, load_config, save_config + + +def test_default_config_has_storage_backend(): + """DEFAULT_CONFIG should include storage_backend key.""" + assert "storage_backend" in DEFAULT_CONFIG + + +def test_default_storage_backend_is_sqlite(): + """Default storage_backend should be 'sqlite'.""" + assert DEFAULT_CONFIG["storage_backend"] == "sqlite" + + +def test_load_config_includes_storage_backend(tmp_path): + """load_config should return storage_backend from config file.""" + config_path = tmp_path / "config.yaml" + save_config(config_path, {"storage_backend": "json"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "json" + + +def test_storage_backend_valid_values(tmp_path): + """storage_backend should accept 'sqlite' or 'json'.""" + config_path = tmp_path / "config.yaml" + + save_config(config_path, {"storage_backend": "sqlite"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "sqlite" + + save_config(config_path, {"storage_backend": "json"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "json" diff --git a/tests/test_converter.py b/tests/test_converter.py index 6c184fd..919819f 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -48,14 +48,15 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir): def test_md_duplicate_skipped(self, kb_dir): """Second call with same file returns skipped=True when hash is registered.""" - from openkb.state import HashRegistry + from openkb.state import get_registry src = kb_dir / "raw" / "notes.md" src.write_text("# Notes\n\nSome content here.", encoding="utf-8") result1 = convert_document(src, kb_dir) # first call # Simulate CLI registering the hash after successful compilation - registry = HashRegistry(kb_dir / ".openkb" / "hashes.json") + openkb_dir = kb_dir / ".openkb" + registry = get_registry(openkb_dir, backend="sqlite") registry.add(result1.file_hash, {"name": src.name, "type": "md"}) result2 = convert_document(src, kb_dir) # second call diff --git a/tests/test_db_registry.py b/tests/test_db_registry.py new file mode 100644 index 0000000..491343e --- /dev/null +++ b/tests/test_db_registry.py @@ -0,0 +1,172 @@ +"""Tests for DbRegistry SQLite-backed storage.""" +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path + +import pytest + +from openkb.state import DbRegistry + + +def test_db_registry_creates_database_file(tmp_path): + """DbRegistry should create a .db file on init.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + assert db_path.exists() + + +def test_db_registry_creates_table(tmp_path): + """DbRegistry should create the registry table.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='registry'" + ) + result = cursor.fetchone() + conn.close() + assert result is not None + + +def test_db_empty_registry_is_known_false(tmp_path): + """Empty DbRegistry should return False for is_known.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.is_known("abc123") is False + + +def test_db_empty_registry_get_returns_none(tmp_path): + """Empty DbRegistry should return None for get.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.get("abc123") is None + + +def test_db_add_and_is_known(tmp_path): + """After add, is_known should return True.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("deadbeef", {"filename": "test.pdf"}) + assert registry.is_known("deadbeef") is True + + +def test_db_add_and_get(tmp_path): + """After add, get should return the metadata.""" + registry = DbRegistry(tmp_path / "hashes.db") + metadata = {"filename": "doc.pdf", "pages": 10} + registry.add("cafebabe", metadata) + assert registry.get("cafebabe") == metadata + + +def test_db_persistence_across_instances(tmp_path): + """Data should persist across DbRegistry instances.""" + db_path = tmp_path / "hashes.db" + r1 = DbRegistry(db_path) + r1.add("hash1", {"file": "a.pdf"}) + + r2 = DbRegistry(db_path) + assert r2.is_known("hash1") is True + assert r2.get("hash1") == {"file": "a.pdf"} + + +def test_db_all_entries_returns_all(tmp_path): + """all_entries should return all hash -> metadata mappings.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("h1", {"name": "one"}) + registry.add("h2", {"name": "two"}) + entries = registry.all_entries() + assert "h1" in entries + assert "h2" in entries + assert entries["h1"] == {"name": "one"} + assert entries["h2"] == {"name": "two"} + + +def test_db_all_entries_empty(tmp_path): + """all_entries on empty registry should return empty dict.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.all_entries() == {} + + +def test_db_hash_file_unchanged(tmp_path): + """DbRegistry.hash_file should work same as HashRegistry.""" + f = tmp_path / "sample.txt" + f.write_text("hello world") + digest = DbRegistry.hash_file(f) + assert len(digest) == 64 + assert all(c in "0123456789abcdef" for c in digest) + + +def test_db_update_existing_hash(tmp_path): + """Adding same hash twice should update metadata.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("hash1", {"version": 1}) + registry.add("hash1", {"version": 2}) + assert registry.get("hash1") == {"version": 2} + + +def test_db_metadata_with_nested_dict(tmp_path): + """Metadata can contain nested dictionaries.""" + registry = DbRegistry(tmp_path / "hashes.db") + metadata = { + "name": "doc.pdf", + "stats": {"pages": 10, "words": 5000}, + } + registry.add("hash1", metadata) + assert registry.get("hash1") == metadata + + +def test_db_wal_mode_enabled(tmp_path): + """Database should use WAL mode for concurrency.""" + db_path = tmp_path / "hashes.db" + DbRegistry(db_path) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.execute("PRAGMA journal_mode") + result = cursor.fetchone() + conn.close() + assert result[0].lower() == "wal" + + +def test_migrate_from_json(tmp_path): + """DbRegistry should migrate existing JSON data on first access.""" + json_path = tmp_path / "hashes.json" + existing_data = { + "hash1": {"name": "doc1.pdf", "pages": 10}, + "hash2": {"name": "doc2.pdf", "pages": 20}, + } + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path, migrate_from=json_path) + + assert registry.is_known("hash1") + assert registry.is_known("hash2") + assert registry.get("hash1") == {"name": "doc1.pdf", "pages": 10} + assert registry.get("hash2") == {"name": "doc2.pdf", "pages": 20} + + +def test_migrate_only_once(tmp_path): + """Migration should only happen once, not on subsequent loads.""" + json_path = tmp_path / "hashes.json" + existing_data = {"hash1": {"name": "doc1.pdf"}} + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + db_path = tmp_path / "hashes.db" + + r1 = DbRegistry(db_path, migrate_from=json_path) + assert r1.is_known("hash1") + + existing_data["hash2"] = {"name": "doc2.pdf"} + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + r2 = DbRegistry(db_path, migrate_from=json_path) + assert r2.is_known("hash1") + assert not r2.is_known("hash2") + + +def test_migrate_optional(tmp_path): + """DbRegistry should work without migration.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + registry.add("hash1", {"name": "doc.pdf"}) + assert registry.is_known("hash1") diff --git a/tests/test_migration.py b/tests/test_migration.py new file mode 100644 index 0000000..67e8996 --- /dev/null +++ b/tests/test_migration.py @@ -0,0 +1,74 @@ +"""Integration tests for JSON to SQLite migration.""" +from __future__ import annotations + +import json +import threading +from pathlib import Path + +import pytest + +from openkb.state import get_registry, DbRegistry + + +def test_full_migration_workflow(tmp_path): + """Test complete migration from JSON to SQLite.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + # Step 1: Start with JSON backend + json_registry = get_registry(openkb_dir, backend="json") + json_registry.add("hash1", {"name": "doc1.pdf", "pages": 10}) + json_registry.add("hash2", {"name": "doc2.pdf", "pages": 20}) + + # Verify JSON file exists + json_path = openkb_dir / "hashes.json" + assert json_path.exists() + + # Step 2: Switch to SQLite backend (triggers migration) + sqlite_registry = get_registry(openkb_dir, backend="sqlite") + + # Verify data was migrated + assert sqlite_registry.is_known("hash1") + assert sqlite_registry.is_known("hash2") + assert sqlite_registry.get("hash1") == {"name": "doc1.pdf", "pages": 10} + assert sqlite_registry.get("hash2") == {"name": "doc2.pdf", "pages": 20} + + # Step 3: Add new data via SQLite + sqlite_registry.add("hash3", {"name": "doc3.pdf", "pages": 30}) + + # Step 4: Create new SQLite instance - should have all data + sqlite_registry2 = get_registry(openkb_dir, backend="sqlite") + assert sqlite_registry2.is_known("hash1") + assert sqlite_registry2.is_known("hash2") + assert sqlite_registry2.is_known("hash3") + + +def test_concurrent_sqlite_access(tmp_path): + """Test that SQLite handles concurrent access correctly.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="sqlite") + errors = [] + + def add_entries(start: int, count: int) -> None: + try: + for i in range(start, start + count): + registry.add(f"hash{i}", {"index": i}) + except Exception as e: + errors.append(e) + + threads = [ + threading.Thread(target=add_entries, args=(0, 50)), + threading.Thread(target=add_entries, args=(50, 50)), + threading.Thread(target=add_entries, args=(100, 50)), + ] + + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors + entries = registry.all_entries() + assert len(entries) == 150 diff --git a/tests/test_state.py b/tests/test_state.py index 1b4371f..cc9c5ce 100644 --- a/tests/test_state.py +++ b/tests/test_state.py @@ -82,3 +82,50 @@ def test_load_existing_json(tmp_path): registry = HashRegistry(path) assert registry.is_known("existinghash") is True assert registry.get("existinghash") == {"file": "pre.pdf"} + + +# --------------------------------------------------------------------------- +# Factory function tests +# --------------------------------------------------------------------------- + +from openkb.state import get_registry + + +def test_get_registry_returns_db_registry_by_default(tmp_path): + """get_registry should return DbRegistry by default.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir) + assert type(registry).__name__ == "DbRegistry" + + +def test_get_registry_returns_hash_registry_for_json_backend(tmp_path): + """get_registry should return HashRegistry when backend is 'json'.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="json") + assert type(registry).__name__ == "HashRegistry" + + +def test_get_registry_returns_db_registry_for_sqlite_backend(tmp_path): + """get_registry should return DbRegistry when backend is 'sqlite'.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="sqlite") + assert type(registry).__name__ == "DbRegistry" + + +def test_get_registry_migrates_json_to_sqlite(tmp_path): + """get_registry should migrate existing JSON when switching to sqlite.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + json_path = openkb_dir / "hashes.json" + json_path.write_text('{"hash1": {"name": "doc.pdf"}}', encoding="utf-8") + + registry = get_registry(openkb_dir, backend="sqlite") + assert registry.is_known("hash1") + assert registry.get("hash1") == {"name": "doc.pdf"} From 9436ad684cd29f571503175ddf7cbe513241f333 Mon Sep 17 00:00:00 2001 From: "ray.zhang" Date: Sat, 11 Apr 2026 12:49:24 +0800 Subject: [PATCH 26/26] docs: document storage backend and migration --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 1555c77..2f3497a 100644 --- a/README.md +++ b/README.md @@ -147,8 +147,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`: model: gpt-5.4 # LLM model (any LiteLLM-supported provider) language: en # Wiki output language pageindex_threshold: 20 # PDF pages threshold for PageIndex +storage_backend: sqlite # Storage backend: sqlite (default) or json ``` +### Storage Backend + +OpenKB supports two storage backends for the file hash registry: + +| Backend | Description | Use Case | +|---------|-------------|----------| +| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production | +| `json` | JSON file | Simple, human-readable, for small installations | + +Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used. + Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix): | Provider | Model example |