diff --git a/README.md b/README.md index 791c13c..3a94e6e 100644 --- a/README.md +++ b/README.md @@ -163,8 +163,20 @@ Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`: model: gpt-5.4 # LLM model (any LiteLLM-supported provider) language: en # Wiki output language pageindex_threshold: 20 # PDF pages threshold for PageIndex +storage_backend: sqlite # Storage backend: sqlite (default) or json ``` +### Storage Backend + +OpenKB supports two storage backends for the file hash registry: + +| Backend | Description | Use Case | +|---------|-------------|----------| +| `sqlite` | SQLite database (default) | Better concurrency, scalability, recommended for production | +| `json` | JSON file | Simple, human-readable, for small installations | + +Migration from JSON to SQLite happens automatically when you switch to `sqlite` backend and a `hashes.json` file exists. The JSON file is preserved but no longer used. + Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix): | Provider | Model example | diff --git a/openkb/cli.py b/openkb/cli.py index d91789f..82dd9d0 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -138,14 +138,15 @@ def _add_single_file(file_path: Path, kb_dir: Path) -> None: 4. Else: compile_short_doc. """ from openkb.agent.compiler import compile_long_doc, compile_short_doc - from openkb.state import HashRegistry + from openkb.state import get_registry logger = logging.getLogger(__name__) openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") _setup_llm_key(kb_dir) model: str = config.get("model", DEFAULT_CONFIG["model"]) - registry = HashRegistry(openkb_dir / "hashes.json") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) # 2. Convert document click.echo(f"Adding: {file_path.name}") @@ -299,9 +300,10 @@ def init(): "model": model, "language": DEFAULT_CONFIG["language"], "pageindex_threshold": DEFAULT_CONFIG["pageindex_threshold"], + "storage_backend": DEFAULT_CONFIG["storage_backend"], } save_config(openkb_dir / "config.yaml", config) - (openkb_dir / "hashes.json").write_text(json.dumps({}), encoding="utf-8") + # SQLite DB 会在首次访问时由 get_registry() 自动创建,无需预创建 # Write API key to KB-local .env (0600) if the user provided one if api_key: @@ -590,13 +592,13 @@ def list_cmd(ctx): click.echo("No knowledge base found. Run `openkb init` first.") return - openkb_dir = kb_dir / ".openkb" - hashes_file = openkb_dir / "hashes.json" - if not hashes_file.exists(): - click.echo("No documents indexed yet.") - return + from openkb.state import get_registry - hashes = json.loads(hashes_file.read_text(encoding="utf-8")) + openkb_dir = kb_dir / ".openkb" + config = load_config(openkb_dir / "config.yaml") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) + hashes = registry.all_entries() if not hashes: click.echo("No documents indexed yet.") return @@ -673,11 +675,14 @@ def status(ctx): click.echo(f" {'raw':<20} {raw_count:<10}") # Hash registry summary + from openkb.state import get_registry + openkb_dir = kb_dir / ".openkb" - hashes_file = openkb_dir / "hashes.json" - if hashes_file.exists(): - hashes = json.loads(hashes_file.read_text(encoding="utf-8")) - click.echo(f"\n Total indexed: {len(hashes)} document(s)") + config = load_config(openkb_dir / "config.yaml") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) + hashes = registry.all_entries() + click.echo(f"\n Total indexed: {len(hashes)} document(s)") # Last compile time: newest file in wiki/summaries/ summaries_dir = wiki_dir / "summaries" diff --git a/openkb/config.py b/openkb/config.py index b83e134..4c2169a 100644 --- a/openkb/config.py +++ b/openkb/config.py @@ -9,6 +9,7 @@ "model": "gpt-5.4-mini", "language": "en", "pageindex_threshold": 20, + "storage_backend": "sqlite", } GLOBAL_CONFIG_DIR = Path.home() / ".config" / "openkb" diff --git a/openkb/converter.py b/openkb/converter.py index 3f5f529..51359a6 100644 --- a/openkb/converter.py +++ b/openkb/converter.py @@ -11,7 +11,7 @@ from openkb.config import load_config from openkb.images import copy_relative_images, extract_base64_images, convert_pdf_with_images -from openkb.state import HashRegistry +from openkb.state import get_registry logger = logging.getLogger(__name__) @@ -50,12 +50,13 @@ def convert_document(src: Path, kb_dir: Path) -> ConvertResult: openkb_dir = kb_dir / ".openkb" config = load_config(openkb_dir / "config.yaml") threshold: int = config.get("pageindex_threshold", 20) - registry = HashRegistry(openkb_dir / "hashes.json") + backend = config.get("storage_backend", "sqlite") + registry = get_registry(openkb_dir, backend=backend) # ------------------------------------------------------------------ # 1. Hash check # ------------------------------------------------------------------ - file_hash = HashRegistry.hash_file(src) + file_hash = registry.hash_file(src) if registry.is_known(file_hash): logger.info("Skipping already-known file: %s", src.name) return ConvertResult(skipped=True) diff --git a/openkb/state.py b/openkb/state.py index 9381606..dc9cd6a 100644 --- a/openkb/state.py +++ b/openkb/state.py @@ -2,7 +2,19 @@ import hashlib import json +import sqlite3 +from contextlib import contextmanager from pathlib import Path +from typing import Iterator + + +def _hash_file(path: Path) -> str: + """Return the SHA-256 hex digest (64 chars) of the file at path.""" + h = hashlib.sha256() + with path.open("rb") as fh: + for chunk in iter(lambda: fh.read(65536), b""): + h.update(chunk) + return h.hexdigest() class HashRegistry: @@ -57,8 +69,154 @@ def _persist(self) -> None: @staticmethod def hash_file(path: Path) -> str: """Return the SHA-256 hex digest (64 chars) of the file at path.""" - h = hashlib.sha256() - with path.open("rb") as fh: - for chunk in iter(lambda: fh.read(65536), b""): - h.update(chunk) - return h.hexdigest() + return _hash_file(path) + + +class DbRegistry: + """SQLite-backed registry mapping file SHA-256 hashes to metadata dicts. + + Provides better scalability, concurrency support, and extensibility + compared to JSON-backed HashRegistry. + """ + + def __init__(self, path: Path, migrate_from: Path | None = None) -> None: + """Initialize DbRegistry. + + Args: + path: Path to SQLite database file. + migrate_from: Optional path to JSON file to migrate from. + Migration only happens if DB doesn't exist yet. + """ + self._path = path + should_migrate = migrate_from is not None and not path.exists() + self._init_db() + if should_migrate: + self._migrate_from_json(migrate_from) + + def _migrate_from_json(self, json_path: Path) -> None: + """Migrate data from JSON file to SQLite database.""" + if not json_path.exists(): + return + + with json_path.open("r", encoding="utf-8") as fh: + data: dict[str, dict] = json.load(fh) + + with self._connect() as conn: + for file_hash, metadata in data.items(): + metadata_json = json.dumps(metadata, ensure_ascii=False) + conn.execute(""" + INSERT OR REPLACE INTO registry (file_hash, metadata_json) + VALUES (?, ?) + """, (file_hash, metadata_json)) + + def _init_db(self) -> None: + """Initialize database schema if not exists.""" + self._path.parent.mkdir(parents=True, exist_ok=True) + + with self._connect() as conn: + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA foreign_keys=ON") + conn.execute(""" + CREATE TABLE IF NOT EXISTS registry ( + file_hash TEXT PRIMARY KEY, + metadata_json TEXT NOT NULL, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ) + """) + conn.execute(""" + CREATE INDEX IF NOT EXISTS idx_created_at ON registry(created_at) + """) + + @contextmanager + def _connect(self) -> Iterator[sqlite3.Connection]: + """Context manager for database connections.""" + conn = sqlite3.connect(str(self._path)) + try: + yield conn + conn.commit() + finally: + conn.close() + + def is_known(self, file_hash: str) -> bool: + """Return True if file_hash is already registered.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT 1 FROM registry WHERE file_hash = ?", + (file_hash,) + ) + return cursor.fetchone() is not None + + def get(self, file_hash: str) -> dict | None: + """Return metadata for file_hash, or None if not found.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT metadata_json FROM registry WHERE file_hash = ?", + (file_hash,) + ) + row = cursor.fetchone() + if row is None: + return None + return json.loads(row[0]) + + def all_entries(self) -> dict[str, dict]: + """Return a shallow copy of all hash -> metadata entries.""" + with self._connect() as conn: + cursor = conn.execute( + "SELECT file_hash, metadata_json FROM registry" + ) + return { + row[0]: json.loads(row[1]) + for row in cursor.fetchall() + } + + def add(self, file_hash: str, metadata: dict) -> None: + """Register file_hash with metadata and persist to disk. + + If file_hash already exists, updates the metadata. + """ + metadata_json = json.dumps(metadata, ensure_ascii=False) + with self._connect() as conn: + conn.execute(""" + INSERT INTO registry (file_hash, metadata_json, updated_at) + VALUES (?, ?, CURRENT_TIMESTAMP) + ON CONFLICT(file_hash) DO UPDATE SET + metadata_json = excluded.metadata_json, + updated_at = CURRENT_TIMESTAMP + """, (file_hash, metadata_json)) + + @staticmethod + def hash_file(path: Path) -> str: + """Return the SHA-256 hex digest (64 chars) of the file at path.""" + return _hash_file(path) + + +def get_registry( + openkb_dir: Path, + backend: str = "sqlite", +) -> HashRegistry | DbRegistry: + """Factory function to get the appropriate registry implementation. + + Args: + openkb_dir: Path to .openkb directory. + backend: Storage backend - "sqlite" or "json". + + Returns: + HashRegistry for "json" backend, DbRegistry for "sqlite" backend. + + When switching from json to sqlite and a JSON file exists, + automatically migrates the data. + """ + if backend not in ("sqlite", "json"): + raise ValueError(f"Unknown storage_backend: {backend!r}") + + if backend == "json": + return HashRegistry(openkb_dir / "hashes.json") + + db_path = openkb_dir / "hashes.db" + json_path = openkb_dir / "hashes.json" + + if json_path.exists() and not db_path.exists(): + return DbRegistry(db_path, migrate_from=json_path) + + return DbRegistry(db_path) diff --git a/tests/test_cli.py b/tests/test_cli.py index afb961d..407896d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ from unittest.mock import patch import pytest +import yaml from click.testing import CliRunner from openkb.cli import cli @@ -30,11 +31,11 @@ def test_init_creates_structure(tmp_path): assert (cwd / "wiki" / "log.md").is_file() assert (cwd / "wiki" / "index.md").is_file() assert (cwd / ".openkb" / "config.yaml").is_file() - assert (cwd / ".openkb" / "hashes.json").is_file() + # SQLite DB 在首次访问时由 get_registry() 惰性创建 + assert not (cwd / ".openkb" / "hashes.json").exists() - # hashes.json is empty object - hashes = json.loads((cwd / ".openkb" / "hashes.json").read_text()) - assert hashes == {} + config = yaml.safe_load((cwd / ".openkb" / "config.yaml").read_text()) + assert config["storage_backend"] == "sqlite" # index.md header index_content = (cwd / "wiki" / "index.md").read_text() diff --git a/tests/test_config_storage_backend.py b/tests/test_config_storage_backend.py new file mode 100644 index 0000000..7a0e987 --- /dev/null +++ b/tests/test_config_storage_backend.py @@ -0,0 +1,37 @@ +"""Tests for storage_backend config option.""" +from __future__ import annotations + +from pathlib import Path + +from openkb.config import DEFAULT_CONFIG, load_config, save_config + + +def test_default_config_has_storage_backend(): + """DEFAULT_CONFIG should include storage_backend key.""" + assert "storage_backend" in DEFAULT_CONFIG + + +def test_default_storage_backend_is_sqlite(): + """Default storage_backend should be 'sqlite'.""" + assert DEFAULT_CONFIG["storage_backend"] == "sqlite" + + +def test_load_config_includes_storage_backend(tmp_path): + """load_config should return storage_backend from config file.""" + config_path = tmp_path / "config.yaml" + save_config(config_path, {"storage_backend": "json"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "json" + + +def test_storage_backend_valid_values(tmp_path): + """storage_backend should accept 'sqlite' or 'json'.""" + config_path = tmp_path / "config.yaml" + + save_config(config_path, {"storage_backend": "sqlite"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "sqlite" + + save_config(config_path, {"storage_backend": "json"}) + loaded = load_config(config_path) + assert loaded["storage_backend"] == "json" diff --git a/tests/test_converter.py b/tests/test_converter.py index 6c184fd..919819f 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -48,14 +48,15 @@ def test_md_file_copied_to_wiki_sources(self, kb_dir): def test_md_duplicate_skipped(self, kb_dir): """Second call with same file returns skipped=True when hash is registered.""" - from openkb.state import HashRegistry + from openkb.state import get_registry src = kb_dir / "raw" / "notes.md" src.write_text("# Notes\n\nSome content here.", encoding="utf-8") result1 = convert_document(src, kb_dir) # first call # Simulate CLI registering the hash after successful compilation - registry = HashRegistry(kb_dir / ".openkb" / "hashes.json") + openkb_dir = kb_dir / ".openkb" + registry = get_registry(openkb_dir, backend="sqlite") registry.add(result1.file_hash, {"name": src.name, "type": "md"}) result2 = convert_document(src, kb_dir) # second call diff --git a/tests/test_db_registry.py b/tests/test_db_registry.py new file mode 100644 index 0000000..491343e --- /dev/null +++ b/tests/test_db_registry.py @@ -0,0 +1,172 @@ +"""Tests for DbRegistry SQLite-backed storage.""" +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path + +import pytest + +from openkb.state import DbRegistry + + +def test_db_registry_creates_database_file(tmp_path): + """DbRegistry should create a .db file on init.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + assert db_path.exists() + + +def test_db_registry_creates_table(tmp_path): + """DbRegistry should create the registry table.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='registry'" + ) + result = cursor.fetchone() + conn.close() + assert result is not None + + +def test_db_empty_registry_is_known_false(tmp_path): + """Empty DbRegistry should return False for is_known.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.is_known("abc123") is False + + +def test_db_empty_registry_get_returns_none(tmp_path): + """Empty DbRegistry should return None for get.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.get("abc123") is None + + +def test_db_add_and_is_known(tmp_path): + """After add, is_known should return True.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("deadbeef", {"filename": "test.pdf"}) + assert registry.is_known("deadbeef") is True + + +def test_db_add_and_get(tmp_path): + """After add, get should return the metadata.""" + registry = DbRegistry(tmp_path / "hashes.db") + metadata = {"filename": "doc.pdf", "pages": 10} + registry.add("cafebabe", metadata) + assert registry.get("cafebabe") == metadata + + +def test_db_persistence_across_instances(tmp_path): + """Data should persist across DbRegistry instances.""" + db_path = tmp_path / "hashes.db" + r1 = DbRegistry(db_path) + r1.add("hash1", {"file": "a.pdf"}) + + r2 = DbRegistry(db_path) + assert r2.is_known("hash1") is True + assert r2.get("hash1") == {"file": "a.pdf"} + + +def test_db_all_entries_returns_all(tmp_path): + """all_entries should return all hash -> metadata mappings.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("h1", {"name": "one"}) + registry.add("h2", {"name": "two"}) + entries = registry.all_entries() + assert "h1" in entries + assert "h2" in entries + assert entries["h1"] == {"name": "one"} + assert entries["h2"] == {"name": "two"} + + +def test_db_all_entries_empty(tmp_path): + """all_entries on empty registry should return empty dict.""" + registry = DbRegistry(tmp_path / "hashes.db") + assert registry.all_entries() == {} + + +def test_db_hash_file_unchanged(tmp_path): + """DbRegistry.hash_file should work same as HashRegistry.""" + f = tmp_path / "sample.txt" + f.write_text("hello world") + digest = DbRegistry.hash_file(f) + assert len(digest) == 64 + assert all(c in "0123456789abcdef" for c in digest) + + +def test_db_update_existing_hash(tmp_path): + """Adding same hash twice should update metadata.""" + registry = DbRegistry(tmp_path / "hashes.db") + registry.add("hash1", {"version": 1}) + registry.add("hash1", {"version": 2}) + assert registry.get("hash1") == {"version": 2} + + +def test_db_metadata_with_nested_dict(tmp_path): + """Metadata can contain nested dictionaries.""" + registry = DbRegistry(tmp_path / "hashes.db") + metadata = { + "name": "doc.pdf", + "stats": {"pages": 10, "words": 5000}, + } + registry.add("hash1", metadata) + assert registry.get("hash1") == metadata + + +def test_db_wal_mode_enabled(tmp_path): + """Database should use WAL mode for concurrency.""" + db_path = tmp_path / "hashes.db" + DbRegistry(db_path) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.execute("PRAGMA journal_mode") + result = cursor.fetchone() + conn.close() + assert result[0].lower() == "wal" + + +def test_migrate_from_json(tmp_path): + """DbRegistry should migrate existing JSON data on first access.""" + json_path = tmp_path / "hashes.json" + existing_data = { + "hash1": {"name": "doc1.pdf", "pages": 10}, + "hash2": {"name": "doc2.pdf", "pages": 20}, + } + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path, migrate_from=json_path) + + assert registry.is_known("hash1") + assert registry.is_known("hash2") + assert registry.get("hash1") == {"name": "doc1.pdf", "pages": 10} + assert registry.get("hash2") == {"name": "doc2.pdf", "pages": 20} + + +def test_migrate_only_once(tmp_path): + """Migration should only happen once, not on subsequent loads.""" + json_path = tmp_path / "hashes.json" + existing_data = {"hash1": {"name": "doc1.pdf"}} + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + db_path = tmp_path / "hashes.db" + + r1 = DbRegistry(db_path, migrate_from=json_path) + assert r1.is_known("hash1") + + existing_data["hash2"] = {"name": "doc2.pdf"} + json_path.write_text(json.dumps(existing_data), encoding="utf-8") + + r2 = DbRegistry(db_path, migrate_from=json_path) + assert r2.is_known("hash1") + assert not r2.is_known("hash2") + + +def test_migrate_optional(tmp_path): + """DbRegistry should work without migration.""" + db_path = tmp_path / "hashes.db" + registry = DbRegistry(db_path) + registry.add("hash1", {"name": "doc.pdf"}) + assert registry.is_known("hash1") diff --git a/tests/test_migration.py b/tests/test_migration.py new file mode 100644 index 0000000..67e8996 --- /dev/null +++ b/tests/test_migration.py @@ -0,0 +1,74 @@ +"""Integration tests for JSON to SQLite migration.""" +from __future__ import annotations + +import json +import threading +from pathlib import Path + +import pytest + +from openkb.state import get_registry, DbRegistry + + +def test_full_migration_workflow(tmp_path): + """Test complete migration from JSON to SQLite.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + # Step 1: Start with JSON backend + json_registry = get_registry(openkb_dir, backend="json") + json_registry.add("hash1", {"name": "doc1.pdf", "pages": 10}) + json_registry.add("hash2", {"name": "doc2.pdf", "pages": 20}) + + # Verify JSON file exists + json_path = openkb_dir / "hashes.json" + assert json_path.exists() + + # Step 2: Switch to SQLite backend (triggers migration) + sqlite_registry = get_registry(openkb_dir, backend="sqlite") + + # Verify data was migrated + assert sqlite_registry.is_known("hash1") + assert sqlite_registry.is_known("hash2") + assert sqlite_registry.get("hash1") == {"name": "doc1.pdf", "pages": 10} + assert sqlite_registry.get("hash2") == {"name": "doc2.pdf", "pages": 20} + + # Step 3: Add new data via SQLite + sqlite_registry.add("hash3", {"name": "doc3.pdf", "pages": 30}) + + # Step 4: Create new SQLite instance - should have all data + sqlite_registry2 = get_registry(openkb_dir, backend="sqlite") + assert sqlite_registry2.is_known("hash1") + assert sqlite_registry2.is_known("hash2") + assert sqlite_registry2.is_known("hash3") + + +def test_concurrent_sqlite_access(tmp_path): + """Test that SQLite handles concurrent access correctly.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="sqlite") + errors = [] + + def add_entries(start: int, count: int) -> None: + try: + for i in range(start, start + count): + registry.add(f"hash{i}", {"index": i}) + except Exception as e: + errors.append(e) + + threads = [ + threading.Thread(target=add_entries, args=(0, 50)), + threading.Thread(target=add_entries, args=(50, 50)), + threading.Thread(target=add_entries, args=(100, 50)), + ] + + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors + entries = registry.all_entries() + assert len(entries) == 150 diff --git a/tests/test_state.py b/tests/test_state.py index 1b4371f..cc9c5ce 100644 --- a/tests/test_state.py +++ b/tests/test_state.py @@ -82,3 +82,50 @@ def test_load_existing_json(tmp_path): registry = HashRegistry(path) assert registry.is_known("existinghash") is True assert registry.get("existinghash") == {"file": "pre.pdf"} + + +# --------------------------------------------------------------------------- +# Factory function tests +# --------------------------------------------------------------------------- + +from openkb.state import get_registry + + +def test_get_registry_returns_db_registry_by_default(tmp_path): + """get_registry should return DbRegistry by default.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir) + assert type(registry).__name__ == "DbRegistry" + + +def test_get_registry_returns_hash_registry_for_json_backend(tmp_path): + """get_registry should return HashRegistry when backend is 'json'.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="json") + assert type(registry).__name__ == "HashRegistry" + + +def test_get_registry_returns_db_registry_for_sqlite_backend(tmp_path): + """get_registry should return DbRegistry when backend is 'sqlite'.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + registry = get_registry(openkb_dir, backend="sqlite") + assert type(registry).__name__ == "DbRegistry" + + +def test_get_registry_migrates_json_to_sqlite(tmp_path): + """get_registry should migrate existing JSON when switching to sqlite.""" + openkb_dir = tmp_path / ".openkb" + openkb_dir.mkdir() + + json_path = openkb_dir / "hashes.json" + json_path.write_text('{"hash1": {"name": "doc.pdf"}}', encoding="utf-8") + + registry = get_registry(openkb_dir, backend="sqlite") + assert registry.is_known("hash1") + assert registry.get("hash1") == {"name": "doc.pdf"}