From a3b78819e1889c8939475b432e8e41cacc94f963 Mon Sep 17 00:00:00 2001 From: saagpatel Date: Sun, 7 Jun 2026 05:50:39 -0700 Subject: [PATCH 1/2] feat(portfolio-truth): emit canonical cross-store project registry Adds project_registry.py: joins the four stores that key projects differently (auditor project_key, bridge-db project_name, Notion Local Portfolio titles, ~/.claude memory slugs) under one canonical key so events stop going unmatched. - publish_portfolio_truth now emits project-registry.json each run via the existing staged atomic-write set - resolve(): override table for hard normalization failures (notion_os-> Notion, jcc->JobCommandCenter) + a collision guard (screenshotselect -> ScreenshottoDataSelect, not ScreenshotAnnotate); no fuzzy matching - config/project-registry-overrides.json: operator-editable enrollment; adds personal-ops + SecondBrain as supplementary (absent from auditor) - notion_scoring_page_id backfilled from the Project Portfolio data source - every external source optional/degrades gracefully (CI-safe) - 10 tests; existing 43 publish tests still pass --- config/project-registry-overrides.json | 37 ++ src/portfolio_truth_publish.py | 49 +++ src/project_registry.py | 477 +++++++++++++++++++++++++ tests/test_project_registry.py | 169 +++++++++ 4 files changed, 732 insertions(+) create mode 100644 config/project-registry-overrides.json create mode 100644 src/project_registry.py create mode 100644 tests/test_project_registry.py diff --git a/config/project-registry-overrides.json b/config/project-registry-overrides.json new file mode 100644 index 0000000..fa3e629 --- /dev/null +++ b/config/project-registry-overrides.json @@ -0,0 +1,37 @@ +{ + "_comment": "Operator-editable enrollment for the canonical project registry. 'overrides' pins hard normalization failures (drifted identifier -> canonical project_key). 'supplementary' enrolls real operator-OS projects the auditor does not track as git repos. 'memory_meta' maps ~/.claude memory note-slugs to their parent project (or '' for pure meta-notes). Consumed by src/project_registry.py; falls back to built-in defaults if this file is absent.", + "overrides": { + "jcc": "JobCommandCenter", + "jsm_export": "JSMTicketAnalyticsExport", + "bhv": "BrowserHistoryVisualizer", + "netmapper": "NetworkMapper", + "notion_os": "Notion", + "screenshotselect": "ScreenshottoDataSelect", + "interruptionresume": "Interruption Resume Studio" + }, + "supplementary": [ + { + "canonical_key": "supp:personal-ops", + "display_name": "personal-ops", + "repo_full_name": null, + "group_key": "operator_infra", + "lifecycle_state": "active", + "note": "Local operator control plane (127.0.0.1:46210). Most active project in bridge-db yet absent from auditor portfolio-truth." + }, + { + "canonical_key": "supp:SecondBrain", + "display_name": "SecondBrain", + "repo_full_name": null, + "group_key": "operator_infra", + "lifecycle_state": "active", + "note": "4-layer knowledge vault at /Users/d/Documents/SecondBrain (engraph-indexed). Not a git repo; absent from auditor." + } + ], + "memory_meta": { + "personal_ops_codebase": "supp:personal-ops", + "personal_ops_vision": "supp:personal-ops", + "github_repo_auditor_future_arcs": "GithubRepoAuditor", + "skill_library_port_2026-05": "", + "skill_eval_harness_2026-05": "" + } +} diff --git a/src/portfolio_truth_publish.py b/src/portfolio_truth_publish.py index 83141e9..0a87a33 100644 --- a/src/portfolio_truth_publish.py +++ b/src/portfolio_truth_publish.py @@ -14,6 +14,7 @@ validate_registry_markdown, validate_truth_snapshot, ) +from src.project_registry import build_project_registry, load_source_paths @dataclass(frozen=True) @@ -25,6 +26,47 @@ class PortfolioTruthPublishResult: project_count: int registry_changed: bool report_changed: bool + project_registry_path: Path | None = None + + +_REPO_ROOT = Path(__file__).resolve().parents[1] +_CONFIG_DIR = _REPO_ROOT / "config" + + +def _build_project_registry_json(snapshot, *, include_notion: bool) -> str: + """Render the canonical cross-store project registry from a snapshot. + + External sources (bridge-db, Notion snapshot, memory) degrade gracefully + when absent, so this never fails the publish run. + """ + overrides_config_path = _CONFIG_DIR / "project-registry-overrides.json" + sources = load_source_paths(overrides_config_path) + + scoring_pageids: dict[str, str] = {} + if include_notion and sources.get("scoring_data_source_id"): + try: + from src.notion_client import get_notion_token + from src.project_registry import fetch_scoring_pageids + + token = get_notion_token() + if token: + scoring_pageids = fetch_scoring_pageids( + str(sources["scoring_data_source_id"]), token + ) + except Exception: + scoring_pageids = {} + + registry = build_project_registry( + snapshot.to_dict(), + bridge_db_path=sources["bridge_db"], + notion_snapshot_path=sources["notion_snapshot"], + notion_project_map_path=_CONFIG_DIR / "notion-project-map.json", + memory_dir=sources["memory_dir"], + scoring_pageids=scoring_pageids, + overrides_config_path=overrides_config_path, + generated_at=snapshot.generated_at, + ) + return json.dumps(registry, indent=2) + "\n" def publish_portfolio_truth( @@ -60,6 +102,10 @@ def publish_portfolio_truth( latest_path = truth_latest_path(output_dir) latest_name = latest_path.name snapshot_json = json.dumps(build_result.snapshot.to_dict(), indent=2) + "\n" + project_registry_path = output_dir / "project-registry.json" + project_registry_json = _build_project_registry_json( + build_result.snapshot, include_notion=include_notion + ) registry_markdown = render_registry_markdown(build_result.snapshot) report_markdown = render_portfolio_report_markdown(build_result.snapshot, latest_name) @@ -79,12 +125,14 @@ def publish_portfolio_truth( latest_path: snapshot_json, registry_output: registry_markdown, portfolio_report_output: report_markdown, + project_registry_path: project_registry_json, } changed: dict[Path, bool] = { registry_output: _content_changed(registry_output, registry_markdown), portfolio_report_output: _content_changed(portfolio_report_output, report_markdown), snapshot_path: True, latest_path: True, + project_registry_path: True, } temp_files = {path: _stage_text(path, content) for path, content in targets.items()} originals = {path: (path.read_text() if path.exists() else None) for path in targets} @@ -119,6 +167,7 @@ def publish_portfolio_truth( project_count=len(build_result.snapshot.projects), registry_changed=changed[registry_output], report_changed=changed[portfolio_report_output], + project_registry_path=project_registry_path, ) diff --git a/src/project_registry.py b/src/project_registry.py new file mode 100644 index 0000000..2e4edef --- /dev/null +++ b/src/project_registry.py @@ -0,0 +1,477 @@ +"""Canonical cross-store project-identity registry. + +Joins the four stores that key projects differently — this auditor +(``identity.project_key``), bridge-db (``project_name``), Notion's Local +Portfolio Projects (row title), and ``~/.claude`` memory (``project_*.md`` +slug) — under one canonical key, so events stop going unmatched. + +The auditor is the system of record for *what exists*, so its +``project_key`` is the canonical key, with ``repo_full_name`` as the stable +secondary natural key. A normalization function bridges the majority of +spelling differences; a small curated override table (see +``config/project-registry-overrides.json``) handles the cases where even +normalization diverges, plus supplementary entries for operator-OS projects +the auditor does not track (e.g. ``personal-ops``). + +Every external source is optional: a missing bridge-db / Notion snapshot / +memory dir degrades to reduced coverage rather than failing the run. +""" + +from __future__ import annotations + +import json +import re +import sqlite3 +from datetime import datetime, timezone +from pathlib import Path + +SCHEMA_VERSION = "1.0" + +# Built-in fallbacks, mirrored by config/project-registry-overrides.json. +# Hard normalization failures: drifted identifier -> canonical project_key. +DEFAULT_OVERRIDES: dict[str, str] = { + "jcc": "JobCommandCenter", + "jsm_export": "JSMTicketAnalyticsExport", + "bhv": "BrowserHistoryVisualizer", + "netmapper": "NetworkMapper", + "notion_os": "Notion", + "screenshotselect": "ScreenshottoDataSelect", + "interruptionresume": "Interruption Resume Studio", +} + +# Real operator-OS projects absent from the auditor's repo registry. +DEFAULT_SUPPLEMENTARY: list[dict] = [ + { + "canonical_key": "supp:personal-ops", + "display_name": "personal-ops", + "repo_full_name": None, + "group_key": "operator_infra", + "lifecycle_state": "active", + "note": ( + "Local operator control plane (127.0.0.1:46210). Most active " + "project in bridge-db yet absent from auditor portfolio-truth." + ), + }, + { + "canonical_key": "supp:SecondBrain", + "display_name": "SecondBrain", + "repo_full_name": None, + "group_key": "operator_infra", + "lifecycle_state": "active", + "note": ( + "4-layer knowledge vault at /Users/d/Documents/SecondBrain " + "(engraph-indexed). Not a git repo; absent from auditor." + ), + }, +] + +# Memory slugs that are notes about a project, not their own project. +# slug -> parent canonical_key (empty string = pure meta, attach to nothing). +DEFAULT_MEMORY_META: dict[str, str] = { + "personal_ops_codebase": "supp:personal-ops", + "personal_ops_vision": "supp:personal-ops", + "github_repo_auditor_future_arcs": "GithubRepoAuditor", + "skill_library_port_2026-05": "", + "skill_eval_harness_2026-05": "", +} + +# Operator-machine source locations (overridable via the "sources" block of +# config/project-registry-overrides.json). Every source is optional. +DEFAULT_SOURCES: dict[str, str] = { + "bridge_db": "~/.local/share/bridge-db/bridge.db", + "notion_snapshot": "~/.local/share/notion-os/project-snapshot.json", + "memory_dir": "~/.claude/projects/-Users-d/memory", + "scoring_data_source_id": "35e04e4d-bcd8-45c0-b783-238edef210f7", +} + +_NON_ALNUM = re.compile(r"[^a-z0-9]") + + +def normalize(value: str | None) -> str: + """Lowercase, drop any taxonomy path prefix, strip non-alphanumerics.""" + if not value: + return "" + text = str(value) + if "/" in text: + text = text.rsplit("/", 1)[-1] + return _NON_ALNUM.sub("", text.lower()) + + +def _repo_base(repo_full_name: str | None) -> str: + return repo_full_name.rsplit("/", 1)[-1] if repo_full_name else "" + + +def _strip_alias_prefix(alias: str) -> str: + return alias.split(":", 1)[1] if ":" in alias else alias + + +def load_overrides_config( + config_path: Path | None, +) -> tuple[dict[str, str], list[dict], dict[str, str]]: + """Load overrides + supplementary + memory-meta, falling back to defaults.""" + if config_path is None or not config_path.exists(): + return ( + dict(DEFAULT_OVERRIDES), + [dict(s) for s in DEFAULT_SUPPLEMENTARY], + dict(DEFAULT_MEMORY_META), + ) + data = json.loads(config_path.read_text()) + overrides = data.get("overrides", DEFAULT_OVERRIDES) + supplementary = data.get("supplementary", DEFAULT_SUPPLEMENTARY) + memory_meta = data.get("memory_meta", DEFAULT_MEMORY_META) + return dict(overrides), [dict(s) for s in supplementary], dict(memory_meta) + + +def load_source_paths(config_path: Path | None) -> dict[str, object]: + """Resolve external-source locations, merging config over built-in defaults. + + Returns a dict with ``bridge_db``/``notion_snapshot``/``memory_dir`` as + expanded ``Path`` objects and ``scoring_data_source_id`` as a string. + """ + sources = dict(DEFAULT_SOURCES) + if config_path is not None and config_path.exists(): + try: + configured = json.loads(config_path.read_text()).get("sources", {}) + sources.update({k: v for k, v in configured.items() if v}) + except (json.JSONDecodeError, OSError): + pass + return { + "bridge_db": Path(sources["bridge_db"]).expanduser(), + "notion_snapshot": Path(sources["notion_snapshot"]).expanduser(), + "memory_dir": Path(sources["memory_dir"]).expanduser(), + "scoring_data_source_id": sources.get("scoring_data_source_id"), + } + + +def _read_bridge_names(bridge_db_path: Path | None) -> list[str]: + if bridge_db_path is None or not bridge_db_path.exists(): + return [] + try: + uri = f"file:{bridge_db_path}?mode=ro" + with sqlite3.connect(uri, uri=True) as conn: + rows = conn.execute( + "SELECT DISTINCT project_name FROM activity_log " + "UNION SELECT DISTINCT project_name FROM pending_handoffs" + ).fetchall() + return [r[0] for r in rows if r[0]] + except sqlite3.Error: + return [] + + +def _read_notion_titles(notion_snapshot_path: Path | None) -> list[str]: + if notion_snapshot_path is None or not notion_snapshot_path.exists(): + return [] + try: + data = json.loads(notion_snapshot_path.read_text()) + return [p["title"] for p in data.get("projects", []) if p.get("title")] + except (json.JSONDecodeError, OSError, KeyError): + return [] + + +def _read_notion_pageids(notion_project_map_path: Path | None) -> dict[str, str]: + if notion_project_map_path is None or not notion_project_map_path.exists(): + return {} + try: + data = json.loads(notion_project_map_path.read_text()) + return { + name: entry["localProjectId"] + for name, entry in data.items() + if isinstance(entry, dict) and entry.get("localProjectId") + } + except (json.JSONDecodeError, OSError): + return {} + + +def _read_memory_slugs(memory_dir: Path | None) -> list[str]: + if memory_dir is None or not memory_dir.exists(): + return [] + return sorted(p.name[len("project_") : -len(".md")] for p in memory_dir.glob("project_*.md")) + + +class _Entry: + """Mutable accumulator for one canonical project during the join.""" + + __slots__ = ( + "canonical_key", + "display_name", + "repo_full_name", + "group_key", + "lifecycle_state", + "source", + "note", + "matchset", + "bridge_names", + "notion_local_title", + "notion_local_page_id", + "notion_scoring_page_id", + "memory_slug", + "memory_meta", + "aliases", + ) + + def __init__(self, identity: dict, lifecycle_state: str | None, source: str, note: str | None): + self.canonical_key = identity["project_key"] + self.display_name = identity.get("display_name") or self.canonical_key + self.repo_full_name = identity.get("repo_full_name") or None + self.group_key = identity.get("group_key") + self.lifecycle_state = lifecycle_state + self.source = source + self.note = note + self.matchset = { + f + for f in ( + normalize(self.display_name), + normalize(self.canonical_key), + normalize(_repo_base(self.repo_full_name)), + ) + if f + } + self.bridge_names: list[str] = [] + self.notion_local_title: str | None = None + self.notion_local_page_id: str | None = None + self.notion_scoring_page_id: str | None = None + self.memory_slug: str | None = None + self.memory_meta: list[str] = [] + self.aliases: set[str] = set() + + def add_alias(self, prefixed: str) -> None: + if _strip_alias_prefix(prefixed) != self.display_name: + self.aliases.add(prefixed) + + def to_dict(self) -> dict: + out = { + "canonical_key": self.canonical_key, + "display_name": self.display_name, + "repo_full_name": self.repo_full_name, + "group_key": self.group_key, + "lifecycle_state": self.lifecycle_state, + "source": self.source, + "bridge_project_names": self.bridge_names, + "notion_local_title": self.notion_local_title, + "notion_local_page_id": self.notion_local_page_id, + "notion_scoring_page_id": self.notion_scoring_page_id, + "memory_slug": self.memory_slug, + "memory_meta_notes": self.memory_meta, + "aliases": sorted(self.aliases), + "coverage": { + "auditor": self.source == "auditor", + "bridge": bool(self.bridge_names), + "notion_local": bool(self.notion_local_title), + "memory": bool(self.memory_slug), + }, + } + if self.note: + out["note"] = self.note + return out + + +def build_project_registry( + snapshot: dict, + *, + bridge_db_path: Path | None = None, + notion_snapshot_path: Path | None = None, + notion_project_map_path: Path | None = None, + memory_dir: Path | None = None, + scoring_pageids: dict[str, str] | None = None, + overrides_config_path: Path | None = None, + generated_at: datetime | None = None, +) -> dict: + """Build the canonical registry from a portfolio-truth snapshot dict. + + ``snapshot`` is the serialized portfolio-truth (``snapshot.to_dict()``). + All other sources are optional and degrade gracefully. + """ + overrides, supplementary, memory_meta = load_overrides_config(overrides_config_path) + generated_at = generated_at or datetime.now(timezone.utc) + + entries: list[_Entry] = [ + _Entry(p["identity"], (p.get("declared") or {}).get("lifecycle_state"), "auditor", None) + for p in snapshot.get("projects", []) + ] + for supp in supplementary: + entries.append( + _Entry( + { + "project_key": supp["canonical_key"], + "display_name": supp.get("display_name"), + "repo_full_name": supp.get("repo_full_name"), + "group_key": supp.get("group_key"), + }, + supp.get("lifecycle_state"), + "supplementary", + supp.get("note"), + ) + ) + + by_key = {e.canonical_key: e for e in entries} + index: dict[str, _Entry] = {} + for entry in entries: + for form in entry.matchset: + index.setdefault(form, entry) + override_norm = {normalize(raw): key for raw, key in overrides.items()} + + def resolve_entry(raw: str) -> _Entry | None: + norm = normalize(raw) + if not norm: + return None + if norm in override_norm: + target = by_key.get(override_norm[norm]) + if target is not None: + return target + return index.get(norm) + + notion_orphans: list[str] = [] + for title in _read_notion_titles(notion_snapshot_path): + entry = resolve_entry(title) + if entry is not None: + entry.notion_local_title = title + entry.add_alias(f"notion:{title}") + else: + notion_orphans.append(title) + + pageid_unmatched: list[str] = [] + for name, page_id in _read_notion_pageids(notion_project_map_path).items(): + entry = resolve_entry(name) + if entry is not None: + entry.notion_local_page_id = page_id + entry.add_alias(f"notionmap:{name}") + else: + pageid_unmatched.append(name) + + for project_name, page_id in (scoring_pageids or {}).items(): + entry = resolve_entry(project_name) + if entry is not None: + entry.notion_scoring_page_id = page_id + + memory_orphans: list[dict] = [] + for slug in _read_memory_slugs(memory_dir): + if slug in memory_meta: + parent = memory_meta[slug] + if parent and parent in by_key: + by_key[parent].memory_meta.append(f"project_{slug}") + continue + if not parent: + memory_orphans.append({"slug": slug, "kind": "meta-epic-note"}) + continue + entry = resolve_entry(slug) + if entry is not None: + if entry.memory_slug is None: + entry.memory_slug = f"project_{slug}" + else: + entry.memory_meta.append(f"project_{slug}") + entry.add_alias(f"memory:{slug}") + else: + memory_orphans.append({"slug": slug, "kind": "unmatched"}) + + bridge_orphans: list[str] = [] + for name in _read_bridge_names(bridge_db_path): + entry = resolve_entry(name) + if entry is not None: + if name not in entry.bridge_names: + entry.bridge_names.append(name) + entry.add_alias(f"bridge:{name}") + else: + bridge_orphans.append(name) + + return { + "schema_version": SCHEMA_VERSION, + "generated_at": generated_at.strftime("%Y-%m-%dT%H:%M:%SZ"), + "description": ( + "Canonical cross-store project-identity registry for the operator " + "OS. Joins GithubRepoAuditor, bridge-db, Notion (Local Portfolio " + "Projects), and ~/.claude memory under one canonical key." + ), + "canonical_key": { + "primary": "GithubRepoAuditor identity.project_key (taxonomy-path-qualified)", + "secondary": "repo_full_name (saagpatel/)", + "supplementary": "supp: for operator-OS projects the auditor does not track", + }, + "entry_count": len(entries), + "resolution_overrides": overrides, + "entries": [e.to_dict() for e in entries], + "unmatched": { + "bridge": sorted(bridge_orphans), + "memory": memory_orphans, + "notion_local": sorted(notion_orphans), + "notion_pageid_map": sorted(pageid_unmatched), + }, + } + + +def build_index(registry: dict) -> dict: + """Precompute lookup structures from a built registry for resolve().""" + norm2entry: dict[str, dict] = {} + for entry in registry["entries"]: + forms = {normalize(entry["display_name"])} + if entry.get("repo_full_name"): + forms.add(normalize(_repo_base(entry["repo_full_name"]))) + if "/" in (entry.get("canonical_key") or ""): + forms.add(normalize(entry["canonical_key"])) + for alias in entry.get("aliases", []): + forms.add(normalize(_strip_alias_prefix(alias))) + for form in forms: + if form: + norm2entry.setdefault(form, entry) + override_norm = { + normalize(raw): key for raw, key in registry.get("resolution_overrides", {}).items() + } + by_key = {e["canonical_key"]: e for e in registry["entries"]} + return {"norm2entry": norm2entry, "override_norm": override_norm, "by_key": by_key} + + +def resolve(name: str, index: dict) -> dict | None: + """Map a free-form project name to its canonical entry, or None.""" + norm = normalize(name) + if not norm: + return None + if norm in index["override_norm"]: + entry = index["by_key"].get(index["override_norm"][norm]) + if entry is not None: + return { + "canonical_key": entry["canonical_key"], + "display_name": entry["display_name"], + "matched_via": "override", + } + entry = index["norm2entry"].get(norm) + if entry is not None: + return { + "canonical_key": entry["canonical_key"], + "display_name": entry["display_name"], + "matched_via": "normalized", + } + return None + + +def fetch_scoring_pageids(data_source_id: str, token: str) -> dict[str, str]: + """Read Project Name -> page_id from the Notion Project Portfolio DB. + + Uses the auditor's Notion client; paginates the data source. Returns an + empty dict on any failure so registry generation never hard-depends on it. + """ + from src.notion_client import query_notion_collection + + result: dict[str, str] = {} + cursor: str | None = None + try: + while True: + body = {"page_size": 100} + if cursor: + body["start_cursor"] = cursor + response = query_notion_collection(data_source_id, token, body=body) + if response is None or response.status_code != 200: + break + payload = response.json() + for page in payload.get("results", []): + title_prop = page.get("properties", {}).get("Project Name", {}) + segments = title_prop.get("title", []) if isinstance(title_prop, dict) else [] + name = "".join(seg.get("plain_text", "") for seg in segments).strip() + if name and page.get("id"): + result[name] = page["id"] + if not payload.get("has_more"): + break + cursor = payload.get("next_cursor") + if not cursor: + break + except Exception: + return result + return result diff --git a/tests/test_project_registry.py b/tests/test_project_registry.py new file mode 100644 index 0000000..2cfdd43 --- /dev/null +++ b/tests/test_project_registry.py @@ -0,0 +1,169 @@ +"""Tests for the canonical cross-store project-identity registry.""" + +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path + +from src.project_registry import ( + build_index, + build_project_registry, + normalize, + resolve, +) + + +def _snapshot(*identities: dict) -> dict: + return { + "projects": [ + {"identity": ident, "declared": {"lifecycle_state": "active"}} for ident in identities + ] + } + + +def _ident(project_key: str, display_name: str, repo: str | None = None) -> dict: + return { + "project_key": project_key, + "display_name": display_name, + "repo_full_name": repo, + "group_key": "test", + } + + +# A snapshot covering the tricky cases: a space-vs-camel name, the screenshot +# collision pair, a notion-os-style repo, and a Notion-orphan project. +SNAPSHOT = _snapshot( + _ident("MCPAudit", "MCPAudit", "saagpatel/MCPAudit"), + _ident("ScreenshottoDataSelect", "ScreenshottoDataSelect", "saagpatel/ScreenshottoDataSelect"), + _ident( + "ITPRJsViaClaude/ScreenshotAnnotate", "ScreenshotAnnotate", "saagpatel/ScreenshotAnnotate" + ), + _ident("JobCommandCenter", "JobCommandCenter", "saagpatel/JobCommandCenter"), + _ident( + "BrowserHistoryVisualizer", "BrowserHistoryVisualizer", "saagpatel/BrowserHistoryVisualizer" + ), + _ident("Notion", "Notion", "saagpatel/notion-operating-system"), + _ident("PortfolioCommandCenter", "PortfolioCommandCenter", "saagpatel/PortfolioCommandCenter"), +) + + +def _bridge_db(tmp_path: Path, names: list[str]) -> Path: + db_path = tmp_path / "bridge.db" + conn = sqlite3.connect(db_path) + conn.execute("CREATE TABLE activity_log (project_name TEXT)") + conn.execute("CREATE TABLE pending_handoffs (project_name TEXT)") + conn.executemany("INSERT INTO activity_log VALUES (?)", [(n,) for n in names]) + conn.commit() + conn.close() + return db_path + + +def test_normalize_strips_case_separators_and_taxonomy_path(): + assert normalize("MCP Audit") == "mcpaudit" + assert normalize("MCPAudit") == "mcpaudit" + assert normalize("ITPRJsViaClaude/SlackIncidentBot") == "slackincidentbot" + assert normalize("Devil's Advocate") == "devilsadvocate" + assert normalize(None) == "" + + +def test_build_includes_supplementary_projects_from_defaults(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + keys = {e["canonical_key"] for e in registry["entries"]} + assert "supp:personal-ops" in keys + assert "supp:SecondBrain" in keys + assert registry["entry_count"] == len(SNAPSHOT["projects"]) + 2 + + +def test_resolve_joins_spelling_variants(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + index = build_index(registry) + for spelling in ("MCPAudit", "MCP Audit", "mcpaudit", "mcp_audit"): + result = resolve(spelling, index) + assert result is not None, spelling + assert result["canonical_key"] == "MCPAudit", spelling + + +def test_resolve_hard_normalization_failures_via_override(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + index = build_index(registry) + assert resolve("notion_os", index)["canonical_key"] == "Notion" + assert resolve("jcc", index)["canonical_key"] == "JobCommandCenter" + assert resolve("bhv", index)["canonical_key"] == "BrowserHistoryVisualizer" + + +def test_resolve_collision_guard_screenshotselect(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + index = build_index(registry) + result = resolve("screenshotselect", index) + assert result["canonical_key"] == "ScreenshottoDataSelect" + assert result["canonical_key"] != "ITPRJsViaClaude/ScreenshotAnnotate" + + +def test_resolve_supplementary_from_each_spelling(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + index = build_index(registry) + assert resolve("personal-ops", index)["canonical_key"] == "supp:personal-ops" + assert resolve("personal_ops", index)["canonical_key"] == "supp:personal-ops" + assert resolve("Personal Ops", index)["canonical_key"] == "supp:personal-ops" + assert resolve("SecondBrain", index)["canonical_key"] == "supp:SecondBrain" + + +def test_resolve_returns_none_for_non_projects(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + index = build_index(registry) + for junk in ("weekly-review", "Phase 18 audit task", "app", "totally-unknown"): + assert resolve(junk, index) is None, junk + + +def test_build_degrades_gracefully_without_external_sources(): + registry = build_project_registry( + SNAPSHOT, + bridge_db_path=None, + notion_snapshot_path=None, + notion_project_map_path=None, + memory_dir=None, + overrides_config_path=None, + ) + assert registry["entry_count"] == len(SNAPSHOT["projects"]) + 2 + for entry in registry["entries"]: + assert entry["coverage"]["bridge"] is False + assert entry["coverage"]["notion_local"] is False + + +def test_build_attaches_external_sources(tmp_path: Path): + bridge = _bridge_db(tmp_path, ["MCPAudit", "PortfolioCommandCenter", "weekly-review"]) + snap = tmp_path / "snapshot.json" + snap.write_text(json.dumps({"projects": [{"title": "MCP Audit"}, {"title": "app"}]})) + memdir = tmp_path / "memory" + memdir.mkdir() + (memdir / "project_mcpaudit.md").write_text("x") + + registry = build_project_registry( + SNAPSHOT, + bridge_db_path=bridge, + notion_snapshot_path=snap, + memory_dir=memdir, + overrides_config_path=None, + ) + by_key = {e["canonical_key"]: e for e in registry["entries"]} + mcp = by_key["MCPAudit"] + assert mcp["bridge_project_names"] == ["MCPAudit"] + assert mcp["notion_local_title"] == "MCP Audit" + assert mcp["memory_slug"] == "project_mcpaudit" + # bridge noise + notion junk land in unmatched, not on an entry + assert "weekly-review" in registry["unmatched"]["bridge"] + assert "app" in registry["unmatched"]["notion_local"] + # PortfolioCommandCenter resolves from bridge but has no Notion/memory row + assert by_key["PortfolioCommandCenter"]["bridge_project_names"] == ["PortfolioCommandCenter"] + assert by_key["PortfolioCommandCenter"]["notion_local_title"] is None + + +def test_scoring_pageids_attach_to_matching_entries(): + registry = build_project_registry( + SNAPSHOT, + scoring_pageids={"MCPAudit": "page-123", "Unknown Idea": "page-999"}, + overrides_config_path=None, + ) + by_key = {e["canonical_key"]: e for e in registry["entries"]} + assert by_key["MCPAudit"]["notion_scoring_page_id"] == "page-123" From e68c9e9b26fc41cfec9715d719ab5ea9ed58cd71 Mon Sep 17 00:00:00 2001 From: saagpatel Date: Sun, 7 Jun 2026 05:55:27 -0700 Subject: [PATCH 2/2] fix(project-registry): surface normalized-key collisions instead of silent first-wins Code-review hardening: the resolver index keeps the first entry on a normalized-form clash, so a second project normalizing to the same form would silently mis-resolve. Detect clashes during index build and emit them under warnings.normalized_key_collisions (live registry has none today). Adds 2 tests. --- src/project_registry.py | 17 ++++++++++++++++- tests/test_project_registry.py | 16 ++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/project_registry.py b/src/project_registry.py index 2e4edef..2c38bc4 100644 --- a/src/project_registry.py +++ b/src/project_registry.py @@ -305,9 +305,23 @@ def build_project_registry( by_key = {e.canonical_key: e for e in entries} index: dict[str, _Entry] = {} + collisions: list[dict] = [] for entry in entries: for form in entry.matchset: - index.setdefault(form, entry) + existing = index.get(form) + if existing is None: + index[form] = entry + elif existing is not entry: + # Two distinct projects normalize to the same form: the index + # keeps the first (stable), so the second would mis-resolve. + # Surface it as a warning rather than failing silently. + collisions.append( + { + "normalized_form": form, + "kept": existing.canonical_key, + "shadowed": entry.canonical_key, + } + ) override_norm = {normalize(raw): key for raw, key in overrides.items()} def resolve_entry(raw: str) -> _Entry | None: @@ -395,6 +409,7 @@ def resolve_entry(raw: str) -> _Entry | None: "notion_local": sorted(notion_orphans), "notion_pageid_map": sorted(pageid_unmatched), }, + "warnings": {"normalized_key_collisions": collisions}, } diff --git a/tests/test_project_registry.py b/tests/test_project_registry.py index 2cfdd43..4a0b49e 100644 --- a/tests/test_project_registry.py +++ b/tests/test_project_registry.py @@ -159,6 +159,22 @@ def test_build_attaches_external_sources(tmp_path: Path): assert by_key["PortfolioCommandCenter"]["notion_local_title"] is None +def test_normalized_key_collision_is_surfaced_not_silent(): + # Two distinct projects whose display names normalize to the same form. + colliding = _snapshot( + _ident("NetMapper", "Net Mapper", "saagpatel/NetMapper"), + _ident("NetworkMapperAlt", "NetMapper", "saagpatel/NetworkMapperAlt"), + ) + registry = build_project_registry(colliding, overrides_config_path=None) + collisions = registry["warnings"]["normalized_key_collisions"] + assert any(c["normalized_form"] == "netmapper" for c in collisions) + + +def test_real_snapshot_shape_has_no_collisions_block_when_clean(): + registry = build_project_registry(SNAPSHOT, overrides_config_path=None) + assert registry["warnings"]["normalized_key_collisions"] == [] + + def test_scoring_pageids_attach_to_matching_entries(): registry = build_project_registry( SNAPSHOT,