-
Notifications
You must be signed in to change notification settings - Fork 45
feat(experts): kb.experts — rank entities by evidence density on a topic (closes #315) #347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
e11734937-beep
wants to merge
2
commits into
vouchdev:main
Choose a base branch
from
e11734937-beep:feat/vouch-experts
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+330
−0
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| """kb.experts - rank entities by evidence density on a topic (issue #315). | ||
|
|
||
| Read-only aggregation over approved, live claims. Given a free-text topic, | ||
| return the entities carrying the most matched evidence, ranked by one of three | ||
| weightings (count / recency / citation). It never proposes, writes, or mutates | ||
| anything, makes no network or LLM call, and reads only claims already past the | ||
| review gate - so the review gate is untouched by construction. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from datetime import datetime | ||
| from typing import Any | ||
|
|
||
| from . import index_db | ||
| from .models import Claim, ClaimStatus, utcnow | ||
| from .salience import _substring_entity_ids | ||
| from .storage import KBStore | ||
|
|
||
| # A superseded / archived / redacted claim is not live evidence and must never | ||
| # inflate an entity ranking (consistent with issue #78). | ||
| _EXCLUDED_STATUSES = frozenset( | ||
| {ClaimStatus.SUPERSEDED, ClaimStatus.ARCHIVED, ClaimStatus.REDACTED} | ||
| ) | ||
| _VALID_WEIGHTS = frozenset({"count", "recency", "citation"}) | ||
| _RECENCY_HALF_LIFE_DAYS = 30.0 | ||
|
|
||
|
|
||
| def _claim_weight(claim: Claim, weight: str, now: datetime) -> float: | ||
| """Per-claim contribution to an entity score under the chosen weighting.""" | ||
| if weight == "recency": | ||
| ts = claim.last_confirmed_at or claim.updated_at | ||
| age_days = max(0.0, (now - ts).total_seconds() / 86400.0) | ||
| return 2.0 ** (-age_days / _RECENCY_HALF_LIFE_DAYS) | ||
| if weight == "citation": | ||
| return float(len(set(claim.evidence))) * float(claim.confidence) | ||
| return 1.0 # count | ||
|
|
||
|
|
||
| def rank_experts( | ||
| store: KBStore, | ||
| topic: str, | ||
| *, | ||
| limit: int = 10, | ||
| min_claims: int = 1, | ||
| weight: str = "count", | ||
| ) -> list[dict[str, Any]]: | ||
| """Return entities ranked by evidence density on ``topic``. | ||
|
|
||
| ``weight`` is one of ``count`` | ``recency`` | ``citation``; an unknown | ||
| value falls back to ``count`` (never raises), matching the defensive-config | ||
| style used elsewhere. Ordered by descending score with a stable tie-break | ||
| on ``entity_id``. | ||
| """ | ||
| if weight not in _VALID_WEIGHTS: | ||
| weight = "count" | ||
|
|
||
| entities = store.list_entities() | ||
| by_id = {ent.id: ent for ent in entities} | ||
| topic_entity_ids = set(_substring_entity_ids(entities, topic)) | ||
|
|
||
| # Candidate claims: FTS hits on the topic, plus every claim that references | ||
| # an entity whose name/alias matches the topic. | ||
| fetch = max(limit * 5, 50) | ||
| fts_claim_ids = { | ||
| cid | ||
| for kind, cid, _snip, _score in index_db.search(store.kb_dir, topic, limit=fetch) | ||
| if kind == "claim" | ||
| } | ||
|
|
||
| now = utcnow() | ||
| counts: dict[str, int] = {} | ||
| citations: dict[str, set[str]] = {} | ||
| scores: dict[str, float] = {} | ||
| top_claims: dict[str, list[tuple[float, str]]] = {} | ||
|
|
||
| for claim in store.list_claims(): | ||
| if claim.status in _EXCLUDED_STATUSES: | ||
| continue | ||
| matched = claim.id in fts_claim_ids or bool( | ||
| set(claim.entities) & topic_entity_ids | ||
| ) | ||
| if not matched: | ||
| continue | ||
| contrib = _claim_weight(claim, weight, now) | ||
| for eid in claim.entities: | ||
| if eid not in by_id: | ||
| continue # dangling reference - skip (graph gate should prevent) | ||
| counts[eid] = counts.get(eid, 0) + 1 | ||
| citations.setdefault(eid, set()).update(claim.evidence) | ||
| scores[eid] = scores.get(eid, 0.0) + contrib | ||
| top_claims.setdefault(eid, []).append((contrib, claim.id)) | ||
|
|
||
| rows: list[dict[str, Any]] = [] | ||
| for eid, count in counts.items(): | ||
| if count < min_claims: | ||
| continue | ||
| ent = by_id[eid] | ||
| ranked = sorted(top_claims[eid], key=lambda item: (-item[0], item[1])) | ||
| rows.append( | ||
| { | ||
| "entity_id": eid, | ||
| "name": ent.name, | ||
| "type": str(ent.type), | ||
| "claim_count": count, | ||
| "citation_count": len(citations.get(eid, set())), | ||
| "score": round(scores[eid], 6), | ||
| "top_claim_ids": [cid for _w, cid in ranked[:3]], | ||
| } | ||
| ) | ||
|
|
||
| rows.sort(key=lambda row: (-row["score"], -row["claim_count"], row["entity_id"])) | ||
| return rows[:limit] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,151 @@ | ||
| """kb.experts - rank entities by evidence density on a topic (issue #315). | ||
|
|
||
| Read-only: aggregates approved, live claims and returns a ranking. These tests | ||
| seed entities + claims directly and match on the topic via the entity | ||
| name/alias substring pass, so they exercise the ranking without depending on | ||
| the FTS index being populated. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| import pytest | ||
|
|
||
| from vouch.experts import rank_experts | ||
| from vouch.jsonl_server import handle_request | ||
| from vouch.models import Claim, ClaimStatus, Entity, EntityType | ||
| from vouch.storage import KBStore | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def store(tmp_path: Path) -> KBStore: | ||
| return KBStore.init(tmp_path) | ||
|
|
||
|
|
||
| def _seed(store: KBStore) -> None: | ||
| src = store.put_source(b"evidence-bytes") | ||
| src2 = store.put_source(b"other-evidence") | ||
| store.put_entity(Entity(id="jwt", name="JWT", type=EntityType.CONCEPT)) | ||
| store.put_entity(Entity(id="alice", name="alice", type=EntityType.PERSON)) | ||
| store.put_entity(Entity(id="bob", name="bob", type=EntityType.PERSON)) | ||
| # alice: 2 JWT claims (one citing two distinct sources); bob: 1 JWT claim. | ||
| store.put_claim( | ||
| Claim(id="c1", text="jwt auth by alice", evidence=[src.id], entities=["jwt", "alice"]) | ||
| ) | ||
| store.put_claim( | ||
| Claim( | ||
| id="c2", | ||
| text="jwt rotation by alice", | ||
| evidence=[src.id, src2.id], | ||
| entities=["jwt", "alice"], | ||
| ) | ||
| ) | ||
| store.put_claim( | ||
| Claim(id="c3", text="jwt review by bob", evidence=[src.id], entities=["jwt", "bob"]) | ||
| ) | ||
|
|
||
|
|
||
| def test_ranks_by_claim_count(store: KBStore) -> None: | ||
| _seed(store) | ||
| rows = rank_experts(store, "JWT", weight="count") | ||
| names = [r["name"] for r in rows] | ||
| assert names[0] == "JWT" # on all 3 claims | ||
| assert names.index("alice") < names.index("bob") # 2 claims vs 1 | ||
| alice = next(r for r in rows if r["name"] == "alice") | ||
| assert alice["claim_count"] == 2 | ||
|
|
||
|
|
||
| def test_min_claims_and_limit(store: KBStore) -> None: | ||
| _seed(store) | ||
| names = {r["name"] for r in rank_experts(store, "JWT", min_claims=2)} | ||
| assert "bob" not in names # bob has only 1 claim | ||
| assert rank_experts(store, "JWT", limit=1)[0]["name"] == "JWT" | ||
|
|
||
|
|
||
| def test_citation_weight_rewards_source_breadth(store: KBStore) -> None: | ||
| _seed(store) | ||
| rows = rank_experts(store, "JWT", weight="citation") | ||
| alice = next(r for r in rows if r["name"] == "alice") | ||
| assert alice["citation_count"] == 2 # c2 cites two distinct sources | ||
|
|
||
|
|
||
| def test_excludes_superseded_archived_redacted(store: KBStore) -> None: | ||
| src = store.put_source(b"x") | ||
| store.put_entity(Entity(id="e", name="ghost", type=EntityType.CONCEPT)) | ||
| store.put_claim( | ||
| Claim( | ||
| id="live", | ||
| text="ghost live", | ||
| evidence=[src.id], | ||
| entities=["e"], | ||
| status=ClaimStatus.STABLE, | ||
| ) | ||
| ) | ||
| for i, dead in enumerate( | ||
| (ClaimStatus.SUPERSEDED, ClaimStatus.ARCHIVED, ClaimStatus.REDACTED) | ||
| ): | ||
| store.put_claim( | ||
| Claim( | ||
| id=f"dead{i}", | ||
| text="ghost dead", | ||
| evidence=[src.id], | ||
| entities=["e"], | ||
| status=dead, | ||
| ) | ||
| ) | ||
| row = next(r for r in rank_experts(store, "ghost") if r["name"] == "ghost") | ||
| assert row["claim_count"] == 1 # only the live claim scored | ||
|
|
||
|
|
||
| def test_unknown_weight_falls_back_to_count(store: KBStore) -> None: | ||
| _seed(store) | ||
| fallback = [r["entity_id"] for r in rank_experts(store, "JWT", weight="nonsense")] | ||
| baseline = [r["entity_id"] for r in rank_experts(store, "JWT", weight="count")] | ||
| assert fallback == baseline | ||
|
|
||
|
|
||
| def test_empty_kb_and_no_match(store: KBStore) -> None: | ||
| assert rank_experts(store, "anything") == [] | ||
| _seed(store) | ||
| assert rank_experts(store, "no-such-topic-xyz") == [] | ||
|
|
||
|
|
||
| def test_deterministic_tie_break_on_entity_id(store: KBStore) -> None: | ||
| src = store.put_source(b"y") | ||
| store.put_entity(Entity(id="t", name="topic-x", type=EntityType.CONCEPT)) | ||
| store.put_entity(Entity(id="a2", name="a2", type=EntityType.PERSON)) | ||
| store.put_entity(Entity(id="a1", name="a1", type=EntityType.PERSON)) | ||
| store.put_claim( | ||
| Claim(id="k1", text="topic-x one", evidence=[src.id], entities=["t", "a1"]) | ||
| ) | ||
| store.put_claim( | ||
| Claim(id="k2", text="topic-x two", evidence=[src.id], entities=["t", "a2"]) | ||
| ) | ||
| ranked = rank_experts(store, "topic-x") | ||
| tied = [r["entity_id"] for r in ranked if r["entity_id"] in {"a1", "a2"}] | ||
| assert tied == ["a1", "a2"] # equal score -> ascending entity_id | ||
|
|
||
|
|
||
| def test_jsonl_experts_envelope_success(store: KBStore, monkeypatch) -> None: | ||
| # kb.experts over the JSONL contract: a well-formed request returns the | ||
| # {id, ok, result} envelope with the ranking under result["experts"]. | ||
| _seed(store) | ||
| monkeypatch.chdir(store.root) | ||
| resp = handle_request( | ||
| {"id": "e1", "method": "kb.experts", "params": {"topic": "JWT"}} | ||
| ) | ||
| assert resp["id"] == "e1" | ||
| assert resp["ok"] is True | ||
| names = [r["name"] for r in resp["result"]["experts"]] | ||
| assert "alice" in names | ||
|
|
||
|
|
||
| def test_jsonl_experts_envelope_missing_topic_errors(store: KBStore, monkeypatch) -> None: | ||
| # A request missing the required `topic` param yields the failure envelope | ||
| # {id, ok: false, error} rather than raising out of the server. | ||
| monkeypatch.chdir(store.root) | ||
| resp = handle_request({"id": "e2", "method": "kb.experts", "params": {}}) | ||
| assert resp["id"] == "e2" | ||
| assert resp["ok"] is False | ||
| assert resp["error"]["code"] == "missing_param" | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.