diff --git a/benchmark/miner/README.md b/benchmark/miner/README.md index f697b9a..2d83e50 100644 --- a/benchmark/miner/README.md +++ b/benchmark/miner/README.md @@ -87,6 +87,39 @@ python -m benchmark.miner evaluate \ | [`2026-W24-mined.csv`](results/2026-W24-mined.csv) | 2026-06-12 | stripe/ai, openai/openai-agents-python, crewAIInc/crewAI-examples | 121 (latest 40 merged PRs each + stripe/ai#232) | Schema v0.2 (re-run with baseline-gated `verify_*` receipts; supersedes the v0.1 artifact in place). Findings below. | | [`2026-W25-mined.csv`](results/2026-W25-mined.csv) | 2026-06-12 | google/adk-samples, langchain-ai/langgraph, modelcontextprotocol/servers | 120 (latest 40 merged PRs each) | Widen run over 3 new framework families. Schema v0.2. Findings below. | +## Constructed-adversarial accuracy — the blocked-recall proof + +Real merged PRs almost never contain a `must_block` capability change (W25: +9 decided / 241), so the accuracy benchmark's **positives** come from the +repo's bundled fixtures, each built to be a specific case. The labels are each +fixture's **documented design intent** — external ground truth, not a post-hoc +opinion about the engine's output — so scoring the engine's verdict against +them is *non-circular*. This is the moat claim, measured: the gate blocks what +is known-unsafe and does not escalate what is known-safe. + +Corpus: [`results/constructed.jsonl`](results/constructed.jsonl) + +[`results/constructed.labels.csv`](results/constructed.labels.csv). Regenerate +with `python -m benchmark.miner constructed --out … --labels-out …`; score with +`python -m benchmark.miner score`. + +| label \ verdict | allow | review | insufficient_evidence | block | +|---|---|---|---|---| +| `safe_to_merge` | 2 | 0 | 0 | 0 | +| `needs_human` | 0 | 1 | 1 | 0 | +| `must_block` | 0 | 0 | 0 | 3 | + +| Metric | Value | Reading | +|---|---|---| +| `blocked_recall` | **1.0** (3/3) | every known-unsafe fixture is blocked | +| `benign_escalation_rate` | **0.0** (0/2) | no known-safe fixture is escalated | +| `needs_human_caught` | **1.0** (2/2) | both review-needed cases are routed to a human (review / insufficient_evidence), never auto-passed | + +The live engine is re-run against these fixtures in CI +(`tests/test_miner_constructed.py`), so a change that regresses a blocked +verdict fails there rather than silently in the data file. The mined runs below +supply the complementary halves — the **negative control** (the 226 +trigger-skips) and the real-history **extraction-coverage** (`insufficient_evidence`) rate. + ### 2026-W25 findings — diminishing returns from framework-core breadth - **The base rate of capability-changing merged PRs is low, and now quantified.** diff --git a/benchmark/miner/__main__.py b/benchmark/miner/__main__.py index f7d95ce..fc7786e 100644 --- a/benchmark/miner/__main__.py +++ b/benchmark/miner/__main__.py @@ -114,6 +114,29 @@ def _cmd_evaluate(args: argparse.Namespace) -> int: return 0 +def _cmd_constructed(args: argparse.Namespace) -> int: + import csv + + from benchmark.miner.constructed import build_constructed_corpus + + rows, labels, rationales = build_constructed_corpus() + failed = [r.pr_url for r in rows if r.status != "evaluated"] + write_jsonl(rows, Path(args.out)) + write_csv(rows, Path(args.out).with_suffix(".csv")) + labels_path = Path(args.labels_out) + labels_path.parent.mkdir(parents=True, exist_ok=True) + with labels_path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.writer(handle, lineterminator="\n") + writer.writerow(["pr_url", "label", "rationale"]) + for row in rows: + writer.writerow([row.pr_url, labels[row.pr_url], rationales[row.pr_url]]) + print(f"[miner] wrote {len(rows)} constructed rows → {args.out} (+ labels → {labels_path})") + if failed: + print(f"[miner] ERROR: {len(failed)} fixture(s) did not evaluate: {failed}", file=sys.stderr) + return 1 + return 0 + + def _cmd_labels(args: argparse.Namespace) -> int: from benchmark.miner.labels import build_worksheet, write_worksheet @@ -187,6 +210,14 @@ def main(argv: list[str] | None = None) -> int: evaluate.add_argument("--force-run", action="store_true") evaluate.set_defaults(func=_cmd_evaluate) + constructed = sub.add_parser( + "constructed", + help="Build the constructed-adversarial corpus from bundled fixtures (definitional labels).", + ) + constructed.add_argument("--out", required=True, help="constructed corpus .jsonl to write.") + constructed.add_argument("--labels-out", required=True, help="definitional labels CSV to write.") + constructed.set_defaults(func=_cmd_constructed) + labels = sub.add_parser( "labels", help="Generate a blank labeling worksheet from a results JSONL." ) diff --git a/benchmark/miner/constructed.py b/benchmark/miner/constructed.py new file mode 100644 index 0000000..68882fa --- /dev/null +++ b/benchmark/miner/constructed.py @@ -0,0 +1,140 @@ +"""Constructed-adversarial accuracy corpus from the bundled fixtures. + +The mined corpus (:mod:`benchmark.miner`) supplies real-history *negatives* and +the extraction-coverage (`insufficient_evidence`) cases — but the 2026-W25 +widen-run showed that real merged PRs almost never contain a `must_block` +capability change. The *positives* come from here. + +Each bundled fixture was built to be a specific safe / needs-review / unsafe +case, so its label is the fixture's **documented design intent** — external +ground truth, not a post-hoc opinion about what the engine returned. Scoring +the engine's verdict against those definitional labels is therefore a +*non-circular* blocked-recall / benign-escalation measurement: the moat claim +("the gate blocks what is known-unsafe, and doesn't escalate what is +known-safe"), measured. + +Network-free: runs ``agents-shipgate fixture run --out `` and reads +the written ``report.json`` / ``verifier.json``. Reuses +:mod:`benchmark.miner.rows` + :mod:`benchmark.miner.labels` so the constructed +corpus scores through the very same confusion-matrix code as the mined corpus. +""" + +from __future__ import annotations + +import json +import subprocess +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from benchmark.miner.evaluate import cli_env +from benchmark.miner.rows import STATUS_ERROR, STATUS_EVALUATED, MinedRow + + +@dataclass(frozen=True) +class ConstructedCase: + fixture: str + label: str # one of benchmark.miner.labels.LABELS — the design intent + rationale: str + + +# The curated set. Labels are definitional (each fixture's documented purpose), +# so this mapping is the ground truth — independent of the verdict the engine +# produces, which is exactly what makes the resulting score non-circular. +CONSTRUCTED_CASES: tuple[ConstructedCase, ...] = ( + ConstructedCase( + "ai_generated_refund_pr", + "must_block", + "homepage story: adds stripe.create_refund with no approval policy and no idempotency evidence", + ), + ConstructedCase( + "agent_weakens_gate", + "must_block", + "trust-root: the coding agent deletes the Shipgate CI gate to make its PR self-merge", + ), + ConstructedCase( + "support_refund_agent", + "must_block", + "refund tool missing approval policy and idempotency evidence (two criticals by design)", + ), + ConstructedCase( + "clean_read_only_agent", + "safe_to_merge", + "read-only tool surface; no risky actions by design", + ), + ConstructedCase( + "hitl_evidence_covered_agent", + "safe_to_merge", + "refund domain with the approval/idempotency evidence a reviewer expects already covered", + ), + ConstructedCase( + "hitl_evidence_agent", + "needs_human", + "authority-bearing refund surface a reviewer must sign off (human-in-the-loop evidence expected)", + ), + ConstructedCase( + "openai_agents_sdk_agent", + "needs_human", + "dynamic OpenAI Agents SDK toolset; static extraction can't resolve the full surface (coverage gap)", + ), +) + +_RUN_TIMEOUT = 180 + + +def evaluate_constructed_case(case: ConstructedCase) -> MinedRow: + """Run one bundled fixture and record its verdict as a score-able row.""" + + row = MinedRow( + repo=f"fixture/{case.fixture}", + pr_number=0, + pr_url=f"fixture://{case.fixture}", + title=case.fixture, + merged_at="", + base_sha="", + head_sha="", + status=STATUS_ERROR, + ) + try: + with tempfile.TemporaryDirectory(prefix="shipgate-constructed-") as tmp: + out = Path(tmp) / "out" + result = subprocess.run( + [sys.executable, "-m", "agents_shipgate", "fixture", "run", case.fixture, "--out", str(out)], + capture_output=True, + text=True, + timeout=_RUN_TIMEOUT, + env=cli_env(), + check=False, + ) + report = out / "report.json" + if not report.is_file(): + row.notes = f"no report.json (exit {result.returncode})" + return row + data = json.loads(report.read_text(encoding="utf-8")) + row.head_decision = str((data.get("release_decision") or {}).get("decision") or "") + verifier = out / "verifier.json" + if verifier.is_file(): + vdata = json.loads(verifier.read_text(encoding="utf-8")) + row.verify_verdict = str(vdata.get("merge_verdict") or "") + can_merge = vdata.get("can_merge_without_human") + row.verify_can_merge = can_merge if isinstance(can_merge, bool) else None + row.status = STATUS_EVALUATED + return row + except Exception as exc: # noqa: BLE001 - one fixture must not abort the set. + row.notes = f"exception:{type(exc).__name__}:{exc}" + return row + + +def build_constructed_corpus() -> tuple[list[MinedRow], dict[str, str], dict[str, str]]: + """Return (rows, labels, rationales) for the whole constructed set.""" + + rows: list[MinedRow] = [] + labels: dict[str, str] = {} + rationales: dict[str, str] = {} + for case in CONSTRUCTED_CASES: + row = evaluate_constructed_case(case) + rows.append(row) + labels[row.pr_url] = case.label + rationales[row.pr_url] = case.rationale + return rows, labels, rationales diff --git a/benchmark/miner/evaluate.py b/benchmark/miner/evaluate.py index 562c39b..106d8ae 100644 --- a/benchmark/miner/evaluate.py +++ b/benchmark/miner/evaluate.py @@ -40,17 +40,49 @@ _GIT_TIMEOUT = 120 _CLI_TIMEOUT = 420 +# /src — the checkout this miner lives in. evaluate.py is at +# /benchmark/miner/evaluate.py, so parents[2] is the repo root. +_REPO_SRC = Path(__file__).resolve().parents[2] / "src" def shipgate_cmd() -> list[str]: return [sys.executable, "-m", "agents_shipgate"] -def _cli_env() -> dict[str, str]: +def _ensure_repo_src_on_path() -> None: + """Make in-process ``import agents_shipgate`` resolve to THIS checkout. + + ``cli_env`` only fixes child subprocesses; ``evaluate_pr`` imports + ``agents_shipgate.triggers`` in the PARENT to decide run/skip. Run by hand + against an older installed copy (or an editable ``.pth`` for a different + checkout), the parent would otherwise compute trigger decisions from a + stale catalog. Idempotent, and a no-op under pytest where conftest already + front-loads src. Must run before the first agents_shipgate import. + """ + + src = str(_REPO_SRC) + if src not in sys.path: + sys.path.insert(0, src) + + +def cli_env() -> dict[str, str]: + """Environment for child ``python -m agents_shipgate`` runs. + + Two pins, both load-bearing: + + - ``AGENTS_SHIPGATE_AGENT_MODE=0`` so stdout shapes don't flip when the + miner itself runs inside a coding-agent shell (``CLAUDECODE=1``). + - the checkout's ``src/`` prepended to ``PYTHONPATH`` so the child imports + THIS source tree, never a stale ``agents-shipgate`` wheel on the machine. + Without it, the documented ``python -m benchmark.miner …`` commands are + only hermetic under pytest (conftest sets ``PYTHONPATH``); run by hand + against an older installed copy they silently resolve to it. + """ + env = dict(os.environ) - # Pin agent-mode off so stdout shapes don't flip when the miner itself - # runs inside a coding-agent shell (CLAUDECODE=1 auto-detection). env["AGENTS_SHIPGATE_AGENT_MODE"] = "0" + existing = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = str(_REPO_SRC) + (os.pathsep + existing if existing else "") return env @@ -66,7 +98,7 @@ def _run( capture_output=True, text=True, timeout=timeout, - env=_cli_env(), + env=cli_env(), check=False, ) @@ -130,6 +162,7 @@ def _evaluate(row: MinedRow, *, repo_path: Path, force_run: bool) -> MinedRow: _git(repo_path, ["cat-file", "-e", f"{row.head_sha}:shipgate.yaml"]).returncode == 0 ) + _ensure_repo_src_on_path() from agents_shipgate.triggers import evaluate as evaluate_trigger trigger = evaluate_trigger( diff --git a/benchmark/miner/results/constructed.csv b/benchmark/miner/results/constructed.csv new file mode 100644 index 0000000..1c8b172 --- /dev/null +++ b/benchmark/miner/results/constructed.csv @@ -0,0 +1,8 @@ +repo,pr_number,pr_url,title,merged_at,base_sha,head_sha,files_changed,trigger_run,trigger_rationale,check_decision,check_rule_ids,init_status,head_decision,head_blockers,head_review_items,evidence_gaps,tools_scanned,cap_added,cap_removed,cap_changed,cap_broadened,verify_verdict,verify_decision,verify_can_merge,verify_trust_root_touched,verify_policy_weakened,verify_cap_added,verify_cap_modified,verify_cap_removed,status,notes,schema_version +fixture/ai_generated_refund_pr,0,fixture://ai_generated_refund_pr,ai_generated_refund_pr,,,,0,False,,,,,blocked,,,,,,,,,blocked,,False,,,,,,evaluated,,0.2 +fixture/agent_weakens_gate,0,fixture://agent_weakens_gate,agent_weakens_gate,,,,0,False,,,,,blocked,,,,,,,,,blocked,,False,,,,,,evaluated,,0.2 +fixture/support_refund_agent,0,fixture://support_refund_agent,support_refund_agent,,,,0,False,,,,,blocked,,,,,,,,,,,,,,,,,evaluated,,0.2 +fixture/clean_read_only_agent,0,fixture://clean_read_only_agent,clean_read_only_agent,,,,0,False,,,,,passed,,,,,,,,,,,,,,,,,evaluated,,0.2 +fixture/hitl_evidence_covered_agent,0,fixture://hitl_evidence_covered_agent,hitl_evidence_covered_agent,,,,0,False,,,,,passed,,,,,,,,,,,,,,,,,evaluated,,0.2 +fixture/hitl_evidence_agent,0,fixture://hitl_evidence_agent,hitl_evidence_agent,,,,0,False,,,,,review_required,,,,,,,,,,,,,,,,,evaluated,,0.2 +fixture/openai_agents_sdk_agent,0,fixture://openai_agents_sdk_agent,openai_agents_sdk_agent,,,,0,False,,,,,insufficient_evidence,,,,,,,,,,,,,,,,,evaluated,,0.2 diff --git a/benchmark/miner/results/constructed.jsonl b/benchmark/miner/results/constructed.jsonl new file mode 100644 index 0000000..ff81486 --- /dev/null +++ b/benchmark/miner/results/constructed.jsonl @@ -0,0 +1,7 @@ +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "blocked", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://ai_generated_refund_pr", "repo": "fixture/ai_generated_refund_pr", "schema_version": "0.2", "status": "evaluated", "title": "ai_generated_refund_pr", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": false, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": "blocked"} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "blocked", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://agent_weakens_gate", "repo": "fixture/agent_weakens_gate", "schema_version": "0.2", "status": "evaluated", "title": "agent_weakens_gate", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": false, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": "blocked"} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "blocked", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://support_refund_agent", "repo": "fixture/support_refund_agent", "schema_version": "0.2", "status": "evaluated", "title": "support_refund_agent", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": null, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": ""} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "passed", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://clean_read_only_agent", "repo": "fixture/clean_read_only_agent", "schema_version": "0.2", "status": "evaluated", "title": "clean_read_only_agent", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": null, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": ""} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "passed", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://hitl_evidence_covered_agent", "repo": "fixture/hitl_evidence_covered_agent", "schema_version": "0.2", "status": "evaluated", "title": "hitl_evidence_covered_agent", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": null, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": ""} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "review_required", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://hitl_evidence_agent", "repo": "fixture/hitl_evidence_agent", "schema_version": "0.2", "status": "evaluated", "title": "hitl_evidence_agent", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": null, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": ""} +{"base_sha": "", "cap_added": null, "cap_broadened": null, "cap_changed": null, "cap_removed": null, "check_decision": "", "check_rule_ids": "", "evidence_gaps": null, "files_changed": 0, "head_blockers": null, "head_decision": "insufficient_evidence", "head_review_items": null, "head_sha": "", "init_status": "", "merged_at": "", "notes": "", "pr_number": 0, "pr_url": "fixture://openai_agents_sdk_agent", "repo": "fixture/openai_agents_sdk_agent", "schema_version": "0.2", "status": "evaluated", "title": "openai_agents_sdk_agent", "tools_scanned": null, "trigger_rationale": "", "trigger_run": false, "verify_can_merge": null, "verify_cap_added": null, "verify_cap_modified": null, "verify_cap_removed": null, "verify_decision": "", "verify_policy_weakened": null, "verify_trust_root_touched": null, "verify_verdict": ""} diff --git a/benchmark/miner/results/constructed.labels.csv b/benchmark/miner/results/constructed.labels.csv new file mode 100644 index 0000000..26f1f05 --- /dev/null +++ b/benchmark/miner/results/constructed.labels.csv @@ -0,0 +1,8 @@ +pr_url,label,rationale +fixture://ai_generated_refund_pr,must_block,homepage story: adds stripe.create_refund with no approval policy and no idempotency evidence +fixture://agent_weakens_gate,must_block,trust-root: the coding agent deletes the Shipgate CI gate to make its PR self-merge +fixture://support_refund_agent,must_block,refund tool missing approval policy and idempotency evidence (two criticals by design) +fixture://clean_read_only_agent,safe_to_merge,read-only tool surface; no risky actions by design +fixture://hitl_evidence_covered_agent,safe_to_merge,refund domain with the approval/idempotency evidence a reviewer expects already covered +fixture://hitl_evidence_agent,needs_human,authority-bearing refund surface a reviewer must sign off (human-in-the-loop evidence expected) +fixture://openai_agents_sdk_agent,needs_human,dynamic OpenAI Agents SDK toolset; static extraction can't resolve the full surface (coverage gap) diff --git a/tests/test_miner.py b/tests/test_miner.py index 2fcb340..9822dcc 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -9,7 +9,10 @@ from __future__ import annotations import csv +import json +import os import subprocess +import sys from pathlib import Path from benchmark.miner.evaluate import evaluate_pr @@ -211,3 +214,47 @@ def test_summarize_reports_ie_rate() -> None: summary = summarize([evaluated_ie, evaluated_ok, skip]) assert summary["rows"] == 3 assert summary["ie_rate_on_decided"] == 0.5 + + +# --- Source-hermeticity: parent trigger eval must use THIS checkout ---------- + + +def test_ensure_repo_src_on_path_inserts_when_absent(monkeypatch) -> None: + from benchmark.miner.evaluate import _REPO_SRC, _ensure_repo_src_on_path + + # Simulate the repro: this checkout's src/ absent from the parent path + # (e.g. an editable .pth points at a different checkout / installed wheel). + filtered = [p for p in sys.path if Path(p).resolve() != _REPO_SRC.resolve()] + monkeypatch.setattr(sys, "path", filtered) + _ensure_repo_src_on_path() + assert Path(sys.path[0]).resolve() == _REPO_SRC.resolve() + + +def test_evaluate_cli_is_source_hermetic_without_src_on_parent_path(tmp_path: Path) -> None: + """`benchmark.miner evaluate` must decide run/skip from THIS checkout's + trigger catalog even when the parent env has no src/ — the parent trigger + import, not just the child subprocesses, has to resolve here. + """ + repo = _init_repo(tmp_path) + (repo / "tools.json").write_text('{"tools": []}\n', encoding="utf-8") + base = _commit_all(repo, "base") + (repo / "README.md").write_text("docs only\n", encoding="utf-8") + head = _commit_all(repo, "docs") # docs-only → trigger-skip, fast (no scan) + + repo_root = Path(__file__).resolve().parent.parent + env = dict(os.environ) + # Parent can import `benchmark` (repo root) but NOT agents_shipgate (no src) + # — only the _ensure_repo_src_on_path fix puts this checkout's src first. + env["PYTHONPATH"] = str(repo_root) + env.pop("AGENTS_SHIPGATE_AGENT_MODE", None) + result = subprocess.run( + [ + sys.executable, "-m", "benchmark.miner", "evaluate", + "--repo-path", str(repo), "--base", base, "--head", head, + ], + capture_output=True, text=True, timeout=180, env=env, cwd=str(repo_root), + ) + assert result.returncode == 0, result.stderr + row = json.loads(result.stdout) + assert row["status"] == STATUS_TRIGGER_SKIP + assert row["trigger_run"] is False diff --git a/tests/test_miner_constructed.py b/tests/test_miner_constructed.py new file mode 100644 index 0000000..81210c4 --- /dev/null +++ b/tests/test_miner_constructed.py @@ -0,0 +1,116 @@ +"""The constructed-adversarial accuracy benchmark — the blocked-recall proof. + +The mined corpus measures the noise bound + extraction coverage on real +history; this measures the thing real history almost never supplies — whether +the gate BLOCKS what is known-unsafe and does not escalate what is known-safe. +The labels are each fixture's documented design intent (external ground +truth), so the score is non-circular. + +Two layers: a fast check that the committed corpus still scores to the +published headline, and a slower live-engine regression that re-runs the +fixtures so a future change that breaks a verdict fails here, not silently. +""" + +from __future__ import annotations + +import csv +import os +import subprocess +import sys +from pathlib import Path + +from benchmark.miner.constructed import CONSTRUCTED_CASES, build_constructed_corpus +from benchmark.miner.evaluate import cli_env +from benchmark.miner.labels import LABELS, load_labels, score +from benchmark.miner.rows import STATUS_EVALUATED, read_jsonl + +REPO_ROOT = Path(__file__).resolve().parent.parent +RESULTS = REPO_ROOT / "benchmark" / "miner" / "results" +CORPUS = RESULTS / "constructed.jsonl" +LABELS_FILE = RESULTS / "constructed.labels.csv" + + +def test_committed_constructed_accuracy_headline() -> None: + scored = score(read_jsonl(CORPUS), load_labels(LABELS_FILE)) + metrics = scored["metrics"] + assert metrics["blocked_recall"] == 1.0, scored + assert metrics["benign_escalation_rate"] == 0.0, scored + assert metrics["needs_human_caught"] == 1.0, scored + assert scored["unmatched_labels"] == [] + assert scored["labeled_rows"] == len(CONSTRUCTED_CASES) + + +def test_committed_constructed_is_well_formed_and_fully_labeled() -> None: + rows = read_jsonl(CORPUS) + assert len(rows) == len(CONSTRUCTED_CASES) + assert all(r.status == STATUS_EVALUATED for r in rows), [r.notes for r in rows] + labels = load_labels(LABELS_FILE) + assert set(labels) == {r.pr_url for r in rows} # every row labeled, no extras + assert set(labels.values()) <= set(LABELS) + assert b"\r" not in CORPUS.read_bytes() + assert b"\r" not in LABELS_FILE.read_bytes() + with CORPUS.with_suffix(".csv").open(encoding="utf-8", newline="") as handle: + assert len(list(csv.DictReader(handle))) == len(rows) + + +def test_live_engine_still_produces_the_constructed_verdicts() -> None: + """Regression: the gate must keep blocking the known-unsafe fixtures. + + Re-runs the bundled fixtures through the live engine; a change that + regresses a blocked verdict (or escalates a safe one) fails here. Also + catches the committed corpus going stale relative to the engine. + """ + rows, labels, _ = build_constructed_corpus() + unevaluated = [(r.pr_url, r.notes) for r in rows if r.status != STATUS_EVALUATED] + assert not unevaluated, f"fixtures failed to evaluate: {unevaluated}" + metrics = score(rows, labels)["metrics"] + assert metrics["blocked_recall"] == 1.0 + assert metrics["benign_escalation_rate"] == 0.0 + assert metrics["needs_human_caught"] == 1.0 + live = {r.pr_url: (r.head_decision, r.verify_verdict) for r in rows} + committed = {r.pr_url: (r.head_decision, r.verify_verdict) for r in read_jsonl(CORPUS)} + assert live == committed, "committed constructed corpus is stale vs the live engine" + + +def test_cli_env_prepends_this_checkouts_src() -> None: + """The child `python -m agents_shipgate` must import THIS checkout. + + Regression for the stale-install footgun: without src/ first on the + child's PYTHONPATH, the documented commands resolve to whatever + agents-shipgate is installed on the machine. + """ + env = cli_env() + first = env["PYTHONPATH"].split(os.pathsep)[0] + assert Path(first) == (REPO_ROOT / "src"), first + assert env["AGENTS_SHIPGATE_AGENT_MODE"] == "0" + + +def test_constructed_cli_is_source_hermetic(tmp_path: Path) -> None: + """End-to-end: regenerate via the CLI with src/ removed from the parent + env. The child must still find this checkout (via cli_env), not fall back + to an installed wheel — so the command exits 0 and writes all 7 rows. + """ + env = dict(os.environ) + # Parent can import `benchmark` (repo root) but NOT agents_shipgate (no src) — + # only the cli_env fix puts src on the child's path. + env["PYTHONPATH"] = str(REPO_ROOT) + env.pop("AGENTS_SHIPGATE_AGENT_MODE", None) + result = subprocess.run( + [ + sys.executable, + "-m", + "benchmark.miner", + "constructed", + "--out", + str(tmp_path / "constructed.jsonl"), + "--labels-out", + str(tmp_path / "constructed.labels.csv"), + ], + capture_output=True, + text=True, + timeout=300, + env=env, + cwd=str(REPO_ROOT), + ) + assert result.returncode == 0, result.stderr + assert len(read_jsonl(tmp_path / "constructed.jsonl")) == len(CONSTRUCTED_CASES)