From 4f685f0f0bea8a339105fad1ef6800072b821585 Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Tue, 16 Jun 2026 13:17:15 -0700 Subject: [PATCH 1/3] Phase 2c: a named high concern routes to review, not insufficient_evidence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An *active* (non-baseline-accepted) high/critical review finding now elevates the release decision to `review_required` even when the low-confidence extraction gate would otherwise have produced `insufficient_evidence`. Why: the 2026-06-01 Stripe pilot and the W24/W25 mining showed the dominant real-world failure was a silent slide into `insufficient_evidence` — the gate HAD something concrete (an unbounded toolkit) but reported only "we couldn't see enough." Phase 2a named that concern (SHIP-SCOPE-TOOLKIT-UNBOUNDED, high); this routes it to a human instead of burying it under the evidence gate. Safety: both verdicts are equally non-auto-mergeable (can_merge_without_human=False), so this loses no gate strength. `blocked` still outranks everything; IE still fires when the only signal is thin evidence with no named high concern; baseline-accepted (matched) high debt does NOT elevate (acknowledged debt, not an active concern). The low-confidence detail is preserved in evidence_coverage.evidence_gaps, and _decision_reason already emits the honest combined wording ("N findings need review and evidence coverage is incomplete"). No schema change (verdict-value change only); generate_schemas --check clean. - release_decision.py: track has_active_high_review; insert the elevation branch above the IE branch. - test_release_decision.py: high-active elevates; medium-active and high-accepted do not (the contrast cases). - test_toolkit_bounds_check.py: the stripe-head e2e now asserts release_decision.decision == "review_required" (ties 2a+2c together). - CHANGELOG: Unreleased note. Co-Authored-By: Claude Fable 5 --- CHANGELOG.md | 14 ++++++++ src/agents_shipgate/ci/release_decision.py | 20 +++++++++++ tests/test_release_decision.py | 39 ++++++++++++++++++++++ tests/test_toolkit_bounds_check.py | 4 +++ 4 files changed, 77 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e216ad01..6e8eea8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## Unreleased + +- **A named high concern now routes to review, not `insufficient_evidence`.** + When a scan turns up an *active* (not baseline-accepted) high/critical review + finding, the release decision is now `review_required` even if low-confidence + extraction would otherwise have produced `insufficient_evidence`. Both + verdicts are equally non-auto-mergeable, but `review_required` points the + human at a specific, actionable finding (e.g. the new + `SHIP-SCOPE-TOOLKIT-UNBOUNDED`) instead of the vaguer "we couldn't see + enough." `blocked` still outranks everything; IE still fires when the only + signal is thin extraction. The 2026-06-01 Stripe pilot's silent/IE case now + surfaces as a routed review. `evidence_gaps` are preserved on the report + either way, so the extraction-coverage signal is not lost. + ## 0.13.0 - 2026-06-12 - **Accepted-debt exception workflow (baseline schema 0.6).** `baseline save` diff --git a/src/agents_shipgate/ci/release_decision.py b/src/agents_shipgate/ci/release_decision.py index 0bb7e5e5..3eab652c 100644 --- a/src/agents_shipgate/ci/release_decision.py +++ b/src/agents_shipgate/ci/release_decision.py @@ -55,6 +55,11 @@ def build_release_decision( review_items: list[ReleaseDecisionItem] = [] contribution_rules: list[ContributionRule] = [] blocker_severities: set[Severity] = {"critical", *fail_on_resolved} + # v0.27 (Phase 2c): an active (non-accepted) high/critical finding routed to + # review is a *named* concern — the gate has decided "a human must look", + # which is more specific than "insufficient_evidence". Tracked here so the + # decision can prefer review_required over IE when one exists (see below). + has_active_high_review = False # v0.17: iterate the FULL findings list (not just `active`) so the # audit row set is exhaustive over report.findings. The branching @@ -121,6 +126,11 @@ def build_release_decision( or finding.requires_human_review is True ): review_items.append(_to_item(finding)) + if ( + finding.severity in {"critical", "high"} + and finding.baseline_status != "matched" + ): + has_active_high_review = True contribution_rules.append( _rule( finding, @@ -183,6 +193,16 @@ def build_release_decision( decision: ReleaseDecisionStatus if blockers: decision = "blocked" + elif has_active_high_review: + # Phase 2c: a named, active high/critical concern (e.g. an unbounded + # toolkit mounted on the agent) is not "insufficient evidence" — the + # gate HAS something concrete for a human to review. Prefer the + # actionable review_required over the vaguer insufficient_evidence. + # Both are equally non-auto-mergeable (can_merge_without_human=False), + # so this loses no safety; the low-confidence detail is still carried + # in evidence_coverage.evidence_gaps. IE stays the verdict when the + # only signal is weak evidence with no named high concern. + decision = "review_required" elif ( evidence.low_confidence_tool_count >= low_confidence_threshold or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS diff --git a/tests/test_release_decision.py b/tests/test_release_decision.py index d73da81c..7bac5a2e 100644 --- a/tests/test_release_decision.py +++ b/tests/test_release_decision.py @@ -303,6 +303,45 @@ def test_insufficient_evidence_outranks_review_required(): assert len(decision.review_items) == 1 +def test_active_high_review_finding_outranks_insufficient_evidence(): + """Phase 2c: an active (non-accepted) HIGH finding is a *named* concern — + review_required, not the vaguer insufficient_evidence — even when the + low-confidence-tool gate would otherwise trip. Contrast with the medium + case above, which stays insufficient_evidence. Both verdicts are equally + non-auto-mergeable, so this loses no safety; the evidence detail is kept.""" + tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")] + report = _report( + findings=[ + _finding( + check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED", + severity="high", + baseline_status="new", + ) + ], + tools=tools, + human_review_recommended=True, + ) + decision = _build(report, ci_mode="advisory", tools=tools) + assert decision.decision == "review_required" + assert len(decision.review_items) == 1 + # The insufficient-evidence detail is preserved for the reviewer. + assert decision.evidence_coverage.low_confidence_tool_count == 2 + assert decision.evidence_coverage.evidence_gaps + + +def test_accepted_high_finding_does_not_outrank_insufficient_evidence(): + """A baseline-MATCHED (accepted) high finding is acknowledged debt — it + must NOT elevate out of insufficient_evidence; only active high concerns do.""" + tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")] + report = _report( + findings=[_finding(severity="high", baseline_status="matched")], + tools=tools, + baseline=BaselineSummary(path=".agents-shipgate/baseline.json", matched_count=1), + ) + decision = _build(report, ci_mode="advisory", tools=tools, new_findings_only=True) + assert decision.decision == "insufficient_evidence" + + def test_insufficient_evidence_reason_lists_both_counts(): """When both gates trip, the reason names both counts.""" tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")] diff --git a/tests/test_toolkit_bounds_check.py b/tests/test_toolkit_bounds_check.py index 58830dba..42845b27 100644 --- a/tests/test_toolkit_bounds_check.py +++ b/tests/test_toolkit_bounds_check.py @@ -106,3 +106,7 @@ def test_scan_of_stripe_head_fixture_names_the_unbounded_toolkit(tmp_path: Path) # The finding must point at the constructor, not shipgate.yaml. source = matches[0].get("source") or {} assert (source.get("path") or "").endswith("support_agent.py"), source + # Phase 2c: the named high concern elevates the verdict out of the vague + # insufficient_evidence into an actionable review_required (the pilot's + # silent/IE case is now a routed human review). + assert report["release_decision"]["decision"] == "review_required" From b2c2f9fb75632591bd8cb31e3033e0ac065cad2b Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Tue, 16 Jun 2026 13:35:31 -0700 Subject: [PATCH 2/3] Phase 2c review: keep degraded-evidence cases human-routed + fix contract docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two review findings on PR #222. [P2] Degraded-evidence cases could route to coding_agent. The new precedence elevates a below-threshold-evidence case to `review_required` when an active high/critical finding exists, but `fix_task` keyed its degraded-evidence authority escalation on `merge_verdict == "insufficient_evidence"`. A mechanically-fixable high finding + 2/2 low-confidence tools therefore landed on `review_required` and routed to coding_agent / safe_to_attempt=True — an auto-fix path on evidence too weak to gate. Fix: extract the IE-threshold predicate into a single public helper `evidence_below_ie_threshold(evidence, tool_count=...)` in release_decision.py (now the one source of truth, used by build_release_decision too), and OR it into fix_task's `authority_escalation`. A degraded-evidence case is now always human-routed regardless of which non-mergeable verdict it carries. The human also gets the concrete "make the surface enumerable" remedies in that case, not only on the bare IE verdict. Regression tests: degraded-evidence + mechanically-fixable high → human (was the bug); high finding + full evidence → still coding_agent (escalation fires on degraded evidence, not severity). [P2] Contract docs described the old precedence. Updated STABILITY.md, docs/agent-contract-current.md, and docs/report-reading-for-agents.md to document the active-high/critical exception, the full precedence order, that the evidence gap is preserved in `evidence_coverage` (read the counts, not the label, to detect degraded evidence), and that verify keeps both cases human-routed. Regenerated llms-full.txt. Also scrubbed an inaccurate "v0.27" marker (code comments + docs): these contract docs use `v0.X` for report_schema_version (currently 0.26), and this change does not bump the schema — replaced with the "Phase 2c" milestone name. Full suite green; ruff + schema-drift clean; no schema bump. Co-Authored-By: Claude Fable 5 --- STABILITY.md | 36 ++++++++-- docs/agent-contract-current.md | 2 +- docs/report-reading-for-agents.md | 8 ++- llms-full.txt | 2 +- src/agents_shipgate/ci/release_decision.py | 34 +++++++-- src/agents_shipgate/cli/verify/fix_task.py | 28 +++++++- tests/test_fix_task_contract.py | 81 ++++++++++++++++++++-- 7 files changed, 167 insertions(+), 24 deletions(-) diff --git a/STABILITY.md b/STABILITY.md index a8845d24..02f8321b 100644 --- a/STABILITY.md +++ b/STABILITY.md @@ -331,14 +331,36 @@ the scan also has low-confidence tools or source warnings. When there are no blockers, `insufficient_evidence` means the static inputs are not strong enough for Shipgate to gate release confidently. It does **not** -prove the agent is unsafe. The decision fires when low-confidence tools are at -least `max(1, ceil(tool_count × 0.5))`, or when source-loader warnings exceed -`3`. One to three source warnings without blockers route to `review_required` -so a human still sees the degraded source coverage. - -The intended recovery is to provide clearer local evidence — for example an MCP +prove the agent is unsafe. The evidence is considered below threshold when +low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or when +source-loader warnings exceed `3`. One to three source warnings without +blockers route to `review_required` so a human still sees the degraded source +coverage. + +**Active high/critical findings take precedence over the IE label.** When the +evidence is below threshold *and* there is an active (non-baseline-accepted) +high- or critical-severity review finding, the decision is `review_required` +rather than `insufficient_evidence`. Both verdicts are +equally non-mergeable (`can_merge_without_human` is false either way), but +`review_required` points the reviewer at a specific, named finding instead of +the vaguer "we couldn't see enough." The evidence gap is **not** lost: the +underlying counts remain in `release_decision.evidence_coverage` +(`low_confidence_tool_count`, `source_warning_count`, `evidence_gaps[]`), so a +consumer must read those fields rather than the verdict label alone to know +whether evidence was degraded. `insufficient_evidence` still fires when the only +signal is weak evidence with no named high/critical concern; `blocked` still +takes precedence over both. The precedence is therefore: +`blocked` → `review_required` (active high/critical) → `insufficient_evidence` +→ `review_required` (other) → `passed`. + +The intended recovery for a degraded-evidence case — whichever of the two +verdicts it lands on — is to provide clearer local evidence — for example an MCP export, OpenAPI spec, explicit local tool inventory, broader OpenAI Agents SDK -source path, or validation trace — and rerun the scan. +source path, or validation trace — and rerun the scan. When the decision is +`review_required` because of an active high/critical finding, also resolve that +finding. `agents-shipgate verify` keeps both cases human-routed +(`fix_task.actor = "human"`): a degraded-evidence case never opens an automated +coding-agent fix path, regardless of which verdict it carries. ### Check IDs diff --git a/docs/agent-contract-current.md b/docs/agent-contract-current.md index 2f3df8cb..86a305b5 100644 --- a/docs/agent-contract-current.md +++ b/docs/agent-contract-current.md @@ -60,7 +60,7 @@ hashes. It is not a second gate; the release gate remains In `agents-shipgate-reports/report.json`: -- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Blockers take precedence. If there are no blockers, `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings without blockers route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values. +- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Precedence is `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`. Blockers take precedence over everything. The evidence is degraded past threshold when low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. With no blockers, degraded evidence is `insufficient_evidence` (added v0.14) **unless** there is an active (non-baseline-accepted) high/critical review finding, in which case it is `review_required` so the reviewer is pointed at a named concern rather than the vaguer label. The evidence gap is preserved either way in `release_decision.evidence_coverage` — read those counts, not the verdict label, to detect degraded evidence. One to three source warnings without blockers also route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values. - `release_decision.blockers[]` — items that block release on this run. - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt. - `release_decision.{blockers,review_items}[].capability_refs` (v0.24+) — stable capability IDs copied from the originating finding when a policy or policy-pack rule matched a `CapabilityFactV1`. Empty for findings that are not capability-policy matches. This is audit metadata only; `release_decision.decision` remains the gate. diff --git a/docs/report-reading-for-agents.md b/docs/report-reading-for-agents.md index bc0d1b35..6b3bdb06 100644 --- a/docs/report-reading-for-agents.md +++ b/docs/report-reading-for-agents.md @@ -28,15 +28,19 @@ The CLI's stable contract names this signal explicitly: run `agents-shipgate con Branch on the four values (treat unknown future values as `review_required` per the [STABILITY.md additivity contract](../STABILITY.md#what-may-change-additively-in-any-minor-release)): +Precedence (highest first): `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`. + | `decision` | Meaning | Agent behavior | |---|---|---| | `"blocked"` | Active, unaccepted blockers exist. CI will fail in strict mode. | Surface blockers; do not auto-merge; do not assert evidence categories — see [`agent-autofix-boundary.md`](agent-autofix-boundary.md). | -| `"insufficient_evidence"` (v0.14+) | With no active blockers, evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings route to `review_required`. The scan can't gate release reliably, but this does not prove the agent is unsafe. | Surface the `release_decision.reason` verbatim; recommend gathering deeper sources (MCP exports, OpenAPI specs, explicit inventories, broader SDK source paths, eval traces) and re-running. Do not auto-merge. | -| `"review_required"` | Review items exist (often baseline-matched accepted debt, capability/intent misalignments, or sub-threshold evidence gaps). | Surface review items as a human handoff; safe mechanical patches may still apply via `apply-patches --confidence high`. | +| `"insufficient_evidence"` (v0.14+) | With no active blockers **and no active high/critical review finding**, evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. The scan can't gate release reliably, but this does not prove the agent is unsafe. | Surface the `release_decision.reason` verbatim; recommend gathering deeper sources (MCP exports, OpenAPI specs, explicit inventories, broader SDK source paths, eval traces) and re-running. Do not auto-merge. | +| `"review_required"` | Review items exist (often baseline-matched accepted debt, capability/intent misalignments, or sub-threshold evidence gaps). This **also** covers a degraded-evidence case that carries an active (non-baseline-accepted) high/critical finding — the verdict names the concern instead of the vaguer `insufficient_evidence`, but the evidence gap is still present in `evidence_coverage`. One to three source warnings without blockers also land here. | Surface review items as a human handoff. Safe mechanical patches may still apply via `apply-patches --confidence high` — **unless** evidence is degraded (check `evidence_coverage.low_confidence_tool_count` / `source_warning_count`), in which case treat it like `insufficient_evidence` and gather deeper sources first. `verify`'s `fix_task` already routes degraded-evidence cases to a human. | | `"passed"` | No active blockers, no review items, evidence coverage clean. | Mechanical patches (if any) may apply; otherwise nothing to do. | The decision is **baseline-aware**: a baseline-matched critical surfaces in `release_decision.review_items` (accepted debt), not in `release_decision.blockers`. Compare with the legacy `summary.status` field, which is *baseline-blind* — see Anti-patterns below. +> **Don't switch on the verdict label to detect degraded evidence.** A degraded-evidence case can surface as either `insufficient_evidence` or `review_required` (the latter when an active high/critical finding names the concern). Read `release_decision.evidence_coverage.{low_confidence_tool_count, source_warning_count, evidence_gaps[]}` to know whether evidence was thin, regardless of the label. + ### Step 2 · `release_decision.{reason, blockers, review_items, fail_policy.would_fail_ci}` Once you have the decision, read the supporting fields: diff --git a/llms-full.txt b/llms-full.txt index f91f9b10..9b0184da 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -1009,7 +1009,7 @@ hashes. It is not a second gate; the release gate remains In `agents-shipgate-reports/report.json`: -- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Blockers take precedence. If there are no blockers, `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings without blockers route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values. +- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Precedence is `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`. Blockers take precedence over everything. The evidence is degraded past threshold when low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. With no blockers, degraded evidence is `insufficient_evidence` (added v0.14) **unless** there is an active (non-baseline-accepted) high/critical review finding, in which case it is `review_required` so the reviewer is pointed at a named concern rather than the vaguer label. The evidence gap is preserved either way in `release_decision.evidence_coverage` — read those counts, not the verdict label, to detect degraded evidence. One to three source warnings without blockers also route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values. - `release_decision.blockers[]` — items that block release on this run. - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt. - `release_decision.{blockers,review_items}[].capability_refs` (v0.24+) — stable capability IDs copied from the originating finding when a policy or policy-pack rule matched a `CapabilityFactV1`. Empty for findings that are not capability-policy matches. This is audit metadata only; `release_decision.decision` remains the gate. diff --git a/src/agents_shipgate/ci/release_decision.py b/src/agents_shipgate/ci/release_decision.py index 3eab652c..920cc520 100644 --- a/src/agents_shipgate/ci/release_decision.py +++ b/src/agents_shipgate/ci/release_decision.py @@ -34,6 +34,26 @@ def _low_confidence_tool_threshold(tool_count: int) -> int: return max(1, math.ceil(tool_count * _LOW_CONFIDENCE_TOOL_RATIO)) +def evidence_below_ie_threshold( + evidence: EvidenceCoverageDecision, *, tool_count: int +) -> bool: + """True when extraction evidence is too weak to gate release on its own. + + This is the exact predicate `build_release_decision` uses to raise the + `insufficient_evidence` verdict, exposed so downstream projections can + reason about it directly. An active high/critical review finding *elevates* + such a case to `review_required` (a more actionable verdict), so the verdict + label alone no longer tells a consumer whether evidence was degraded — the + verify fix_task authority routing needs this predicate to keep + degraded-evidence cases human-routed regardless of which of the two + non-mergeable verdicts they landed on. + """ + return ( + evidence.low_confidence_tool_count >= _low_confidence_tool_threshold(tool_count) + or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS + ) + + def build_release_decision( *, report: ReadinessReport, @@ -55,7 +75,7 @@ def build_release_decision( review_items: list[ReleaseDecisionItem] = [] contribution_rules: list[ContributionRule] = [] blocker_severities: set[Severity] = {"critical", *fail_on_resolved} - # v0.27 (Phase 2c): an active (non-accepted) high/critical finding routed to + # Phase 2c: an active (non-accepted) high/critical finding routed to # review is a *named* concern — the gate has decided "a human must look", # which is more specific than "insufficient_evidence". Tracked here so the # decision can prefer review_required over IE when one exists (see below). @@ -188,7 +208,7 @@ def build_release_decision( exit_code=exit_code, ) - low_confidence_threshold = _low_confidence_tool_threshold(len(tools)) + evidence_is_degraded = evidence_below_ie_threshold(evidence, tool_count=len(tools)) decision: ReleaseDecisionStatus if blockers: @@ -201,12 +221,12 @@ def build_release_decision( # Both are equally non-auto-mergeable (can_merge_without_human=False), # so this loses no safety; the low-confidence detail is still carried # in evidence_coverage.evidence_gaps. IE stays the verdict when the - # only signal is weak evidence with no named high concern. + # only signal is weak evidence with no named high concern. NOTE: when + # evidence is *also* degraded here, the verify fix_task still routes to + # a human via evidence_below_ie_threshold (the same predicate), so the + # elevation never opens an auto-fix path on weak evidence. decision = "review_required" - elif ( - evidence.low_confidence_tool_count >= low_confidence_threshold - or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS - ): + elif evidence_is_degraded: decision = "insufficient_evidence" elif ( review_items diff --git a/src/agents_shipgate/cli/verify/fix_task.py b/src/agents_shipgate/cli/verify/fix_task.py index 15298e4e..7c4ac9a0 100644 --- a/src/agents_shipgate/cli/verify/fix_task.py +++ b/src/agents_shipgate/cli/verify/fix_task.py @@ -14,6 +14,7 @@ import shlex +from agents_shipgate.ci.release_decision import evidence_below_ie_threshold from agents_shipgate.core.agent_controls import FORBIDDEN_SHORTCUTS from agents_shipgate.schemas.report import Finding, ReadinessReport from agents_shipgate.schemas.verifier import ( @@ -109,6 +110,19 @@ def build_fix_task( gating = _gating_findings(report) + # Degraded static evidence (below the IE threshold) is an authority gap + # regardless of which verdict it produced. An active high/critical finding + # elevates a degraded-evidence case from `insufficient_evidence` to + # `review_required` (a more actionable verdict), + # so keying the escalation on `merge_verdict == "insufficient_evidence"` + # alone would let a mechanically-fixable high finding open a coding-agent + # auto-fix path on weak evidence. Compute the threshold directly from the + # same predicate the release decision uses so the two never drift. + evidence_degraded = evidence_below_ie_threshold( + report.release_decision.evidence_coverage, + tool_count=len(report.tool_inventory), + ) + # The coding-agent route is the only non-human outcome and it MUST fail # closed: every gating finding has to be explicitly mechanical # (``autofix_safe is True`` AND ``requires_human_review is False``). A @@ -123,6 +137,7 @@ def build_fix_task( capability_review.policy_weakened or capability_review.trust_root_touched or merge_verdict in {"insufficient_evidence", "unknown"} + or evidence_degraded ) if mechanical and not authority_escalation: return VerifierFixTask( @@ -143,7 +158,11 @@ def build_fix_task( actor="human", safe_to_attempt=False, instructions=_human_instructions( - report, capability_review, gating, merge_verdict=merge_verdict + report, + capability_review, + gating, + merge_verdict=merge_verdict, + evidence_degraded=evidence_degraded, ), allowed_repairs=_human_repairs( report, @@ -181,11 +200,16 @@ def _human_instructions( gating: list[Finding], *, merge_verdict: MergeVerdict = "human_review_required", + evidence_degraded: bool = False, ) -> list[str]: decision = report.release_decision assert decision is not None out: list[str] = [decision.reason] - if merge_verdict == "insufficient_evidence": + # Surface the concrete "make the hidden authority enumerable" remedies + # whenever evidence is degraded — not only on the bare IE verdict. A + # high-finding case elevated to review_required carries the same + # evidence gap and the human needs the same remedy. + if merge_verdict == "insufficient_evidence" or evidence_degraded: out.extend(_insufficient_evidence_remedies(report)) if capability_review.policy_weakened: out.append( diff --git a/tests/test_fix_task_contract.py b/tests/test_fix_task_contract.py index 99d73de1..41c1c7e1 100644 --- a/tests/test_fix_task_contract.py +++ b/tests/test_fix_task_contract.py @@ -67,8 +67,17 @@ def _item(finding: Finding) -> ReleaseDecisionItem: ) -def _report(*, decision: str, findings, blockers=(), review_items=()) -> ReadinessReport: - return ReadinessReport( +def _report( + *, + decision: str, + findings, + blockers=(), + review_items=(), + low_confidence_tool_count: int = 0, + source_warning_count: int = 0, + tool_inventory=None, +) -> ReadinessReport: + report = ReadinessReport( run_id="r", project={"name": "p"}, agent={"name": "a"}, @@ -82,8 +91,8 @@ def _report(*, decision: str, findings, blockers=(), review_items=()) -> Readine evidence_coverage=EvidenceCoverageDecision( level="static", human_review_recommended=False, - source_warning_count=0, - low_confidence_tool_count=0, + source_warning_count=source_warning_count, + low_confidence_tool_count=low_confidence_tool_count, ), baseline_delta=BaselineDelta(enabled=False), fail_policy=FailPolicy( @@ -97,6 +106,9 @@ def _report(*, decision: str, findings, blockers=(), review_items=()) -> Readine tool_surface=ToolSurfaceSummary(total_tools=0, high_risk_tools=0), findings=list(findings), ) + if tool_inventory is not None: + report.tool_inventory = list(tool_inventory) + return report def _review(*, policy_weakened=False, trust_root_touched=False) -> VerifierCapabilityReview: @@ -198,6 +210,67 @@ def test_insufficient_evidence_forces_human() -> None: assert task.safe_to_attempt is False +def test_degraded_evidence_under_review_required_forces_human() -> None: + # v0.27 regression guard: an active high finding elevates a degraded-evidence + # case from insufficient_evidence to review_required. The finding here is + # mechanically fixable (autofix_safe, no human review), so without an + # evidence-aware escalation it would route to coding_agent / safe_to_attempt + # — opening an auto-fix path on evidence too weak to gate. The fix_task must + # still fail closed to a human because the evidence is below the IE + # threshold (2 low-confidence tools of 2 → threshold 1). + f = _finding( + "F1", + requires_human_review=False, + autofix_safe=True, + severity="high", + recommendation="Add the missing scope bound.", + ) + report = _report( + decision="review_required", + findings=[f], + review_items=[f], + low_confidence_tool_count=2, + tool_inventory=[ + {"name": "a", "source_type": "langchain", "source_ref": "t.py", "confidence": "low"}, + {"name": "b", "source_type": "langchain", "source_ref": "t.py", "confidence": "low"}, + ], + ) + task = _fix_task(report) + assert task is not None + assert task.actor == "human" + assert task.safe_to_attempt is False + # The human still gets the concrete "make the surface enumerable" remedy, + # even though the verdict is review_required rather than the bare IE verdict. + joined = " ".join(task.instructions) + assert "explicit local tool inventory" in joined + + +def test_review_required_with_full_evidence_stays_mechanical() -> None: + # Counterpart guard: a mechanically-fixable high finding with HIGH-confidence + # evidence (no gap) must still route to the coding agent — the escalation + # fires on degraded evidence, not on severity. + f = _finding( + "F1", + requires_human_review=False, + autofix_safe=True, + severity="high", + recommendation="Add an owner field from CODEOWNERS.", + ) + report = _report( + decision="review_required", + findings=[f], + review_items=[f], + low_confidence_tool_count=0, + tool_inventory=[ + {"name": "a", "source_type": "mcp", "source_ref": "m.json", "confidence": "high"}, + ], + ) + task = _fix_task(report) + assert task is not None + assert task.actor == "coding_agent" + assert task.safe_to_attempt is True + + # --- Instructions / guardrails / verification ------------------------------- From b48d85e55bce6cb03344c190f663c7c4622ef24a Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Tue, 16 Jun 2026 13:53:08 -0700 Subject: [PATCH 3/3] Phase 2c review: agent_summary must not recommend auto-apply on degraded evidence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the third review finding on PR #222, the report-only counterpart to the verify fix_task fix. [P2] `agent_summary.first_recommended_action` still recommended auto-apply on degraded evidence. The Phase 2c elevation means a below-IE-threshold scan now surfaces as `review_required` when an active high/critical finding exists. agent_summary's action picker prioritized `auto_appliable > 0` before evidence remediation for ALL review_required cases — on the prior assumption that review_required implied trustworthy-enough evidence (true before Phase 2c). So an active high auto_apply finding + 2/2 low-confidence tools emitted a runnable `apply-patches ... --apply` command, telling a report-only consumer to "fix" a scan the gate already said it can't trust. Fix: thread the total tool count into build_agent_summary (report_builder has `tools`) so it can evaluate the shared `evidence_below_ie_threshold` predicate, and add a review_required + below-threshold branch to the action picker that emits the evidence/human info action before auto-apply — exactly like the insufficient_evidence path. Surgical: only the genuinely below-threshold case flips; sub-threshold review_required (1-3 source warnings, or low-confidence tools below the ratio) still recommends apply-patches with an evidence note. Regression tests: active high auto_apply + 2/2 low tools (tool_count=2) → info/no-command (was the bug); same finding with 2/10 low tools (sub-threshold) → still the apply-patches command (escalation keys on the ratio, not severity). Verified the only two machine-actionable, verdict-conditional apply-patches recommenders are fix_task (fixed last round) and agent_summary (this round); the other apply-patches mentions are static CLI help / rendered instructions. Full suite green; ruff + schema-drift clean; no schema bump; no golden ripple. Co-Authored-By: Claude Fable 5 --- .../core/findings/agent_summary.py | 65 +++++++++++++-- .../core/findings/report_builder.py | 5 ++ tests/test_agent_action_summary.py | 82 +++++++++++++++++++ 3 files changed, 147 insertions(+), 5 deletions(-) diff --git a/src/agents_shipgate/core/findings/agent_summary.py b/src/agents_shipgate/core/findings/agent_summary.py index 68ee491c..71434f31 100644 --- a/src/agents_shipgate/core/findings/agent_summary.py +++ b/src/agents_shipgate/core/findings/agent_summary.py @@ -2,6 +2,7 @@ import shlex +from agents_shipgate.ci.release_decision import evidence_below_ie_threshold from agents_shipgate.schemas.report import ( AgentSummary, AgentSummaryAction, @@ -17,6 +18,7 @@ def build_agent_summary( findings: list[Finding], release_decision: ReleaseDecision | None, json_report_path: str | None = None, + tool_count: int = 0, ) -> AgentSummary: """Construct the top-level ``agent_summary`` block. @@ -41,6 +43,7 @@ def build_agent_summary( review_item_count = 0 reason = "No release decision computed." evidence_recommended = False + evidence_below_threshold = False else: verdict = release_decision.decision blocker_count = len(release_decision.blockers) @@ -69,6 +72,22 @@ def build_agent_summary( or release_decision.evidence_coverage.source_warning_count > 0 ) ) + # `evidence_recommended` is the BROAD signal (any review-worthy + # evidence gap, including 1-3 sub-threshold source warnings). + # `evidence_below_threshold` is the NARROW one: evidence weak + # enough that, absent an active high/critical finding, the verdict + # would have been `insufficient_evidence`. Since that finding now + # *elevates* such a case to `review_required` (Phase 2c), the + # narrow signal is what tells the action picker to put evidence + # remediation ahead of auto-apply — applying patches never makes a + # below-threshold scan trustworthy. Uses the same predicate + # `build_release_decision` does, so the two never disagree. + evidence_below_threshold = bool( + release_decision.evidence_coverage + and evidence_below_ie_threshold( + release_decision.evidence_coverage, tool_count=tool_count + ) + ) active_findings = [f for f in findings if not f.suppressed] auto_appliable = sum( @@ -215,6 +234,7 @@ def build_agent_summary( active_findings=active_findings, json_report_path=json_report_path, evidence_recommended=evidence_recommended, + evidence_below_threshold=evidence_below_threshold, evidence_reason=( reason if (evidence_recommended or verdict == "insufficient_evidence") @@ -242,6 +262,7 @@ def _build_first_recommended_action( active_findings: list[Finding], json_report_path: str | None, evidence_recommended: bool = False, + evidence_below_threshold: bool = False, evidence_reason: str = "", ) -> AgentSummaryAction | None: """Deterministic next-step picker for ``agent_summary``. @@ -255,6 +276,13 @@ def _build_first_recommended_action( release, and running apply-patches first would contradict the headline. Tell the agent to fix the trust problem before cleaning up findings. + 1b. Verdict is review_required BUT evidence is below the IE + threshold (an active high/critical finding elevated it out of + insufficient_evidence — Phase 2c) → same as (1): evidence + remediation + human review of the named concern outrank + auto-apply. Without this, a report-only consumer following + agent_summary would be told to run ``apply-patches --apply`` on a + scan the gate has already said it can't trust. 2. Auto-applicable patches available → propose ``apply-patches``, but only as a ``command`` action when we know the actual JSON report path (so the command never points at the wrong file). @@ -281,17 +309,44 @@ def _build_first_recommended_action( ), ) + if verdict == "review_required" and evidence_below_threshold: + # An active high/critical finding elevated a below-IE-threshold + # scan to review_required (Phase 2c). Evidence is still too weak to + # gate, so — exactly like insufficient_evidence — gathering better + # evidence and routing the named concern to a human outrank any + # auto-apply patch. Emitting a runnable apply-patches command here + # would tell a report-only consumer to "fix" a scan the gate has + # already said it cannot trust. + base = ( + evidence_reason + or "Evidence coverage is below threshold; scan results are not " + "trustworthy enough to gate release." + ) + return AgentSummaryAction( + kind="info", + command=None, + why=( + f"{base} A human must review the active high/critical " + "finding(s), and you should gather deeper evidence (e.g. " + "MCP/OpenAPI inputs, eval traces, additional source files) " + "before re-running the scan; applying patches does not clear " + "the evidence gap." + ), + ) + if auto_appliable > 0: why = ( f"{auto_appliable} finding(s) carry high-confidence patches " "safe to apply without human review." ) if verdict == "review_required" and evidence_recommended: - # The patches are still worth applying (the scan IS - # trustworthy enough to gate at review_required, unlike - # the insufficient_evidence path that outranks auto-apply - # entirely). But the action's why must call out the - # evidence gap so the agent doesn't treat apply-patches + # Reaching here means evidence is recommended but only + # *sub-threshold* (1-3 source warnings, or human-review- + # recommended without enough low-confidence tools) — the + # below-threshold case was already intercepted above and + # routed to evidence remediation. So the patches ARE worth + # applying here, but the why must still call out the + # sub-threshold gap so the agent doesn't treat apply-patches # as the *only* next step — the human still needs to # review the source warnings / low-confidence tools. evidence_note = evidence_reason or ( diff --git a/src/agents_shipgate/core/findings/report_builder.py b/src/agents_shipgate/core/findings/report_builder.py index dfffb37a..4c1ad818 100644 --- a/src/agents_shipgate/core/findings/report_builder.py +++ b/src/agents_shipgate/core/findings/report_builder.py @@ -115,6 +115,11 @@ def build_report( findings=findings, release_decision=report.release_decision, json_report_path=generated_reports.get("json"), + # Threaded so agent_summary can detect below-IE-threshold evidence + # (the ratio needs the total tool count) and put evidence + # remediation ahead of auto-apply even when an active high/critical + # finding elevated the verdict to review_required (Phase 2c). + tool_count=len(tools), ) # v0.20 NOTE: ``report.reviewer_summary`` is NOT built here. It # depends on ``report.misalignments`` which ``apply_capability_diff`` diff --git a/tests/test_agent_action_summary.py b/tests/test_agent_action_summary.py index 5bb5f821..750bfb12 100644 --- a/tests/test_agent_action_summary.py +++ b/tests/test_agent_action_summary.py @@ -870,6 +870,88 @@ def test_insufficient_evidence_outranks_auto_apply_in_first_action(): assert "apply-patches" not in why or "does not clear" in why +def test_review_required_below_threshold_outranks_auto_apply(): + """Phase 2c regression: an active high finding elevates a + below-IE-threshold scan from insufficient_evidence to review_required. + The finding is auto-applicable, so the OLD code took the auto-apply + branch and emitted a runnable `apply-patches --apply` command — telling + a report-only consumer to "fix" a scan the gate already said it can't + trust. first_recommended_action must instead be the evidence/human + info action, exactly like the insufficient_evidence path.""" + finding = _make_finding( + check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED", + severity="high", + agent_action="auto_apply", + ) + reason = ( + "1 finding need review and evidence coverage is incomplete." + ) + decision = _make_release_decision( + decision="review_required", + review_items=["SHIP-SCOPE-TOOLKIT-UNBOUNDED"], + reason=reason, + evidence_human_review_recommended=True, + ) + # 2 of 2 low-confidence tools → below the IE threshold (ceil(2*0.5)=1). + decision.evidence_coverage = EvidenceCoverageDecision( + level=decision.evidence_coverage.level, + human_review_recommended=True, + source_warning_count=0, + low_confidence_tool_count=2, + ) + summary = build_agent_summary( + findings=[finding], + release_decision=decision, + json_report_path="/abs/agents-shipgate-reports/report.json", + tool_count=2, + ) + assert summary.verdict == "review_required" + assert summary.auto_appliable_patches == 1 + assert summary.first_recommended_action is not None + # Must NOT propose a runnable apply-patches command. + assert summary.first_recommended_action.kind == "info" + assert summary.first_recommended_action.command is None + why = summary.first_recommended_action.why + assert "gather deeper evidence" in why + assert "apply-patches" not in why or "does not clear" in why + assert "human" in why.lower() + + +def test_review_required_sub_threshold_evidence_keeps_auto_apply(): + """Counterpart guard: the SAME auto-applicable high finding with only + sub-threshold evidence (2 low-confidence tools out of 10 → below the + ceil(10*0.5)=5 threshold) is NOT degraded. apply-patches stays the + first action (the scan is trustworthy enough to gate at + review_required); the escalation keys on the ratio, not severity.""" + finding = _make_finding( + check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED", + severity="high", + agent_action="auto_apply", + ) + decision = _make_release_decision( + decision="review_required", + review_items=["SHIP-SCOPE-TOOLKIT-UNBOUNDED"], + reason="1 finding requires review.", + evidence_human_review_recommended=True, + ) + decision.evidence_coverage = EvidenceCoverageDecision( + level=decision.evidence_coverage.level, + human_review_recommended=True, + source_warning_count=0, + low_confidence_tool_count=2, + ) + summary = build_agent_summary( + findings=[finding], + release_decision=decision, + json_report_path="/abs/agents-shipgate-reports/report.json", + tool_count=10, + ) + assert summary.verdict == "review_required" + assert summary.first_recommended_action is not None + assert summary.first_recommended_action.kind == "command" + assert "apply-patches" in (summary.first_recommended_action.command or "") + + def test_build_agent_summary_insufficient_evidence_uses_reason_as_headline(): """A `insufficient_evidence` verdict must NOT fall through to the "Release ready" else-branch. The headline should surface the