From 4f685f0f0bea8a339105fad1ef6800072b821585 Mon Sep 17 00:00:00 2001
From: Pengfei Hu <pengfei@threemoonslab.com>
Date: Tue, 16 Jun 2026 13:17:15 -0700
Subject: [PATCH 1/3] Phase 2c: a named high concern routes to review, not
 insufficient_evidence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An *active* (non-baseline-accepted) high/critical review finding now elevates
the release decision to `review_required` even when the low-confidence
extraction gate would otherwise have produced `insufficient_evidence`.

Why: the 2026-06-01 Stripe pilot and the W24/W25 mining showed the dominant
real-world failure was a silent slide into `insufficient_evidence` — the gate
HAD something concrete (an unbounded toolkit) but reported only "we couldn't
see enough." Phase 2a named that concern (SHIP-SCOPE-TOOLKIT-UNBOUNDED, high);
this routes it to a human instead of burying it under the evidence gate.

Safety: both verdicts are equally non-auto-mergeable
(can_merge_without_human=False), so this loses no gate strength. `blocked`
still outranks everything; IE still fires when the only signal is thin
evidence with no named high concern; baseline-accepted (matched) high debt
does NOT elevate (acknowledged debt, not an active concern). The
low-confidence detail is preserved in evidence_coverage.evidence_gaps, and
_decision_reason already emits the honest combined wording ("N findings need
review and evidence coverage is incomplete").

No schema change (verdict-value change only); generate_schemas --check clean.

- release_decision.py: track has_active_high_review; insert the elevation
  branch above the IE branch.
- test_release_decision.py: high-active elevates; medium-active and
  high-accepted do not (the contrast cases).
- test_toolkit_bounds_check.py: the stripe-head e2e now asserts
  release_decision.decision == "review_required" (ties 2a+2c together).
- CHANGELOG: Unreleased note.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 CHANGELOG.md                               | 14 ++++++++
 src/agents_shipgate/ci/release_decision.py | 20 +++++++++++
 tests/test_release_decision.py             | 39 ++++++++++++++++++++++
 tests/test_toolkit_bounds_check.py         |  4 +++
 4 files changed, 77 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e216ad01..6e8eea8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## Unreleased
+
+- **A named high concern now routes to review, not `insufficient_evidence`.**
+  When a scan turns up an *active* (not baseline-accepted) high/critical review
+  finding, the release decision is now `review_required` even if low-confidence
+  extraction would otherwise have produced `insufficient_evidence`. Both
+  verdicts are equally non-auto-mergeable, but `review_required` points the
+  human at a specific, actionable finding (e.g. the new
+  `SHIP-SCOPE-TOOLKIT-UNBOUNDED`) instead of the vaguer "we couldn't see
+  enough." `blocked` still outranks everything; IE still fires when the only
+  signal is thin extraction. The 2026-06-01 Stripe pilot's silent/IE case now
+  surfaces as a routed review. `evidence_gaps` are preserved on the report
+  either way, so the extraction-coverage signal is not lost.
+
 ## 0.13.0 - 2026-06-12
 
 - **Accepted-debt exception workflow (baseline schema 0.6).** `baseline save`
diff --git a/src/agents_shipgate/ci/release_decision.py b/src/agents_shipgate/ci/release_decision.py
index 0bb7e5e5..3eab652c 100644
--- a/src/agents_shipgate/ci/release_decision.py
+++ b/src/agents_shipgate/ci/release_decision.py
@@ -55,6 +55,11 @@ def build_release_decision(
     review_items: list[ReleaseDecisionItem] = []
     contribution_rules: list[ContributionRule] = []
     blocker_severities: set[Severity] = {"critical", *fail_on_resolved}
+    # v0.27 (Phase 2c): an active (non-accepted) high/critical finding routed to
+    # review is a *named* concern — the gate has decided "a human must look",
+    # which is more specific than "insufficient_evidence". Tracked here so the
+    # decision can prefer review_required over IE when one exists (see below).
+    has_active_high_review = False
 
     # v0.17: iterate the FULL findings list (not just `active`) so the
     # audit row set is exhaustive over report.findings. The branching
@@ -121,6 +126,11 @@ def build_release_decision(
             or finding.requires_human_review is True
         ):
             review_items.append(_to_item(finding))
+            if (
+                finding.severity in {"critical", "high"}
+                and finding.baseline_status != "matched"
+            ):
+                has_active_high_review = True
             contribution_rules.append(
                 _rule(
                     finding,
@@ -183,6 +193,16 @@ def build_release_decision(
     decision: ReleaseDecisionStatus
     if blockers:
         decision = "blocked"
+    elif has_active_high_review:
+        # Phase 2c: a named, active high/critical concern (e.g. an unbounded
+        # toolkit mounted on the agent) is not "insufficient evidence" — the
+        # gate HAS something concrete for a human to review. Prefer the
+        # actionable review_required over the vaguer insufficient_evidence.
+        # Both are equally non-auto-mergeable (can_merge_without_human=False),
+        # so this loses no safety; the low-confidence detail is still carried
+        # in evidence_coverage.evidence_gaps. IE stays the verdict when the
+        # only signal is weak evidence with no named high concern.
+        decision = "review_required"
     elif (
         evidence.low_confidence_tool_count >= low_confidence_threshold
         or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS
diff --git a/tests/test_release_decision.py b/tests/test_release_decision.py
index d73da81c..7bac5a2e 100644
--- a/tests/test_release_decision.py
+++ b/tests/test_release_decision.py
@@ -303,6 +303,45 @@ def test_insufficient_evidence_outranks_review_required():
     assert len(decision.review_items) == 1
 
 
+def test_active_high_review_finding_outranks_insufficient_evidence():
+    """Phase 2c: an active (non-accepted) HIGH finding is a *named* concern —
+    review_required, not the vaguer insufficient_evidence — even when the
+    low-confidence-tool gate would otherwise trip. Contrast with the medium
+    case above, which stays insufficient_evidence. Both verdicts are equally
+    non-auto-mergeable, so this loses no safety; the evidence detail is kept."""
+    tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")]
+    report = _report(
+        findings=[
+            _finding(
+                check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED",
+                severity="high",
+                baseline_status="new",
+            )
+        ],
+        tools=tools,
+        human_review_recommended=True,
+    )
+    decision = _build(report, ci_mode="advisory", tools=tools)
+    assert decision.decision == "review_required"
+    assert len(decision.review_items) == 1
+    # The insufficient-evidence detail is preserved for the reviewer.
+    assert decision.evidence_coverage.low_confidence_tool_count == 2
+    assert decision.evidence_coverage.evidence_gaps
+
+
+def test_accepted_high_finding_does_not_outrank_insufficient_evidence():
+    """A baseline-MATCHED (accepted) high finding is acknowledged debt — it
+    must NOT elevate out of insufficient_evidence; only active high concerns do."""
+    tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")]
+    report = _report(
+        findings=[_finding(severity="high", baseline_status="matched")],
+        tools=tools,
+        baseline=BaselineSummary(path=".agents-shipgate/baseline.json", matched_count=1),
+    )
+    decision = _build(report, ci_mode="advisory", tools=tools, new_findings_only=True)
+    assert decision.decision == "insufficient_evidence"
+
+
 def test_insufficient_evidence_reason_lists_both_counts():
     """When both gates trip, the reason names both counts."""
     tools = [_tool(name="a", confidence="low"), _tool(name="b", confidence="low")]
diff --git a/tests/test_toolkit_bounds_check.py b/tests/test_toolkit_bounds_check.py
index 58830dba..42845b27 100644
--- a/tests/test_toolkit_bounds_check.py
+++ b/tests/test_toolkit_bounds_check.py
@@ -106,3 +106,7 @@ def test_scan_of_stripe_head_fixture_names_the_unbounded_toolkit(tmp_path: Path)
     # The finding must point at the constructor, not shipgate.yaml.
     source = matches[0].get("source") or {}
     assert (source.get("path") or "").endswith("support_agent.py"), source
+    # Phase 2c: the named high concern elevates the verdict out of the vague
+    # insufficient_evidence into an actionable review_required (the pilot's
+    # silent/IE case is now a routed human review).
+    assert report["release_decision"]["decision"] == "review_required"

From b2c2f9fb75632591bd8cb31e3033e0ac065cad2b Mon Sep 17 00:00:00 2001
From: Pengfei Hu <pengfei@threemoonslab.com>
Date: Tue, 16 Jun 2026 13:35:31 -0700
Subject: [PATCH 2/3] Phase 2c review: keep degraded-evidence cases
 human-routed + fix contract docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses two review findings on PR #222.

[P2] Degraded-evidence cases could route to coding_agent. The new precedence
elevates a below-threshold-evidence case to `review_required` when an active
high/critical finding exists, but `fix_task` keyed its degraded-evidence
authority escalation on `merge_verdict == "insufficient_evidence"`. A
mechanically-fixable high finding + 2/2 low-confidence tools therefore landed
on `review_required` and routed to coding_agent / safe_to_attempt=True — an
auto-fix path on evidence too weak to gate.

Fix: extract the IE-threshold predicate into a single public helper
`evidence_below_ie_threshold(evidence, tool_count=...)` in release_decision.py
(now the one source of truth, used by build_release_decision too), and OR it
into fix_task's `authority_escalation`. A degraded-evidence case is now always
human-routed regardless of which non-mergeable verdict it carries. The human
also gets the concrete "make the surface enumerable" remedies in that case, not
only on the bare IE verdict. Regression tests: degraded-evidence +
mechanically-fixable high → human (was the bug); high finding + full evidence →
still coding_agent (escalation fires on degraded evidence, not severity).

[P2] Contract docs described the old precedence. Updated STABILITY.md,
docs/agent-contract-current.md, and docs/report-reading-for-agents.md to
document the active-high/critical exception, the full precedence order, that
the evidence gap is preserved in `evidence_coverage` (read the counts, not the
label, to detect degraded evidence), and that verify keeps both cases
human-routed. Regenerated llms-full.txt.

Also scrubbed an inaccurate "v0.27" marker (code comments + docs): these
contract docs use `v0.X` for report_schema_version (currently 0.26), and this
change does not bump the schema — replaced with the "Phase 2c" milestone name.

Full suite green; ruff + schema-drift clean; no schema bump.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 STABILITY.md                               | 36 ++++++++--
 docs/agent-contract-current.md             |  2 +-
 docs/report-reading-for-agents.md          |  8 ++-
 llms-full.txt                              |  2 +-
 src/agents_shipgate/ci/release_decision.py | 34 +++++++--
 src/agents_shipgate/cli/verify/fix_task.py | 28 +++++++-
 tests/test_fix_task_contract.py            | 81 ++++++++++++++++++++--
 7 files changed, 167 insertions(+), 24 deletions(-)

diff --git a/STABILITY.md b/STABILITY.md
index a8845d24..02f8321b 100644
--- a/STABILITY.md
+++ b/STABILITY.md
@@ -331,14 +331,36 @@ the scan also has low-confidence tools or source warnings.
 
 When there are no blockers, `insufficient_evidence` means the static inputs are
 not strong enough for Shipgate to gate release confidently. It does **not**
-prove the agent is unsafe. The decision fires when low-confidence tools are at
-least `max(1, ceil(tool_count × 0.5))`, or when source-loader warnings exceed
-`3`. One to three source warnings without blockers route to `review_required`
-so a human still sees the degraded source coverage.
-
-The intended recovery is to provide clearer local evidence — for example an MCP
+prove the agent is unsafe. The evidence is considered below threshold when
+low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or when
+source-loader warnings exceed `3`. One to three source warnings without
+blockers route to `review_required` so a human still sees the degraded source
+coverage.
+
+**Active high/critical findings take precedence over the IE label.** When the
+evidence is below threshold *and* there is an active (non-baseline-accepted)
+high- or critical-severity review finding, the decision is `review_required`
+rather than `insufficient_evidence`. Both verdicts are
+equally non-mergeable (`can_merge_without_human` is false either way), but
+`review_required` points the reviewer at a specific, named finding instead of
+the vaguer "we couldn't see enough." The evidence gap is **not** lost: the
+underlying counts remain in `release_decision.evidence_coverage`
+(`low_confidence_tool_count`, `source_warning_count`, `evidence_gaps[]`), so a
+consumer must read those fields rather than the verdict label alone to know
+whether evidence was degraded. `insufficient_evidence` still fires when the only
+signal is weak evidence with no named high/critical concern; `blocked` still
+takes precedence over both. The precedence is therefore:
+`blocked` → `review_required` (active high/critical) → `insufficient_evidence`
+→ `review_required` (other) → `passed`.
+
+The intended recovery for a degraded-evidence case — whichever of the two
+verdicts it lands on — is to provide clearer local evidence — for example an MCP
 export, OpenAPI spec, explicit local tool inventory, broader OpenAI Agents SDK
-source path, or validation trace — and rerun the scan.
+source path, or validation trace — and rerun the scan. When the decision is
+`review_required` because of an active high/critical finding, also resolve that
+finding. `agents-shipgate verify` keeps both cases human-routed
+(`fix_task.actor = "human"`): a degraded-evidence case never opens an automated
+coding-agent fix path, regardless of which verdict it carries.
 
 ### Check IDs
 
diff --git a/docs/agent-contract-current.md b/docs/agent-contract-current.md
index 2f3df8cb..86a305b5 100644
--- a/docs/agent-contract-current.md
+++ b/docs/agent-contract-current.md
@@ -60,7 +60,7 @@ hashes. It is not a second gate; the release gate remains
 
 In `agents-shipgate-reports/report.json`:
 
-- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Blockers take precedence. If there are no blockers, `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings without blockers route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values.
+- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Precedence is `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`. Blockers take precedence over everything. The evidence is degraded past threshold when low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. With no blockers, degraded evidence is `insufficient_evidence` (added v0.14) **unless** there is an active (non-baseline-accepted) high/critical review finding, in which case it is `review_required` so the reviewer is pointed at a named concern rather than the vaguer label. The evidence gap is preserved either way in `release_decision.evidence_coverage` — read those counts, not the verdict label, to detect degraded evidence. One to three source warnings without blockers also route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values.
 - `release_decision.blockers[]` — items that block release on this run.
 - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt.
 - `release_decision.{blockers,review_items}[].capability_refs` (v0.24+) — stable capability IDs copied from the originating finding when a policy or policy-pack rule matched a `CapabilityFactV1`. Empty for findings that are not capability-policy matches. This is audit metadata only; `release_decision.decision` remains the gate.
diff --git a/docs/report-reading-for-agents.md b/docs/report-reading-for-agents.md
index bc0d1b35..6b3bdb06 100644
--- a/docs/report-reading-for-agents.md
+++ b/docs/report-reading-for-agents.md
@@ -28,15 +28,19 @@ The CLI's stable contract names this signal explicitly: run `agents-shipgate con
 
 Branch on the four values (treat unknown future values as `review_required` per the [STABILITY.md additivity contract](../STABILITY.md#what-may-change-additively-in-any-minor-release)):
 
+Precedence (highest first): `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`.
+
 | `decision` | Meaning | Agent behavior |
 |---|---|---|
 | `"blocked"` | Active, unaccepted blockers exist. CI will fail in strict mode. | Surface blockers; do not auto-merge; do not assert evidence categories — see [`agent-autofix-boundary.md`](agent-autofix-boundary.md). |
-| `"insufficient_evidence"` (v0.14+) | With no active blockers, evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings route to `review_required`. The scan can't gate release reliably, but this does not prove the agent is unsafe. | Surface the `release_decision.reason` verbatim; recommend gathering deeper sources (MCP exports, OpenAPI specs, explicit inventories, broader SDK source paths, eval traces) and re-running. Do not auto-merge. |
-| `"review_required"` | Review items exist (often baseline-matched accepted debt, capability/intent misalignments, or sub-threshold evidence gaps). | Surface review items as a human handoff; safe mechanical patches may still apply via `apply-patches --confidence high`. |
+| `"insufficient_evidence"` (v0.14+) | With no active blockers **and no active high/critical review finding**, evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. The scan can't gate release reliably, but this does not prove the agent is unsafe. | Surface the `release_decision.reason` verbatim; recommend gathering deeper sources (MCP exports, OpenAPI specs, explicit inventories, broader SDK source paths, eval traces) and re-running. Do not auto-merge. |
+| `"review_required"` | Review items exist (often baseline-matched accepted debt, capability/intent misalignments, or sub-threshold evidence gaps). This **also** covers a degraded-evidence case that carries an active (non-baseline-accepted) high/critical finding — the verdict names the concern instead of the vaguer `insufficient_evidence`, but the evidence gap is still present in `evidence_coverage`. One to three source warnings without blockers also land here. | Surface review items as a human handoff. Safe mechanical patches may still apply via `apply-patches --confidence high` — **unless** evidence is degraded (check `evidence_coverage.low_confidence_tool_count` / `source_warning_count`), in which case treat it like `insufficient_evidence` and gather deeper sources first. `verify`'s `fix_task` already routes degraded-evidence cases to a human. |
 | `"passed"` | No active blockers, no review items, evidence coverage clean. | Mechanical patches (if any) may apply; otherwise nothing to do. |
 
 The decision is **baseline-aware**: a baseline-matched critical surfaces in `release_decision.review_items` (accepted debt), not in `release_decision.blockers`. Compare with the legacy `summary.status` field, which is *baseline-blind* — see Anti-patterns below.
 
+> **Don't switch on the verdict label to detect degraded evidence.** A degraded-evidence case can surface as either `insufficient_evidence` or `review_required` (the latter when an active high/critical finding names the concern). Read `release_decision.evidence_coverage.{low_confidence_tool_count, source_warning_count, evidence_gaps[]}` to know whether evidence was thin, regardless of the label.
+
 ### Step 2 · `release_decision.{reason, blockers, review_items, fail_policy.would_fail_ci}`
 
 Once you have the decision, read the supporting fields:
diff --git a/llms-full.txt b/llms-full.txt
index f91f9b10..9b0184da 100644
--- a/llms-full.txt
+++ b/llms-full.txt
@@ -1009,7 +1009,7 @@ hashes. It is not a second gate; the release gate remains
 
 In `agents-shipgate-reports/report.json`:
 
-- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Blockers take precedence. If there are no blockers, `insufficient_evidence` (added v0.14) fires when evidence coverage is degraded past threshold: low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. One to three source warnings without blockers route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values.
+- `release_decision.decision` — `"blocked"` / `"review_required"` / `"insufficient_evidence"` / `"passed"`. Baseline-aware. **This is the gating signal.** Precedence is `blocked` → `review_required` (active high/critical) → `insufficient_evidence` → `review_required` (other) → `passed`. Blockers take precedence over everything. The evidence is degraded past threshold when low-confidence tools are at least `max(1, ceil(tool_count × 0.5))`, or source-loader warnings exceed `3`. With no blockers, degraded evidence is `insufficient_evidence` (added v0.14) **unless** there is an active (non-baseline-accepted) high/critical review finding, in which case it is `review_required` so the reviewer is pointed at a named concern rather than the vaguer label. The evidence gap is preserved either way in `release_decision.evidence_coverage` — read those counts, not the verdict label, to detect degraded evidence. One to three source warnings without blockers also route to `review_required`. `insufficient_evidence` means the scan cannot confidently gate release from the available static evidence; it does not prove the agent is unsafe. Switch on the enum with a `review_required` fallback for unknown future values.
 - `release_decision.blockers[]` — items that block release on this run.
 - `release_decision.review_items[]` — items the human reviewer should look at; includes baseline-matched accepted debt.
 - `release_decision.{blockers,review_items}[].capability_refs` (v0.24+) — stable capability IDs copied from the originating finding when a policy or policy-pack rule matched a `CapabilityFactV1`. Empty for findings that are not capability-policy matches. This is audit metadata only; `release_decision.decision` remains the gate.
diff --git a/src/agents_shipgate/ci/release_decision.py b/src/agents_shipgate/ci/release_decision.py
index 3eab652c..920cc520 100644
--- a/src/agents_shipgate/ci/release_decision.py
+++ b/src/agents_shipgate/ci/release_decision.py
@@ -34,6 +34,26 @@ def _low_confidence_tool_threshold(tool_count: int) -> int:
     return max(1, math.ceil(tool_count * _LOW_CONFIDENCE_TOOL_RATIO))
 
 
+def evidence_below_ie_threshold(
+    evidence: EvidenceCoverageDecision, *, tool_count: int
+) -> bool:
+    """True when extraction evidence is too weak to gate release on its own.
+
+    This is the exact predicate `build_release_decision` uses to raise the
+    `insufficient_evidence` verdict, exposed so downstream projections can
+    reason about it directly. An active high/critical review finding *elevates*
+    such a case to `review_required` (a more actionable verdict), so the verdict
+    label alone no longer tells a consumer whether evidence was degraded — the
+    verify fix_task authority routing needs this predicate to keep
+    degraded-evidence cases human-routed regardless of which of the two
+    non-mergeable verdicts they landed on.
+    """
+    return (
+        evidence.low_confidence_tool_count >= _low_confidence_tool_threshold(tool_count)
+        or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS
+    )
+
+
 def build_release_decision(
     *,
     report: ReadinessReport,
@@ -55,7 +75,7 @@ def build_release_decision(
     review_items: list[ReleaseDecisionItem] = []
     contribution_rules: list[ContributionRule] = []
     blocker_severities: set[Severity] = {"critical", *fail_on_resolved}
-    # v0.27 (Phase 2c): an active (non-accepted) high/critical finding routed to
+    # Phase 2c: an active (non-accepted) high/critical finding routed to
     # review is a *named* concern — the gate has decided "a human must look",
     # which is more specific than "insufficient_evidence". Tracked here so the
     # decision can prefer review_required over IE when one exists (see below).
@@ -188,7 +208,7 @@ def build_release_decision(
         exit_code=exit_code,
     )
 
-    low_confidence_threshold = _low_confidence_tool_threshold(len(tools))
+    evidence_is_degraded = evidence_below_ie_threshold(evidence, tool_count=len(tools))
 
     decision: ReleaseDecisionStatus
     if blockers:
@@ -201,12 +221,12 @@ def build_release_decision(
         # Both are equally non-auto-mergeable (can_merge_without_human=False),
         # so this loses no safety; the low-confidence detail is still carried
         # in evidence_coverage.evidence_gaps. IE stays the verdict when the
-        # only signal is weak evidence with no named high concern.
+        # only signal is weak evidence with no named high concern. NOTE: when
+        # evidence is *also* degraded here, the verify fix_task still routes to
+        # a human via evidence_below_ie_threshold (the same predicate), so the
+        # elevation never opens an auto-fix path on weak evidence.
         decision = "review_required"
-    elif (
-        evidence.low_confidence_tool_count >= low_confidence_threshold
-        or evidence.source_warning_count > _MAX_TOLERATED_SOURCE_WARNINGS
-    ):
+    elif evidence_is_degraded:
         decision = "insufficient_evidence"
     elif (
         review_items
diff --git a/src/agents_shipgate/cli/verify/fix_task.py b/src/agents_shipgate/cli/verify/fix_task.py
index 15298e4e..7c4ac9a0 100644
--- a/src/agents_shipgate/cli/verify/fix_task.py
+++ b/src/agents_shipgate/cli/verify/fix_task.py
@@ -14,6 +14,7 @@
 
 import shlex
 
+from agents_shipgate.ci.release_decision import evidence_below_ie_threshold
 from agents_shipgate.core.agent_controls import FORBIDDEN_SHORTCUTS
 from agents_shipgate.schemas.report import Finding, ReadinessReport
 from agents_shipgate.schemas.verifier import (
@@ -109,6 +110,19 @@ def build_fix_task(
 
     gating = _gating_findings(report)
 
+    # Degraded static evidence (below the IE threshold) is an authority gap
+    # regardless of which verdict it produced. An active high/critical finding
+    # elevates a degraded-evidence case from `insufficient_evidence` to
+    # `review_required` (a more actionable verdict),
+    # so keying the escalation on `merge_verdict == "insufficient_evidence"`
+    # alone would let a mechanically-fixable high finding open a coding-agent
+    # auto-fix path on weak evidence. Compute the threshold directly from the
+    # same predicate the release decision uses so the two never drift.
+    evidence_degraded = evidence_below_ie_threshold(
+        report.release_decision.evidence_coverage,
+        tool_count=len(report.tool_inventory),
+    )
+
     # The coding-agent route is the only non-human outcome and it MUST fail
     # closed: every gating finding has to be explicitly mechanical
     # (``autofix_safe is True`` AND ``requires_human_review is False``). A
@@ -123,6 +137,7 @@ def build_fix_task(
         capability_review.policy_weakened
         or capability_review.trust_root_touched
         or merge_verdict in {"insufficient_evidence", "unknown"}
+        or evidence_degraded
     )
     if mechanical and not authority_escalation:
         return VerifierFixTask(
@@ -143,7 +158,11 @@ def build_fix_task(
         actor="human",
         safe_to_attempt=False,
         instructions=_human_instructions(
-            report, capability_review, gating, merge_verdict=merge_verdict
+            report,
+            capability_review,
+            gating,
+            merge_verdict=merge_verdict,
+            evidence_degraded=evidence_degraded,
         ),
         allowed_repairs=_human_repairs(
             report,
@@ -181,11 +200,16 @@ def _human_instructions(
     gating: list[Finding],
     *,
     merge_verdict: MergeVerdict = "human_review_required",
+    evidence_degraded: bool = False,
 ) -> list[str]:
     decision = report.release_decision
     assert decision is not None
     out: list[str] = [decision.reason]
-    if merge_verdict == "insufficient_evidence":
+    # Surface the concrete "make the hidden authority enumerable" remedies
+    # whenever evidence is degraded — not only on the bare IE verdict. A
+    # high-finding case elevated to review_required carries the same
+    # evidence gap and the human needs the same remedy.
+    if merge_verdict == "insufficient_evidence" or evidence_degraded:
         out.extend(_insufficient_evidence_remedies(report))
     if capability_review.policy_weakened:
         out.append(
diff --git a/tests/test_fix_task_contract.py b/tests/test_fix_task_contract.py
index 99d73de1..41c1c7e1 100644
--- a/tests/test_fix_task_contract.py
+++ b/tests/test_fix_task_contract.py
@@ -67,8 +67,17 @@ def _item(finding: Finding) -> ReleaseDecisionItem:
     )
 
 
-def _report(*, decision: str, findings, blockers=(), review_items=()) -> ReadinessReport:
-    return ReadinessReport(
+def _report(
+    *,
+    decision: str,
+    findings,
+    blockers=(),
+    review_items=(),
+    low_confidence_tool_count: int = 0,
+    source_warning_count: int = 0,
+    tool_inventory=None,
+) -> ReadinessReport:
+    report = ReadinessReport(
         run_id="r",
         project={"name": "p"},
         agent={"name": "a"},
@@ -82,8 +91,8 @@ def _report(*, decision: str, findings, blockers=(), review_items=()) -> Readine
             evidence_coverage=EvidenceCoverageDecision(
                 level="static",
                 human_review_recommended=False,
-                source_warning_count=0,
-                low_confidence_tool_count=0,
+                source_warning_count=source_warning_count,
+                low_confidence_tool_count=low_confidence_tool_count,
             ),
             baseline_delta=BaselineDelta(enabled=False),
             fail_policy=FailPolicy(
@@ -97,6 +106,9 @@ def _report(*, decision: str, findings, blockers=(), review_items=()) -> Readine
         tool_surface=ToolSurfaceSummary(total_tools=0, high_risk_tools=0),
         findings=list(findings),
     )
+    if tool_inventory is not None:
+        report.tool_inventory = list(tool_inventory)
+    return report
 
 
 def _review(*, policy_weakened=False, trust_root_touched=False) -> VerifierCapabilityReview:
@@ -198,6 +210,67 @@ def test_insufficient_evidence_forces_human() -> None:
     assert task.safe_to_attempt is False
 
 
+def test_degraded_evidence_under_review_required_forces_human() -> None:
+    # v0.27 regression guard: an active high finding elevates a degraded-evidence
+    # case from insufficient_evidence to review_required. The finding here is
+    # mechanically fixable (autofix_safe, no human review), so without an
+    # evidence-aware escalation it would route to coding_agent / safe_to_attempt
+    # — opening an auto-fix path on evidence too weak to gate. The fix_task must
+    # still fail closed to a human because the evidence is below the IE
+    # threshold (2 low-confidence tools of 2 → threshold 1).
+    f = _finding(
+        "F1",
+        requires_human_review=False,
+        autofix_safe=True,
+        severity="high",
+        recommendation="Add the missing scope bound.",
+    )
+    report = _report(
+        decision="review_required",
+        findings=[f],
+        review_items=[f],
+        low_confidence_tool_count=2,
+        tool_inventory=[
+            {"name": "a", "source_type": "langchain", "source_ref": "t.py", "confidence": "low"},
+            {"name": "b", "source_type": "langchain", "source_ref": "t.py", "confidence": "low"},
+        ],
+    )
+    task = _fix_task(report)
+    assert task is not None
+    assert task.actor == "human"
+    assert task.safe_to_attempt is False
+    # The human still gets the concrete "make the surface enumerable" remedy,
+    # even though the verdict is review_required rather than the bare IE verdict.
+    joined = " ".join(task.instructions)
+    assert "explicit local tool inventory" in joined
+
+
+def test_review_required_with_full_evidence_stays_mechanical() -> None:
+    # Counterpart guard: a mechanically-fixable high finding with HIGH-confidence
+    # evidence (no gap) must still route to the coding agent — the escalation
+    # fires on degraded evidence, not on severity.
+    f = _finding(
+        "F1",
+        requires_human_review=False,
+        autofix_safe=True,
+        severity="high",
+        recommendation="Add an owner field from CODEOWNERS.",
+    )
+    report = _report(
+        decision="review_required",
+        findings=[f],
+        review_items=[f],
+        low_confidence_tool_count=0,
+        tool_inventory=[
+            {"name": "a", "source_type": "mcp", "source_ref": "m.json", "confidence": "high"},
+        ],
+    )
+    task = _fix_task(report)
+    assert task is not None
+    assert task.actor == "coding_agent"
+    assert task.safe_to_attempt is True
+
+
 # --- Instructions / guardrails / verification -------------------------------
 
 

From b48d85e55bce6cb03344c190f663c7c4622ef24a Mon Sep 17 00:00:00 2001
From: Pengfei Hu <pengfei@threemoonslab.com>
Date: Tue, 16 Jun 2026 13:53:08 -0700
Subject: [PATCH 3/3] Phase 2c review: agent_summary must not recommend
 auto-apply on degraded evidence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the third review finding on PR #222, the report-only counterpart to
the verify fix_task fix.

[P2] `agent_summary.first_recommended_action` still recommended auto-apply on
degraded evidence. The Phase 2c elevation means a below-IE-threshold scan now
surfaces as `review_required` when an active high/critical finding exists.
agent_summary's action picker prioritized `auto_appliable > 0` before evidence
remediation for ALL review_required cases — on the prior assumption that
review_required implied trustworthy-enough evidence (true before Phase 2c). So
an active high auto_apply finding + 2/2 low-confidence tools emitted a runnable
`apply-patches ... --apply` command, telling a report-only consumer to "fix" a
scan the gate already said it can't trust.

Fix: thread the total tool count into build_agent_summary (report_builder has
`tools`) so it can evaluate the shared `evidence_below_ie_threshold` predicate,
and add a review_required + below-threshold branch to the action picker that
emits the evidence/human info action before auto-apply — exactly like the
insufficient_evidence path. Surgical: only the genuinely below-threshold case
flips; sub-threshold review_required (1-3 source warnings, or low-confidence
tools below the ratio) still recommends apply-patches with an evidence note.

Regression tests: active high auto_apply + 2/2 low tools (tool_count=2) →
info/no-command (was the bug); same finding with 2/10 low tools (sub-threshold)
→ still the apply-patches command (escalation keys on the ratio, not severity).

Verified the only two machine-actionable, verdict-conditional apply-patches
recommenders are fix_task (fixed last round) and agent_summary (this round);
the other apply-patches mentions are static CLI help / rendered instructions.

Full suite green; ruff + schema-drift clean; no schema bump; no golden ripple.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../core/findings/agent_summary.py            | 65 +++++++++++++--
 .../core/findings/report_builder.py           |  5 ++
 tests/test_agent_action_summary.py            | 82 +++++++++++++++++++
 3 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/src/agents_shipgate/core/findings/agent_summary.py b/src/agents_shipgate/core/findings/agent_summary.py
index 68ee491c..71434f31 100644
--- a/src/agents_shipgate/core/findings/agent_summary.py
+++ b/src/agents_shipgate/core/findings/agent_summary.py
@@ -2,6 +2,7 @@
 
 import shlex
 
+from agents_shipgate.ci.release_decision import evidence_below_ie_threshold
 from agents_shipgate.schemas.report import (
     AgentSummary,
     AgentSummaryAction,
@@ -17,6 +18,7 @@ def build_agent_summary(
     findings: list[Finding],
     release_decision: ReleaseDecision | None,
     json_report_path: str | None = None,
+    tool_count: int = 0,
 ) -> AgentSummary:
     """Construct the top-level ``agent_summary`` block.
 
@@ -41,6 +43,7 @@ def build_agent_summary(
         review_item_count = 0
         reason = "No release decision computed."
         evidence_recommended = False
+        evidence_below_threshold = False
     else:
         verdict = release_decision.decision
         blocker_count = len(release_decision.blockers)
@@ -69,6 +72,22 @@ def build_agent_summary(
                 or release_decision.evidence_coverage.source_warning_count > 0
             )
         )
+        # `evidence_recommended` is the BROAD signal (any review-worthy
+        # evidence gap, including 1-3 sub-threshold source warnings).
+        # `evidence_below_threshold` is the NARROW one: evidence weak
+        # enough that, absent an active high/critical finding, the verdict
+        # would have been `insufficient_evidence`. Since that finding now
+        # *elevates* such a case to `review_required` (Phase 2c), the
+        # narrow signal is what tells the action picker to put evidence
+        # remediation ahead of auto-apply — applying patches never makes a
+        # below-threshold scan trustworthy. Uses the same predicate
+        # `build_release_decision` does, so the two never disagree.
+        evidence_below_threshold = bool(
+            release_decision.evidence_coverage
+            and evidence_below_ie_threshold(
+                release_decision.evidence_coverage, tool_count=tool_count
+            )
+        )
 
     active_findings = [f for f in findings if not f.suppressed]
     auto_appliable = sum(
@@ -215,6 +234,7 @@ def build_agent_summary(
         active_findings=active_findings,
         json_report_path=json_report_path,
         evidence_recommended=evidence_recommended,
+        evidence_below_threshold=evidence_below_threshold,
         evidence_reason=(
             reason
             if (evidence_recommended or verdict == "insufficient_evidence")
@@ -242,6 +262,7 @@ def _build_first_recommended_action(
     active_findings: list[Finding],
     json_report_path: str | None,
     evidence_recommended: bool = False,
+    evidence_below_threshold: bool = False,
     evidence_reason: str = "",
 ) -> AgentSummaryAction | None:
     """Deterministic next-step picker for ``agent_summary``.
@@ -255,6 +276,13 @@ def _build_first_recommended_action(
        release, and running apply-patches first would contradict the
        headline. Tell the agent to fix the trust problem before
        cleaning up findings.
+    1b. Verdict is review_required BUT evidence is below the IE
+       threshold (an active high/critical finding elevated it out of
+       insufficient_evidence — Phase 2c) → same as (1): evidence
+       remediation + human review of the named concern outrank
+       auto-apply. Without this, a report-only consumer following
+       agent_summary would be told to run ``apply-patches --apply`` on a
+       scan the gate has already said it can't trust.
     2. Auto-applicable patches available → propose ``apply-patches``,
        but only as a ``command`` action when we know the actual JSON
        report path (so the command never points at the wrong file).
@@ -281,17 +309,44 @@ def _build_first_recommended_action(
             ),
         )
 
+    if verdict == "review_required" and evidence_below_threshold:
+        # An active high/critical finding elevated a below-IE-threshold
+        # scan to review_required (Phase 2c). Evidence is still too weak to
+        # gate, so — exactly like insufficient_evidence — gathering better
+        # evidence and routing the named concern to a human outrank any
+        # auto-apply patch. Emitting a runnable apply-patches command here
+        # would tell a report-only consumer to "fix" a scan the gate has
+        # already said it cannot trust.
+        base = (
+            evidence_reason
+            or "Evidence coverage is below threshold; scan results are not "
+            "trustworthy enough to gate release."
+        )
+        return AgentSummaryAction(
+            kind="info",
+            command=None,
+            why=(
+                f"{base} A human must review the active high/critical "
+                "finding(s), and you should gather deeper evidence (e.g. "
+                "MCP/OpenAPI inputs, eval traces, additional source files) "
+                "before re-running the scan; applying patches does not clear "
+                "the evidence gap."
+            ),
+        )
+
     if auto_appliable > 0:
         why = (
             f"{auto_appliable} finding(s) carry high-confidence patches "
             "safe to apply without human review."
         )
         if verdict == "review_required" and evidence_recommended:
-            # The patches are still worth applying (the scan IS
-            # trustworthy enough to gate at review_required, unlike
-            # the insufficient_evidence path that outranks auto-apply
-            # entirely). But the action's why must call out the
-            # evidence gap so the agent doesn't treat apply-patches
+            # Reaching here means evidence is recommended but only
+            # *sub-threshold* (1-3 source warnings, or human-review-
+            # recommended without enough low-confidence tools) — the
+            # below-threshold case was already intercepted above and
+            # routed to evidence remediation. So the patches ARE worth
+            # applying here, but the why must still call out the
+            # sub-threshold gap so the agent doesn't treat apply-patches
             # as the *only* next step — the human still needs to
             # review the source warnings / low-confidence tools.
             evidence_note = evidence_reason or (
diff --git a/src/agents_shipgate/core/findings/report_builder.py b/src/agents_shipgate/core/findings/report_builder.py
index dfffb37a..4c1ad818 100644
--- a/src/agents_shipgate/core/findings/report_builder.py
+++ b/src/agents_shipgate/core/findings/report_builder.py
@@ -115,6 +115,11 @@ def build_report(
         findings=findings,
         release_decision=report.release_decision,
         json_report_path=generated_reports.get("json"),
+        # Threaded so agent_summary can detect below-IE-threshold evidence
+        # (the ratio needs the total tool count) and put evidence
+        # remediation ahead of auto-apply even when an active high/critical
+        # finding elevated the verdict to review_required (Phase 2c).
+        tool_count=len(tools),
     )
     # v0.20 NOTE: ``report.reviewer_summary`` is NOT built here. It
     # depends on ``report.misalignments`` which ``apply_capability_diff``
diff --git a/tests/test_agent_action_summary.py b/tests/test_agent_action_summary.py
index 5bb5f821..750bfb12 100644
--- a/tests/test_agent_action_summary.py
+++ b/tests/test_agent_action_summary.py
@@ -870,6 +870,88 @@ def test_insufficient_evidence_outranks_auto_apply_in_first_action():
     assert "apply-patches" not in why or "does not clear" in why
 
 
+def test_review_required_below_threshold_outranks_auto_apply():
+    """Phase 2c regression: an active high finding elevates a
+    below-IE-threshold scan from insufficient_evidence to review_required.
+    The finding is auto-applicable, so the OLD code took the auto-apply
+    branch and emitted a runnable `apply-patches --apply` command — telling
+    a report-only consumer to "fix" a scan the gate already said it can't
+    trust. first_recommended_action must instead be the evidence/human
+    info action, exactly like the insufficient_evidence path."""
+    finding = _make_finding(
+        check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED",
+        severity="high",
+        agent_action="auto_apply",
+    )
+    reason = (
+        "1 finding need review and evidence coverage is incomplete."
+    )
+    decision = _make_release_decision(
+        decision="review_required",
+        review_items=["SHIP-SCOPE-TOOLKIT-UNBOUNDED"],
+        reason=reason,
+        evidence_human_review_recommended=True,
+    )
+    # 2 of 2 low-confidence tools → below the IE threshold (ceil(2*0.5)=1).
+    decision.evidence_coverage = EvidenceCoverageDecision(
+        level=decision.evidence_coverage.level,
+        human_review_recommended=True,
+        source_warning_count=0,
+        low_confidence_tool_count=2,
+    )
+    summary = build_agent_summary(
+        findings=[finding],
+        release_decision=decision,
+        json_report_path="/abs/agents-shipgate-reports/report.json",
+        tool_count=2,
+    )
+    assert summary.verdict == "review_required"
+    assert summary.auto_appliable_patches == 1
+    assert summary.first_recommended_action is not None
+    # Must NOT propose a runnable apply-patches command.
+    assert summary.first_recommended_action.kind == "info"
+    assert summary.first_recommended_action.command is None
+    why = summary.first_recommended_action.why
+    assert "gather deeper evidence" in why
+    assert "apply-patches" not in why or "does not clear" in why
+    assert "human" in why.lower()
+
+
+def test_review_required_sub_threshold_evidence_keeps_auto_apply():
+    """Counterpart guard: the SAME auto-applicable high finding with only
+    sub-threshold evidence (2 low-confidence tools out of 10 → below the
+    ceil(10*0.5)=5 threshold) is NOT degraded. apply-patches stays the
+    first action (the scan is trustworthy enough to gate at
+    review_required); the escalation keys on the ratio, not severity."""
+    finding = _make_finding(
+        check_id="SHIP-SCOPE-TOOLKIT-UNBOUNDED",
+        severity="high",
+        agent_action="auto_apply",
+    )
+    decision = _make_release_decision(
+        decision="review_required",
+        review_items=["SHIP-SCOPE-TOOLKIT-UNBOUNDED"],
+        reason="1 finding requires review.",
+        evidence_human_review_recommended=True,
+    )
+    decision.evidence_coverage = EvidenceCoverageDecision(
+        level=decision.evidence_coverage.level,
+        human_review_recommended=True,
+        source_warning_count=0,
+        low_confidence_tool_count=2,
+    )
+    summary = build_agent_summary(
+        findings=[finding],
+        release_decision=decision,
+        json_report_path="/abs/agents-shipgate-reports/report.json",
+        tool_count=10,
+    )
+    assert summary.verdict == "review_required"
+    assert summary.first_recommended_action is not None
+    assert summary.first_recommended_action.kind == "command"
+    assert "apply-patches" in (summary.first_recommended_action.command or "")
+
+
 def test_build_agent_summary_insufficient_evidence_uses_reason_as_headline():
     """A `insufficient_evidence` verdict must NOT fall through to the
     "Release ready" else-branch. The headline should surface the