From 2688f3733c599e3c035a87ded35936b77d4f3a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Thu, 18 Jun 2026 01:35:17 +0200 Subject: [PATCH 1/4] fix: harden phase completion diagnostics --- tests/test_event_recording.py | 6 +- tests/test_findings_quality.py | 76 +++++++++++ tests/test_gate_check.py | 15 +++ tests/test_phase_artifacts_cli.py | 82 ++++++++++++ tests/test_phase_failure_state_reset.py | 77 +++++++++++ tests/test_phases_completion.py | 127 +++++++++++++++++- tools/codecome/harness.py | 76 ++++++++++- tools/codecome/transcript.py | 6 +- tools/findings/quality.py | 165 ++++++++++++++++++++++++ tools/phases/artifact_checks.py | 77 ++++++++++- tools/phases/completion.py | 68 +++++++++- tools/phases/gates.py | 6 +- 12 files changed, 762 insertions(+), 19 deletions(-) create mode 100644 tests/test_findings_quality.py create mode 100644 tools/findings/quality.py diff --git a/tests/test_event_recording.py b/tests/test_event_recording.py index a4f85251..26177c33 100644 --- a/tests/test_event_recording.py +++ b/tests/test_event_recording.py @@ -1,4 +1,5 @@ import sys +import re from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "tools")) @@ -77,4 +78,7 @@ def test_phase_transcript_does_not_truncate_existing_file(tmp_path, monkeypatch) assert existing.read_text(encoding="utf-8") == "keep me\n" assert transcript.path != existing - assert transcript.path.name.startswith("last-phase-1c-no-finding-attempt-1-") + assert re.fullmatch( + r"last-phase-1c-no-finding-attempt-1-\d{8}-\d{6}-pid\d+\.jsonl", + transcript.path.name, + ) diff --git a/tests/test_findings_quality.py b/tests/test_findings_quality.py new file mode 100644 index 00000000..ef00277c --- /dev/null +++ b/tests/test_findings_quality.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "tools")) + + +def _write_phase2_finding(path: Path, *, title: str, category: str, target_area: str, summary: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + "---\n" + "id: \"CC-0099\"\n" + f"title: \"{title}\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"MEDIUM\"\n" + f"category: \"{category}\"\n" + "cwe: [\"CWE-287\"]\n" + "language: \"c\"\n" + f"target_area: \"{target_area}\"\n" + "files: [\"scheduler/ipp.c\"]\n" + "symbols: [\"ippReadIO\"]\n" + "entry_points: [\"IPP request parser\"]\n" + "sources: [\"network IPP request\"]\n" + "sinks: [\"authorization decision\"]\n" + "trust_boundary: \"remote client to scheduler\"\n" + "assets_at_risk: [\"scheduler authorization state\"]\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0099\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0099/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-18\"\n" + "updated_at: \"2026-06-18\"\n" + "---\n\n" + "# Summary\n\n" + f"{summary}\n\n" + "# Target context\n\n" + "The CUPS scheduler accepts IPP requests from remote clients and maps request metadata into authorization state.\n\n" + "# Affected code\n\n" + "The affected path is `scheduler/ipp.c` in the IPP request parser and authorization handoff.\n\n" + "# Vulnerability hypothesis\n\n" + "A remote client may control the user identity attribute that reaches an authorization decision without canonicalization.\n\n" + "# Source-to-sink reasoning\n\n" + "The source is a network IPP request. The parser copies the identity attribute into scheduler request state. The sink is an authorization check that trusts that state.\n\n" + "# Attackability / trigger conditions\n\n" + "An unauthenticated remote client can send a crafted IPP request before authorization is evaluated.\n\n" + "# Impact\n\n" + "Successful exploitation could bypass authorization checks and perform scheduler operations as a more privileged user.\n\n" + "# Validation plan\n\n" + "Send crafted IPP requests with controlled identity attributes and compare the scheduler authorization outcome against a baseline request.\n\n" + "# Counter-analysis\n\n" + "Review parser normalization, authentication layers, and later authorization checks to determine whether attacker control is removed before the sink.\n\n" + "# Validation result\n\n" + "Pending.\n\n" + "# Evidence\n\n" + "Pending.\n", + encoding="utf-8", + ) + + +def test_phase2_quality_rejects_test_template_artifact(tmp_path: Path) -> None: + from findings.quality import validate_phase2_finding_quality + + finding = tmp_path / "itemdb" / "findings" / "PENDING" / "CC-0099-test-finding.md" + _write_phase2_finding( + finding, + title="Test finding to see template", + category="Test", + target_area="Testing", + summary="This is a test finding created to verify the template system. It does not represent an actual vulnerability.", + ) + + errors = validate_phase2_finding_quality(finding) + + assert any("test/template artifact" in error for error in errors), errors + assert any("not an actual target vulnerability" in error for error in errors), errors diff --git a/tests/test_gate_check.py b/tests/test_gate_check.py index 4ab99826..b4e06f1c 100644 --- a/tests/test_gate_check.py +++ b/tests/test_gate_check.py @@ -127,3 +127,18 @@ def test_gate_phase_4_rejects_wrong_status(tmp_path, monkeypatch): assert exit_code == 1 finally: gates_module.ROOT = original_root + + +def test_gate_phase_3_no_pending_is_noop_success(tmp_path, capsys): + original_root = gates_module.ROOT + gates_module.ROOT = tmp_path + (tmp_path / "itemdb" / "findings" / "PENDING").mkdir(parents=True) + + try: + exit_code = gates_module.gate_phase_3() + finally: + gates_module.ROOT = original_root + + out = capsys.readouterr().out + assert exit_code == 0 + assert "nothing to review" in out diff --git a/tests/test_phase_artifacts_cli.py b/tests/test_phase_artifacts_cli.py index b92bf7aa..bdba1bee 100644 --- a/tests/test_phase_artifacts_cli.py +++ b/tests/test_phase_artifacts_cli.py @@ -216,3 +216,85 @@ def test_has_valid_threat_model_returns_false_when_missing(tmp_path: Path) -> No with patch("phases.artifact_checks.ROOT", tmp_path): assert not has_valid_threat_model() + + +def test_phase2_artifacts_accept_explicit_no_findings_summary(tmp_path: Path) -> None: + from phases.artifact_checks import check_phase_2_artifacts + + runs = tmp_path / "runs" + runs.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-16-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + assert check_phase_2_artifacts() == [] + + +def test_phase2_artifacts_reject_stub_finding(tmp_path: Path) -> None: + from phases.artifact_checks import check_phase_2_artifacts + + runs = tmp_path / "runs" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + runs.mkdir(parents=True) + pending.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-16-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| CC-0001 | Stub | itemdb/findings/PENDING/CC-0001-stub.md |\n", + encoding="utf-8", + ) + (pending / "CC-0001-stub.md").write_text( + "---\n" + "id: \"CC-0001\"\n" + "title: \"Stub\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"LOW\"\ncategory: \"Unclassified\"\ncwe: []\nlanguage: \"unknown\"\ntarget_area: \"unknown\"\n" + "files: []\nsymbols: []\nentry_points: []\nsources: []\nsinks: []\ntrust_boundary: \"unknown\"\nassets_at_risk: []\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0001\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0001/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-16\"\nupdated_at: \"2026-06-16\"\n---\n\n# Summary\n\nPending.\n", + encoding="utf-8", + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + errors = check_phase_2_artifacts() + + assert any("not a complete Phase 2 finding" in error for error in errors), errors + + +def test_phase2_artifacts_report_all_quality_errors(tmp_path: Path, monkeypatch) -> None: + from phases.artifact_checks import check_phase_2_artifacts + from findings import quality as quality_mod + + runs = tmp_path / "runs" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + runs.mkdir(parents=True) + pending.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-18-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| CC-0099 | Many | itemdb/findings/PENDING/CC-0099-many-errors.md |\n", + encoding="utf-8", + ) + (pending / "CC-0099-many-errors.md").write_text("placeholder", encoding="utf-8") + monkeypatch.setattr( + quality_mod, + "validate_phase2_finding_quality", + lambda _path: [f"artifact-error-{i}" for i in range(7)], + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + errors = check_phase_2_artifacts() + + joined = "\n".join(errors) + for i in range(7): + assert f"artifact-error-{i}" in joined diff --git a/tests/test_phase_failure_state_reset.py b/tests/test_phase_failure_state_reset.py index 24425772..e70ad83a 100644 --- a/tests/test_phase_failure_state_reset.py +++ b/tests/test_phase_failure_state_reset.py @@ -117,6 +117,83 @@ def fake_resume_prompt(*_args, failure_details=None, **_kw): assert captured == [["stale failure from previous attempt"], None] +def test_phase_mode_terminal_stop_missing_artifacts_auto_resumes(monkeypatch): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + attempts = iter([ + (0, "ses_test", _terminal_result(), transcript), + (0, "ses_test", _terminal_result(), transcript), + ]) + completion_results = iter([ + (False, ["runs/phase-2-summary*.md was not updated during this run"]), + (True, []), + ]) + captured: list[list[str] | None] = [] + prompts: list[str] = [] + + def fake_run_single_attempt(_args, _console, prompt, *_a, **_kw): + prompts.append(prompt) + return next(attempts) + + def fake_resume_prompt(*_args, failure_details=None, **_kw): + captured.append(failure_details) + return "resume prompt" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "1") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr(runner_mod, "_run_single_attempt", fake_run_single_attempt) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: next(completion_results), + ) + monkeypatch.setattr(harness_mod, "build_phase_resume_prompt", fake_resume_prompt) + + from findings import checks_entry + monkeypatch.setattr(checks_entry, "run_frontmatter_validation", lambda: (0, "")) + + rc = harness_mod.run_phase_mode(_args()) + + assert rc == 0 + assert prompts == ["initial prompt", "resume prompt"] + assert captured == [["runs/phase-2-summary*.md was not updated during this run"]] + + +def test_phase_mode_final_failure_prints_gate_failures(monkeypatch, capsys): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "0") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr( + runner_mod, + "_run_single_attempt", + lambda *_a, **_kw: (0, "ses_test", _terminal_result(), transcript), + ) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: (False, ["Invalid: CC-0015 still contains template guidance"]), + ) + + rc = harness_mod.run_phase_mode(_args()) + output = capsys.readouterr().out + + assert rc == 2 + assert "remaining gate failures" in output + assert "CC-0015 still contains template guidance" in output + + def test_phase1_subphase_does_not_reuse_previous_attempt_failures(monkeypatch): from codecome import phase_1 as p1 diff --git a/tests/test_phases_completion.py b/tests/test_phases_completion.py index 22d650a9..219540ee 100644 --- a/tests/test_phases_completion.py +++ b/tests/test_phases_completion.py @@ -213,8 +213,8 @@ def test_phase1_path_diagnostics_handle_notes_root_outside_root(self, tmp_path): completion_mod.SANDBOX_PLAN_PATH = orig_sandbox_plan completion_mod.ROOT = orig_root - def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): - """Phase 2 should pass when only the run summary is fresh (no new findings).""" + def test_phase2_accepts_explicit_no_findings_summary(self, tmp_path): + """Phase 2 may pass with zero findings when the summary says so.""" import os import phases.completion as completion_mod @@ -224,7 +224,13 @@ def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" (tmp_path / "runs").mkdir(parents=True, exist_ok=True) summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" - summary.write_text("", encoding="utf-8") + summary.write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) run_start = time.time() - 60 os.utime(summary, (run_start + 60, run_start + 60)) @@ -236,6 +242,70 @@ def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): completion_mod.ROOT = orig_root completion_mod.FINDINGS_ROOT = orig_findings_root + def test_phase2_rejects_vague_no_finding_summary(self, tmp_path): + """A fresh summary without explicit no-finding wording is not enough.""" + import os + import phases.completion as completion_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" + summary.write_text("# Goal\n\nLooked around.\n", encoding="utf-8") + run_start = time.time() - 60 + os.utime(summary, (run_start + 60, run_start + 60)) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + assert ok is False + assert any("does not explicitly state" in f for f in failures), failures + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + + def test_phase2_rejects_fresh_stub_finding(self, tmp_path): + """A fresh template-like finding should not satisfy Phase 2 completion.""" + import os + import phases.completion as completion_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + pending.mkdir(parents=True, exist_ok=True) + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" + summary.write_text("# Findings created\n\n| ID | Title | Path |\n|---|---|---|\n| CC-0001 | Stub | itemdb/findings/PENDING/CC-0001-stub.md |\n", encoding="utf-8") + finding = pending / "CC-0001-stub.md" + finding.write_text( + "---\n" + "id: \"CC-0001\"\n" + "title: \"Stub\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"LOW\"\ncategory: \"Unclassified\"\ncwe: []\nlanguage: \"unknown\"\ntarget_area: \"unknown\"\n" + "files: []\nsymbols: []\nentry_points: []\nsources: []\nsinks: []\ntrust_boundary: \"unknown\"\nassets_at_risk: []\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0001\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0001/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-16\"\nupdated_at: \"2026-06-16\"\n---\n\n# Summary\n\nPending.\n", + encoding="utf-8", + ) + run_start = time.time() - 60 + for path in (summary, finding): + os.utime(path, (run_start + 60, run_start + 60)) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + assert ok is False + assert any("Phase 2 finding is incomplete" in f for f in failures), failures + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + def test_phase2_failure_details_mention_run_summary(self, tmp_path): """Phase 2 should report missing run summary when nothing is freshened.""" import phases.completion as completion_mod @@ -340,7 +410,45 @@ def test_resume_prompt_with_failure_details_lists_missing_artifacts(self): ) assert "Missing required artifacts:" in prompt assert "runs/phase-2-summary*.md" in prompt - assert "Fix only these missing items." in prompt + assert "Fix every listed item." in prompt + assert "authoritative completion-gate output" in prompt + assert "Do not claim validation passed" in prompt + + def test_phase2_reports_all_quality_errors_without_truncation(self, tmp_path, monkeypatch): + import os + import phases.completion as completion_mod + from findings import quality as quality_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + pending.mkdir(parents=True, exist_ok=True) + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-18-120000.md" + finding = pending / "CC-0099-many-errors.md" + summary.write_text("# Findings created\n\n| ID | Title | Path |\n|---|---|---|\n| CC-0099 | Many | itemdb/findings/PENDING/CC-0099-many-errors.md |\n", encoding="utf-8") + finding.write_text("placeholder", encoding="utf-8") + run_start = time.time() - 60 + for path in (summary, finding): + os.utime(path, (run_start + 60, run_start + 60)) + monkeypatch.setattr( + quality_mod, + "validate_phase2_finding_quality", + lambda _path: [f"error-{i}" for i in range(7)], + ) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + + assert ok is False + joined = "\n".join(failures) + for i in range(7): + assert f"error-{i}" in joined def test_resume_prompt_without_failure_details_uses_generic_wording(self): """Resume prompt should use the generic reassess wording when no failure details given.""" @@ -792,7 +900,13 @@ def test_sweep_accepts_phase2_summary_sweep(self, tmp_path): completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" (tmp_path / "runs").mkdir(parents=True, exist_ok=True) summary = tmp_path / "runs" / "phase-2-summary-sweep-src-foo-php-2026-06-12-143022.md" - summary.write_text("", encoding="utf-8") + summary.write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) run_start = time.time() - 60 os.utime(summary, (run_start + 60, run_start + 60)) @@ -849,7 +963,8 @@ def test_sweep_resume_prompt_uses_phase2_gate(self): ], ) assert "runs/phase-2-summary*.md" in prompt - assert "Fix only these missing items." in prompt + assert "Fix every listed item." in prompt + assert "authoritative completion-gate output" in prompt class TestPhase3ChecklistMentionsRunSummary: diff --git a/tools/codecome/harness.py b/tools/codecome/harness.py index 28fc300c..2bc6c911 100644 --- a/tools/codecome/harness.py +++ b/tools/codecome/harness.py @@ -16,6 +16,7 @@ import os import signal import time +from datetime import datetime from pathlib import Path from typing import Any, Optional @@ -33,6 +34,65 @@ ) +def _pending_finding_count() -> int: + pending_dir = ROOT / "itemdb" / "findings" / "PENDING" + if not pending_dir.exists(): + return 0 + return len([p for p in pending_dir.glob("CC-*.md") if p.is_file()]) + + +def _write_phase3_noop_summary() -> Path: + runs_dir = ROOT / "runs" + runs_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + path = runs_dir / f"phase-3-summary-{timestamp}.md" + path.write_text( + "# CodeCome Run Summary\n\n" + f"Date: {datetime.now().date().isoformat()} \n" + "Phase: counter_analysis \n" + "Agent: reviewer \n" + "Target path: `./src`\n\n" + "# Goal\n\n" + "Review pending findings.\n\n" + "# Prompt\n\n" + "Phase 3 was requested, but there were no PENDING findings to review.\n\n" + "# Files read\n\n" + "- `itemdb/findings/PENDING/`\n\n" + "# Files created\n\n" + f"- `runs/{path.name}`\n\n" + "# Files modified\n\n" + "None.\n\n" + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n\n" + "# Findings moved\n\n" + "| ID | From | To | Reason |\n" + "|---|---|---|---|\n" + "| - | - | - | None. |\n\n" + "# Findings updated\n\n" + "| ID | Update summary |\n" + "|---|---|\n" + "| - | None. |\n\n" + "# Evidence created\n\n" + "None.\n\n" + "# Important observations\n\n" + "No PENDING findings were available, so Phase 3 had nothing to review.\n\n" + "# Assumptions\n\n" + "None.\n\n" + "# Open questions for the user\n\n" + "None.\n\n" + "# Re-run prompt hints\n\n" + "None.\n\n" + "# Limitations\n\n" + "No counter-analysis was performed because there were no PENDING findings.\n\n" + "# Recommended next step\n\n" + "Run Phase 2 if additional vulnerability hypotheses are needed.\n", + encoding="utf-8", + ) + return path + + def run_phase_mode(args: argparse.Namespace) -> int: """Run a single phase with auto-retry/resume. @@ -125,6 +185,14 @@ def _p1_forward_signal(signum: int, _frame: Any) -> None: if console is None: out.warn("rich is not installed; using plain structured output fallback") + if str(args.phase) == "3" and _pending_finding_count() == 0: + summary_path = _write_phase3_noop_summary() + out.separator(tone=T.SUCCESS) + out.success("Phase 3 completed successfully", symbol=True) + out.detail(" no PENDING findings; nothing to review") + out.detail(f" summary: {summary_path.relative_to(ROOT)}") + return 0 + attempt_number = 0 last_session_id: str = "" last_finish_reason: Optional[str] = None @@ -169,6 +237,7 @@ def _forward_signal(signum: int, _frame: Any) -> None: attempt_number += 1 phase_failures = [] phase_ok = False + finish_warning = None # Clear per-session dedup state so retries don't suppress updates. _reset_subagent_state() returncode, session_id, run_result, transcript_path = _run_single_attempt( @@ -315,7 +384,8 @@ def _forward_signal(signum: int, _frame: Any) -> None: out.error(msg) print(validation_output) break - break + if returncode == 0: + break if returncode == 2 and ( last_finish_reason in _FINISH_MID_TURN @@ -367,6 +437,10 @@ def _forward_signal(signum: int, _frame: Any) -> None: ) if finish_warning: out.error(f" reason: {finish_warning}", strong=False) + if phase_failures: + out.error(" remaining gate failures:", strong=False) + for failure in phase_failures: + out.detail(f" - {failure}") out.detail(f" transcript: {transcript_path.relative_to(ROOT)}") out.warn( " hint: the run is likely partial; rerun the phase or " diff --git a/tools/codecome/transcript.py b/tools/codecome/transcript.py index a3316e58..dd9ce14c 100644 --- a/tools/codecome/transcript.py +++ b/tools/codecome/transcript.py @@ -70,7 +70,11 @@ def for_phase(cls, phase: str, finding: str | None) -> Transcript: counter = _ATTEMPT_COUNTER.get(key, 1) _ATTEMPT_COUNTER[key] = counter + 1 - path = _unique_transcript_path(_transcript_dir() / f"last-phase-{phase}-{finding_tag}-attempt-{counter}.jsonl") + stamp = time.strftime("%Y%m%d-%H%M%S") + path = _unique_transcript_path( + _transcript_dir() + / f"last-phase-{phase}-{finding_tag}-attempt-{counter}-{stamp}-pid{os.getpid()}.jsonl" + ) return cls(path, path.open("x", encoding="utf-8", buffering=1)) @classmethod diff --git a/tools/findings/quality.py b/tools/findings/quality.py new file mode 100644 index 00000000..4b0d5c6f --- /dev/null +++ b/tools/findings/quality.py @@ -0,0 +1,165 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from findings.checks import load_sections, validate_finding +from findings.frontmatter import load_frontmatter_strict + + +PHASE2_CORE_SECTIONS = [ + "Summary", + "Target context", + "Affected code", + "Vulnerability hypothesis", + "Source-to-sink reasoning", + "Attackability / trigger conditions", + "Impact", + "Validation plan", +] + +PHASE2_REQUIRED_SECTIONS = PHASE2_CORE_SECTIONS + [ + "Counter-analysis", + "Validation result", + "Evidence", +] + +_PLACEHOLDER_TEXT = { + "", + "pending.", + "todo.", + "tbd.", + "not applicable.", +} + +_TEMPLATE_MARKERS = [ + "Briefly describe the suspected vulnerability.", + "Describe the relevant target type", + "List the relevant files", + "Explain the suspected vulnerability.", + "This section must clearly distinguish what is known from what is assumed.", + "Describe the path from attacker-controlled", + "Explain how an attacker", + "Explain the realistic security impact.", + "Describe exactly how to prove or disprove the finding.", + "Try to disprove the finding.", +] + +_NON_FINDING_MARKERS = [ + "does not represent an actual vulnerability", + "does not describe a real vulnerability", + "does not contain a vulnerability", + "no actual vulnerability", + "this is a test finding", + "test finding created to verify", + "template system itself does not contain a vulnerability", +] + + +def _is_placeholder(value: Any) -> bool: + if not isinstance(value, str): + return False + return value.strip().lower() in _PLACEHOLDER_TEXT + + +def _contains_template_marker(value: str) -> bool: + return any(marker in value for marker in _TEMPLATE_MARKERS) + + +def _contains_non_finding_marker(value: str) -> bool: + lowered = value.lower() + return any(marker in lowered for marker in _NON_FINDING_MARKERS) + + +def _list_is_populated(data: dict[str, Any], key: str) -> bool: + value = data.get(key) + return isinstance(value, list) and any(str(item).strip() for item in value) + + +def _scalar_is_populated(data: dict[str, Any], key: str) -> bool: + value = data.get(key) + if not isinstance(value, str): + return False + return value.strip().lower() not in _PLACEHOLDER_TEXT | {"unknown", "unclassified"} + + +def validate_phase2_finding_quality(path: Path) -> list[str]: + """Return Phase 2 quality errors for a candidate finding. + + This is intentionally stricter than frontmatter validation. A PENDING + finding can be syntactically valid while still being unusable as a Phase 2 + durable artifact if it is mostly an untouched template. + """ + errors: list[str] = [] + + errors.extend(validate_finding(path)) + try: + data = load_frontmatter_strict(path) + except Exception as exc: + return errors or [str(exc)] + + if data.get("status") != "PENDING": + errors.append("Phase 2 findings must remain in PENDING status") + + title = str(data.get("title", "")).strip().lower() + if title.startswith("test finding") or title.startswith("template test"): + errors.append("Phase 2 findings must describe a target vulnerability, not a test/template artifact") + + for key in ("category", "target_area"): + value = str(data.get(key, "")).strip().lower() + if value in {"test", "testing", "template"}: + errors.append(f"Phase 2 frontmatter field {key} describes a test/template artifact") + + for key in ("category", "language", "target_area", "trust_boundary"): + if not _scalar_is_populated(data, key): + errors.append(f"Phase 2 requires populated frontmatter field: {key}") + + for key in ("files", "sources", "sinks", "assets_at_risk"): + if not _list_is_populated(data, key): + errors.append(f"Phase 2 requires non-empty frontmatter list: {key}") + + if not (_list_is_populated(data, "symbols") or _list_is_populated(data, "entry_points")): + errors.append("Phase 2 requires at least one symbol or entry point") + + sections = load_sections(path) + combined_sections = "\n".join(str(value) for value in sections.values()) + if _contains_non_finding_marker(combined_sections): + errors.append("Phase 2 finding text says it is not an actual target vulnerability") + + for section in PHASE2_REQUIRED_SECTIONS: + body = sections.get(section) + if body is None: + errors.append(f"Phase 2 requires #{section} section") + continue + if section in PHASE2_CORE_SECTIONS: + if _is_placeholder(body): + errors.append(f"Phase 2 requires populated #{section} section") + elif _contains_template_marker(body): + errors.append(f"Phase 2 #{section} still contains template guidance") + + counter = sections.get("Counter-analysis", "") + if counter and not _is_placeholder(counter) and _contains_template_marker(counter): + errors.append("Phase 2 #Counter-analysis still contains template guidance") + + return errors + + +def phase2_summary_declares_no_findings(path: Path) -> bool: + try: + content = path.read_text(encoding="utf-8") + except OSError: + return False + lowered = content.lower() + if "# findings created" not in lowered: + return False + no_finding_markers = [ + "| - | none.", + "| - | none |", + "no findings created", + "no new findings", + "zero findings", + ] + return any(marker in lowered for marker in no_finding_markers) diff --git a/tools/phases/artifact_checks.py b/tools/phases/artifact_checks.py index 2b57a743..13b5a367 100644 --- a/tools/phases/artifact_checks.py +++ b/tools/phases/artifact_checks.py @@ -283,6 +283,71 @@ def _validate_risk_index() -> list[str]: return validate_file_risk_index() +# -- Phase 2 ------------------------------------------------------------------- + + +def _phase2_summaries() -> list[Path]: + runs = ROOT / "runs" + if not runs.exists(): + return [] + return sorted( + [path for path in runs.glob("phase-2-summary*.md") if path.is_file()], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + +def _phase2_pending_findings() -> list[Path]: + pending = ROOT / "itemdb" / "findings" / "PENDING" + if not pending.exists(): + return [] + return sorted(path for path in pending.glob("CC-*.md") if path.is_file()) + + +def check_phase_2_artifacts(allow_missing_generated: bool = False) -> list[str]: + """Validate Phase 2 artifacts. + + A correct Phase 2 run may create zero findings, but that must be explicit in + the run summary. If findings exist, they must be reviewable Phase 2 artifacts + rather than untouched templates. + """ + from findings.quality import ( + phase2_summary_declares_no_findings, + validate_phase2_finding_quality, + ) + + errors: list[str] = [] + summaries = _phase2_summaries() + if not summaries: + if not allow_missing_generated: + errors.append("Missing artifact: runs/phase-2-summary*.md") + return errors + + if phase2_summary_declares_no_findings(summaries[0]): + return errors + + findings = _phase2_pending_findings() + if not findings: + errors.append( + "Phase 2 summary does not explicitly state that no findings were found, " + "and no PENDING findings exist" + ) + return errors + + for finding in findings: + quality_errors = validate_phase2_finding_quality(finding) + if quality_errors: + try: + rel = finding.relative_to(ROOT) + except ValueError: + rel = finding + errors.append( + f"{rel} is not a complete Phase 2 finding: " + + "; ".join(quality_errors) + ) + return errors + + # -- Dispatch ------------------------------------------------------------------ @@ -308,10 +373,10 @@ def check_phase_artifacts( phases_to_check: list[str] if phase == "all": - phases_to_check = ["1a", "1b", "1c"] + phases_to_check = ["1a", "1b", "1c", "2"] elif phase == "1": phases_to_check = ["1a", "1b", "1c"] - elif phase in ("2", "3", "4", "5", "6"): + elif phase in ("3", "4", "5", "6"): print(C.info(f"Phase {phase} artifact checks not yet implemented; nothing to do.")) return 0 else: @@ -364,8 +429,16 @@ def _check_phase_1c_with_ct( return check_phase_1c_artifacts(allow_missing_generated=allow_missing_generated) +def _check_phase_2_with_ct( + allow_missing_generated: bool = False, + phase_1a_start_time: float | None = None, +) -> list[str]: + return check_phase_2_artifacts(allow_missing_generated=allow_missing_generated) + + _CHECKERS = { "1a": _check_phase_1a_with_ct, "1b": _check_phase_1b_with_ct, "1c": _check_phase_1c_with_ct, + "2": _check_phase_2_with_ct, } diff --git a/tools/phases/completion.py b/tools/phases/completion.py index 11bc7f25..b2970929 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -91,6 +91,16 @@ def _run_summary_is_fresh(phase_id: str, run_start_time: float) -> bool: return any(Path(p).stat().st_mtime >= run_start_time for p in matches) +def _fresh_run_summaries(phase_id: str, run_start_time: float) -> list[Path]: + import glob as _glob + matches = [Path(p) for p in _glob.glob(str(ROOT / "runs" / f"phase-{phase_id}-summary*.md"))] + return sorted( + [p for p in matches if p.is_file() and p.stat().st_mtime >= run_start_time], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + def _append_run_summary_check( failures: list[str], phase_id: str, run_start_time: float ) -> None: @@ -111,6 +121,29 @@ def _iter_files(root: Path) -> Iterator[Path]: yield path +def _fresh_pending_findings(run_start_time: float) -> list[Path]: + pending_dir = FINDINGS_ROOT / "PENDING" + if not pending_dir.exists(): + return [] + return sorted( + [ + path for path in pending_dir.glob("CC-*.md") + if path.is_file() and path.stat().st_mtime >= run_start_time + ], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + +def _latest_summary_declares_no_phase2_findings(run_start_time: float) -> bool: + from findings.quality import phase2_summary_declares_no_findings + + summaries = _fresh_run_summaries("2", run_start_time) + if not summaries: + return False + return phase2_summary_declares_no_findings(summaries[0]) + + def _find_finding_file(status_dir: Path, finding_id: str, run_start_time: float) -> Path | None: """Find a finding file matching the ID in the status directory. @@ -256,14 +289,34 @@ def check_phase_graceful_completion(phase: str, finding: str | None, run_start_t ) return (len(failures) == 0, failures) elif phase_key in ("2", "sweep"): - import glob as _glob - run_summaries = _glob.glob(str(ROOT / "runs" / "phase-2-summary*.md")) - summary_fresh = any(Path(p).stat().st_mtime >= run_start_time for p in run_summaries) - if not summary_fresh: + from findings.quality import validate_phase2_finding_quality + + fresh_summaries = _fresh_run_summaries("2", run_start_time) + if not fresh_summaries: failures.append( "Missing: runs/phase-2-summary*.md — run summary was not " "created or updated" ) + + fresh_findings = _fresh_pending_findings(run_start_time) + if fresh_findings: + for finding_path in fresh_findings: + quality_errors = validate_phase2_finding_quality(finding_path) + if quality_errors: + try: + rel = finding_path.relative_to(ROOT) + except ValueError: + rel = finding_path + failures.append( + f"Invalid: {rel} — Phase 2 finding is incomplete: " + + "; ".join(quality_errors) + ) + elif fresh_summaries and not _latest_summary_declares_no_phase2_findings(run_start_time): + failures.append( + "Missing: itemdb/findings/PENDING/ — no finding was created or updated during this run, " + "and the fresh Phase 2 summary does not explicitly state that no findings were found" + ) + return (len(failures) == 0, failures) elif phase_key == "3": import glob as _glob @@ -497,7 +550,8 @@ def build_phase_resume_prompt( lines.append(f"- {detail}") lines.append("") lines.append( - "Fix only these missing items. Do not redo completed work." + "Fix every listed item. Do not redo completed work. The list above is CodeCome's " + "authoritative completion-gate output; do not ignore any item." ) else: lines.append("") @@ -516,6 +570,10 @@ def build_phase_resume_prompt( "Before ending, verify that the required durable artifacts for " "this phase exist, are updated, and are internally consistent." ) + lines.append( + "Do not claim validation passed unless the validation tool call actually completed successfully. " + "If a validation or shell tool call fails, fix the invocation or report that validation could not be run." + ) return "\n".join(lines) diff --git a/tools/phases/gates.py b/tools/phases/gates.py index e92c6845..8e886827 100644 --- a/tools/phases/gates.py +++ b/tools/phases/gates.py @@ -177,10 +177,10 @@ def gate_phase_3() -> int: nv_count = count_findings("PENDING") if nv_count == 0: - print(fail("No findings in PENDING.")) + print(warn("No findings in PENDING.")) print() - print(info("Run Phase 2 first: make phase-2")) - return 1 + print(info("Phase 3 has nothing to review and will complete as a no-op.")) + return 0 print(ok(f"{nv_count} finding(s) in PENDING.")) print() From e2f35c3c112a2b125dc9aacde0985bdbc05c9ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Fri, 19 Jun 2026 11:45:21 +0200 Subject: [PATCH 2/4] fix: address phase completion diagnostics review feedback - Use generator counting in _pending_finding_count() - Capture datetime.now() once in _write_phase3_noop_summary() - Remove hardcoded Target path from Phase 3 noop summary - Make _contains_template_marker() case-insensitive - Avoid redundant _fresh_run_summaries call in completion gate - Add test for lowercase template guidance detection --- tests/test_findings_quality.py | 23 +++++++++++++++++++++++ tools/codecome/harness.py | 10 +++++----- tools/findings/quality.py | 3 ++- tools/phases/completion.py | 9 ++++----- 4 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/test_findings_quality.py b/tests/test_findings_quality.py index ef00277c..b156fac2 100644 --- a/tests/test_findings_quality.py +++ b/tests/test_findings_quality.py @@ -74,3 +74,26 @@ def test_phase2_quality_rejects_test_template_artifact(tmp_path: Path) -> None: assert any("test/template artifact" in error for error in errors), errors assert any("not an actual target vulnerability" in error for error in errors), errors + + +def test_phase2_quality_rejects_case_insensitive_template_markers(tmp_path: Path) -> None: + from findings.quality import validate_phase2_finding_quality + + finding = tmp_path / "itemdb" / "findings" / "PENDING" / "CC-0099-case-variant.md" + _write_phase2_finding( + finding, + title="IPP identity bypass in request parser", + category="Auth", + target_area="Scheduler", + summary="A remote client may inject identity attributes.", + ) + content = finding.read_text(encoding="utf-8") + content = content.replace( + "A remote client may control the user identity attribute", + "briefly describe the suspected vulnerability. the parser accepts unvalidated identity attributes.", + ) + finding.write_text(content, encoding="utf-8") + + errors = validate_phase2_finding_quality(finding) + + assert any("contains template guidance" in error for error in errors), errors diff --git a/tools/codecome/harness.py b/tools/codecome/harness.py index 2bc6c911..98249e41 100644 --- a/tools/codecome/harness.py +++ b/tools/codecome/harness.py @@ -38,20 +38,20 @@ def _pending_finding_count() -> int: pending_dir = ROOT / "itemdb" / "findings" / "PENDING" if not pending_dir.exists(): return 0 - return len([p for p in pending_dir.glob("CC-*.md") if p.is_file()]) + return sum(1 for p in pending_dir.glob("CC-*.md") if p.is_file()) def _write_phase3_noop_summary() -> Path: runs_dir = ROOT / "runs" runs_dir.mkdir(parents=True, exist_ok=True) - timestamp = datetime.now().strftime("%Y-%m-%d-%H%M%S") + now = datetime.now() + timestamp = now.strftime("%Y-%m-%d-%H%M%S") path = runs_dir / f"phase-3-summary-{timestamp}.md" path.write_text( "# CodeCome Run Summary\n\n" - f"Date: {datetime.now().date().isoformat()} \n" + f"Date: {now.date().isoformat()} \n" "Phase: counter_analysis \n" - "Agent: reviewer \n" - "Target path: `./src`\n\n" + "Agent: reviewer \n\n" "# Goal\n\n" "Review pending findings.\n\n" "# Prompt\n\n" diff --git a/tools/findings/quality.py b/tools/findings/quality.py index 4b0d5c6f..ef1c490d 100644 --- a/tools/findings/quality.py +++ b/tools/findings/quality.py @@ -66,7 +66,8 @@ def _is_placeholder(value: Any) -> bool: def _contains_template_marker(value: str) -> bool: - return any(marker in value for marker in _TEMPLATE_MARKERS) + lowered = value.lower() + return any(marker.lower() in lowered for marker in _TEMPLATE_MARKERS) def _contains_non_finding_marker(value: str) -> bool: diff --git a/tools/phases/completion.py b/tools/phases/completion.py index b2970929..be71f93d 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -135,13 +135,12 @@ def _fresh_pending_findings(run_start_time: float) -> list[Path]: ) -def _latest_summary_declares_no_phase2_findings(run_start_time: float) -> bool: +def _latest_summary_declares_no_phase2_findings(fresh_summaries: list[Path]) -> bool: from findings.quality import phase2_summary_declares_no_findings - summaries = _fresh_run_summaries("2", run_start_time) - if not summaries: + if not fresh_summaries: return False - return phase2_summary_declares_no_findings(summaries[0]) + return phase2_summary_declares_no_findings(fresh_summaries[0]) def _find_finding_file(status_dir: Path, finding_id: str, run_start_time: float) -> Path | None: @@ -311,7 +310,7 @@ def check_phase_graceful_completion(phase: str, finding: str | None, run_start_t f"Invalid: {rel} — Phase 2 finding is incomplete: " + "; ".join(quality_errors) ) - elif fresh_summaries and not _latest_summary_declares_no_phase2_findings(run_start_time): + elif fresh_summaries and not _latest_summary_declares_no_phase2_findings(fresh_summaries): failures.append( "Missing: itemdb/findings/PENDING/ — no finding was created or updated during this run, " "and the fresh Phase 2 summary does not explicitly state that no findings were found" From bea6ace67c57601c61765d23592f2ca7e1ee6d60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Fri, 19 Jun 2026 16:09:23 +0200 Subject: [PATCH 3/4] fix: auto-resume on output-budget exhaustion (length, max_tokens) Split _FINISH_FAILURE into _FINISH_BUDGET (length, max_tokens) and _FINISH_HARD_FAILURE (content-filter, content_filter, error). Budget-exhaustion reasons now trigger check_phase_graceful_completion and auto-resume, matching the terminal-stop + missing-artifacts path. Added regression test for length auto-resume. Also updated phase_1._run_subphase to mirror the same behaviour. --- tests/test_phase_failure_state_reset.py | 55 +++++++++++++++++++++++++ tools/codecome/harness.py | 38 +++++++++++------ tools/codecome/phase_1.py | 41 ++++++++++++------ tools/phases/completion.py | 6 +++ tools/rendering/events/__init__.py | 4 ++ tools/rendering/events/base.py | 13 +++++- tools/rendering/events/step_finish.py | 6 +-- 7 files changed, 135 insertions(+), 28 deletions(-) diff --git a/tests/test_phase_failure_state_reset.py b/tests/test_phase_failure_state_reset.py index e70ad83a..3c153219 100644 --- a/tests/test_phase_failure_state_reset.py +++ b/tests/test_phase_failure_state_reset.py @@ -194,6 +194,61 @@ def test_phase_mode_final_failure_prints_gate_failures(monkeypatch, capsys): assert "CC-0015 still contains template guidance" in output +def _budget_result() -> RunResult: + return RunResult( + last_finish_reason="length", + any_step_finish_seen=True, + step_finish_count=1, + ) + + +def test_phase_mode_budget_exhaustion_auto_resumes(monkeypatch): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + attempts = iter([ + (0, "ses_test", _budget_result(), transcript), + (0, "ses_test", _budget_result(), transcript), + ]) + completion_results = iter([ + (False, ["runs/phase-2-summary*.md was not updated during this run"]), + (True, []), + ]) + captured: list[list[str] | None] = [] + prompts: list[str] = [] + + def fake_run_single_attempt(_args, _console, prompt, *_a, **_kw): + prompts.append(prompt) + return next(attempts) + + def fake_resume_prompt(*_args, failure_details=None, **_kw): + captured.append(failure_details) + return "resume prompt" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "1") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr(runner_mod, "_run_single_attempt", fake_run_single_attempt) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: next(completion_results), + ) + monkeypatch.setattr(harness_mod, "build_phase_resume_prompt", fake_resume_prompt) + + from findings import checks_entry + monkeypatch.setattr(checks_entry, "run_frontmatter_validation", lambda: (0, "")) + + rc = harness_mod.run_phase_mode(_args()) + + assert rc == 0 + assert prompts == ["initial prompt", "resume prompt"] + assert captured == [["runs/phase-2-summary*.md was not updated during this run"]] + + def test_phase1_subphase_does_not_reuse_previous_attempt_failures(monkeypatch): from codecome import phase_1 as p1 diff --git a/tools/codecome/harness.py b/tools/codecome/harness.py index 98249e41..fcaf0895 100644 --- a/tools/codecome/harness.py +++ b/tools/codecome/harness.py @@ -25,7 +25,7 @@ from codecome.console import build_console, _emit_fatal_error from rendering.dispatch import _get_rendering_ctx, configure_rendering, render_event from rendering.output import get_output, T -from rendering.events import _FINISH_TERMINAL_OK, _FINISH_MID_TURN, _FINISH_FAILURE +from rendering.events import _FINISH_TERMINAL_OK, _FINISH_MID_TURN, _FINISH_BUDGET, _FINISH_FAILURE from codecome.config import ROOT, resolve_color_mode, load_prompt, resolve_runtime_config from phases.completion import ( check_phase_graceful_completion, @@ -295,6 +295,12 @@ def _forward_signal(signum: int, _frame: Any) -> None: "CodeCome observed a step_finish event without a finish reason, so the model/provider completion " "state is ambiguous. Treating the run as incomplete." ) + elif last_finish_reason in _FINISH_BUDGET: + finish_warning = ( + f"CodeCome observed output budget exhaustion (finish reason '{last_finish_reason}'). " + f"The model/provider stopped after {step_finish_count} completed loops before producing all " + "required artifacts. Treating the run as incomplete." + ) elif last_finish_reason in _FINISH_FAILURE: finish_warning = ( f"CodeCome observed finish reason '{last_finish_reason}', which means the model/provider stopped " @@ -320,22 +326,29 @@ def _forward_signal(signum: int, _frame: Any) -> None: if finish_warning is not None: if ( - last_finish_reason in _FINISH_MID_TURN + (last_finish_reason in _FINISH_MID_TURN or last_finish_reason in _FINISH_BUDGET) and last_permission_error is None ): phase_ok, phase_failures = check_phase_graceful_completion( args.phase, args.finding, RUN_START_TIME) - if phase_ok: - msg = ( - f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " - "completed loops, but expected durable artifacts were written during " - "the run. Treating the phase as complete enough to run validation and auto-repair." - ) - out.success(msg) - finish_warning = None - last_finish_reason = "graceful_forgiveness" + if last_finish_reason in _FINISH_MID_TURN: + if phase_ok: + msg = ( + f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " + "completed loops, but expected durable artifacts were written during " + "the run. Treating the phase as complete enough to run validation and auto-repair." + ) + out.success(msg) + finish_warning = None + last_finish_reason = "graceful_forgiveness" + else: + returncode = 2 else: - returncode = 2 + # _FINISH_BUDGET: fail only if artifacts are incomplete + if not phase_ok: + returncode = 2 + else: + finish_warning = None else: returncode = 2 @@ -389,6 +402,7 @@ def _forward_signal(signum: int, _frame: Any) -> None: if returncode == 2 and ( last_finish_reason in _FINISH_MID_TURN + or last_finish_reason in _FINISH_BUDGET or (last_finish_reason in _FINISH_TERMINAL_OK and not phase_ok) ): max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "3")) diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index 703e4fc5..de054d6f 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -33,6 +33,7 @@ from rendering.events import ( _FINISH_TERMINAL_OK, _FINISH_MID_TURN, + _FINISH_BUDGET, _FINISH_FAILURE, _reset_subagent_state, ) @@ -453,6 +454,12 @@ def _run_subphase( "CodeCome observed a step_finish event without a finish reason, so the model/provider completion " "state is ambiguous. Treating the run as incomplete." ) + elif last_finish_reason in _FINISH_BUDGET: + finish_warning = ( + f"CodeCome observed output budget exhaustion (finish reason '{last_finish_reason}'). " + f"The model/provider stopped after {step_finish_count} completed loops before producing all " + "required artifacts. Treating the subphase as incomplete." + ) elif last_finish_reason in _FINISH_FAILURE: finish_warning = ( f"CodeCome observed finish reason '{last_finish_reason}', which means the model/provider stopped " @@ -478,22 +485,29 @@ def _run_subphase( if finish_warning is not None: if ( - (not any_step_finish_seen or last_finish_reason in _FINISH_MID_TURN) + (last_finish_reason in _FINISH_MID_TURN or last_finish_reason in _FINISH_BUDGET) and last_permission_error is None ): phase_ok, phase_failures = check_phase_graceful_completion( phase_id, finding, subphase_start_time) - if phase_ok: - msg = ( - f"CodeCome observed an incomplete model/provider completion signal for Phase {phase_id} after " - f"{step_finish_count} completed loops, but expected durable artifacts were written during " - "the run. Treating the subphase as complete enough to run validation and auto-repair." - ) - out.success(msg) - finish_warning = None - last_finish_reason = "graceful_forgiveness" + if last_finish_reason in _FINISH_MID_TURN: + if phase_ok: + msg = ( + f"CodeCome observed an incomplete model/provider completion signal for Phase {phase_id} after " + f"{step_finish_count} completed loops, but expected durable artifacts were written during " + "the run. Treating the subphase as complete enough to run validation and auto-repair." + ) + out.success(msg) + finish_warning = None + last_finish_reason = "graceful_forgiveness" + else: + returncode = 2 else: - returncode = 2 + # _FINISH_BUDGET: fail only if artifacts are incomplete + if not phase_ok: + returncode = 2 + else: + finish_warning = None else: returncode = 2 @@ -573,7 +587,10 @@ def _run_subphase( break - if returncode == 2 and last_finish_reason in _FINISH_MID_TURN: + if returncode == 2 and ( + last_finish_reason in _FINISH_MID_TURN + or last_finish_reason in _FINISH_BUDGET + ): import os max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "1")) if iteration_retry_count < max_iteration_retries: diff --git a/tools/phases/completion.py b/tools/phases/completion.py index be71f93d..5245bbc9 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -496,6 +496,7 @@ def _resume_opener_for_reason(reason: str) -> str: the gate still failed, so required artifacts are missing. """ from rendering.events import ( + _FINISH_BUDGET, _FINISH_FAILURE, _FINISH_MID_TURN, _FINISH_TERMINAL_OK, @@ -508,6 +509,11 @@ def _resume_opener_for_reason(reason: str) -> str: f"Your previous run was cut off mid-turn (finish reason '{reason}') " "before completing all required artifacts." ) + if reason in _FINISH_BUDGET: + return ( + f"Your previous run exhausted its output budget (finish reason '{reason}') " + "before completing all required artifacts." + ) if reason in _FINISH_FAILURE: return ( f"Your previous run stopped with a failure finish reason '{reason}' " diff --git a/tools/rendering/events/__init__.py b/tools/rendering/events/__init__.py index 63bec010..706c883d 100644 --- a/tools/rendering/events/__init__.py +++ b/tools/rendering/events/__init__.py @@ -15,6 +15,8 @@ EventRenderer, _FINISH_TERMINAL_OK, _FINISH_MID_TURN, + _FINISH_BUDGET, + _FINISH_HARD_FAILURE, _FINISH_FAILURE, _SUBAGENT_LAST_STATE, _reset_subagent_state, @@ -40,6 +42,8 @@ "EventRenderer", "_FINISH_TERMINAL_OK", "_FINISH_MID_TURN", + "_FINISH_BUDGET", + "_FINISH_HARD_FAILURE", "_FINISH_FAILURE", "_SUBAGENT_LAST_STATE", "_reset_subagent_state", diff --git a/tools/rendering/events/base.py b/tools/rendering/events/base.py index c5d6e9c2..d810a218 100644 --- a/tools/rendering/events/base.py +++ b/tools/rendering/events/base.py @@ -27,7 +27,18 @@ # incomplete signal that warrants retry/resume handling. _FINISH_MID_TURN = {"tool-calls", "tool_use", "unknown"} -_FINISH_FAILURE = {"content-filter", "content_filter", "length", "max_tokens", "error"} +# Budget-exhaustion — the model/provider stopped because the output budget was +# consumed (token limit reached). The session is still viable; auto-resume can +# let the model continue from where it was cut off. +_FINISH_BUDGET = {"length", "max_tokens"} + +# Hard failure — the model/provider stopped with a non-recoverable error +# (safety filter, internal error). Retrying the same session is unlikely to +# succeed. +_FINISH_HARD_FAILURE = {"content-filter", "content_filter", "error"} + +# Legacy alias; kept for backward compatibility. +_FINISH_FAILURE = _FINISH_BUDGET | _FINISH_HARD_FAILURE # Per-session dedup state for subagent update events. _SUBAGENT_LAST_STATE: dict[str, tuple[dict[str, Any], float]] = {} diff --git a/tools/rendering/events/step_finish.py b/tools/rendering/events/step_finish.py index 4050733a..867d620b 100644 --- a/tools/rendering/events/step_finish.py +++ b/tools/rendering/events/step_finish.py @@ -7,7 +7,7 @@ from typing import Any -from rendering.events.base import EventRenderer, _FINISH_FAILURE, _clear_hidden_reasoning_state +from rendering.events.base import EventRenderer, _FINISH_FAILURE, _FINISH_BUDGET, _clear_hidden_reasoning_state import _colors as C @@ -21,13 +21,13 @@ def render(self, event: dict[str, Any]) -> bool: tokens = self._format_tokens(part.get("tokens", {})) suffix = f" ({tokens})" if tokens else "" style = "dim" - if reason in _FINISH_FAILURE: + if reason in _FINISH_FAILURE or reason in _FINISH_BUDGET: style = "bold red" if self.rich: from rich.text import Text self.sink.write(Text(f"step finished: {reason}{suffix}", style=style)) elif self.plain: - if reason in _FINISH_FAILURE: + if reason in _FINISH_FAILURE or reason in _FINISH_BUDGET: self.sink.write_text(C.fail(f"step finished: {reason}{suffix}")) else: self.sink.write_text(f"step finished: {reason}{suffix}") From bbd49a22b22b5fbb9ce49ad332ecb3666337c586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pablo=20Ruiz=20Garc=C3=ADa?= Date: Fri, 19 Jun 2026 19:12:11 +0200 Subject: [PATCH 4/4] fix: differentiate budget-exhaustion from mid-turn in auto-resume messages - Auto-resume [budget] now says output budget exhaustion instead of mid-turn - No-session-ID fallback messages use generic wording for both cases - Docstring updated: length moved from _FINISH_MID_TURN to _FINISH_BUDGET - step_finish.py: removed redundant _FINISH_BUDGET check (_FINISH_FAILURE already covers it) --- tools/codecome/harness.py | 21 +++++++++++++++------ tools/codecome/phase_1.py | 19 +++++++++++++------ tools/phases/completion.py | 9 ++++++--- tools/rendering/events/step_finish.py | 6 +++--- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/tools/codecome/harness.py b/tools/codecome/harness.py index fcaf0895..7e420e1e 100644 --- a/tools/codecome/harness.py +++ b/tools/codecome/harness.py @@ -408,10 +408,18 @@ def _forward_signal(signum: int, _frame: Any) -> None: max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "3")) if iteration_retry_count < max_iteration_retries: iteration_retry_count += 1 - msg = ( - "\n[Auto-Resume] CodeCome observed an incomplete run and will resume the same " - f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." - ) + if last_finish_reason in _FINISH_BUDGET: + msg = ( + "\n[Auto-Resume] CodeCome observed output budget exhaustion and will resume the same " + f"session once to let the model finish the interrupted work " + f"(retry {iteration_retry_count}/{max_iteration_retries})." + ) + else: + msg = ( + "\n[Auto-Resume] CodeCome observed an incomplete run and will resume the same " + f"session once to let the model finish the interrupted work " + f"(retry {iteration_retry_count}/{max_iteration_retries})." + ) out.warn(msg) if last_session_id and last_session_id != "id": prompt = build_phase_resume_prompt( @@ -421,8 +429,9 @@ def _forward_signal(signum: int, _frame: Any) -> None: continue else: finish_warning = ( - "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " - "a session ID for automatic continuation. Treating the phase as incomplete." + "CodeCome correctly detected that the model/provider stopped before completing " + "the phase, but it could not determine a session ID for automatic continuation. " + "Treating the phase as incomplete." ) out.error("Could not determine session ID to resume.", strong=False) break diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index de054d6f..bb953231 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -595,10 +595,16 @@ def _run_subphase( max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "1")) if iteration_retry_count < max_iteration_retries: iteration_retry_count += 1 - msg = ( - "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " - f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." - ) + if last_finish_reason in _FINISH_BUDGET: + msg = ( + "\n[Auto-Resume] CodeCome observed output budget exhaustion and will resume the same " + f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." + ) + else: + msg = ( + "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " + f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." + ) out.warn(msg) if last_session_id and last_session_id != "id": prompt = build_phase_resume_prompt( @@ -608,8 +614,9 @@ def _run_subphase( continue else: finish_warning = ( - "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " - "a session ID for automatic continuation. Treating the subphase as incomplete." + "CodeCome correctly detected that the model/provider stopped before completing " + "the phase, but it could not determine a session ID for automatic continuation. " + "Treating the subphase as incomplete." ) out.error("Could not determine session ID to resume.", strong=False) break diff --git a/tools/phases/completion.py b/tools/phases/completion.py index 5245bbc9..b0c4b4b9 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -486,9 +486,12 @@ def _resume_opener_for_reason(reason: str) -> str: - ``"infrastructure_error"`` — harness fatal-retry path. - A finish reason from ``rendering.events._FINISH_MID_TURN`` (e.g. - ``"length"``, ``"tool_use"``) — the model/provider cut off mid-turn. - - A finish reason from ``rendering.events._FINISH_FAILURE`` — the - model/provider reported a failure finish reason. + ``"tool_use"``, ``"unknown"``) — the model/provider cut off mid-turn. + - A finish reason from ``rendering.events._FINISH_BUDGET`` (e.g. + ``"length"``, ``"max_tokens"``) — the model/provider exhausted its + output budget before completing all required artifacts. + - A finish reason from ``rendering.events._FINISH_HARD_FAILURE`` — the + model/provider reported a non-recoverable failure finish reason. - ``"graceful_forgiveness"`` — synthesized by the harness when the mid-turn cutoff happened but partial artifacts were written. - A finish reason from ``rendering.events._FINISH_TERMINAL_OK`` (e.g. diff --git a/tools/rendering/events/step_finish.py b/tools/rendering/events/step_finish.py index 867d620b..4050733a 100644 --- a/tools/rendering/events/step_finish.py +++ b/tools/rendering/events/step_finish.py @@ -7,7 +7,7 @@ from typing import Any -from rendering.events.base import EventRenderer, _FINISH_FAILURE, _FINISH_BUDGET, _clear_hidden_reasoning_state +from rendering.events.base import EventRenderer, _FINISH_FAILURE, _clear_hidden_reasoning_state import _colors as C @@ -21,13 +21,13 @@ def render(self, event: dict[str, Any]) -> bool: tokens = self._format_tokens(part.get("tokens", {})) suffix = f" ({tokens})" if tokens else "" style = "dim" - if reason in _FINISH_FAILURE or reason in _FINISH_BUDGET: + if reason in _FINISH_FAILURE: style = "bold red" if self.rich: from rich.text import Text self.sink.write(Text(f"step finished: {reason}{suffix}", style=style)) elif self.plain: - if reason in _FINISH_FAILURE or reason in _FINISH_BUDGET: + if reason in _FINISH_FAILURE: self.sink.write_text(C.fail(f"step finished: {reason}{suffix}")) else: self.sink.write_text(f"step finished: {reason}{suffix}")