diff --git a/src/forge_loop/audit_probes/__init__.py b/src/forge_loop/audit_probes/__init__.py new file mode 100644 index 0000000..51d018a --- /dev/null +++ b/src/forge_loop/audit_probes/__init__.py @@ -0,0 +1,14 @@ +"""Codebase-audit probes (issue #156). + +Each probe is a small module exposing one class that satisfies the +:class:`forge_loop.codebase_audit.Probe` Protocol. Keep probes: + +* Pure (filesystem in, ``Violation`` objects out). +* Independently importable — the framework can disable a broken probe + without bringing the rest down. +* Named with a stable identifier (the probe label uses it). +""" + +from forge_loop.audit_probes.file_size import FileSizeProbe + +__all__ = ["FileSizeProbe"] diff --git a/src/forge_loop/audit_probes/file_size.py b/src/forge_loop/audit_probes/file_size.py new file mode 100644 index 0000000..022aac7 --- /dev/null +++ b/src/forge_loop/audit_probes/file_size.py @@ -0,0 +1,138 @@ +"""File-size probe — first state-rule probe (issue #156). + +The quality manifesto carries soft caps for module size per language. +This probe walks the repository, measures LOC per source file +(non-blank, non-comment-only lines), and yields a :class:`Violation` +for every file that exceeds its language's threshold. + +Why this is the first probe +=========================== + +The motivating example is ``src/forge_loop/cli.py`` at >1700 LOC, 3.4× +the Python soft-cap of 500. The per-PR critic never flagged it because +cli.py grew 50-100 LOC per PR — each diff was reasonable, the cumulative +state-violation was invisible. This probe makes the cumulative +violation visible AS a violation, on the same cadence as the +maintenance daemon. + +Thresholds +========== + +Defaults match the manifesto: + +* Python: 500 LOC +* TypeScript / TSX / JS: 400 LOC +* Java: 600 LOC + +Operators override via ``FileSizeProbe(thresholds=...)``. The +``hard_threshold_multiplier`` knob escalates severity once a file +crosses N× its soft cap — the cli.py case fires P1, not P2. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Iterable + +from forge_loop.codebase_audit import Violation, walk_source_files + + +DEFAULT_THRESHOLDS: dict[str, int] = { + ".py": 500, + ".ts": 400, + ".tsx": 400, + ".js": 400, + ".java": 600, +} + + +def _count_significant_lines(path: Path) -> int: + """Count non-blank, non-comment-only lines. + + Pragmatic — no AST. A line that's entirely whitespace or starts + (after lstrip) with ``#`` / ``//`` / ``/*`` / ``*`` is dropped. Good + enough to distinguish a 1700-LOC business-logic file from a 1700- + LOC docstring (none of which exist in the codebase today). + + Decode failures fall back to 0 — a binary file shouldn't trip the + probe just because its suffix matched. + """ + try: + text = path.read_text(encoding="utf-8", errors="strict") + except (OSError, UnicodeDecodeError): + return 0 + n = 0 + for raw in text.splitlines(): + line = raw.strip() + if not line: + continue + if line.startswith(("#", "//", "/*", "*")): + continue + n += 1 + return n + + +@dataclass +class FileSizeProbe: + """Yield one :class:`Violation` per oversized file. + + Construction parameters are kept simple — production wires defaults, + tests override per-case. + """ + + name: str = "file-size" + thresholds: dict[str, int] = field(default_factory=lambda: dict(DEFAULT_THRESHOLDS)) + hard_threshold_multiplier: float = 2.0 + + def scan(self, repo: Path) -> Iterable[Violation]: + suffixes = tuple(self.thresholds.keys()) + for path in walk_source_files(repo, suffixes=suffixes): + soft = self.thresholds.get(path.suffix) + if soft is None: + continue + loc = _count_significant_lines(path) + if loc <= soft: + continue + rel = path.relative_to(repo.resolve()) if path.is_absolute() else path + rel_str = str(rel).replace("\\", "/") + multiplier = loc / soft if soft else float("inf") + severity = 1 if multiplier >= self.hard_threshold_multiplier else 2 + ratio_str = f"{multiplier:.1f}×" + title = ( + f"refactor({rel_str}): split oversized module " + f"({loc} LOC > {soft} cap, {ratio_str})" + ) + rationale = ( + f"`{rel_str}` is {loc} significant LOC, which exceeds the " + f"quality-manifesto soft cap of {soft} for `{path.suffix}` " + f"files ({ratio_str}).\n\n" + f"The per-PR critic does not catch this class of accumulation " + f"violation — modules grow 50-100 LOC at a time and each " + f"individual diff looks reasonable. The audit probe (issue " + f"#156) is the state-based gate that the manifesto rule " + f"\"state-based rules need a state-based gate\" calls for.\n\n" + f"Why it matters: oversized modules are correlated with the " + f"boiling-frog failures the manifesto cites (#147 stringly-" + f"typed boundaries, #128 silent fallthroughs) — once a " + f"module is hard to read end-to-end, single-character bugs " + f"survive review." + ) + acceptance = [ + f"`{rel_str}` is split into ≥2 focused modules, none exceeding {soft} LOC.", + "Public import surface preserved (no downstream breakage).", + "Tests for the split modules remain green; no test logic moved.", + "Manifesto rationale is honoured: each new module has a single, nameable responsibility.", + ] + yield Violation( + probe=self.name, + target=rel_str, + severity=severity, + title=title, + rationale=rationale, + acceptance=acceptance, + metrics={"loc": loc, "soft_cap": soft, "ratio": round(multiplier, 2)}, + ) + + +__all__ = ["FileSizeProbe", "DEFAULT_THRESHOLDS"] diff --git a/src/forge_loop/cli.py b/src/forge_loop/cli.py index df8e279..9356add 100644 --- a/src/forge_loop/cli.py +++ b/src/forge_loop/cli.py @@ -890,6 +890,132 @@ def _render_ticket_body(ticket: ProposedTicket, parent: int | None) -> str: return 0 +def _cmd_audit(args: SimpleNamespace) -> int: + """`forge-loop audit` — codebase-state audit (issue #156). + + Contract: + * Default (no flags): walk the repo, run every default probe, + print a human-readable summary, exit 0. NO GitHub calls. + * --apply: file one ticket per violation (idempotent — existing + open tickets for the same probe+target are skipped). + * --json: emit the report as JSON for scripting / dashboards. + + Errors: + * No probe failure can cause exit != 0 on the dry-run path — + ``audit.errors`` is surfaced as a warning but isn't a gate. + * ``--apply`` exits 1 if ANY ticket-create call raised. + """ + import json as _json + + from forge_loop.codebase_audit import audit, file_violations + + repo_path = Path.cwd() + owner = "" + repo_name = "" + extra_labels: tuple[str, ...] = () + try: + cfg = load() + repo_path = Path(cfg.repo).resolve() if getattr(cfg, "repo", None) else repo_path + gh_repo = getattr(cfg, "github_repo", "") or "" + if "/" in gh_repo: + owner, repo_name = gh_repo.split("/", 1) + except Exception: # noqa: BLE001 — audit must work even without a config + pass + + report = audit(repo_path) + + if getattr(args, "json", False): + payload = { + "probes_run": report.probes_run, + "violations": [ + { + "probe": v.probe, + "target": v.target, + "severity": v.severity, + "title": v.title, + "metrics": v.metrics, + } + for v in report.violations + ], + "errors": report.errors, + } + typer.echo(_json.dumps(payload, indent=2, sort_keys=True)) + else: + typer.echo( + f"audit: probes_run={report.probes_run} " + f"violations={len(report.violations)} errors={list(report.errors)}" + ) + for v in report.violations: + typer.echo(f" [P{v.severity}] {v.probe}: {v.target}") + typer.echo(f" {v.title}") + for probe_name, err in report.errors.items(): + typer.echo(f" ! probe {probe_name} crashed: {err}", err=True) + + if not getattr(args, "apply", False): + return 0 + + if not owner or not repo_name: + typer.echo( + "audit: --apply requires github_repo configured (owner/repo)", + err=True, + ) + return 2 + + if report.is_clean: + typer.echo("audit: clean — nothing to file.") + return 0 + + try: + gh_client = _gh_client_factory() + except Exception as exc: # noqa: BLE001 + typer.echo(f"audit: gh client init failed: {exc}", err=True) + return 1 + + # Wire events file from the resolved config (best-effort). + events_file: Path | None = None + try: + events_file = cfg.events_file # type: ignore[name-defined] + except Exception: # noqa: BLE001 + events_file = None + + def _emit_filed(v, number): # noqa: ANN001 — internal + if events_file is None: + return + try: + from forge_loop.events import AuditViolationFiledEvent, emit + + emit( + events_file, + AuditViolationFiledEvent( + probe=v.probe, + target=v.target, + severity=v.severity, + issue_number=number, + title=v.title, + ), + ) + except Exception: # noqa: BLE001 — never let event emit kill --apply + pass + + outcome = file_violations( + report, + gh_client, + owner=owner, + repo=repo_name, + extra_labels=extra_labels, + emit_filed=_emit_filed, + ) + for v, number in outcome.filed: + typer.echo(f"audit: filed #{number}: {v.title}") + for v in outcome.skipped: + typer.echo(f"audit: skipped (already filed): {v.probe}:{v.target}") + if outcome.errors: + for key, err in outcome.errors.items(): + typer.echo(f"audit: ERROR filing {key}: {err}", err=True) + return 1 + return 0 + + def _cmd_record_session(args: SimpleNamespace) -> int: from forge_loop._testing.recorder import SessionRecorder from forge_loop.worker import make_brief @@ -1597,6 +1723,21 @@ def cmd_brainstorm( _exit(_cmd_brainstorm(SimpleNamespace(apply=apply))) +@app.command( + "audit", + help="Codebase-state audit (issue #156). Dry-run by default; --apply files tickets.", +) +def cmd_audit( + apply: bool = typer.Option( + False, "--apply", help="File one ticket per violation (idempotent)." + ), + json_: bool = typer.Option( + False, "--json", help="Emit the report as JSON (for scripts/dashboards)." + ), +) -> None: + _exit(_cmd_audit(SimpleNamespace(apply=apply, json=json_))) + + @app.command("record-session", help="Record a real SDK session to a JSONL fixture.") def cmd_record_session( issue: int | None = typer.Option(None, "--issue"), diff --git a/src/forge_loop/codebase_audit.py b/src/forge_loop/codebase_audit.py new file mode 100644 index 0000000..d805d73 --- /dev/null +++ b/src/forge_loop/codebase_audit.py @@ -0,0 +1,409 @@ +"""Codebase-state auditor (issue #156). + +Per-PR critic catches **deltas**; this auditor catches **accumulation**. + +Background — boiling-frog failure +================================= + +``src/forge_loop/cli.py`` is >1700 LOC. The quality manifesto's soft cap +for Python modules is 500. The per-PR critic never flagged it because +cli.py grew 50-100 LOC at a time across many PRs — each increment was a +reasonable diff, but the cumulative state-violation slipped past every +per-PR review. + +This module solves that gap by walking the repository, running a set of +**state-based probes** against it, and reporting (and optionally filing +tickets for) accumulation violations the per-PR critic structurally +cannot see. + +Design +====== + +* :class:`Probe` is a typed ``Protocol`` — every concrete probe lives + under :mod:`forge_loop.audit_probes` and emits zero or more + :class:`Violation` objects. +* :class:`AuditReport` is the aggregate result of one audit pass. +* :func:`audit` walks a repository with the configured probes. The + result is **dry-run by default**: the caller decides whether to file + tickets. +* :func:`file_violations` does the gh side: one ticket per violation, + axis-labeled ``modernization-gated``, with acceptance criteria + scaffolded into the body. Idempotency is handled by listing the + ``audit:probe-`` label on each existing ticket and skipping + duplicates (same probe + same target). +* Typed events: :class:`AuditViolationFiledEvent` / :class:`AuditCleanEvent` + live in :mod:`forge_loop.events`. + +Why a separate module (not absorbed into ``maintenance.py``) +============================================================ + +``maintenance.py`` is an LLM-as-PM grooming agent — qualitative +judgement calls (is this issue stale? which dupe is canonical?). +The auditor is the opposite shape: deterministic, regex/AST-driven, +zero LLM involvement. Keeping them apart means the audit pass runs in +<1s with no model spend and is easy to reason about in isolation. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Iterable, Protocol + + +# --------------------------------------------------------------------------- +# Public types +# --------------------------------------------------------------------------- + + +# Axis label for tickets the auditor files. Same shape as other +# loop-managed axis labels (``loop:ready`` etc.) — operators / dispatch +# can filter by it. +AUDIT_AXIS_LABEL = "modernization-gated" + +# Per-probe label prefix used for idempotency. ``audit:probe-file-size`` +# on an issue means "this ticket was filed by the file-size probe". +PROBE_LABEL_PREFIX = "audit:probe-" + + +@dataclass(frozen=True) +class Violation: + """One state-rule violation surfaced by a probe. + + ``probe`` is the probe name (matches :attr:`Probe.name`); ``target`` + is a probe-specific identifier (file path, function FQN, module). + The pair ``(probe, target)`` is the dedup key for ticket filing. + + ``severity`` is a 1-5 scale (1 = highest), purely informational. + + ``rationale`` should explain WHY this violation matters in + manifesto terms (citing the rule, citing downstream axis work + blocked) so the resulting ticket reads cleanly. + + ``acceptance`` is a markdown bullet list scaffolded into the ticket + body so the worker that picks it up has unambiguous acceptance + criteria. + """ + + probe: str + target: str + severity: int + title: str + rationale: str + acceptance: list[str] = field(default_factory=list) + metrics: dict[str, int | float | str] = field(default_factory=dict) + + @property + def probe_label(self) -> str: + return f"{PROBE_LABEL_PREFIX}{self.probe}" + + def dedup_key(self) -> tuple[str, str]: + return (self.probe, self.target) + + +class Probe(Protocol): + """Each probe inspects the repository and yields violations. + + Implementations live under :mod:`forge_loop.audit_probes`. They MUST: + + * Be pure with respect to the filesystem — no network, no state + writes — so the audit pass is cheap and reproducible. + * Yield zero violations on a clean repository. + * Surface a stable ``name`` (used as the probe-label suffix; do not + change without a migration). + """ + + name: str + + def scan(self, repo: Path) -> Iterable[Violation]: ... + + +@dataclass +class AuditReport: + """Aggregate of one audit pass. + + ``violations`` is the flat list across every probe; ``by_probe`` is + the same data grouped for convenient summarisation. + ``probes_run`` lets the report distinguish "no violations because + we didn't probe" from "no violations because the repo is clean". + ``errors`` keys a probe name to the exception string if that probe + crashed — one probe blowing up MUST NOT take down the rest. + """ + + violations: list[Violation] = field(default_factory=list) + probes_run: list[str] = field(default_factory=list) + errors: dict[str, str] = field(default_factory=dict) + + @property + def is_clean(self) -> bool: + return not self.violations + + def by_probe(self) -> dict[str, list[Violation]]: + out: dict[str, list[Violation]] = {} + for v in self.violations: + out.setdefault(v.probe, []).append(v) + return out + + +# --------------------------------------------------------------------------- +# Repo walk — single source of truth for "which files do we audit?" +# --------------------------------------------------------------------------- + + +_DEFAULT_IGNORE_DIR_NAMES: frozenset[str] = frozenset({ + ".git", + ".venv", + "venv", + "node_modules", + "__pycache__", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + "dist", + "build", + "target", + ".tox", + ".nox", + "site-packages", +}) + + +def walk_source_files( + repo: Path, + *, + suffixes: tuple[str, ...] = (".py", ".ts", ".tsx", ".js", ".java"), + ignore_dirs: frozenset[str] = _DEFAULT_IGNORE_DIR_NAMES, +) -> Iterable[Path]: + """Yield every source file under ``repo`` whose suffix matches. + + Skips common vendor / cache / build directories so probes don't burn + cycles on generated code. Probes that need a different filter should + call this with their own ``suffixes``. + + The walk is intentionally simple — no .gitignore parsing — because + the ignore set above already covers everything that matters in our + layout, and adding pygit2 / pathspec for one feature is overkill. + """ + repo = repo.resolve() + suffix_set = {s if s.startswith(".") else f".{s}" for s in suffixes} + stack: list[Path] = [repo] + while stack: + cur = stack.pop() + try: + entries = list(cur.iterdir()) + except (FileNotFoundError, PermissionError): + continue + for entry in entries: + name = entry.name + if entry.is_dir(): + if name in ignore_dirs or name.startswith("."): + # Skip hidden + ignored dirs but keep ``.`` itself + # (the repo root) which never recurses here. + if name not in {".github"}: # keep workflows visible + continue + stack.append(entry) + elif entry.is_file(): + if entry.suffix in suffix_set: + yield entry + + +# --------------------------------------------------------------------------- +# audit() — the entrypoint +# --------------------------------------------------------------------------- + + +def default_probes() -> list[Probe]: + """Built-in probes shipped with forge-loop. + + A function (not a module-level list) so importing this module + doesn't drag in every probe — and so tests can monkey-patch. + """ + from forge_loop.audit_probes.file_size import FileSizeProbe + + return [FileSizeProbe()] + + +def audit(repo: Path, probes: list[Probe] | None = None) -> AuditReport: + """Run every probe against ``repo`` and aggregate the results. + + A probe that raises is recorded in ``report.errors`` and skipped — + one bad probe MUST NOT prevent the others from running. This is the + same belt-and-braces shape used by the stuck-sweep tick guard. + + The function is pure: no event emission, no gh calls, no ticket + filing. Callers (CLI, tick loop) decide what to do with the report. + """ + probes_list = probes if probes is not None else default_probes() + report = AuditReport() + for probe in probes_list: + report.probes_run.append(probe.name) + try: + found = list(probe.scan(repo)) + except Exception as ex: # noqa: BLE001 — one bad probe must not kill the pass + report.errors[probe.name] = f"{type(ex).__name__}: {ex}"[:300] + continue + report.violations.extend(found) + return report + + +# --------------------------------------------------------------------------- +# Ticket filing — gh side. Idempotent across runs. +# --------------------------------------------------------------------------- + + +# Loose protocol of what we need from the gh client. Matches both +# ``GithubkitClient`` and ``MockGhClient`` structurally without forcing +# callers to import the heavy module. +class _GhLike(Protocol): + def issues_by_label(self, owner: str, repo: str, label: str, limit: int) -> list: ... + def create_issue( + self, owner: str, repo: str, title: str, body: str, labels: list[str] + ): ... + + +def render_ticket_body(v: Violation) -> str: + """Render the markdown body of an audit-filed ticket. + + The body restates the rationale, lists acceptance criteria, and + closes with a footer identifying the probe + target so a human + reading the ticket can map back to the audit run that filed it. + """ + lines = [ + "## Why", + "", + v.rationale.strip(), + "", + "## Acceptance", + ] + if v.acceptance: + for crit in v.acceptance: + lines.append(f"- {crit}") + else: + lines.append("- (probe did not scaffold acceptance criteria — fill in before working)") + if v.metrics: + lines.extend(["", "## Metrics", ""]) + for k, val in v.metrics.items(): + lines.append(f"- `{k}`: {val}") + lines.extend([ + "", + "---", + f"_Filed by `forge-loop audit` — probe `{v.probe}`, target `{v.target}`._", + ]) + return "\n".join(lines) + + +@dataclass +class FilingOutcome: + """Result of one :func:`file_violations` call. + + ``filed`` lists the violations that produced new tickets; + ``skipped`` lists those that matched an existing open ticket + (idempotent path); ``errors`` keys violation dedup-keys to the + exception string when create_issue blew up. + """ + + filed: list[tuple[Violation, int]] = field(default_factory=list) + skipped: list[Violation] = field(default_factory=list) + errors: dict[str, str] = field(default_factory=dict) + + +def _existing_targets_for_probe( + gh: _GhLike, owner: str, repo: str, probe_name: str +) -> set[str]: + """Pull existing open issues for a probe and parse their target footers. + + Matches the ``probe ``, target ``.`` line we inject in + :func:`render_ticket_body`. A missing footer (older ticket, hand- + edited) is gracefully ignored — the worst case is one duplicate + ticket, not a crash. + """ + label = f"{PROBE_LABEL_PREFIX}{probe_name}" + try: + issues = gh.issues_by_label(owner, repo, label, 100) + except Exception: # noqa: BLE001 — best effort; on failure we may dup + return set() + pattern = re.compile( + rf"probe `{re.escape(probe_name)}`, target `([^`]+)`" + ) + out: set[str] = set() + for issue in issues: + body = getattr(issue, "body", "") or "" + m = pattern.search(body) + if m: + out.add(m.group(1)) + return out + + +def file_violations( + report: AuditReport, + gh: _GhLike, + *, + owner: str, + repo: str, + extra_labels: tuple[str, ...] = (), + emit_filed: Callable[[Violation, int], None] | None = None, + emit_clean: Callable[[list[str]], None] | None = None, +) -> FilingOutcome: + """File one GitHub issue per violation, deduping against existing. + + * ``extra_labels`` is appended to every created ticket — operators + use this to inject org-specific labels (``team:platform`` etc.). + * The ``audit:probe-`` label is always added so the next + audit pass can find the existing ticket and dedup. + * ``AUDIT_AXIS_LABEL`` (``modernization-gated``) is always added. + * If the report has zero violations, ``emit_clean`` is invoked with + the probes-run list so the operator can see the auditor actually + ran (and didn't just have no probes loaded). + + Idempotency: we list existing open tickets carrying the probe label + and parse the target footer. A violation whose target is already + present is recorded in ``outcome.skipped`` and NOT re-filed. + """ + outcome = FilingOutcome() + if report.is_clean: + if emit_clean is not None: + emit_clean(list(report.probes_run)) + return outcome + + # Group by probe so we issue one issues_by_label call per probe, + # not one per violation. + by_probe: dict[str, list[Violation]] = {} + for v in report.violations: + by_probe.setdefault(v.probe, []).append(v) + + for probe_name, violations in by_probe.items(): + existing = _existing_targets_for_probe(gh, owner, repo, probe_name) + for v in violations: + if v.target in existing: + outcome.skipped.append(v) + continue + labels = [AUDIT_AXIS_LABEL, v.probe_label, *extra_labels] + body = render_ticket_body(v) + try: + created = gh.create_issue(owner, repo, v.title, body, list(labels)) + except Exception as ex: # noqa: BLE001 + outcome.errors[f"{v.probe}:{v.target}"] = ( + f"{type(ex).__name__}: {ex}"[:200] + ) + continue + number = int(getattr(created, "number", 0) or 0) + outcome.filed.append((v, number)) + if emit_filed is not None: + emit_filed(v, number) + return outcome + + +__all__ = [ + "AUDIT_AXIS_LABEL", + "PROBE_LABEL_PREFIX", + "AuditReport", + "FilingOutcome", + "Probe", + "Violation", + "audit", + "default_probes", + "file_violations", + "render_ticket_body", + "walk_source_files", +] diff --git a/src/forge_loop/events.py b/src/forge_loop/events.py index 8705403..366df81 100644 --- a/src/forge_loop/events.py +++ b/src/forge_loop/events.py @@ -208,6 +208,37 @@ class WorktreeReapedEvent(EventBase): status: str = "" +@register_event +class AuditViolationFiledEvent(EventBase): + """One :func:`forge_loop.codebase_audit.file_violations` filing. + + Emitted per ticket created — operators can see the audit pass + actually translated a manifesto-state violation into a downstream + work item without grepping the gh API. + """ + + KIND: ClassVar[str] = "audit_violation_filed" + probe: str = "" + target: str = "" + severity: int = Field(ge=1, le=5, default=2) + issue_number: int = Field(ge=0, default=0) + title: str = "" + + +@register_event +class AuditCleanEvent(EventBase): + """The audit pass found zero violations. + + Emitted at the end of a clean pass — the explicit "clean" event is + important because absence-of-violations could otherwise look like + the auditor never ran. ``probes_run`` lets the operator confirm the + probe set that actually fired. + """ + + KIND: ClassVar[str] = "audit_clean" + probes_run: list[str] = Field(default_factory=list) + + @register_event class StuckSweepDemotedEvent(EventBase): """A per-tick stuck-sweep decision (issue #129). @@ -305,6 +336,8 @@ def append_event_with_registry_check(events_path: Path, kind: str, **fields: Any __all__ = [ "EVENT_REGISTRY", + "AuditCleanEvent", + "AuditViolationFiledEvent", "EventBase", "LoopStartEvent", "LoopStopEvent", diff --git a/src/forge_loop/runner/tick.py b/src/forge_loop/runner/tick.py index 54a0b35..38f25a4 100644 --- a/src/forge_loop/runner/tick.py +++ b/src/forge_loop/runner/tick.py @@ -454,10 +454,71 @@ def _run_stuck_sweep(cfg: Config, tick: int) -> SweepReport | None: return report +def _run_codebase_audit(cfg: Config, tick: int) -> None: + """Per-cadence codebase-state audit (issue #156). + + Runs on the maintenance cadence so the deterministic state-rule + check (regex / LOC counting) lands alongside the LLM grooming pass. + Dry-run on the runner: we emit ``audit_violation_filed`` / + ``audit_clean`` events so the operator dashboard surfaces drift, but + we do NOT auto-file tickets here. The ``forge-loop audit --apply`` + CLI command is the explicit, operator-driven side. + + Best-effort: any exception is swallowed with a single event. The + tick itself must never crash because of an audit pass. + """ + try: + from forge_loop.codebase_audit import audit as _audit + from forge_loop.events import AuditCleanEvent, emit + + report = _audit(cfg.repo) + except Exception as ex: # noqa: BLE001 + append_event( + cfg.events_file, + "audit_skipped", + tick=tick, + reason=f"{type(ex).__name__}: {ex}"[:200], + ) + return + if report.is_clean: + try: + emit(cfg.events_file, AuditCleanEvent(probes_run=list(report.probes_run))) + except Exception: # noqa: BLE001 + pass + return + # Non-empty: emit one summary event per violation (probe + target). + # No gh side-effect — the operator runs `forge-loop audit --apply`. + for v in report.violations: + append_event( + cfg.events_file, + "audit_violation_observed", + tick=tick, + probe=v.probe, + target=v.target, + severity=v.severity, + title=v.title, + metrics=v.metrics, + ) + for probe_name, err in report.errors.items(): + append_event( + cfg.events_file, + "audit_probe_crashed", + tick=tick, + probe=probe_name, + err=err, + ) + + def _tick(cfg: Config, tick: int) -> None: # Imported lazily to avoid an import cycle (boot.py imports tick.py). from forge_loop.runner.boot import _short_sleep + # Codebase-state audit (issue #156) — same cadence as maintenance. + # Runs *before* the maintenance branch so it fires even when the + # maintenance subagent is the body of the tick. + if cfg.maintenance_every_n_ticks > 0 and tick % cfg.maintenance_every_n_ticks == 0: + _run_codebase_audit(cfg, tick) + # Maintenance ticks: every Nth tick, run the AI-as-PM subagent instead # of dispatching workers. The maintenance agent grooms + triages the # backlog so subsequent ticks have a clean queue. diff --git a/tests/test_codebase_audit.py b/tests/test_codebase_audit.py new file mode 100644 index 0000000..7e9a8c9 --- /dev/null +++ b/tests/test_codebase_audit.py @@ -0,0 +1,441 @@ +"""Tests for the codebase-state auditor (issue #156). + +Covers the framework + the first probe (file-size) + ticket-filing +idempotency. Hits both the manifesto T1 (state-machine edges) and T2 +(adversarial / sad path) rules: + +* Happy paths: clean repo emits an ``audit_clean`` filing; a planted + oversized file produces a violation that becomes a ticket. +* Adversarial: probe crashes are isolated; ``--apply`` against an empty + report is a no-op; second-run dedup skips already-filed targets; + empty / binary / unreadable files don't crash the LOC counter. +* Plus a CLI integration that drives ``cmd_audit`` against a planted + worktree. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path +from types import SimpleNamespace +from typing import Iterable + +import pytest + +from forge_loop.audit_probes.file_size import ( + DEFAULT_THRESHOLDS, + FileSizeProbe, + _count_significant_lines, +) +from forge_loop.codebase_audit import ( + AUDIT_AXIS_LABEL, + PROBE_LABEL_PREFIX, + AuditReport, + FilingOutcome, + Violation, + audit, + file_violations, + render_ticket_body, + walk_source_files, +) +from forge_loop.events import AuditCleanEvent, AuditViolationFiledEvent +from forge_loop.gh_client import Issue, MockGhClient + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _plant_py_file(path: Path, n_significant: int) -> None: + """Write a .py file with exactly ``n_significant`` significant lines.""" + path.parent.mkdir(parents=True, exist_ok=True) + lines = [f"x_{i} = {i}" for i in range(n_significant)] + path.write_text("\n".join(lines) + "\n") + + +@dataclass +class StubProbe: + name: str = "stub" + output: list[Violation] = None # type: ignore[assignment] + + def __post_init__(self) -> None: + if self.output is None: + self.output = [] + + def scan(self, repo: Path) -> Iterable[Violation]: # noqa: ARG002 + return list(self.output) + + +@dataclass +class BoomProbe: + name: str = "boom" + + def scan(self, repo: Path) -> Iterable[Violation]: # noqa: ARG002 + raise RuntimeError("probe blew up") + + +def _violation(probe: str = "file-size", target: str = "src/x.py") -> Violation: + return Violation( + probe=probe, + target=target, + severity=2, + title=f"refactor({target}): split", + rationale="manifesto cap exceeded", + acceptance=["split it"], + metrics={"loc": 1000, "soft_cap": 500}, + ) + + +# --------------------------------------------------------------------------- +# walk_source_files +# --------------------------------------------------------------------------- + + +def test_walk_source_files_skips_vendor_dirs(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "src" / "ok.py", 5) + _plant_py_file(tmp_path / ".venv" / "lib" / "skip.py", 5) + _plant_py_file(tmp_path / "node_modules" / "skip.py", 5) + _plant_py_file(tmp_path / "__pycache__" / "skip.py", 5) + + found = {p.name for p in walk_source_files(tmp_path)} + assert found == {"ok.py"} + + +def test_walk_source_files_suffix_filter(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "a.py", 1) + (tmp_path / "b.md").write_text("# not source") + found = {p.suffix for p in walk_source_files(tmp_path)} + assert ".md" not in found + assert ".py" in found + + +# --------------------------------------------------------------------------- +# _count_significant_lines — T5 (small property surface) + adversarial +# --------------------------------------------------------------------------- + + +def test_count_significant_lines_skips_blanks_and_comments(tmp_path: Path) -> None: + f = tmp_path / "x.py" + f.write_text( + "\n".join( + [ + "# header comment", + "", + "x = 1", + " ", + "// not python but still a comment-shape", + "y = 2", + " # indented comment", + ] + ) + ) + assert _count_significant_lines(f) == 2 + + +def test_count_significant_lines_handles_missing_file(tmp_path: Path) -> None: + # No crash on a path that doesn't exist; just returns 0. + assert _count_significant_lines(tmp_path / "nope.py") == 0 + + +def test_count_significant_lines_handles_binary(tmp_path: Path) -> None: + f = tmp_path / "binary.py" + f.write_bytes(b"\xff\xfe\x00\x01garbage\x80\x90") + # Either 0 (decode failure) or a positive count — the contract is + # "doesn't raise". Assert no exception and a non-negative result. + assert _count_significant_lines(f) >= 0 + + +# --------------------------------------------------------------------------- +# FileSizeProbe — happy + threshold edges +# --------------------------------------------------------------------------- + + +def test_file_size_probe_clean_repo_yields_nothing(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "src" / "small.py", 10) + probe = FileSizeProbe(thresholds={".py": 500}) + assert list(probe.scan(tmp_path)) == [] + + +def test_file_size_probe_flags_oversized_file(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "src" / "huge.py", 1700) + probe = FileSizeProbe(thresholds={".py": 500}) + violations = list(probe.scan(tmp_path)) + assert len(violations) == 1 + v = violations[0] + assert v.probe == "file-size" + assert v.target == "src/huge.py" + assert v.metrics["loc"] == 1700 + assert v.metrics["soft_cap"] == 500 + # 1700/500 = 3.4 ≥ default hard multiplier (2.0) → severity 1. + assert v.severity == 1 + + +def test_file_size_probe_at_threshold_is_not_flagged(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "edge.py", 500) + probe = FileSizeProbe(thresholds={".py": 500}) + assert list(probe.scan(tmp_path)) == [] + + +def test_file_size_probe_severity_2_below_hard_multiplier(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "med.py", 700) + probe = FileSizeProbe(thresholds={".py": 500}, hard_threshold_multiplier=2.0) + [v] = list(probe.scan(tmp_path)) + assert v.severity == 2 + + +def test_file_size_probe_per_language_thresholds(tmp_path: Path) -> None: + # Java soft cap is 600, TS is 400. + (tmp_path / "Big.java").write_text( + "\n".join(f"int v{i} = {i};" for i in range(601)) + ) + (tmp_path / "big.ts").write_text( + "\n".join(f"const v{i} = {i};" for i in range(401)) + ) + (tmp_path / "small.ts").write_text("const x = 1;\n") + probe = FileSizeProbe(thresholds=DEFAULT_THRESHOLDS) + targets = {v.target for v in probe.scan(tmp_path)} + assert "Big.java" in targets + assert "big.ts" in targets + assert "small.ts" not in targets + + +# --------------------------------------------------------------------------- +# audit() — orchestration + error isolation +# --------------------------------------------------------------------------- + + +def test_audit_runs_every_probe_and_aggregates(tmp_path: Path) -> None: + v1 = _violation(probe="a", target="x") + v2 = _violation(probe="b", target="y") + report = audit(tmp_path, probes=[StubProbe(name="a", output=[v1]), StubProbe(name="b", output=[v2])]) + assert report.probes_run == ["a", "b"] + assert report.violations == [v1, v2] + assert report.errors == {} + assert not report.is_clean + + +def test_audit_isolates_probe_crash(tmp_path: Path) -> None: + # Adversarial — one probe blows up, the other still runs. + good = _violation(probe="good", target="z") + report = audit(tmp_path, probes=[BoomProbe(), StubProbe(name="good", output=[good])]) + assert "boom" in report.errors + assert "RuntimeError" in report.errors["boom"] + assert report.violations == [good] + assert report.probes_run == ["boom", "good"] + + +def test_audit_clean_repo_with_real_probe(tmp_path: Path) -> None: + _plant_py_file(tmp_path / "small.py", 5) + report = audit(tmp_path, probes=[FileSizeProbe(thresholds={".py": 500})]) + assert report.is_clean + assert report.probes_run == ["file-size"] + + +# --------------------------------------------------------------------------- +# Ticket filing — happy + idempotent dedup + adversarial create failure +# --------------------------------------------------------------------------- + + +def test_file_violations_files_one_ticket_per_violation() -> None: + report = AuditReport( + violations=[_violation(target="a.py"), _violation(target="b.py")], + probes_run=["file-size"], + ) + gh = MockGhClient() + filed: list[tuple[Violation, int]] = [] + + outcome = file_violations( + report, gh, owner="o", repo="r", + emit_filed=lambda v, n: filed.append((v, n)), + ) + + assert len(outcome.filed) == 2 + assert outcome.skipped == [] + assert outcome.errors == {} + assert len(filed) == 2 + # Labels must include axis + probe-label. + create_calls = [kw for (m, kw) in gh.calls if m == "create_issue"] + assert len(create_calls) == 2 + for kw in create_calls: + assert AUDIT_AXIS_LABEL in kw["labels"] + assert f"{PROBE_LABEL_PREFIX}file-size" in kw["labels"] + + +def test_file_violations_clean_report_emits_clean_event_only() -> None: + report = AuditReport(violations=[], probes_run=["file-size"]) + gh = MockGhClient() + cleans: list[list[str]] = [] + + outcome = file_violations( + report, gh, owner="o", repo="r", + emit_clean=cleans.append, + ) + + # No tickets created. + assert not [m for (m, _) in gh.calls if m == "create_issue"] + assert outcome.filed == [] and outcome.skipped == [] and outcome.errors == {} + assert cleans == [["file-size"]] + + +def test_file_violations_is_idempotent_via_dedup() -> None: + """A second filing run with the same target must not re-create the ticket.""" + v = _violation(target="src/huge.py") + body = render_ticket_body(v) + # Seed the mock with an existing open ticket carrying both labels + + # the footer the dedup logic parses. + existing = Issue( + number=42, + title=v.title, + body=body, + state="open", + labels=[AUDIT_AXIS_LABEL, v.probe_label], + ) + gh = MockGhClient( + issues={("o", "r", 42): existing}, + issues_by_label_response=[existing], + ) + + report = AuditReport(violations=[v], probes_run=["file-size"]) + outcome = file_violations(report, gh, owner="o", repo="r") + + assert outcome.filed == [] + assert outcome.skipped == [v] + assert not [m for (m, _) in gh.calls if m == "create_issue"] + + +def test_file_violations_surfaces_create_failure() -> None: + from forge_loop.gh_client import GhError + + v = _violation(target="src/x.py") + gh = MockGhClient( + raise_on_create_titles={v.title: GhError("create_issue", 422, "boom")}, + ) + report = AuditReport(violations=[v], probes_run=["file-size"]) + + outcome = file_violations(report, gh, owner="o", repo="r") + + assert outcome.filed == [] + assert outcome.skipped == [] + assert any("file-size:src/x.py" in k for k in outcome.errors) + + +def test_render_ticket_body_contains_footer_and_acceptance() -> None: + v = _violation(target="src/huge.py") + body = render_ticket_body(v) + # Footer is the dedup contract — assert exact shape. + assert "probe `file-size`, target `src/huge.py`" in body + # Acceptance bullets surface in the body. + assert "## Acceptance" in body + assert "- split it" in body + + +# --------------------------------------------------------------------------- +# Typed events — schema validation +# --------------------------------------------------------------------------- + + +def test_audit_events_are_typed_and_validated() -> None: + e = AuditViolationFiledEvent( + probe="file-size", target="x.py", severity=2, + issue_number=99, title="t", + ) + rec = e.to_record() + assert rec["kind"] == "audit_violation_filed" + assert rec["issue_number"] == 99 + + # Out-of-range severity is rejected by pydantic. + with pytest.raises(Exception): + AuditViolationFiledEvent(severity=99) # ge=1, le=5 + + clean = AuditCleanEvent(probes_run=["a", "b"]) + assert clean.to_record()["probes_run"] == ["a", "b"] + + +# --------------------------------------------------------------------------- +# CLI integration — drives `_cmd_audit` end-to-end with a planted repo +# --------------------------------------------------------------------------- + + +def test_cli_audit_dry_run_no_gh_calls(tmp_path: Path, monkeypatch) -> None: + """`forge-loop audit` default path must not hit gh — even with violations.""" + from forge_loop import cli + + _plant_py_file(tmp_path / "src" / "huge.py", 600) + + monkeypatch.chdir(tmp_path) + # Make load() fail so we exercise the "no config" branch — audit must + # still succeed on the dry-run path. + monkeypatch.setattr(cli, "load", lambda: (_ for _ in ()).throw(RuntimeError("nope"))) + + gh_calls: list = [] + + def _no_gh(): + gh_calls.append("called") + raise AssertionError("gh client must not be constructed on dry-run") + + monkeypatch.setattr(cli, "_gh_client_factory", _no_gh) + + rc = cli._cmd_audit(SimpleNamespace(apply=False, json=False)) + assert rc == 0 + assert gh_calls == [] + + +def test_cli_audit_json_emits_violations(tmp_path: Path, monkeypatch, capsys) -> None: + from forge_loop import cli + + _plant_py_file(tmp_path / "src" / "huge.py", 800) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(cli, "load", lambda: (_ for _ in ()).throw(RuntimeError("nope"))) + + rc = cli._cmd_audit(SimpleNamespace(apply=False, json=True)) + assert rc == 0 + out = capsys.readouterr().out + payload = json.loads(out) + assert payload["probes_run"] == ["file-size"] + assert any( + v["target"] == "src/huge.py" for v in payload["violations"] + ) + + +def test_cli_audit_apply_requires_github_repo(tmp_path: Path, monkeypatch) -> None: + from forge_loop import cli + + _plant_py_file(tmp_path / "src" / "huge.py", 800) + monkeypatch.chdir(tmp_path) + monkeypatch.setattr(cli, "load", lambda: (_ for _ in ()).throw(RuntimeError("nope"))) + + rc = cli._cmd_audit(SimpleNamespace(apply=True, json=False)) + # No owner/repo configured -> exit 2 with a clear message. + assert rc == 2 + + +def test_cli_audit_apply_files_tickets_via_mock_gh( + tmp_path: Path, monkeypatch +) -> None: + """End-to-end --apply path: plant a violation, drive CLI, see one create_issue call.""" + from forge_loop import cli + + _plant_py_file(tmp_path / "src" / "huge.py", 800) + monkeypatch.chdir(tmp_path) + + cfg = SimpleNamespace( + repo=tmp_path, + github_repo="o/r", + events_file=tmp_path / "events.jsonl", + ) + monkeypatch.setattr(cli, "load", lambda: cfg) + + gh = MockGhClient() + monkeypatch.setattr(cli, "_gh_client_factory", lambda: gh) + + rc = cli._cmd_audit(SimpleNamespace(apply=True, json=False)) + assert rc == 0 + create_calls = [kw for (m, kw) in gh.calls if m == "create_issue"] + assert len(create_calls) == 1 + assert AUDIT_AXIS_LABEL in create_calls[0]["labels"] + # Event was emitted to the configured file. + assert cfg.events_file.exists() + body = cfg.events_file.read_text() + assert "audit_violation_filed" in body