Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/forge_loop/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,13 @@ class Config:

worker_max_iterations: int = 3

# Stuck-issue sweep (issue #129). Minimum consecutive
# ``worker_iterations_exhausted`` events before the per-tick sweep
# demotes the issue from ``loop:ready`` to ``loop:needs-human``.
# Sourced from settings.maintenance.stuck_threshold_attempts.
stuck_threshold_attempts: int = 2
stuck_tail_events: int = 100

@property
def state_dir(self) -> Path:
return self.repo / "docs" / "ops"
Expand Down Expand Up @@ -245,6 +252,8 @@ def _from_settings(s: Settings) -> Config:
),
lumen=LumenConfig(top_k=s.lumen.top_k),
worker_max_iterations=s.iteration.max_iterations,
stuck_threshold_attempts=s.maintenance.stuck_threshold_attempts,
stuck_tail_events=s.maintenance.stuck_tail_events,
)


Expand Down
23 changes: 23 additions & 0 deletions src/forge_loop/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,28 @@ class WorktreeReapedEvent(EventBase):
status: str = ""


@register_event
class StuckSweepDemotedEvent(EventBase):
"""A per-tick stuck-sweep decision (issue #129).

Emitted by ``forge_loop.stuck_sweep.sweep`` whenever it touches an
issue — successful demotions carry ``ok=True``; gh API failures
carry ``ok=False`` plus a ``reason`` so the operator can see what
blew up without grepping structlog.

Idempotency skips (issue already lost ``loop:ready``) are NOT
emitted — there's nothing operationally interesting about them.
"""

KIND: ClassVar[str] = "stuck_sweep_demoted"
issue: int = Field(ge=1)
attempts: int = Field(ge=1)
last_state: str = ""
pr_url: str | None = None
ok: bool = True
reason: str = ""


# ---------------------------------------------------------------------------
# Emit + back-compat shim. ``emit`` is the typed path; ``append_event_with_
# registry_check`` is the back-compat wrapper called by state.append_event.
Expand Down Expand Up @@ -287,6 +309,7 @@ def append_event_with_registry_check(events_path: Path, kind: str, **fields: Any
"LoopStartEvent",
"LoopStopEvent",
"RedeployEvent",
"StuckSweepDemotedEvent",
"TickStartEvent",
"WorkerSessionRecoveredEvent",
"WorkerSessionTransitionEvent",
Expand Down
66 changes: 66 additions & 0 deletions src/forge_loop/runner/tick.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
_maybe_deploy_drift_halt,
)
from forge_loop.state import append_event, consolidate_sprint, write_state
from forge_loop.stuck_sweep import SweepReport, sweep as _stuck_sweep
from forge_loop.worker import WorkerOutcome


Expand Down Expand Up @@ -324,6 +325,66 @@ def _remove_ready_label(
)


def _run_stuck_sweep(cfg: Config, tick: int) -> SweepReport | None:
"""Per-tick stuck-issue sweep (issue #129).

Runs after the iteration loop (which may have written
``worker_iterations_exhausted`` for issues whose escalation didn't
land) and before the next dispatch batch — so an issue that was
supposed to be demoted but wasn't gets caught here, before
``top_issues`` re-picks it.

Fully best-effort: any failure (gh client init, sweep crash) is
swallowed with a single event. The tick itself must never fail
because of a maintenance sweep.
"""
if cfg.github_repo is None or "/" not in cfg.github_repo:
return None
owner, repo = cfg.github_repo.split("/", 1)
try:
# Lazy import — keeps the tick startup fast and lets tests stub
# the constructor via the env-token path without importing
# githubkit when not needed.
from forge_loop.gh_client import GithubkitClient
client = GithubkitClient()
except Exception as ex: # noqa: BLE001
append_event(
cfg.events_file,
"stuck_sweep_skipped",
tick=tick,
reason=f"gh_client_init: {ex}"[:200],
)
return None
try:
report = _stuck_sweep(
cfg.events_file,
client,
owner=owner,
repo=repo,
threshold=cfg.stuck_threshold_attempts,
ready_label=cfg.labels.ready,
tail=cfg.stuck_tail_events,
)
except Exception as ex: # noqa: BLE001 — sweep promises not to raise, belt-and-braces
append_event(
cfg.events_file,
"stuck_sweep_crashed",
tick=tick,
err=str(ex)[:200],
)
return None
if report.demotions:
append_event(
cfg.events_file,
"stuck_sweep_done",
tick=tick,
demoted=[d.issue for d in report.demotions if d.ok],
failed=list(report.errors),
scanned=report.scanned,
)
return report


def _tick(cfg: Config, tick: int) -> None:
# Imported lazily to avoid an import cycle (boot.py imports tick.py).
from forge_loop.runner.boot import _short_sleep
Expand Down Expand Up @@ -371,6 +432,11 @@ def _tick(cfg: Config, tick: int) -> None:
def _bus_emit(kind: str, payload: dict[str, Any]) -> None:
append_event(cfg.events_file, kind, **payload)

# Stuck-issue sweep (issue #129) — fires before the next dispatch
# so any issue the iteration loop gave up on but failed to demote
# gets caught here, not re-picked by top_issues below.
_run_stuck_sweep(cfg, tick)

repairs = _blocking_pr_repairs(cfg)
if repairs:
master_log_path = cfg.logs_dir / "master.log"
Expand Down
18 changes: 18 additions & 0 deletions src/forge_loop/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,22 @@ class IterationSettings(BaseSettings):
max_critic_iterations: int = 3


class MaintenanceSettings(BaseSettings):
"""Knobs for the maintenance-tier sweeps that run alongside the LLM
groomer (issue #129).

``stuck_threshold_attempts`` gates the stuck-issue sweep — an issue
needs at least this many ``worker_iterations_exhausted`` events
(without a recovery in between) before we demote it from
``loop:ready`` to ``loop:needs-human``. Default 2: one bad run is
forgivable, two is a pattern.
"""

model_config = SettingsConfigDict(extra="ignore")
stuck_threshold_attempts: int = 2
stuck_tail_events: int = 100


class MiscSettings(BaseSettings):
"""Misc knobs that don't fit a logical group cleanly."""

Expand Down Expand Up @@ -393,6 +409,7 @@ class Settings(BaseSettings):
operator: OperatorSettings = Field(default_factory=OperatorSettings)
dashboard: DashboardSettings = Field(default_factory=DashboardSettings)
iteration: IterationSettings = Field(default_factory=IterationSettings)
maintenance: MaintenanceSettings = Field(default_factory=MaintenanceSettings)
misc: MiscSettings = Field(default_factory=MiscSettings)

# The repo path itself is resolved at load time (git toplevel or env
Expand Down Expand Up @@ -429,6 +446,7 @@ def load(cls) -> Settings:
"operator": {**(y.get("operator") or {})},
"dashboard": {**(y.get("dashboard") or {})},
"iteration": {**(y.get("iteration") or {})},
"maintenance": {**(y.get("maintenance") or {})},
"misc": {**(y.get("misc") or {})},
}

Expand Down
Loading
Loading