diff --git a/containers/Containerfile b/containers/Containerfile index 7ac1f211..23e87998 100644 --- a/containers/Containerfile +++ b/containers/Containerfile @@ -30,8 +30,9 @@ RUN pip install --no-cache-dir \ # Create workspace directory (will be mounted at runtime) RUN mkdir -p /workspace && chmod 777 /workspace -# Copy entrypoint script +# Copy entrypoint and review module COPY entrypoint.py /usr/local/bin/forge-entrypoint.py +COPY review.py /usr/local/bin/review.py RUN chmod +x /usr/local/bin/forge-entrypoint.py # Set working directory diff --git a/containers/entrypoint.py b/containers/entrypoint.py index ae737e8e..1ab6da97 100644 --- a/containers/entrypoint.py +++ b/containers/entrypoint.py @@ -19,9 +19,20 @@ import os import subprocess import sys +import time +from datetime import UTC, datetime from pathlib import Path from typing import Any +from review import ( + ReviewCycleData, + Verdict, + detect_review_md, + parse_review_config, + parse_verdict, + write_cycle_file, +) + # Configure logging log_level = os.environ.get("LOG_LEVEL", "INFO").upper() logging.basicConfig( @@ -31,10 +42,52 @@ ) logger = logging.getLogger(__name__) +# TTY detection for terminal progress display +_IS_TTY = sys.stdout.isatty() + + +def _print_review_progress( + cycle: int, + max_cycles: int, + verdict: str, + feedback: str, +) -> None: + """Print review cycle progress to terminal (TTY only). + + Displays progress in format: "Review cycle N/M: VERDICT - [truncated feedback]" + Only prints when stdout is a TTY (local/terminal mode). + + Args: + cycle: Current cycle number (1-indexed). + max_cycles: Maximum cycles allowed. + verdict: Review verdict (e.g., "approved", "rejected"). + feedback: Reviewer feedback text to truncate. + """ + if not _IS_TTY: + return + + # Build progress message + verdict_upper = verdict.upper() + message = f"Review cycle {cycle}/{max_cycles}: {verdict_upper}" + + # Append truncated feedback for rejections + if feedback: + max_feedback_len = 200 + if len(feedback) > max_feedback_len: + truncated = feedback[:max_feedback_len] + "..." + else: + truncated = feedback + message = f"{message} - {truncated}" + + # Print immediately (flush ensures display within 1 second of call) + print(message, flush=True) + + # Enable LangChain debug/verbose mode if requested if os.environ.get("LANGCHAIN_VERBOSE", "").lower() in ("true", "1", "yes"): try: from langchain_core.globals import set_debug, set_verbose + set_verbose(True) set_debug(True) logger.info("LangChain verbose/debug mode enabled") @@ -344,7 +397,9 @@ async def run_agent_task( trace_context: Workflow fields forwarded to Langfuse only. """ # Support both new (LLM_MODEL) and legacy (CLAUDE_MODEL) env var names - model_name = os.environ.get("LLM_MODEL") or os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-5@20250929") + model_name = os.environ.get("LLM_MODEL") or os.environ.get( + "CLAUDE_MODEL", "claude-sonnet-4-5@20250929" + ) logger.info(f"Implementing task: {task_summary}") logger.info(f"Model: {model_name}") @@ -487,9 +542,7 @@ async def run_agent_task( # Run the agent (with Langfuse session context if enabled) initial_message = { - "messages": [ - {"role": "user", "content": f"Implement this task:\n\n{task_description}"} - ] + "messages": [{"role": "user", "content": f"Implement this task:\n\n{task_description}"}] } if langfuse_enabled: @@ -550,6 +603,305 @@ async def run_agent_task( return False +async def run_reviewer_agent( + workspace: Path, + review_instructions: str, + task_key: str, +) -> str: + """Run reviewer agent with review.md instructions. + + Args: + workspace: Path to the workspace directory. + review_instructions: Instructions from review.md body. + task_key: Jira task key for tracing. + + Returns: + The reviewer agent's output text. + """ + # Support both new (LLM_MODEL) and legacy (CLAUDE_MODEL) env var names + model_name = os.environ.get("LLM_MODEL") or os.environ.get( + "CLAUDE_MODEL", "claude-sonnet-4-5@20250929" + ) + logger.info(f"Running reviewer agent for {task_key}") + logger.info(f"Model: {model_name}") + + from deepagents import create_deep_agent + from deepagents.backends import LocalShellBackend + + # Check for API credentials + api_key = os.environ.get("ANTHROPIC_API_KEY") + vertex_project = os.environ.get("ANTHROPIC_VERTEX_PROJECT_ID") + + if not api_key and not vertex_project: + raise RuntimeError( + "No API credentials found (ANTHROPIC_API_KEY or ANTHROPIC_VERTEX_PROJECT_ID)" + ) + + # Create the agent with local shell backend (read-only access for review) + backend = LocalShellBackend( + root_dir=str(workspace), + inherit_env=True, + virtual_mode=False, + timeout=300, # 5 minutes for review + ) + + # Build reviewer system prompt + system_prompt = f"""You are a code reviewer agent. Your job is to review the implementation and provide a verdict. + +## Review Instructions +{review_instructions} + +## Verdict Format +After reviewing the code, you MUST output your verdict as either: +- APPROVED - if the implementation meets all requirements +- REJECTED - followed by your feedback if the implementation needs changes + +Example outputs: +- "The implementation looks good. APPROVED" +- "REJECTED: The function is missing error handling. Please add try/except blocks." + +Be specific in your feedback if rejecting.""" + + # Determine model type (Gemini vs Claude) + is_gemini = model_name.lower().startswith(("gemini", "models/gemini")) + + # Get max tokens from env (default 8192 for review) + max_tokens = int(os.environ.get("LLM_MAX_TOKENS", "8192")) + + if vertex_project: + if is_gemini: + from langchain_google_genai import ChatGoogleGenerativeAI + + model = ChatGoogleGenerativeAI( + model=model_name, + project=vertex_project, + location=os.environ.get("ANTHROPIC_VERTEX_REGION", "us-east5"), + vertexai=True, + max_output_tokens=max_tokens, + ) + else: + from langchain_google_vertexai.model_garden import ChatAnthropicVertex + + model = ChatAnthropicVertex( + model_name=model_name, + project=vertex_project, + location=os.environ.get("ANTHROPIC_VERTEX_REGION", "us-east5"), + max_tokens=max_tokens, + ) + else: + if is_gemini: + raise RuntimeError(f"Gemini model '{model_name}' requires Vertex AI credentials") + + from langchain_anthropic import ChatAnthropic + + model = ChatAnthropic( + model=model_name, + api_key=api_key, + max_tokens=max_tokens, + ) + + # Create reviewer agent (no skills needed for review) + agent = create_deep_agent( + model=model, + backend=backend, + system_prompt=system_prompt, + ) + + # Run the reviewer + initial_message = { + "messages": [ + { + "role": "user", + "content": "Please review the implementation in the workspace and provide your verdict.", + } + ] + } + + result = await agent.ainvoke(initial_message) + + # Extract output text from result + messages = result.get("messages", []) + if messages: + last_message = messages[-1] + content = getattr(last_message, "content", str(last_message)) + return content if isinstance(content, str) else str(content) + return "" + + +def load_conversation_history(workspace: Path, task_key: str) -> list[dict] | None: + """Load conversation history from .forge/history/{task_key}.json. + + Args: + workspace: Path to the workspace directory. + task_key: Jira task key to load history for. + + Returns: + List of message dicts or None if file doesn't exist. + """ + history_file = workspace / ".forge" / "history" / f"{task_key}.json" + if not history_file.exists(): + logger.warning(f"Conversation history not found: {history_file}") + return None + + try: + history_data = json.loads(history_file.read_text()) + return history_data.get("messages", []) + except Exception as e: + logger.warning(f"Failed to load conversation history: {e}") + return None + + +async def run_worker_with_feedback( + workspace: Path, + task_key: str, + task_summary: str, + task_description: str, + guardrails: str, + feedback: str, + previous_task_keys: list[str] | None = None, +) -> bool: + """Re-run worker agent with reviewer feedback injected. + + Args: + workspace: Path to the workspace directory. + task_key: Jira task key being implemented. + task_summary: Short task summary. + task_description: Detailed task description. + guardrails: Repository guidelines. + feedback: Reviewer feedback to inject. + previous_task_keys: List of previously implemented task keys. + + Returns: + True if agent completed successfully. + """ + # Inject reviewer feedback section into task description + feedback_section = f"\n\n## Reviewer Feedback\n\n{feedback}\n" + enhanced_description = task_description + feedback_section + + logger.info(f"Re-running worker with feedback for {task_key}") + + # Load conversation history for context + history = load_conversation_history(workspace, task_key) + if history: + logger.info(f"Loaded {len(history)} messages from conversation history") + + return await run_agent_task( + workspace=workspace, + task_key=task_key, + task_summary=task_summary, + task_description=enhanced_description, + guardrails=guardrails, + previous_task_keys=previous_task_keys, + ) + + +async def run_review_loop( + workspace: Path, + task_key: str, + task_summary: str, + task_description: str, + guardrails: str, + skill_name: str, + review_md_path: Path, + previous_task_keys: list[str] | None = None, +) -> bool: + """Run the review loop after initial skill execution. + + Args: + workspace: Path to the workspace directory. + task_key: Jira task key being implemented. + task_summary: Short task summary. + task_description: Detailed task description. + guardrails: Repository guidelines. + skill_name: Name of the skill being reviewed. + review_md_path: Path to the review.md file. + previous_task_keys: List of previously implemented task keys. + + Returns: + True if review loop completed successfully (approved or max retries). + """ + # Parse review configuration (SC-004, SC-005) + config = parse_review_config(review_md_path) + max_retries = config.max_retries + instructions = config.instructions + + logger.info(f"Starting review loop for {skill_name} (max_retries={max_retries})") + logger.info( + f"Review instructions: {instructions[:200]}..." + if len(instructions) > 200 + else f"Review instructions: {instructions}" + ) + + cycle = 0 + while cycle < max_retries: + cycle += 1 + cycle_start = time.perf_counter() + logger.info(f"Review cycle {cycle}/{max_retries}") + + # Run reviewer agent (SC-001) + try: + reviewer_output = await run_reviewer_agent( + workspace=workspace, + review_instructions=instructions, + task_key=task_key, + ) + except Exception as e: + logger.error(f"Reviewer agent failed: {e}") + # Treat reviewer failure as rejection + reviewer_output = f"REJECTED: Reviewer agent error: {e}" + + # Parse verdict (SC-002) + verdict, feedback = parse_verdict(reviewer_output) + elapsed = time.perf_counter() - cycle_start + timestamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Display terminal progress immediately (SC-011) + _print_review_progress(cycle, max_retries, verdict, feedback) + + logger.info(f"Review verdict: {verdict}, elapsed: {elapsed:.2f}s") + if feedback: + logger.info( + f"Feedback: {feedback[:200]}..." if len(feedback) > 200 else f"Feedback: {feedback}" + ) + + # Write cycle file (SC-007) + cycle_data = ReviewCycleData( + cycle=cycle, + max_cycles=max_retries, + verdict=verdict, + feedback=feedback, + skill=skill_name, + elapsed_seconds=elapsed, + timestamp=timestamp, + ) + write_cycle_file(workspace, skill_name, cycle_data) + + # On APPROVED: Exit loop successfully (SC-002) + if verdict == Verdict.APPROVED: + logger.info(f"Review approved on cycle {cycle}") + return True + + # On REJECTED with retries remaining (SC-003) + if cycle < max_retries: + logger.info(f"Retrying with feedback (attempt {cycle + 1}/{max_retries})") + worker_success = await run_worker_with_feedback( + workspace=workspace, + task_key=task_key, + task_summary=task_summary, + task_description=task_description, + guardrails=guardrails, + feedback=feedback, + previous_task_keys=previous_task_keys, + ) + if not worker_success: + logger.error(f"Worker retry failed on cycle {cycle}") + # Continue to next cycle anyway, let reviewer decide + + # Max retries exhausted - exit with success (BR-005) + logger.warning(f"Review loop exhausted max_retries ({max_retries}) for {skill_name}") + return True + + def main(): parser = argparse.ArgumentParser( description="Forge container entrypoint for AI code implementation" @@ -597,9 +949,11 @@ def main(): previous_task_keys = task_data.get("previous_task_keys", []) raw_trace_context = task_data.get("trace_context", {}) trace_context = raw_trace_context if isinstance(raw_trace_context, dict) else {} + skill_name = task_data.get("skill_name", "") elif args.task_summary and args.task_description: task_summary = args.task_summary task_description = args.task_description + skill_name = "" else: logger.error( "Task details required: use --task-file or --task-summary + --task-description" @@ -611,6 +965,10 @@ def main(): logger.error(f"Workspace not found: {workspace}") sys.exit(EXIT_CONFIG_ERROR) + # Get skill_name from env var if not in task data + if not skill_name: + skill_name = os.environ.get("FORGE_SKILL_NAME", "") + # Configure git for commits configure_git() @@ -646,16 +1004,49 @@ def main(): logger.error("Task implementation failed") sys.exit(EXIT_TASK_FAILED) + # Check for review.md and run review loop if it exists (SC-001, SC-010) + if skill_name: + # Skills are mounted at /skills/skill_0/, /skills/skill_1/, etc. + # Check all skill paths for the review.md + skills_base = Path("/skills") + review_md_path = detect_review_md(skill_name, task_key, skills_base) + + if review_md_path: + logger.info(f"Found review.md at {review_md_path}, starting review loop") + if not asyncio.run( + run_review_loop( + workspace=workspace, + task_key=task_key, + task_summary=task_summary, + task_description=task_description, + guardrails=guardrails, + skill_name=skill_name, + review_md_path=review_md_path, + previous_task_keys=previous_task_keys, + ) + ): + logger.error("Review loop failed") + sys.exit(EXIT_TASK_FAILED) + else: + logger.info(f"No review.md found for skill {skill_name}, skipping review loop") + else: + logger.info("No skill_name provided, skipping review loop") + # Ensure changes are committed (agent should have done this, but as fallback). # Skip if workspace is not a git repo — analysis tasks (RCA, reflection) write # artifacts to .forge/ without needing a commit. - is_git_repo = subprocess.run( - ["git", "rev-parse", "--is-inside-work-tree"], - cwd=workspace, - capture_output=True, - ).returncode == 0 + is_git_repo = ( + subprocess.run( + ["git", "rev-parse", "--is-inside-work-tree"], + cwd=workspace, + capture_output=True, + ).returncode + == 0 + ) if is_git_repo: - fallback_message = f"[{task_key}] {task_summary}\n\nAuto-committed by Forge container fallback." + fallback_message = ( + f"[{task_key}] {task_summary}\n\nAuto-committed by Forge container fallback." + ) if not git_commit(workspace, fallback_message): logger.error("Failed to commit changes") sys.exit(EXIT_TASK_FAILED) diff --git a/containers/review.py b/containers/review.py new file mode 100644 index 00000000..720b6946 --- /dev/null +++ b/containers/review.py @@ -0,0 +1,309 @@ +"""Review loop data structures for the container review engine. + +This module provides core data structures used by the review loop engine: + +- ReviewConfig: Configuration parsed from review.md frontmatter +- ReviewCycleData: Data captured for each review cycle iteration +- Verdict: Enum for review outcomes (APPROVED/REJECTED) +- parse_review_config: Parser for review.md YAML frontmatter +- detect_review_md: Locates review.md with project override precedence +- parse_verdict: Extracts verdict and feedback from review output text +- write_cycle_file: Writes review cycle data to JSON file +""" + +import json +import logging +import os +import re +from dataclasses import asdict, dataclass +from enum import StrEnum +from pathlib import Path + +import yaml + +logger = logging.getLogger(__name__) + +# Default max retries for review loops +DEFAULT_MAX_RETRIES = 3 + +# Environment variable for max retries override +ENV_MAX_RETRIES = "AUTO_REVIEW_MAX_RETRIES" + + +class Verdict(StrEnum): + """Review verdict indicating approval or rejection.""" + + APPROVED = "approved" + REJECTED = "rejected" + + +@dataclass +class ReviewConfig: + """Configuration for review loops parsed from review.md frontmatter. + + Attributes: + max_retries: Maximum number of retry attempts (default: 3). + instructions: Review instructions from the markdown body. + """ + + max_retries: int = DEFAULT_MAX_RETRIES + instructions: str = "" + + +@dataclass +class ReviewCycleData: + """Data captured for a single review cycle iteration. + + Attributes: + cycle: Current cycle number (1-indexed). + max_cycles: Maximum cycles allowed. + verdict: Review outcome ("approved" or "rejected"). + feedback: Reviewer feedback text. + skill: Name of the skill that performed the review. + elapsed_seconds: Time taken for this review cycle. + timestamp: ISO 8601 UTC timestamp of cycle completion. + """ + + cycle: int + max_cycles: int + verdict: str + feedback: str + skill: str + elapsed_seconds: float + timestamp: str + + +# Regex pattern for YAML frontmatter: --- at start, optional content, --- delimiter +# The frontmatter content between delimiters may be empty (just newlines) +_FRONTMATTER_PATTERN = re.compile( + r"\A---\s*\n(.*?)---\s*\n?(.*)", + re.DOTALL, +) + + +def parse_review_config(review_md_path: Path) -> ReviewConfig: + """Parse ReviewConfig from a review.md file with YAML frontmatter. + + The file format is: + ``` + --- + max_retries: 5 + --- + Review instructions here... + ``` + + Priority for max_retries: + 1. YAML frontmatter value + 2. AUTO_REVIEW_MAX_RETRIES environment variable + 3. Default value (3) + + If the file does not exist or YAML parsing fails, returns defaults + with a warning log (per BR-006). + + Args: + review_md_path: Path to the review.md file. + + Returns: + ReviewConfig with parsed values or defaults. + """ + # Get env var fallback for max_retries + env_max_retries: int | None = None + env_value = os.environ.get(ENV_MAX_RETRIES) + if env_value is not None: + try: + env_max_retries = int(env_value) + except ValueError: + logger.warning( + "Invalid %s value %r, ignoring", + ENV_MAX_RETRIES, + env_value, + ) + + # Default config to return on errors + default_max_retries = env_max_retries if env_max_retries is not None else DEFAULT_MAX_RETRIES + + if not review_md_path.exists(): + logger.warning("Review config not found at %s, using defaults", review_md_path) + return ReviewConfig(max_retries=default_max_retries, instructions="") + + try: + content = review_md_path.read_text(encoding="utf-8") + except OSError as e: + logger.warning("Failed to read %s: %s, using defaults", review_md_path, e) + return ReviewConfig(max_retries=default_max_retries, instructions="") + + # Try to parse frontmatter + match = _FRONTMATTER_PATTERN.match(content) + if not match: + # No frontmatter found - treat entire content as instructions + return ReviewConfig(max_retries=default_max_retries, instructions=content.strip()) + + frontmatter_raw = match.group(1) + instructions = match.group(2).strip() + + # Parse YAML frontmatter + try: + frontmatter = yaml.safe_load(frontmatter_raw) + except yaml.YAMLError as e: + logger.warning( + "Malformed YAML frontmatter in %s: %s, using defaults", + review_md_path, + e, + ) + return ReviewConfig(max_retries=default_max_retries, instructions=instructions) + + # Handle empty frontmatter + if frontmatter is None: + frontmatter = {} + + # Extract max_retries with fallback chain + max_retries = default_max_retries + if isinstance(frontmatter, dict) and "max_retries" in frontmatter: + fm_value = frontmatter["max_retries"] + if isinstance(fm_value, int): + max_retries = fm_value + else: + logger.warning( + "Invalid max_retries value %r in %s, using default", + fm_value, + review_md_path, + ) + + return ReviewConfig(max_retries=max_retries, instructions=instructions) + + +def detect_review_md(skill_name: str, ticket_key: str, skills_base: Path) -> Path | None: + """Detect the review.md file for a skill with project override precedence. + + Mirrors the precedence logic from src/forge/skills/resolver.py: + project-specific override wins over default. + + Search order: + 1. skills/{project}/{skill_name}/review.md (project override) + 2. skills/default/{skill_name}/review.md (default) + + Args: + skill_name: Name of the skill (e.g., "local-code-review"). + ticket_key: Jira ticket key (e.g., "AISOS-123") to extract project. + skills_base: Base path to the skills directory. + + Returns: + Path to review.md if found, None otherwise (SC-010). + """ + # Extract project key from ticket_key (e.g., "AISOS-123" -> "aisos") + project: str | None = None + if "-" in ticket_key: + project = ticket_key.split("-")[0].lower() + + # Check project override first (if project can be extracted) + if project: + project_path = skills_base / project / skill_name / "review.md" + if project_path.is_file(): + logger.debug( + "Found project override review.md: %s", + project_path, + ) + return project_path + + # Fall back to default + default_path = skills_base / "default" / skill_name / "review.md" + if default_path.is_file(): + logger.debug("Found default review.md: %s", default_path) + return default_path + + # No review.md found in either location (SC-010) + logger.debug( + "No review.md found for skill %r (ticket %s)", + skill_name, + ticket_key, + ) + return None + + +def parse_verdict(output_text: str) -> tuple[Verdict, str]: + """Extract verdict and feedback from review output text. + + Performs case-insensitive search for "APPROVED" and "REJECTED" markers. + When both markers are present, the first occurrence wins (checks APPROVED first). + + Args: + output_text: The raw output text from the review process. + + Returns: + Tuple of (Verdict, feedback_string): + - (Verdict.APPROVED, "") when APPROVED marker is found first + - (Verdict.REJECTED, feedback) when REJECTED marker is found first, + where feedback is the text following the marker + - (Verdict.REJECTED, "Verdict could not be parsed") when neither marker found + """ + # Normalize for case-insensitive matching + text_upper = output_text.upper() + + # Find positions of both markers (case-insensitive) + approved_pos = text_upper.find("APPROVED") + rejected_pos = text_upper.find("REJECTED") + + # Neither marker found + if approved_pos == -1 and rejected_pos == -1: + return (Verdict.REJECTED, "Verdict could not be parsed") + + # Determine which marker comes first + # If APPROVED found and (REJECTED not found OR APPROVED comes first) + if approved_pos != -1 and (rejected_pos == -1 or approved_pos < rejected_pos): + return (Verdict.APPROVED, "") + + # REJECTED marker found first (or only REJECTED found) + # Extract feedback from text following the REJECTED marker + # Skip past "REJECTED" keyword (8 characters) + feedback_start = rejected_pos + len("REJECTED") + feedback = output_text[feedback_start:].strip() + + return (Verdict.REJECTED, feedback) + + +def write_cycle_file(workspace: Path, step_name: str, cycle_data: ReviewCycleData) -> None: + """Write review cycle data to a JSON file. + + Creates the directory .forge/{step_name}/ if it doesn't exist, + and writes review_cycle_N.json where N is the cycle number. + + The JSON file contains all ReviewCycleData fields with proper formatting: + - cycle: Current cycle number (1-indexed) + - max_cycles: Maximum cycles allowed + - verdict: Lowercase string ("approved" or "rejected") + - feedback: Reviewer feedback text + - skill: Name of the skill that performed the review + - elapsed_seconds: Time taken for this review cycle + - timestamp: ISO 8601 UTC format timestamp + + Args: + workspace: Path to the workspace root directory. + step_name: Name of the step (used as subdirectory name). + cycle_data: ReviewCycleData instance to serialize. + """ + # Create .forge/{step_name}/ directory if it doesn't exist + output_dir = workspace / ".forge" / step_name + output_dir.mkdir(parents=True, exist_ok=True) + + # Build output path: .forge/{step_name}/review_cycle_N.json + output_file = output_dir / f"review_cycle_{cycle_data.cycle}.json" + + # Convert dataclass to dict + data = asdict(cycle_data) + + # Ensure verdict is lowercase string (handle Verdict enum or string) + verdict_value = data["verdict"] + if hasattr(verdict_value, "value"): + # It's a Verdict enum + data["verdict"] = verdict_value.value + else: + # It's already a string, ensure lowercase + data["verdict"] = str(verdict_value).lower() + + # Write JSON with proper formatting (2-space indent for readability) + output_file.write_text( + json.dumps(data, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + logger.debug("Wrote review cycle file: %s", output_file) diff --git a/docs/guide/auto-review.md b/docs/guide/auto-review.md new file mode 100644 index 00000000..eb8b8df0 --- /dev/null +++ b/docs/guide/auto-review.md @@ -0,0 +1,419 @@ +# Auto-Review + +Auto-review is a quality gate that runs after skill execution. It spawns a reviewer agent to evaluate the skill's output and triggers retries with feedback when the result doesn't meet quality criteria. + +## Overview + +When a skill has a `review.md` file in its directory, Forge automatically invokes a reviewer agent after the skill completes. The reviewer evaluates the output according to instructions in `review.md` and issues a verdict: + +- **APPROVED** — The output meets quality criteria; workflow continues +- **REJECTED** — The output has issues; skill re-runs with feedback injected + +```mermaid +flowchart LR + A[Skill Executes] --> B{review.md exists?} + B -->|No| C[Continue Workflow] + B -->|Yes| D[Spawn Reviewer Agent] + D --> E{Verdict?} + E -->|APPROVED| C + E -->|REJECTED| F{Retries left?} + F -->|Yes| G[Re-run Skill with Feedback] + G --> D + F -->|No| C +``` + +### When to Use Auto-Review + +Auto-review is useful when: + +- A skill produces artifacts that need quality validation before proceeding +- You want to catch common mistakes (missing tests, debug code, documentation gaps) without human intervention +- You need consistent quality standards across all executions of a skill + +Auto-review is **not** a replacement for human code review — it catches machine-detectable issues early in the pipeline. + +## Quick Start + +Create a `review.md` file in your skill directory: + +``` +skills/ +└── default/ + └── implement-task/ + ├── SKILL.md + └── review.md ← Reviewer configuration +``` + +Minimal `review.md`: + +```markdown +--- +max_retries: 3 +--- + +# Quality Check + +Verify the implementation: + +1. All new public functions have tests +2. No debug print statements in committed code +3. Error handling uses specific exception types + +Output APPROVED if all criteria pass, or REJECTED with specific feedback. +``` + +That's it. The next time `implement-task` runs, the reviewer will evaluate its output. + +## Configuration + +### `max_retries` in Frontmatter + +The `max_retries` field in YAML frontmatter controls how many times a skill can retry after a REJECTED verdict: + +```markdown +--- +max_retries: 5 +--- + +Review instructions here... +``` + +### `AUTO_REVIEW_MAX_RETRIES` Environment Variable + +Set a global default for all skills that don't specify `max_retries` in their `review.md`: + +```bash +# In .env or environment +AUTO_REVIEW_MAX_RETRIES=5 +``` + +### Priority Chain + +When determining retry limit, Forge checks in order: + +| Priority | Source | Example | +|----------|--------|---------| +| 1 (highest) | Frontmatter `max_retries` | `max_retries: 5` in review.md | +| 2 | Environment variable | `AUTO_REVIEW_MAX_RETRIES=5` | +| 3 (lowest) | Built-in default | `3` | + +### Default Value + +If neither frontmatter nor environment variable is set, the default `max_retries` is **3**. + +## Writing Review Instructions + +The prose body of `review.md` (everything after the `---` closing delimiter) becomes the reviewer agent's instructions. Write clear, actionable criteria. + +### Effective Review Instructions + +**Do:** + +- Define explicit pass/fail criteria for each quality dimension +- Include examples of acceptable vs. flagged patterns +- Specify the expected output format (APPROVED/REJECTED markers) +- Reference specific files or artifacts the reviewer should examine + +**Don't:** + +- Write vague instructions ("make sure the code is good") +- Include implementation details unrelated to review +- Duplicate instructions from the skill's SKILL.md + +### Example: Comprehensive Review + +```markdown +--- +max_retries: 3 +--- + +# Implementation Review + +Review code changes via `git diff origin/main...HEAD`. + +## Criteria + +### Test Coverage +- Public functions must have tests +- Flag: new public function without corresponding test + +### Error Handling +- No bare `except:` or empty `catch {}` +- Flag: generic error swallowing + +### Documentation +- Public APIs need docstrings +- Flag: public function without documentation + +## Verdict + +Output exactly one: +- `APPROVED` — all criteria pass +- `REJECTED` — followed by specific, actionable feedback +``` + +See [skills/default/implement-task/review.md](https://github.com/your-org/forge/blob/main/skills/default/implement-task/review.md) for a complete example. + +## Verdict Protocol + +The reviewer must output exactly one verdict marker. Forge scans the output text for these markers. + +### APPROVED + +Use when all criteria pass: + +``` +APPROVED +``` + +Or with optional notes: + +``` +APPROVED + +Notes: +- Consider adding edge case tests (non-blocking) +``` + +### REJECTED + +Use when any criterion fails. Always include specific, actionable feedback: + +``` +REJECTED + +Issues: +- [Test Coverage] src/handler.py:42 — `process_data()` is public but has no test +- [Error Handling] pkg/db.go:87 — error from `Query()` discarded with `_` + +Required changes: +1. Add test for `process_data()` in `test_handler.py` +2. Handle or propagate error from `Query()` at line 87 +``` + +### Verdict Detection + +Forge performs case-insensitive substring matching for `APPROVED` and `REJECTED`: + +- First marker found wins (if both appear, the one occurring first determines verdict) +- Marker can appear anywhere in the output (start, middle, end) +- Surrounding text is captured as feedback for REJECTED verdicts + +### No Verdict Found + +If neither marker appears, Forge treats the output as **REJECTED** with the feedback: + +``` +Verdict could not be parsed +``` + +This prevents silent failures from malformed reviewer output. + +## Per-Project Overrides + +Review configuration follows the same precedence rules as skill files. See [Authoring Skills](../skills/authoring.md) for the general pattern. + +### Resolution Order + +For a ticket `MYPROJECT-123` running skill `implement-task`: + +| Priority | Path | Description | +|----------|------|-------------| +| 1 (highest) | `skills/myproject/implement-task/review.md` | Project override | +| 2 | `skills/default/implement-task/review.md` | Default configuration | + +### Override Behavior + +- If the project override exists, it **completely replaces** the default (no merging) +- If neither exists, the skill runs **without a reviewer** +- Project key is extracted from ticket key and lowercased (`MYPROJECT-123` → `myproject`) + +### Example: Project-Specific Review + +``` +skills/ +├── default/ +│ └── implement-task/ +│ └── review.md ← Generic criteria for all projects +└── myproject/ + └── implement-task/ + └── review.md ← Stricter criteria for myproject +``` + +The `myproject` override might add additional criteria: + +```markdown +--- +max_retries: 5 +--- + +# MyProject Code Review + +All default criteria plus: + +## MyProject-Specific + +### Performance +- No N+1 database queries +- Bulk operations must use batching + +### Security +- All user input must be validated +- No raw SQL queries (use ORM) +``` + +## Observability + +### Review Cycle Files + +Each review cycle writes a JSON file to the workspace: + +``` +.forge/{step-name}/review_cycle_N.json +``` + +Where: +- `{step-name}` is the workflow step (e.g., `implement_task`, `local_review`) +- `N` is the cycle number (1-indexed) + +**Example path:** `.forge/implement_task/review_cycle_1.json` + +### File Contents + +```json +{ + "cycle": 1, + "max_cycles": 3, + "verdict": "rejected", + "feedback": "Missing tests for process_data() function", + "skill": "implement-task", + "elapsed_seconds": 12.5, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `cycle` | int | Current cycle number (1-indexed) | +| `max_cycles` | int | Maximum allowed cycles | +| `verdict` | string | `"approved"` or `"rejected"` | +| `feedback` | string | Reviewer feedback text (empty for approved) | +| `skill` | string | Name of the skill being reviewed | +| `elapsed_seconds` | float | Time spent on this review cycle | +| `timestamp` | string | ISO 8601 UTC timestamp | + +### Prometheus Metrics + +Forge exposes these metrics at `/metrics`: + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `forge_review_cycles_total` | Counter | `skill`, `step` | Total review cycles detected | +| `forge_review_verdicts_total` | Counter | `skill`, `step`, `verdict` | Verdicts by outcome | +| `forge_review_duration_seconds` | Histogram | `skill`, `step` | Review cycle duration | + +### Terminal Progress + +In local/terminal mode, Forge prints progress during review loops: + +``` +Review cycle 1/3: REJECTED - Missing tests for process_data() function... +Review cycle 2/3: REJECTED - Tests added but error handling still missing... +Review cycle 3/3: APPROVED +``` + +Feedback is truncated to 200 characters in terminal output. + +## Troubleshooting + +### Reviewer Times Out + +**Symptom:** Review cycle takes > 5 minutes and fails. + +**Causes:** +- Reviewer instructions are too complex or ambiguous +- Large codebase with many files to examine +- Reviewer agent loops or gets stuck + +**Solutions:** +- Simplify review criteria to focus on specific, checkable items +- Use `git diff` instead of reading entire files +- Add explicit instructions to limit scope + +### Verdict Parsing Failures + +**Symptom:** Every cycle shows "Verdict could not be parsed" even though reviewer output looks correct. + +**Causes:** +- Marker is misspelled (`APROVED`, `REJECT`) +- Marker appears inside a code block (not detected) +- Non-ASCII characters in marker + +**Solutions:** +- Ensure reviewer outputs exactly `APPROVED` or `REJECTED` +- Add explicit instructions: "Output the word APPROVED or REJECTED on its own line" +- Check reviewer output in cycle JSON files + +### Endless Loops (Max Retries Hit) + +**Symptom:** Skill always exhausts `max_retries` without passing. + +**Causes:** +- Review criteria are impossible to satisfy automatically +- Feedback is not actionable by the skill +- Skill doesn't understand the feedback format + +**Solutions:** +- Review the feedback in cycle JSON files — is it specific enough? +- Simplify criteria to things the skill can actually fix +- Increase `max_retries` if issues are intermittent +- Consider whether the criteria should cause rejection at all + +### review.md Not Detected + +**Symptom:** Skill runs without review even though `review.md` exists. + +**Causes:** +- File is in wrong directory or has wrong name +- File is named `Review.md` (wrong case) +- Project override path is incorrect + +**Solutions:** +- Verify exact path: `skills/{project}/{skill-name}/review.md` +- Check file is named `review.md` (lowercase) +- Verify project key extraction (`PROJ-123` → `proj`) + +### YAML Frontmatter Errors + +**Symptom:** `max_retries` setting is ignored. + +**Causes:** +- Frontmatter is malformed YAML +- `max_retries` value is not an integer +- Missing `---` delimiters + +**Solutions:** +- Validate YAML syntax +- Use integer value: `max_retries: 3` (not `"3"` or `three`) +- Ensure file starts with `---` on first line +- Check worker logs for "Malformed YAML" warnings + +### Feedback Not Injected on Retry + +**Symptom:** Skill re-runs but doesn't see previous feedback. + +**Causes:** +- Conversation history file missing or corrupted +- Feedback format not recognized by skill + +**Solutions:** +- Check `.forge/history/{task_key}.json` exists +- Verify "## Reviewer Feedback" section appears in retry prompt +- Review skill's handling of feedback sections + +## Related Documentation + +- [Authoring Skills](../skills/authoring.md) — General skill authoring guide +- [review.md Schema](../reference/review-md-schema.md) — Complete file format specification +- [Feature Workflow](feature-workflow.md) — How auto-review fits in the feature pipeline +- [Bug Workflow](bug-workflow.md) — How auto-review fits in the bug pipeline diff --git a/docs/reference/config.md b/docs/reference/config.md index 72f94b5d..34dd861a 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -93,6 +93,16 @@ Use these to skip the Jira project property requirement during local development | `CONTAINER_MEMORY_LIMIT` | Memory limit for task containers (default: `4g`) | | `CONTAINER_CPU_LIMIT` | CPU limit for task containers (default: `2`) | +## Auto-Review + +Settings for the automatic review loop that runs after skill execution. See the [Auto-Review Guide](../guide/auto-review.md) for details. + +| Variable | Default | Description | +|----------|---------|-------------| +| `AUTO_REVIEW_MAX_RETRIES` | `3` | Default maximum retry attempts when a skill's `review.md` doesn't specify `max_retries` | +| `AUTO_REVIEW_POLL_INTERVAL` | `5.0` | Polling interval in seconds for detecting review cycle files during container execution | +| `AUTO_REVIEW_RECORD_POLLED_FILES` | (none) | Recording mode for polled review cycle files: `log` logs cycle data at INFO level, `copy` copies files to recording directory | + ## Observability ### Langfuse Tracing diff --git a/docs/reference/review-md-schema.md b/docs/reference/review-md-schema.md new file mode 100644 index 00000000..49b6afd8 --- /dev/null +++ b/docs/reference/review-md-schema.md @@ -0,0 +1,228 @@ +# review.md Schema + +This document specifies the file format and validation rules for `review.md` files used to configure skill-specific review loops. + +## Overview + +A `review.md` file configures how an AI reviewer evaluates the output of a skill before proceeding. It specifies retry limits and provides review instructions that guide the reviewer's analysis. + +## File Location + +### Naming Convention + +The file must be named exactly `review.md` (lowercase) and placed alongside `SKILL.md` in a skill directory: + +``` +skills/ +├── default/ +│ └── local-code-review/ +│ ├── SKILL.md # Skill definition +│ └── review.md # Review configuration +└── myproject/ + └── local-code-review/ + ├── SKILL.md # Project-specific skill override + └── review.md # Project-specific review override +``` + +### Override Precedence + +Review configuration follows the same precedence rules as skills: + +| Priority | Path | Description | +|----------|------|-------------| +| 1 (highest) | `skills/{project}/{skill-name}/review.md` | Project-specific override | +| 2 | `skills/default/{skill-name}/review.md` | Default configuration | + +The project key is extracted from the Jira ticket key and lowercased. For ticket `MYPROJECT-123`, Forge checks `skills/myproject/` first. + +**Examples:** + +- Ticket `AISOS-456` with skill `local-code-review`: + 1. Check `skills/aisos/local-code-review/review.md` + 2. Fall back to `skills/default/local-code-review/review.md` + +- If neither path exists, the skill runs **without a reviewer**. + +## File Format + +The `review.md` file uses YAML frontmatter followed by Markdown prose, consistent with `SKILL.md` files: + +```markdown +--- +max_retries: 5 +--- + +# Review Instructions + +Verify that the implementation meets all acceptance criteria... +``` + +### YAML Frontmatter Schema + +The frontmatter section is delimited by `---` markers and contains YAML key-value pairs: + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| `max_retries` | `int` | No | `3` | Maximum retry attempts after a REJECTED verdict | + +**Frontmatter rules:** + +- The opening `---` must be the first line of the file (no leading whitespace or blank lines) +- The closing `---` must appear on its own line +- Unknown fields are silently ignored (forward compatibility) +- Field values must match the expected type; invalid types trigger a warning and use defaults + +**Priority for `max_retries`:** + +1. Frontmatter value (if valid integer) +2. `AUTO_REVIEW_MAX_RETRIES` environment variable (if set and valid) +3. Built-in default: `3` + +### Prose Body + +Everything after the closing `---` delimiter is treated as reviewer instructions. This Markdown content is passed to the reviewer agent as its system prompt. + +**Best practices for instructions:** + +- Define what constitutes approval vs. rejection +- Specify quality criteria and acceptance thresholds +- Reference any artifacts the reviewer should examine (e.g., `.forge/` files, test output) +- Keep instructions focused on review criteria, not implementation details + +**Example:** + +```markdown +--- +max_retries: 3 +--- + +# Code Review Checklist + +Evaluate the implementation against these criteria: + +## Required (reject if missing) +- [ ] All acceptance criteria from the task are addressed +- [ ] No obvious security vulnerabilities introduced +- [ ] Tests cover the new functionality + +## Recommended (note but don't reject) +- [ ] Code follows existing patterns +- [ ] Documentation updated if needed + +## Output Format + +End your review with exactly one of: +- `APPROVED` — all required criteria pass +- `REJECTED` — followed by specific feedback for the implementer +``` + +## Edge Cases + +### Empty File + +A `review.md` file with zero bytes or only whitespace: + +- Reviewer spawns with **no instructions** (empty prompt) +- Uses default `max_retries: 3` (or env var if set) + +### Frontmatter-Only File + +A file with frontmatter but no prose body: + +```markdown +--- +max_retries: 5 +--- +``` + +- Reviewer spawns with **no instructions** (empty prompt) +- Uses the specified `max_retries` value + +### Missing Frontmatter + +A file without `---` delimiters: + +```markdown +Review the code for bugs and style issues. +``` + +- Entire file content becomes reviewer instructions +- Uses default `max_retries: 3` (or env var if set) + +### Malformed YAML + +If the frontmatter contains invalid YAML: + +```markdown +--- +max_retries: "not a number" +--- + +Instructions here... +``` + +- A warning is logged +- Default `max_retries` is used +- Prose body is still extracted and used as instructions + +### No review.md Found + +If no `review.md` exists in either the project or default skill directory: + +- **No reviewer is spawned** — the skill output is accepted without review +- This is the expected behavior for skills that don't require review + +## Review Cycle Output + +Each review cycle writes its results to: + +``` +.forge/{step-name}/review_cycle_N.json +``` + +Where: +- `{step-name}` is the workflow step (e.g., `implement_task`, `local_code_review`) +- `N` is the 1-indexed cycle number + +**Example:** `.forge/implement_task/review_cycle_1.json` + +**JSON schema:** + +```json +{ + "cycle": 1, + "max_cycles": 3, + "verdict": "rejected", + "feedback": "Missing test coverage for edge case...", + "skill": "local-code-review", + "elapsed_seconds": 45.2, + "timestamp": "2024-01-15T10:30:00Z" +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `cycle` | `int` | Current cycle number (1-indexed) | +| `max_cycles` | `int` | Maximum cycles allowed (from `max_retries + 1`) | +| `verdict` | `string` | `"approved"` or `"rejected"` | +| `feedback` | `string` | Reviewer feedback (empty string if approved) | +| `skill` | `string` | Name of the skill that performed the review | +| `elapsed_seconds` | `float` | Time taken for this review cycle | +| `timestamp` | `string` | ISO 8601 UTC timestamp of cycle completion | + +## Validation Summary + +| Condition | Behavior | +|-----------|----------| +| File missing | No reviewer spawned | +| File empty | Reviewer spawns with no instructions, default retries | +| No frontmatter | Entire content used as instructions, default retries | +| Empty frontmatter | Prose used as instructions, default retries | +| Malformed YAML | Warning logged, prose used, default retries | +| Invalid `max_retries` type | Warning logged, default used | +| Unknown frontmatter fields | Silently ignored | + +## See Also + +- [Authoring Skills](../skills/authoring.md) — how to create and customize skills +- [Configuration](config.md) — environment variables including `AUTO_REVIEW_MAX_RETRIES` diff --git a/docs/skills/authoring.md b/docs/skills/authoring.md index bf08bd6c..6dcb86e7 100644 --- a/docs/skills/authoring.md +++ b/docs/skills/authoring.md @@ -102,6 +102,22 @@ Write `.forge/fix-plan.md` with: 3. Proposed fix approach ``` +## Auto-Review + +Skills can include a `review.md` file alongside `SKILL.md` to enable automatic quality gates. When present, Forge spawns a reviewer agent after the skill completes, evaluating the output against criteria you define. + +``` +skills/ +└── myteam/ + └── implement-task/ + ├── SKILL.md + └── review.md ← reviewer instructions +``` + +The reviewer issues APPROVED or REJECTED verdicts. On rejection, the skill re-runs with feedback injected — up to `max_retries` attempts (default: 3). + +See the [Auto-Review Guide](../guide/auto-review.md) for configuration options and writing effective review instructions. + ## Using your skills with Forge See [Customize Forge for your project](../dev/contributing.md#customize-forge-for-your-project) for how to point a Jira project at your skills repo using `forge project-setup` and the skill installer. diff --git a/docs/skills/defaults.md b/docs/skills/defaults.md index b2a9f83f..edeb380b 100644 --- a/docs/skills/defaults.md +++ b/docs/skills/defaults.md @@ -50,6 +50,8 @@ Drives the code implementation agent running inside an ephemeral container. **Scope:** Given a task file at `.forge/task.json`, implement the code, run tests, and commit. No external network access. +**Auto-review:** Includes `review.md` that checks test coverage, error handling, documentation, and debug code before PR creation. See [Auto-Review Guide](../guide/auto-review.md). + --- ### `local-code-review` @@ -58,6 +60,8 @@ Reviews the implementation diff against `main` before PR creation. **Focus:** Breaking changes, test failures, security issues, and spec mismatches. Up to 2 fix passes. +**Auto-review:** Includes `review.md` that validates test suite execution and checks for breaking changes, security issues, and spec alignment. See [Auto-Review Guide](../guide/auto-review.md). + --- ### `analyze-ci` diff --git a/docs/skills/index.md b/docs/skills/index.md index 875c8c06..b3f55b90 100644 --- a/docs/skills/index.md +++ b/docs/skills/index.md @@ -40,7 +40,8 @@ skills/ │ └── review-code/ └── {project}/ # Per-project overrides (Jira key, lowercase) └── {skill-name}/ - └── SKILL.md + ├── SKILL.md + └── review.md # Optional auto-review config ``` ## Available Skills @@ -59,6 +60,12 @@ skills/ | `review-code` | AI PR review against spec | | `analyze-bug` | Bug RCA generation | +### Optional Files + +| File | Purpose | +|------|---------| +| `review.md` | Auto-review configuration — when present, Forge spawns a reviewer agent after skill execution. See [Auto-Review Guide](../guide/auto-review.md). | + ## The Primary Way to Contribute Writing a skill set for your team's stack is the fastest way to make Forge work better for your project — and to share that knowledge with others using similar tooling. diff --git a/pyproject.toml b/pyproject.toml index 2c0875a8..dcf7d8af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "opentelemetry-exporter-otlp>=1.27.0", "md-to-adf>=1.1.0", "pyyaml>=6.0.0", + "aiofiles>=24.1.0", ] [project.optional-dependencies] diff --git a/skills/aisos/implement-task/review.md b/skills/aisos/implement-task/review.md new file mode 100644 index 00000000..65d9ec9d --- /dev/null +++ b/skills/aisos/implement-task/review.md @@ -0,0 +1,126 @@ +--- +max_retries: 2 +--- + +# Implementation Review — Forge/AISOS Override + +This is a project-specific override for the Forge codebase. It inherits from the default review criteria in `skills/default/implement-task/review.md` and adds Forge-specific requirements. + +> **For skill authors**: This file demonstrates per-project override patterns. Copy this structure to create overrides for your own project by placing it in `skills/{project-key-lowercase}/implement-task/review.md`. + +## Base Criteria (Inherited) + +Apply all criteria from the default `review.md`: + +1. **Test Coverage** — Public functions must have tests +2. **Error Handling** — Explicit exception handling, no bare `except:` +3. **Documentation** — Public APIs need docstrings +4. **No Debug Code** — No stray `print()`, TODOs, or commented-out code + +## Forge-Specific Criteria + +These project-specific criteria extend the defaults for the Forge/AISOS codebase. + +### 5. Testing with pytest + +All tests must use pytest, not unittest or generic "run tests" approaches. + +- Test files must be in `tests/` and match `test_*.py` naming +- Use pytest fixtures for setup, not `setUp()` methods +- Use `pytest.raises()` for exception testing, not try/except in tests +- Parameterized tests use `@pytest.mark.parametrize`, not loops +- Async tests use `pytest-asyncio` with `@pytest.mark.asyncio` + +**Flag if**: `unittest.TestCase` subclasses, `self.assertEqual()` patterns, or missing pytest markers for async tests. + +### 6. Type Hints on Public Functions + +All public functions and methods must have type annotations. + +- Return types are required (use `-> None` for procedures) +- Use `X | None` instead of `Optional[X]` (PEP 604) +- Use `list[T]` and `dict[K, V]` instead of `List` and `Dict` from typing +- Complex types should use `TypeAlias` or `TypedDict` for clarity + +**Flag if**: Public function missing parameter types or return type annotation. + +**Acceptable**: Private functions (prefixed with `_`) without full annotations. + +### 7. Logging via logging Module + +Use the `logging` module for all diagnostic output. Never use `print()` for logging. + +- Import pattern: `logger = logging.getLogger(__name__)` +- Use appropriate log levels: `debug`, `info`, `warning`, `error`, `exception` +- Exception logging must use `logger.exception()` or pass `exc_info=True` +- Log messages should be lazy-formatted: `logger.info("Processing %s", item)` not f-strings + +**Flag if**: `print()` calls for diagnostic output, f-string log messages, exceptions logged without stack traces. + +**Acceptable**: `print()` in CLI entrypoints for user-facing output, test assertions. + +### 8. Async/Await Patterns + +Code using asyncio must follow proper async patterns. + +- Async functions must be awaited at call sites +- Use `asyncio.gather()` for concurrent operations, not sequential awaits in loops +- Context managers for async resources use `async with` +- No blocking I/O (`time.sleep()`, sync HTTP calls) inside async functions +- Use `contextlib.suppress()` instead of try/except/pass patterns + +**Flag if**: +- `await` inside a for-loop when operations are independent (should use `gather()`) +- Sync `sleep()` or blocking calls in async context +- Missing `await` on coroutine calls +- `asyncio.run()` called from within an async context + +**Acceptable**: Sequential awaits when operations have dependencies. + +## Verdict + +After reviewing all criteria (base + Forge-specific), output exactly one marker: + +``` +APPROVED +``` + +Use when all criteria pass or only minor issues exist that don't block merge. + +``` +REJECTED +``` + +Use when any criterion fails with blocking issues. + +## Feedback Format + +On rejection, include the criterion number and name for Forge-specific violations: + +``` +REJECTED + +Issues: +- [Criterion: Test Coverage] src/forge/api/handler.py:42 — `process_webhook()` is public but has no test +- [Criterion: Type Hints] src/forge/models/task.py:15 — `parse_task()` missing return type annotation +- [Criterion: Logging] src/forge/worker.py:87 — uses `print()` for error output instead of logger +- [Criterion: Async/Await] src/forge/queue/consumer.py:32 — sequential awaits in loop should use `asyncio.gather()` + +Required changes: +1. Add test for `process_webhook()` in `tests/unit/test_handler.py` +2. Add `-> Task` return type to `parse_task()` function signature +3. Replace `print(f"Error: {e}")` with `logger.error("Error: %s", e)` +4. Refactor loop to: `await asyncio.gather(*[process(item) for item in items])` +``` + +## Override Configuration + +This file demonstrates per-project customization: + +| Setting | Default | This Override | Rationale | +|---------|---------|---------------|-----------| +| `max_retries` | 3 | 2 | Forge CI is fast; 2 retries is sufficient | +| Test framework | generic | pytest | Forge uses pytest exclusively | +| Type hints | optional | required | Forge enforces mypy strict mode | +| Logging | generic | logging module | Forge uses structured logging | +| Async patterns | n/a | strict | Forge uses asyncio throughout | diff --git a/skills/default/implement-task/review.md b/skills/default/implement-task/review.md new file mode 100644 index 00000000..04a537e6 --- /dev/null +++ b/skills/default/implement-task/review.md @@ -0,0 +1,96 @@ +--- +max_retries: 3 +--- + +# Implementation Review + +Review the code changes for quality, completeness, and adherence to project standards. Run `git diff origin/main...HEAD` to see all changes on this branch. + +## Review Criteria + +Evaluate the diff against each criterion. Flag violations; do not flag items that pass. + +### 1. Test Coverage + +New public functions, methods, or exported symbols must have corresponding tests. + +- Public function added without test? Flag it. +- Existing test file updated to cover new behavior? Passes. +- Internal/private helpers without direct tests are acceptable if exercised by public API tests. + +### 2. Error Handling + +Errors must be handled explicitly with specific exception types or error values. + +- Bare `except:` (Python), empty `catch {}` (JS/TS), or `_ =>` discarding errors (Rust)? Flag it. +- Generic catches that log and re-raise, or catch-all with explicit handling? Acceptable. +- Go: Errors must be checked, not discarded with `_`. + +### 3. Documentation + +Public APIs must have documentation. + +- Public function/method without docstring, JSDoc, GoDoc, or rustdoc? Flag it. +- Internal/private symbols without docs? Acceptable. +- Complex logic without inline comments explaining the "why"? Consider flagging. + +### 4. No Debug Code + +Committed code must not contain debug artifacts. + +- `print()`, `console.log()`, `fmt.Println()` for debugging (not intentional logging)? Flag it. +- `TODO`, `FIXME`, `XXX`, `HACK` comments in new code? Flag it. +- Commented-out code blocks? Flag it. +- Intentional logging via proper logger frameworks? Acceptable. + +## Verdict + +After reviewing all criteria, output exactly one of these markers: + +``` +APPROVED +``` + +Use when all criteria pass or only minor issues exist that don't block merge. + +``` +REJECTED +``` + +Use when any criterion fails with blocking issues. + +## Feedback Format + +On rejection, provide structured feedback for each violation: + +``` +REJECTED + +Issues: +- [Criterion: Test Coverage] file.py:42 — `process_data()` is public but has no test +- [Criterion: Error Handling] handler.go:87 — error from `db.Query()` discarded with `_` +- [Criterion: No Debug Code] utils.ts:15 — contains `console.log()` debug statement +- [Criterion: Documentation] api.rs:23 — public function `calculate_score` lacks rustdoc + +Required changes: +1. Add test for `process_data()` in `test_file.py` +2. Handle or propagate error from `db.Query()` at line 87 +3. Remove `console.log()` or replace with proper logger +4. Add documentation comment for `calculate_score` +``` + +On approval with optional suggestions: + +``` +APPROVED + +Notes: +- Consider adding edge case tests for `validate_input()` (non-blocking) +``` + +## Important + +- Be specific: include file path and line number for every issue +- Be actionable: state exactly what needs to change +- Be fair: don't flag acceptable patterns listed above +- Stay language-agnostic: apply criteria to any stack (Python, Go, Node.js, Rust, etc.) diff --git a/skills/default/local-code-review/review.md b/skills/default/local-code-review/review.md new file mode 100644 index 00000000..3e7c1bbf --- /dev/null +++ b/skills/default/local-code-review/review.md @@ -0,0 +1,131 @@ +--- +max_retries: 2 +--- + +# Local Code Review Quality Gate + +This review runs after the local-code-review skill completes. It validates the changes are ready for PR creation by checking for breaking changes, test failures, security issues, and spec alignment. + +## Before Reviewing + +Run the project's test suite to verify all tests pass locally. + +### Detecting the Test Command + +Check for a test command in the following order: + +1. **pyproject.toml** — look for `[tool.pytest]` or `[project.scripts]` with test entry + - Default: `uv run pytest` or `pytest` +2. **package.json** — look for `scripts.test` + - Default: `npm test` +3. **Makefile** — look for `test:` target + - Default: `make test` +4. **Cargo.toml** — Rust project + - Default: `cargo test` +5. **go.mod** — Go project + - Default: `go test ./...` + +Run the detected test command. If tests fail, this review must be `REJECTED`. + +## Review Criteria + +Evaluate the diff (`git diff origin/main...HEAD`) against each criterion. Flag violations; do not flag items that pass. + +### 1. Breaking Changes + +API or interface changes must not break existing consumers without a migration path. + +- Public function signature changed (parameters added/removed, types changed) without deprecation or migration? Flag it. +- Public class/struct fields removed or renamed without migration? Flag it. +- Exported constant or enum value removed? Flag it. +- Internal/private changes that don't affect the public API? Acceptable. +- Changes with documented migration in commit message or changelog? Acceptable. + +### 2. Test Failures + +All tests must pass locally before PR creation. + +- Test suite executed and any test failed? Flag it with the failing test name(s). +- Test suite not executed (skipped or couldn't run)? Flag it — tests must run. +- All tests pass? Acceptable. + +### 3. Security Issues + +Code must not introduce security vulnerabilities. + +- **Hardcoded secrets**: API keys, passwords, tokens, or credentials in source code? Flag it. +- **SQL injection**: String concatenation or f-strings in SQL queries instead of parameterized queries? Flag it. +- **Path traversal**: User input used directly in file paths without validation or sanitization? Flag it. +- **Shell injection**: User input passed to shell commands without proper escaping? Flag it. +- Environment variables, config files, or secret managers for credentials? Acceptable. +- Parameterized queries or ORM methods? Acceptable. +- Path validation with allowlists or canonicalization? Acceptable. + +### 4. Spec Alignment + +Changes must match the task description and acceptance criteria. + +- Core requirement from the task description clearly not implemented? Flag it. +- Acceptance criterion explicitly inverted (does the opposite of what's specified)? Flag it. +- Implementation approach differs but satisfies the requirement? Acceptable. +- Additional functionality beyond spec (if not breaking anything)? Acceptable. + +## Verdict + +After reviewing all criteria, output exactly one of these markers: + +``` +APPROVED +``` + +Use when all criteria pass: +- No breaking changes without migration +- All tests pass locally +- No security vulnerabilities detected +- Implementation aligns with task description + +``` +REJECTED +``` + +Use when any criterion fails. + +## Feedback Format + +On rejection, provide structured feedback for each violation: + +``` +REJECTED + +Issues: +- [Criterion: Test Failures] `test_user_auth` failed with AssertionError +- [Criterion: Breaking Changes] api.py:42 — `get_user()` parameter `user_id` changed to `id` without deprecation +- [Criterion: Security] db.py:87 — SQL query uses f-string: `f"SELECT * FROM users WHERE id = {user_id}"` +- [Criterion: Spec Alignment] handler.py:15 — task requires validation of email format but no validation implemented + +Required changes: +1. Fix failing test `test_user_auth` or update test to match new behavior +2. Add `user_id` as deprecated alias for `id` parameter or document breaking change +3. Use parameterized query: `cursor.execute("SELECT * FROM users WHERE id = ?", (user_id,))` +4. Add email format validation using regex or email-validator library +``` + +On approval: + +``` +APPROVED + +All criteria pass: +- No breaking API changes detected +- All 47 tests pass +- No security issues found +- Implementation matches task requirements +``` + +## Important + +- Run tests first — this is a hard requirement, not optional +- Be specific: include file path and line number for every issue +- Be actionable: state exactly what needs to change +- Security issues are always blocking — never approve code with hardcoded secrets or injection vulnerabilities +- This is a pre-PR gate — catch issues before they reach CI or human reviewers diff --git a/src/forge/api/routes/metrics.py b/src/forge/api/routes/metrics.py index 42c49656..9776b2be 100644 --- a/src/forge/api/routes/metrics.py +++ b/src/forge/api/routes/metrics.py @@ -122,6 +122,26 @@ ["service", "operation", "error_type"], ) +# Review cycle metrics +REVIEW_CYCLES = Counter( + "forge_review_cycles_total", + "Total review cycles detected", + ["skill", "step"], +) + +REVIEW_VERDICTS = Counter( + "forge_review_verdicts_total", + "Review verdicts by outcome", + ["skill", "step", "verdict"], # verdict: approved, rejected +) + +REVIEW_DURATION = Histogram( + "forge_review_duration_seconds", + "Review cycle duration", + ["skill", "step"], + buckets=[1, 5, 10, 30, 60, 120, 300, 600], # Same as AGENT_DURATION +) + @router.get("/metrics") async def metrics() -> Response: @@ -232,3 +252,35 @@ def record_external_api_error(service: str, operation: str, error_type: str) -> error_type: Type of error (timeout, rate_limit, auth, etc.). """ EXTERNAL_API_ERRORS.labels(service=service, operation=operation, error_type=error_type).inc() + + +def record_review_cycle(skill: str, step: str) -> None: + """Record a review cycle detected. + + Args: + skill: Skill name (e.g., implement-task, fix-ci). + step: Workflow step name. + """ + REVIEW_CYCLES.labels(skill=skill, step=step).inc() + + +def record_review_verdict(skill: str, step: str, verdict: str) -> None: + """Record a review verdict. + + Args: + skill: Skill name (e.g., implement-task, fix-ci). + step: Workflow step name. + verdict: Verdict outcome (approved, rejected). + """ + REVIEW_VERDICTS.labels(skill=skill, step=step, verdict=verdict).inc() + + +def observe_review_duration(skill: str, step: str, duration: float) -> None: + """Record review cycle duration. + + Args: + skill: Skill name (e.g., implement-task, fix-ci). + step: Workflow step name. + duration: Duration in seconds. + """ + REVIEW_DURATION.labels(skill=skill, step=step).observe(duration) diff --git a/src/forge/config.py b/src/forge/config.py index e1bc7db9..79c64f27 100644 --- a/src/forge/config.py +++ b/src/forge/config.py @@ -330,6 +330,21 @@ def ignored_ci_checks(self) -> list[str]: description="Container CPU limit", ) + # Auto Review Configuration + auto_review_poll_interval: float = Field( + default=5.0, + description="Polling interval in seconds for detecting review cycle files during container execution", + ) + auto_review_record_polled_files: Literal["log", "copy"] | None = Field( + default=None, + description=( + "Recording mode for polled review cycle files. " + "'log' logs cycle data at INFO level. " + "'copy' copies files to {recording_dir}/{step-name}/review_cycle_*.json. " + "None disables recording." + ), + ) + # Queue Consumer Configuration queue_max_concurrent_tasks: int = Field( default=20, diff --git a/src/forge/observability/__init__.py b/src/forge/observability/__init__.py index b6fedc0d..b7c1dff2 100644 --- a/src/forge/observability/__init__.py +++ b/src/forge/observability/__init__.py @@ -10,6 +10,10 @@ get_correlation_id, set_correlation_id, ) +from forge.observability.review_notifier import NotifyResult, ReviewJiraNotifier +from forge.observability.review_poller import ReviewCycleData, ReviewCyclePoller +from forge.observability.review_recorder import ReviewCycleData as RecorderReviewCycleData +from forge.observability.review_recorder import ReviewCycleRecorder __all__ = [ "configure_tracing", @@ -18,4 +22,10 @@ "CorrelationContext", "get_correlation_id", "set_correlation_id", + "ReviewCycleData", + "ReviewCyclePoller", + "RecorderReviewCycleData", + "ReviewCycleRecorder", + "ReviewJiraNotifier", + "NotifyResult", ] diff --git a/src/forge/observability/review_notifier.py b/src/forge/observability/review_notifier.py new file mode 100644 index 00000000..f790304d --- /dev/null +++ b/src/forge/observability/review_notifier.py @@ -0,0 +1,191 @@ +"""Jira notifier for review cycle events. + +This module provides the ReviewJiraNotifier class for posting Jira comments +when review cycles are detected during container execution. + +The notifier includes rate limiting to prevent comment spam and handles +Jira API errors gracefully by logging and continuing without failing. + +Usage: + notifier = ReviewJiraNotifier( + jira_client=jira_client, + ticket_key="PROJ-123", + ) + + # Post a comment for a review cycle + await notifier.notify(cycle_data) + + # With custom rate limit interval + notifier = ReviewJiraNotifier( + jira_client=jira_client, + ticket_key="PROJ-123", + rate_limit_seconds=60.0, + ) +""" + +import logging +import time +from dataclasses import dataclass + +from forge.integrations.jira import JiraClient +from forge.observability.review_poller import ReviewCycleData + +logger = logging.getLogger(__name__) + +# Default rate limit interval between comments (seconds) +DEFAULT_RATE_LIMIT_SECONDS = 30.0 + +# Maximum length for feedback in comments before truncation +MAX_FEEDBACK_LENGTH = 500 + + +@dataclass +class NotifyResult: + """Result of a notify attempt. + + Attributes: + posted: Whether the comment was successfully posted. + rate_limited: Whether the attempt was skipped due to rate limiting. + error: Error message if posting failed, None otherwise. + """ + + posted: bool + rate_limited: bool + error: str | None = None + + +class ReviewJiraNotifier: + """Posts Jira comments for review cycle events with rate limiting. + + This class handles posting review cycle information as Jira comments, + including cycle number, verdict, feedback summary, skill name, and duration. + + Rate limiting prevents comment spam by enforcing a configurable interval + between comments. Jira API errors are logged but don't cause failures. + + Attributes: + jira_client: The JiraClient instance for API calls. + ticket_key: The Jira ticket key to post comments to. + rate_limit_seconds: Minimum seconds between comments. + """ + + def __init__( + self, + jira_client: JiraClient, + ticket_key: str, + rate_limit_seconds: float = DEFAULT_RATE_LIMIT_SECONDS, + ): + """Initialize the review notifier. + + Args: + jira_client: JiraClient instance for posting comments. + ticket_key: The Jira ticket key (e.g., "PROJ-123") for comments. + rate_limit_seconds: Minimum seconds between comments. Default 30s. + """ + self.jira_client = jira_client + self.ticket_key = ticket_key + self.rate_limit_seconds = rate_limit_seconds + self._last_notify_time: float | None = None + + def _is_rate_limited(self) -> bool: + """Check if we should skip due to rate limiting. + + Returns: + True if rate limited (should skip), False if can proceed. + """ + if self._last_notify_time is None: + return False + + elapsed = time.monotonic() - self._last_notify_time + return elapsed < self.rate_limit_seconds + + def _format_comment(self, cycle_data: ReviewCycleData) -> str: + """Format review cycle data as a readable Jira comment. + + Args: + cycle_data: The review cycle data to format. + + Returns: + Formatted comment string. + """ + # Build the verdict display + verdict_upper = cycle_data.verdict.upper() + verdict_icon = "✅" if verdict_upper == "APPROVED" else "❌" + + # Format duration + elapsed = cycle_data.elapsed_seconds + duration_str = f"{elapsed / 60:.1f}m" if elapsed >= 60 else f"{elapsed:.1f}s" + + # Build header + lines = [ + f"*Review Cycle {cycle_data.cycle}/{cycle_data.max_cycles}* — {verdict_icon} *{verdict_upper}*", + "", + ] + + # Add skill and duration + if cycle_data.skill: + lines.append(f"*Skill:* {cycle_data.skill}") + lines.append(f"*Duration:* {duration_str}") + + # Add feedback summary (truncated if too long) + feedback = cycle_data.feedback.strip() + if feedback: + lines.append("") + lines.append("*Feedback:*") + + # Truncate if too long + if len(feedback) > MAX_FEEDBACK_LENGTH: + truncated = feedback[:MAX_FEEDBACK_LENGTH].rsplit(" ", 1)[0] + if len(truncated) < MAX_FEEDBACK_LENGTH // 2: + # No good word boundary, just cut + truncated = feedback[:MAX_FEEDBACK_LENGTH] + lines.append(f"{truncated}...") + else: + lines.append(feedback) + + return "\n".join(lines) + + async def notify(self, cycle_data: ReviewCycleData) -> NotifyResult: + """Post a Jira comment for a review cycle. + + Handles rate limiting and API errors gracefully. If rate limited, + returns without posting. If API call fails, logs the error and + returns without raising. + + Args: + cycle_data: The review cycle data to post. + + Returns: + NotifyResult indicating success, rate limiting, or error. + """ + # Check rate limiting + if self._is_rate_limited(): + remaining = self.rate_limit_seconds - (time.monotonic() - (self._last_notify_time or 0)) + logger.debug( + f"Rate limited: skipping Jira comment for {self.ticket_key} (wait {remaining:.1f}s)" + ) + return NotifyResult(posted=False, rate_limited=True) + + # Format the comment + comment_body = self._format_comment(cycle_data) + + # Post to Jira with error handling + try: + await self.jira_client.add_comment(self.ticket_key, comment_body) + self._last_notify_time = time.monotonic() + logger.info( + f"Posted review cycle {cycle_data.cycle}/{cycle_data.max_cycles} " + f"comment to {self.ticket_key}" + ) + return NotifyResult(posted=True, rate_limited=False) + except Exception as e: + # Log and continue - don't fail the container + logger.warning(f"Failed to post review comment to {self.ticket_key}: {e}") + return NotifyResult(posted=False, rate_limited=False, error=str(e)) + + def reset_rate_limit(self) -> None: + """Reset the rate limit timer. + + Call this to allow immediate posting after a long pause. + """ + self._last_notify_time = None diff --git a/src/forge/observability/review_poller.py b/src/forge/observability/review_poller.py new file mode 100644 index 00000000..3ed01a44 --- /dev/null +++ b/src/forge/observability/review_poller.py @@ -0,0 +1,293 @@ +"""Review cycle poller for observability during container execution. + +This module provides the ReviewCyclePoller class that implements async polling +for review cycle files written by container agents during task execution. + +The poller detects files at the step-specific path: + .forge/{step-name}/review_cycle_*.json + +where step-name (e.g., "implement_task", "generate_prd") is passed when +creating the poller instance. +""" + +import asyncio +import json +import logging +from dataclasses import dataclass +from pathlib import Path + +import aiofiles + +from forge.config import Settings, get_settings + +logger = logging.getLogger(__name__) + +# Maximum retries for JSON parsing (race condition with container writes) +MAX_JSON_PARSE_RETRIES = 3 +# Delay between JSON parse retries in seconds +JSON_PARSE_RETRY_DELAY = 0.5 + + +@dataclass +class ReviewCycleData: + """Data captured for a single review cycle iteration. + + This mirrors the ReviewCycleData from containers.review for use in + the orchestrator-side polling. + + Attributes: + cycle: Current cycle number (1-indexed). + max_cycles: Maximum cycles allowed. + verdict: Review outcome ("approved" or "rejected"). + feedback: Reviewer feedback text. + skill: Name of the skill that performed the review. + elapsed_seconds: Time taken for this review cycle. + timestamp: ISO 8601 UTC timestamp of cycle completion. + file_path: Path to the source JSON file (not serialized). + """ + + cycle: int + max_cycles: int + verdict: str + feedback: str + skill: str + elapsed_seconds: float + timestamp: str + file_path: str = "" + + @classmethod + def from_dict(cls, data: dict, file_path: str = "") -> "ReviewCycleData": + """Create ReviewCycleData from a dictionary. + + Args: + data: Dictionary containing review cycle fields. + file_path: Path to the source file for tracking. + + Returns: + ReviewCycleData instance. + """ + return cls( + cycle=data["cycle"], + max_cycles=data["max_cycles"], + verdict=data["verdict"], + feedback=data.get("feedback", ""), + skill=data.get("skill", ""), + elapsed_seconds=data.get("elapsed_seconds", 0.0), + timestamp=data.get("timestamp", ""), + file_path=file_path, + ) + + +class ReviewCyclePoller: + """Async poller for review cycle files during container execution. + + This class polls for review_cycle_*.json files in the step-specific + directory and returns newly detected ReviewCycleData objects. + + Usage: + poller = ReviewCyclePoller( + workspace_path=Path("/workspace"), + step_name="implement_task", + ) + + # Start polling in background + async for new_cycles in poller.poll(): + for cycle in new_cycles: + print(f"Review cycle {cycle.cycle}: {cycle.verdict}") + + # Or poll once + new_cycles = await poller.poll_once() + """ + + def __init__( + self, + workspace_path: Path, + step_name: str, + settings: Settings | None = None, + ): + """Initialize the review cycle poller. + + Args: + workspace_path: Path to the workspace root (where .forge/ is located). + step_name: Name of the step (e.g., "implement_task") for path detection. + settings: Application settings. Uses default if not provided. + """ + self.workspace_path = Path(workspace_path) + self.step_name = step_name + self._settings = settings + self._processed_files: set[str] = set() + self._running = False + + @property + def settings(self) -> Settings: + """Get settings, lazily loading if not provided.""" + if self._settings is None: + self._settings = get_settings() + return self._settings + + @property + def poll_interval(self) -> float: + """Get the polling interval from settings.""" + return self.settings.auto_review_poll_interval + + @property + def review_cycle_dir(self) -> Path: + """Get the directory path for review cycle files.""" + return self.workspace_path / ".forge" / self.step_name + + def _get_review_cycle_files(self) -> list[Path]: + """Get list of review_cycle_*.json files in the step directory. + + Returns: + List of Path objects for matching files. + """ + cycle_dir = self.review_cycle_dir + if not cycle_dir.exists(): + return [] + + return sorted(cycle_dir.glob("review_cycle_*.json")) + + async def _parse_json_with_retry(self, file_path: Path) -> dict | None: + """Parse JSON file with retry for incomplete reads. + + This handles race conditions where the container may still be + writing the file when we try to read it. + + Args: + file_path: Path to the JSON file. + + Returns: + Parsed JSON dict, or None if parsing fails after retries. + """ + for attempt in range(MAX_JSON_PARSE_RETRIES): + try: + async with aiofiles.open(file_path, encoding="utf-8") as f: + content = await f.read() + + if not content.strip(): + # Empty file, likely still being written + if attempt < MAX_JSON_PARSE_RETRIES - 1: + logger.debug( + "Empty file %s, retrying (%d/%d)", + file_path, + attempt + 1, + MAX_JSON_PARSE_RETRIES, + ) + await asyncio.sleep(JSON_PARSE_RETRY_DELAY) + continue + return None + + return json.loads(content) + + except json.JSONDecodeError as e: + if attempt < MAX_JSON_PARSE_RETRIES - 1: + logger.debug( + "JSON parse error for %s: %s, retrying (%d/%d)", + file_path, + e, + attempt + 1, + MAX_JSON_PARSE_RETRIES, + ) + await asyncio.sleep(JSON_PARSE_RETRY_DELAY) + else: + logger.warning( + "Failed to parse %s after %d attempts: %s", + file_path, + MAX_JSON_PARSE_RETRIES, + e, + ) + return None + + except OSError as e: + logger.warning("Error reading %s: %s", file_path, e) + return None + + return None + + async def poll_once(self) -> list[ReviewCycleData]: + """Poll for new review cycle files once. + + Returns: + List of newly detected ReviewCycleData objects. + """ + new_cycles: list[ReviewCycleData] = [] + files = self._get_review_cycle_files() + + for file_path in files: + file_key = str(file_path) + + if file_key in self._processed_files: + continue + + data = await self._parse_json_with_retry(file_path) + if data is None: + continue + + try: + cycle_data = ReviewCycleData.from_dict(data, file_path=file_key) + new_cycles.append(cycle_data) + self._processed_files.add(file_key) + logger.debug( + "Detected review cycle %d for step %s: %s", + cycle_data.cycle, + self.step_name, + cycle_data.verdict, + ) + except (KeyError, TypeError) as e: + logger.warning("Invalid review cycle data in %s: %s", file_path, e) + + return new_cycles + + def poll(self) -> "ReviewCyclePoller": + """Start the polling loop as an async iterator. + + Yields: + List of newly detected ReviewCycleData objects on each poll. + + Usage: + async for new_cycles in poller.poll(): + for cycle in new_cycles: + process(cycle) + """ + self._running = True + return self + + def stop(self) -> None: + """Stop the polling loop.""" + self._running = False + + def __aiter__(self) -> "ReviewCyclePoller": + """Return self as async iterator.""" + return self + + async def __anext__(self) -> list[ReviewCycleData]: + """Get next batch of new review cycles. + + Returns: + List of newly detected ReviewCycleData objects. + + Raises: + StopAsyncIteration: When polling is stopped. + """ + if not self._running: + raise StopAsyncIteration + + # Wait for the poll interval before checking + await asyncio.sleep(self.poll_interval) + + if not self._running: + raise StopAsyncIteration + + return await self.poll_once() + + def reset(self) -> None: + """Reset the set of processed files. + + This allows re-detecting previously processed files. + """ + self._processed_files.clear() + + @property + def processed_count(self) -> int: + """Get the number of processed files.""" + return len(self._processed_files) diff --git a/src/forge/observability/review_recorder.py b/src/forge/observability/review_recorder.py new file mode 100644 index 00000000..961b628f --- /dev/null +++ b/src/forge/observability/review_recorder.py @@ -0,0 +1,206 @@ +"""Review cycle recorder for observability of review cycle data. + +This module provides the ReviewCycleRecorder class that records review cycle +data either by logging or by copying files to a recording directory. + +Modes: + - mode="log": Log cycle data at INFO level via logging + - mode="copy": Copy files to {recording_dir}/{step-name}/review_cycle_*.json + - mode=None: No recording (disabled) + +The step name is passed to the recorder constructor and used to organize +recorded files into step-specific subdirectories. +""" + +import json +import logging +import shutil +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Literal + +logger = logging.getLogger(__name__) + +# Type alias for recording modes +RecordingMode = Literal["log", "copy"] | None + + +@dataclass +class ReviewCycleData: + """Data captured for a single review cycle iteration. + + This dataclass stores all fields specified in the Epic spec for + review cycle recording and observability. + + Attributes: + cycle: Current cycle number (1-indexed). + max_cycles: Maximum cycles allowed. + verdict: Review outcome ("approved" or "rejected"). + feedback: Reviewer feedback text. + skill: Name of the skill that performed the review. + elapsed_seconds: Time taken for this review cycle. + timestamp: Datetime of cycle completion. + """ + + cycle: int + max_cycles: int + verdict: str + feedback: str + skill: str + elapsed_seconds: float + timestamp: datetime + + def to_dict(self) -> dict: + """Convert to dictionary with ISO 8601 formatted timestamp. + + Returns: + Dictionary representation suitable for JSON serialization. + """ + data = asdict(self) + # Convert datetime to ISO 8601 string + data["timestamp"] = self.timestamp.isoformat() + return data + + def to_json(self) -> str: + """Convert to JSON string with pretty printing. + + Returns: + JSON string representation. + """ + return json.dumps(self.to_dict(), indent=2, ensure_ascii=False) + + +class ReviewCycleRecorder: + """Records review cycle data based on configured mode. + + This class handles recording of review cycle data either by logging + to stdout/file or by copying files to a recording directory. + + Usage: + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="log", + ) + recorder.record(cycle_data) + + # Or for copy mode + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="copy", + recording_dir=Path("/recordings"), + ) + recorder.record_file(Path("/workspace/.forge/step/review_cycle_1.json")) + """ + + def __init__( + self, + step_name: str, + mode: RecordingMode = None, + recording_dir: Path | None = None, + ): + """Initialize the review cycle recorder. + + Args: + step_name: Name of the step (e.g., "implement_task") for file organization. + mode: Recording mode - "log", "copy", or None (disabled). + recording_dir: Base directory for copying files (required for copy mode). + + Raises: + ValueError: If mode is "copy" but recording_dir is not provided. + """ + self.step_name = step_name + self.mode = mode + self.recording_dir = Path(recording_dir) if recording_dir else None + + if mode == "copy" and not self.recording_dir: + raise ValueError("recording_dir is required when mode='copy'") + + @property + def step_dir(self) -> Path | None: + """Get the step-specific directory for recorded files. + + Returns: + Path to {recording_dir}/{step-name}/ or None if no recording_dir. + """ + if self.recording_dir is None: + return None + return self.recording_dir / self.step_name + + def record(self, cycle_data: ReviewCycleData) -> None: + """Record review cycle data based on configured mode. + + For log mode, logs the data at INFO level with structured format. + For copy mode, this method does nothing (use record_file for files). + For disabled mode (None), this method does nothing. + + Args: + cycle_data: The review cycle data to record. + """ + if self.mode is None: + return + + if self.mode == "log": + self._log_cycle_data(cycle_data) + # Copy mode is handled by record_file + + def record_file(self, source_file: Path) -> Path | None: + """Record a review cycle file by copying it to the recording directory. + + Only works in copy mode. Creates the step-specific subdirectory if needed. + + Args: + source_file: Path to the source review_cycle_*.json file. + + Returns: + Path to the copied file, or None if mode is not "copy" or copy failed. + """ + if self.mode != "copy": + return None + + if not self.recording_dir: + return None + + if not source_file.exists(): + logger.warning("Source file does not exist: %s", source_file) + return None + + # Create step-specific subdirectory + step_dir = self.step_dir + if step_dir is None: + return None + + try: + step_dir.mkdir(parents=True, exist_ok=True) + except OSError as e: + logger.error("Failed to create directory %s: %s", step_dir, e) + return None + + # Copy file preserving the filename + dest_file = step_dir / source_file.name + try: + shutil.copy2(source_file, dest_file) + logger.debug("Copied review cycle file to %s", dest_file) + return dest_file + except OSError as e: + logger.error("Failed to copy file to %s: %s", dest_file, e) + return None + + def _log_cycle_data(self, cycle_data: ReviewCycleData) -> None: + """Log review cycle data at INFO level with structured format. + + Args: + cycle_data: The review cycle data to log. + """ + logger.info( + "Review cycle %d/%d for %s: verdict=%s skill=%s elapsed=%.2fs feedback=%r", + cycle_data.cycle, + cycle_data.max_cycles, + self.step_name, + cycle_data.verdict, + cycle_data.skill, + cycle_data.elapsed_seconds, + cycle_data.feedback[:100] + "..." + if len(cycle_data.feedback) > 100 + else cycle_data.feedback, + ) diff --git a/src/forge/sandbox/runner.py b/src/forge/sandbox/runner.py index a422ef91..50c06c42 100644 --- a/src/forge/sandbox/runner.py +++ b/src/forge/sandbox/runner.py @@ -13,20 +13,64 @@ """ import asyncio +import contextlib import json import logging import os import shutil from dataclasses import dataclass, field +from datetime import datetime from pathlib import Path from typing import Any +from forge.api.routes.metrics import ( + observe_review_duration, + record_review_cycle, + record_review_verdict, +) from forge.config import Settings, get_settings +from forge.observability import ( + RecorderReviewCycleData, + ReviewCycleData, + ReviewCyclePoller, + ReviewCycleRecorder, +) from forge.prompts import load_prompt from forge.skills.resolver import resolve_skill_paths logger = logging.getLogger(__name__) + +def _poller_to_recorder_cycle( + poller_cycle: ReviewCycleData, _step_name: str +) -> RecorderReviewCycleData: + """Convert poller's ReviewCycleData to recorder's format. + + Args: + poller_cycle: ReviewCycleData from the poller (has string timestamp). + step_name: The workflow step name for context. + + Returns: + RecorderReviewCycleData suitable for the recorder (has datetime timestamp). + """ + # Parse ISO 8601 timestamp string to datetime + try: + timestamp = datetime.fromisoformat(poller_cycle.timestamp.replace("Z", "+00:00")) + except (ValueError, AttributeError): + # Fallback to now if parsing fails + timestamp = datetime.now() + + return RecorderReviewCycleData( + cycle=poller_cycle.cycle, + max_cycles=poller_cycle.max_cycles, + verdict=poller_cycle.verdict, + feedback=poller_cycle.feedback, + skill=poller_cycle.skill, + elapsed_seconds=poller_cycle.elapsed_seconds, + timestamp=timestamp, + ) + + # Default container image (can be overridden via CONTAINER_IMAGE env var) # Use localhost/ prefix to avoid podman short-name resolution prompts DEFAULT_IMAGE = "localhost/forge-dev:latest" @@ -48,6 +92,7 @@ class ContainerResult: stderr: str tests_passed: bool | None = None # None if tests were skipped error_message: str | None = None + review_cycles: list[ReviewCycleData] = field(default_factory=list) @property def tests_failed(self) -> bool: @@ -366,6 +411,136 @@ async def _stop_timed_out_container( process.kill() await process.wait() + def _sweep_review_cycles( + self, + workspace_path: Path, + step_name: str, + processed_files: set[str], + collected_cycles: list[ReviewCycleData], + recorder: ReviewCycleRecorder, + ) -> None: + """Synchronous post-execution sweep for missed review cycle files. + + This method scans for any review_cycle_*.json files that may have been + missed during async polling, especially if the container exits quickly + after writing. + + Args: + workspace_path: Path to the workspace root. + step_name: Name of the step for path organization. + processed_files: Set of file paths already processed by the poller. + collected_cycles: List to append newly found cycles to. + recorder: Recorder for logging/copying detected cycles. + """ + cycle_dir = workspace_path / ".forge" / step_name + if not cycle_dir.exists(): + return + + # Find all review cycle files + all_files = sorted(cycle_dir.glob("review_cycle_*.json")) + + missed_count = 0 + for file_path in all_files: + file_key = str(file_path) + + # Skip files already processed by the async poller + if file_key in processed_files: + continue + + # This file was missed during polling - parse and collect it + try: + content = file_path.read_text(encoding="utf-8") + if not content.strip(): + logger.warning("Empty review cycle file during sweep: %s", file_path) + continue + + data = json.loads(content) + cycle_data = ReviewCycleData.from_dict(data, file_path=file_key) + collected_cycles.append(cycle_data) + missed_count += 1 + + # Record via recorder + recorder.record(_poller_to_recorder_cycle(cycle_data, step_name)) + recorder.record_file(file_path) + + # Emit Prometheus metrics + record_review_cycle(cycle_data.skill, step_name) + record_review_verdict(cycle_data.skill, step_name, cycle_data.verdict) + observe_review_duration(cycle_data.skill, step_name, cycle_data.elapsed_seconds) + + logger.debug( + "Sweep caught review cycle %d/%d for %s: %s", + cycle_data.cycle, + cycle_data.max_cycles, + step_name, + cycle_data.verdict, + ) + + except json.JSONDecodeError as e: + logger.warning("Failed to parse review cycle file %s: %s", file_path, e) + except (KeyError, TypeError) as e: + logger.warning("Invalid review cycle data in %s: %s", file_path, e) + except OSError as e: + logger.warning("Error reading review cycle file %s: %s", file_path, e) + + if missed_count > 0: + logger.warning( + "Sweep caught %d review cycle file(s) missed during async polling for step %s", + missed_count, + step_name, + ) + + async def _poll_review_cycles( + self, + poller: ReviewCyclePoller, + recorder: ReviewCycleRecorder, + collected_cycles: list[ReviewCycleData], + ) -> None: + """Background task to poll for review cycle files during container execution. + + This task polls the workspace for review_cycle_*.json files and: + - Collects detected ReviewCycleData into the provided list + - Records cycles via the recorder (log or copy mode) + - Emits Prometheus metrics for observability + + Args: + poller: The ReviewCyclePoller instance to use for polling. + recorder: The ReviewCycleRecorder for recording cycles. + collected_cycles: List to aggregate detected cycles into. + """ + try: + async for new_cycles in poller.poll(): + for cycle in new_cycles: + collected_cycles.append(cycle) + + # Record cycle via recorder (handles log/copy modes) + recorder.record( + # Convert poller's ReviewCycleData to recorder's format + # (using to_dict style conversion) + _poller_to_recorder_cycle(cycle, poller.step_name) + ) + + # Copy file if in copy mode + if cycle.file_path: + recorder.record_file(Path(cycle.file_path)) + + # Emit Prometheus metrics + record_review_cycle(cycle.skill, poller.step_name) + record_review_verdict(cycle.skill, poller.step_name, cycle.verdict) + observe_review_duration(cycle.skill, poller.step_name, cycle.elapsed_seconds) + + logger.debug( + "Review cycle %d/%d for %s: %s", + cycle.cycle, + cycle.max_cycles, + poller.step_name, + cycle.verdict, + ) + except asyncio.CancelledError: + # Clean exit on cancellation + logger.debug("Review polling task cancelled") + raise + async def run( self, workspace_path: Path, @@ -377,6 +552,7 @@ async def run( repo_name: str | None = None, previous_task_keys: list[str] | None = None, trace_context: dict[str, Any] | None = None, + step_name: str | None = None, ) -> ContainerResult: """Run a task in a container sandbox. @@ -390,9 +566,12 @@ async def run( repo_name: Repository name (e.g., "owner/repo") for container naming. previous_task_keys: List of previously implemented task keys for handoff context. trace_context: Workflow fields forwarded to Langfuse only. + step_name: Workflow step name (e.g., "implement_task", "local_review") + for organizing review cycle files under .forge/{step-name}/. + If not provided, review polling is disabled. Returns: - ContainerResult with execution status and logs. + ContainerResult with execution status, logs, and review_cycles. """ config = config or self._default_config() @@ -409,6 +588,10 @@ async def run( } task_file.write_text(json.dumps(task_data, indent=2)) + # List to collect review cycles detected during execution + collected_cycles: list[ReviewCycleData] = [] + polling_task: asyncio.Task | None = None + try: # Build container name and command container_name = self._build_container_name(ticket_key, repo_name) @@ -426,6 +609,27 @@ async def run( stderr=asyncio.subprocess.PIPE, ) + # Start review polling background task if step_name is provided + if step_name: + poller = ReviewCyclePoller( + workspace_path=workspace_path, + step_name=step_name, + settings=self.settings, + ) + # Note: recording_dir is not currently configurable. + # For copy mode, the recorder needs a recording_dir, so we use + # log mode only unless copy mode is explicitly needed in the future. + record_mode = self.settings.auto_review_record_polled_files + recorder = ReviewCycleRecorder( + step_name=step_name, + mode=record_mode if record_mode != "copy" else "log", + recording_dir=None, + ) + polling_task = asyncio.create_task( + self._poll_review_cycles(poller, recorder, collected_cycles) + ) + logger.debug(f"Started review polling for step: {step_name}") + try: stdout, stderr = await asyncio.wait_for( process.communicate(), @@ -440,11 +644,46 @@ async def run( stdout="", stderr="Container execution timed out", error_message="Timeout exceeded", + review_cycles=collected_cycles, ) except asyncio.CancelledError: logger.warning(f"Container execution cancelled, stopping {container_name}") await self._stop_timed_out_container(container_name, process) raise # Re-raise CancelledError + finally: + # Stop and clean up polling task + if polling_task: + # Stop the poller + poller.stop() + # Cancel the polling task + polling_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await polling_task + logger.debug("Review polling task stopped") + + # Do one final async poll to catch any remaining files + final_cycles = await poller.poll_once() + for cycle in final_cycles: + collected_cycles.append(cycle) + recorder.record(_poller_to_recorder_cycle(cycle, poller.step_name)) + if cycle.file_path: + recorder.record_file(Path(cycle.file_path)) + record_review_cycle(cycle.skill, poller.step_name) + record_review_verdict(cycle.skill, poller.step_name, cycle.verdict) + observe_review_duration( + cycle.skill, poller.step_name, cycle.elapsed_seconds + ) + + # Synchronous sweep for any files missed during async polling + # This catches files written just before container exit that may + # not have been detected by the async poller + self._sweep_review_cycles( + workspace_path=workspace_path, + step_name=step_name, + processed_files=poller._processed_files, + collected_cycles=collected_cycles, + recorder=recorder, + ) exit_code = process.returncode or 0 stdout_str = stdout.decode("utf-8", errors="replace") @@ -480,6 +719,7 @@ async def run( stdout=stdout_str, stderr=stderr_str, tests_passed=True, + review_cycles=collected_cycles, ) elif exit_code == EXIT_TESTS_FAILED: return ContainerResult( @@ -489,6 +729,7 @@ async def run( stderr=stderr_str, tests_passed=False, error_message="Tests failed after max retries", + review_cycles=collected_cycles, ) else: return ContainerResult( @@ -497,6 +738,7 @@ async def run( stdout=stdout_str, stderr=stderr_str, error_message=f"Task failed with exit code {exit_code}", + review_cycles=collected_cycles, ) finally: diff --git a/src/forge/workflow/nodes/ci_evaluator.py b/src/forge/workflow/nodes/ci_evaluator.py index d94d3161..a8a379e8 100644 --- a/src/forge/workflow/nodes/ci_evaluator.py +++ b/src/forge/workflow/nodes/ci_evaluator.py @@ -324,6 +324,7 @@ async def attempt_ci_fix(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-ci-analyze", repo_name=state.get("current_repo", ""), + step_name="analyze_ci", ) if not fix_plan_file.exists(): @@ -352,6 +353,7 @@ async def attempt_ci_fix(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-ci-fix", repo_name=state.get("current_repo", ""), + step_name="fix_ci", ) workspace = Workspace( diff --git a/src/forge/workflow/nodes/code_review.py b/src/forge/workflow/nodes/code_review.py index 95fb7692..ac73aee0 100644 --- a/src/forge/workflow/nodes/code_review.py +++ b/src/forge/workflow/nodes/code_review.py @@ -66,6 +66,7 @@ async def run_post_change_review( ticket_key=ticket_key, task_key=f"{ticket_key}-review-{label}", repo_name=current_repo, + step_name="code_review", ) git = GitOperations( diff --git a/src/forge/workflow/nodes/docs_updater.py b/src/forge/workflow/nodes/docs_updater.py index b454c3f2..7c32fa36 100644 --- a/src/forge/workflow/nodes/docs_updater.py +++ b/src/forge/workflow/nodes/docs_updater.py @@ -59,6 +59,7 @@ async def update_documentation(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-docs", repo_name=current_repo, + step_name="update_docs", ) git = GitOperations( diff --git a/src/forge/workflow/nodes/implement_review.py b/src/forge/workflow/nodes/implement_review.py index fcf66e90..27b519f0 100644 --- a/src/forge/workflow/nodes/implement_review.py +++ b/src/forge/workflow/nodes/implement_review.py @@ -178,6 +178,7 @@ async def implement_review(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-review-analyze", repo_name=current_repo, + step_name="implement_review_analyze", ) # ── Check for objections ────────────────────────────────────────────── @@ -220,6 +221,7 @@ async def implement_review(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-review-fix", repo_name=current_repo, + step_name="implement_review_fix", ) # Commit any uncommitted changes the container left diff --git a/src/forge/workflow/nodes/implementation.py b/src/forge/workflow/nodes/implementation.py index 596e06ab..a57b1ce6 100644 --- a/src/forge/workflow/nodes/implementation.py +++ b/src/forge/workflow/nodes/implementation.py @@ -145,6 +145,7 @@ async def implement_task(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=current_task, repo_name=current_repo, + step_name="implement_task", previous_task_keys=implemented_tasks, trace_context=_build_implementation_trace_context( state, diff --git a/src/forge/workflow/nodes/local_reviewer.py b/src/forge/workflow/nodes/local_reviewer.py index ffeef0cb..31dca022 100644 --- a/src/forge/workflow/nodes/local_reviewer.py +++ b/src/forge/workflow/nodes/local_reviewer.py @@ -148,6 +148,7 @@ async def _run_bug_review(state: WorkflowState) -> WorkflowState: result = await runner.run( workspace_path=Path(workspace_path), task_summary="Qualitative bug review — root cause and test coverage", + step_name="local_review", task_description=task_description, ticket_key=ticket_key, task_key=f"{ticket_key}-qualreview", @@ -312,6 +313,7 @@ async def _run_feature_review(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-review", repo_name=current_repo, + step_name="local_review", ) git = GitOperations( diff --git a/src/forge/workflow/nodes/plan_bug_fix.py b/src/forge/workflow/nodes/plan_bug_fix.py index e59ad448..fff9c5e5 100644 --- a/src/forge/workflow/nodes/plan_bug_fix.py +++ b/src/forge/workflow/nodes/plan_bug_fix.py @@ -138,6 +138,7 @@ async def _run_plan_container( task_description=task_description, ticket_key=ticket_key, task_key=f"{ticket_key}-plan", + step_name="plan_bug_fix", ) if not result.success: diff --git a/src/forge/workflow/nodes/rca_analysis.py b/src/forge/workflow/nodes/rca_analysis.py index a95b6f96..b59efa32 100644 --- a/src/forge/workflow/nodes/rca_analysis.py +++ b/src/forge/workflow/nodes/rca_analysis.py @@ -96,6 +96,7 @@ async def analyze_bug(state: BugState) -> BugState: task_description=task_description, ticket_key=ticket_key, task_key=f"{ticket_key}-analysis", + step_name="analyze_bug", ) if not result.success: @@ -242,6 +243,7 @@ async def reflect_rca(state: BugState) -> BugState: task_description=task_description, ticket_key=ticket_key, task_key=task_key, + step_name="reflect_rca", ) if not result.success: diff --git a/src/forge/workflow/nodes/rebase.py b/src/forge/workflow/nodes/rebase.py index a4b0071a..d7010204 100644 --- a/src/forge/workflow/nodes/rebase.py +++ b/src/forge/workflow/nodes/rebase.py @@ -155,6 +155,7 @@ async def rebase_pr(state: WorkflowState) -> WorkflowState: ticket_key=ticket_key, task_key=f"{ticket_key}-rebase", repo_name=current_repo, + step_name="rebase", ) if result.exit_code != 0: diff --git a/tests/unit/api/routes/test_metrics.py b/tests/unit/api/routes/test_metrics.py index 21a962f1..8aa8814f 100644 --- a/tests/unit/api/routes/test_metrics.py +++ b/tests/unit/api/routes/test_metrics.py @@ -88,3 +88,138 @@ def test_increment_webhook_counter(self): # Should not raise WEBHOOKS_RECEIVED.labels(source="github", event_type="check_run").inc() + + +class TestReviewCycleMetrics: + """Tests for review cycle metrics registration and recording.""" + + def test_review_cycles_counter_exists(self): + """Review cycles counter is registered.""" + from forge.api.routes.metrics import REVIEW_CYCLES + + assert REVIEW_CYCLES is not None + + def test_review_cycles_counter_has_labels(self): + """Review cycles counter has skill and step labels.""" + from forge.api.routes.metrics import REVIEW_CYCLES + + labeled = REVIEW_CYCLES.labels(skill="implement-task", step="implementation") + assert labeled is not None + + def test_review_verdicts_counter_exists(self): + """Review verdicts counter is registered.""" + from forge.api.routes.metrics import REVIEW_VERDICTS + + assert REVIEW_VERDICTS is not None + + def test_review_verdicts_counter_has_labels(self): + """Review verdicts counter has skill, step, and verdict labels.""" + from forge.api.routes.metrics import REVIEW_VERDICTS + + labeled = REVIEW_VERDICTS.labels( + skill="implement-task", step="implementation", verdict="approved" + ) + assert labeled is not None + + def test_review_duration_histogram_exists(self): + """Review duration histogram is registered.""" + from forge.api.routes.metrics import REVIEW_DURATION + + assert REVIEW_DURATION is not None + + def test_review_duration_histogram_has_labels(self): + """Review duration histogram has skill and step labels.""" + from forge.api.routes.metrics import REVIEW_DURATION + + labeled = REVIEW_DURATION.labels(skill="fix-ci", step="ci_fix") + assert labeled is not None + + def test_record_review_cycle_helper(self): + """record_review_cycle helper increments counter.""" + from forge.api.routes.metrics import REVIEW_CYCLES, record_review_cycle + + # Get initial value + initial = REVIEW_CYCLES.labels(skill="test-skill", step="test-step")._value.get() + + record_review_cycle(skill="test-skill", step="test-step") + + # Counter should be incremented + final = REVIEW_CYCLES.labels(skill="test-skill", step="test-step")._value.get() + assert final == initial + 1 + + def test_record_review_verdict_helper(self): + """record_review_verdict helper increments counter.""" + from forge.api.routes.metrics import REVIEW_VERDICTS, record_review_verdict + + # Get initial value + initial = REVIEW_VERDICTS.labels( + skill="test-skill", step="test-step", verdict="approved" + )._value.get() + + record_review_verdict(skill="test-skill", step="test-step", verdict="approved") + + # Counter should be incremented + final = REVIEW_VERDICTS.labels( + skill="test-skill", step="test-step", verdict="approved" + )._value.get() + assert final == initial + 1 + + def test_record_review_verdict_rejected(self): + """record_review_verdict helper handles rejected verdict.""" + from forge.api.routes.metrics import REVIEW_VERDICTS, record_review_verdict + + # Get initial value + initial = REVIEW_VERDICTS.labels( + skill="task-skill", step="task-step", verdict="rejected" + )._value.get() + + record_review_verdict(skill="task-skill", step="task-step", verdict="rejected") + + # Counter should be incremented + final = REVIEW_VERDICTS.labels( + skill="task-skill", step="task-step", verdict="rejected" + )._value.get() + assert final == initial + 1 + + def test_observe_review_duration_helper(self): + """observe_review_duration helper observes histogram.""" + from forge.api.routes.metrics import REVIEW_DURATION, observe_review_duration + + # Get initial sum + initial_sum = REVIEW_DURATION.labels( + skill="duration-skill", step="duration-step" + )._sum.get() + + observe_review_duration(skill="duration-skill", step="duration-step", duration=45.5) + + # Sum should increase by observed value + final_sum = REVIEW_DURATION.labels( + skill="duration-skill", step="duration-step" + )._sum.get() + # Just check that something was observed (sum increased) + assert final_sum >= initial_sum + 45.5 + + @pytest.mark.asyncio + async def test_review_metrics_in_endpoint_output(self): + """Review metrics appear in /metrics endpoint output.""" + from forge.api.routes.metrics import ( + observe_review_duration, + record_review_cycle, + record_review_verdict, + ) + + # Record some metrics first + record_review_cycle(skill="endpoint-skill", step="endpoint-step") + record_review_verdict(skill="endpoint-skill", step="endpoint-step", verdict="approved") + observe_review_duration(skill="endpoint-skill", step="endpoint-step", duration=30.0) + + async with AsyncClient( + transport=ASGITransport(app=app), + base_url="http://test" + ) as client: + response = await client.get("/metrics") + + body = response.text + assert "forge_review_cycles_total" in body + assert "forge_review_verdicts_total" in body + assert "forge_review_duration_seconds" in body diff --git a/tests/unit/containers/__init__.py b/tests/unit/containers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/containers/conftest.py b/tests/unit/containers/conftest.py new file mode 100644 index 00000000..74931205 --- /dev/null +++ b/tests/unit/containers/conftest.py @@ -0,0 +1,8 @@ +"""Container tests conftest - adds containers/ to sys.path.""" + +import sys +from pathlib import Path + +_containers_dir = str(Path(__file__).parents[3] / "containers") +if _containers_dir not in sys.path: + sys.path.insert(0, _containers_dir) diff --git a/tests/unit/containers/test_entrypoint_review.py b/tests/unit/containers/test_entrypoint_review.py new file mode 100644 index 00000000..67018acf --- /dev/null +++ b/tests/unit/containers/test_entrypoint_review.py @@ -0,0 +1,789 @@ +"""Unit tests for review loop integration in entrypoint.py.""" + +import json +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +# Add containers to path +sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + +# --------------------------------------------------------------------------- +# Test load_conversation_history +# --------------------------------------------------------------------------- + + +class TestLoadConversationHistory: + def test_loads_existing_history(self, tmp_path: Path): + """Test loading conversation history from existing file (SC-003).""" + from entrypoint import load_conversation_history + + # Create history file + history_dir = tmp_path / ".forge" / "history" + history_dir.mkdir(parents=True) + history_file = history_dir / "TEST-123.json" + history_data = { + "task_key": "TEST-123", + "messages": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there"}, + ], + } + history_file.write_text(json.dumps(history_data)) + + result = load_conversation_history(tmp_path, "TEST-123") + + assert result is not None + assert len(result) == 2 + assert result[0]["role"] == "user" + assert result[1]["role"] == "assistant" + + def test_returns_none_when_file_missing(self, tmp_path: Path): + """Test returns None when history file doesn't exist.""" + from entrypoint import load_conversation_history + + result = load_conversation_history(tmp_path, "MISSING-123") + + assert result is None + + def test_returns_none_on_json_error(self, tmp_path: Path): + """Test returns None when JSON parsing fails.""" + from entrypoint import load_conversation_history + + # Create malformed history file + history_dir = tmp_path / ".forge" / "history" + history_dir.mkdir(parents=True) + history_file = history_dir / "BAD-123.json" + history_file.write_text("not valid json") + + result = load_conversation_history(tmp_path, "BAD-123") + + assert result is None + + +# --------------------------------------------------------------------------- +# Test run_worker_with_feedback +# --------------------------------------------------------------------------- + + +class TestRunWorkerWithFeedback: + @pytest.mark.asyncio + async def test_injects_feedback_section(self, tmp_path: Path): + """Test that feedback is injected into task description (SC-003).""" + with patch("entrypoint.run_agent_task") as mock_run_agent: + mock_run_agent.return_value = True + + from entrypoint import run_worker_with_feedback + + result = await run_worker_with_feedback( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Original description", + guardrails="Some guardrails", + feedback="Please fix the error handling", + previous_task_keys=["TEST-122"], + ) + + assert result is True + mock_run_agent.assert_called_once() + + # Check that feedback was injected + call_kwargs = mock_run_agent.call_args + task_description = call_kwargs[1]["task_description"] + assert "## Reviewer Feedback" in task_description + assert "Please fix the error handling" in task_description + assert "Original description" in task_description + + @pytest.mark.asyncio + async def test_loads_conversation_history(self, tmp_path: Path): + """Test that conversation history is loaded (SC-003).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create history file + history_dir = tmp_path / ".forge" / "history" + history_dir.mkdir(parents=True) + history_file = history_dir / "TEST-123.json" + history_data = {"task_key": "TEST-123", "messages": [{"role": "user", "content": "Test"}]} + history_file.write_text(json.dumps(history_data)) + + with patch("entrypoint.run_agent_task") as mock_run_agent: + mock_run_agent.return_value = True + + from entrypoint import run_worker_with_feedback + + result = await run_worker_with_feedback( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Original description", + guardrails="", + feedback="Feedback text", + ) + + assert result is True + + +# --------------------------------------------------------------------------- +# Test run_review_loop +# --------------------------------------------------------------------------- + + +class TestRunReviewLoop: + @pytest.mark.asyncio + async def test_approved_verdict_exits_successfully(self, tmp_path: Path): + """Test that APPROVED verdict terminates loop successfully (SC-002).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create review.md + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 3\n---\nReview instructions here") + + with patch("entrypoint.run_reviewer_agent") as mock_reviewer: + mock_reviewer.return_value = "The code looks great. APPROVED" + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Description", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + assert result is True + mock_reviewer.assert_called_once() + + # Check cycle file was written + cycle_file = tmp_path / ".forge" / "test-skill" / "review_cycle_1.json" + assert cycle_file.exists() + cycle_data = json.loads(cycle_file.read_text()) + assert cycle_data["verdict"] == "approved" + assert cycle_data["cycle"] == 1 + + @pytest.mark.asyncio + async def test_rejected_triggers_retry(self, tmp_path: Path): + """Test that REJECTED verdict triggers retry with feedback (SC-003).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create review.md with 2 retries + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 2\n---\nReview instructions") + + call_count = 0 + + async def mock_reviewer_side_effect(*_args, **_kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + return "REJECTED: Missing error handling" + return "APPROVED" + + with ( + patch("entrypoint.run_reviewer_agent") as mock_reviewer, + patch("entrypoint.run_worker_with_feedback") as mock_worker, + ): + mock_reviewer.side_effect = mock_reviewer_side_effect + mock_worker.return_value = True + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Description", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + assert result is True + assert mock_reviewer.call_count == 2 + mock_worker.assert_called_once() + + # Check feedback was passed to worker + worker_call = mock_worker.call_args + assert "Missing error handling" in worker_call[1]["feedback"] + + @pytest.mark.asyncio + async def test_max_retries_exhausted_exits_success(self, tmp_path: Path): + """Test that max retries exhausted exits with success (BR-005).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create review.md with 2 retries + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 2\n---\nReview instructions") + + with ( + patch("entrypoint.run_reviewer_agent") as mock_reviewer, + patch("entrypoint.run_worker_with_feedback") as mock_worker, + ): + # Always reject + mock_reviewer.return_value = "REJECTED: Still not good" + mock_worker.return_value = True + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Description", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + # Should exit with success even after max retries + assert result is True + assert mock_reviewer.call_count == 2 + + @pytest.mark.asyncio + async def test_uses_frontmatter_max_retries(self, tmp_path: Path): + """Test that max_retries from frontmatter is enforced (SC-004).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create review.md with 5 retries + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 5\n---\nReview instructions") + + with ( + patch("entrypoint.run_reviewer_agent") as mock_reviewer, + patch("entrypoint.run_worker_with_feedback") as mock_worker, + ): + mock_reviewer.return_value = "REJECTED" + mock_worker.return_value = True + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Description", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + # Should have run 5 review cycles + assert mock_reviewer.call_count == 5 + assert result is True + + @pytest.mark.asyncio + async def test_uses_env_var_fallback(self, tmp_path: Path, monkeypatch): + """Test that AUTO_REVIEW_MAX_RETRIES env var is used as fallback (SC-005).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + monkeypatch.setenv("AUTO_REVIEW_MAX_RETRIES", "4") + + # Create review.md without frontmatter + review_md = tmp_path / "review.md" + review_md.write_text("Review instructions only, no frontmatter") + + with ( + patch("entrypoint.run_reviewer_agent") as mock_reviewer, + patch("entrypoint.run_worker_with_feedback") as mock_worker, + ): + mock_reviewer.return_value = "REJECTED" + mock_worker.return_value = True + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test task", + task_description="Description", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + # Should have run 4 review cycles (from env var) + assert mock_reviewer.call_count == 4 + assert result is True + + @pytest.mark.asyncio + async def test_cycle_file_written_to_correct_path(self, tmp_path: Path): + """Test that cycle file is written to .forge/{step_name}/review_cycle_N.json (SC-007).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 1\n---\nReview") + + with patch("entrypoint.run_reviewer_agent") as mock_reviewer: + mock_reviewer.return_value = "APPROVED" + + from entrypoint import run_review_loop + + await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test", + task_description="Desc", + guardrails="", + skill_name="my-custom-skill", + review_md_path=review_md, + ) + + # Check file path matches spec + cycle_file = tmp_path / ".forge" / "my-custom-skill" / "review_cycle_1.json" + assert cycle_file.exists() + + @pytest.mark.asyncio + async def test_reviewer_agent_receives_instructions(self, tmp_path: Path): + """Test that reviewer agent receives review.md instructions (SC-001).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 1\n---\nCheck for security issues and code quality") + + with patch("entrypoint.run_reviewer_agent") as mock_reviewer: + mock_reviewer.return_value = "APPROVED" + + from entrypoint import run_review_loop + + await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test", + task_description="Desc", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + # Check reviewer was called with instructions + call_kwargs = mock_reviewer.call_args[1] + assert "Check for security issues" in call_kwargs["review_instructions"] + + @pytest.mark.asyncio + async def test_cycle_timing_tracked(self, tmp_path: Path): + """Test that cycle timing is tracked with time.perf_counter().""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 1\n---\nReview") + + with patch("entrypoint.run_reviewer_agent") as mock_reviewer: + mock_reviewer.return_value = "APPROVED" + + from entrypoint import run_review_loop + + await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test", + task_description="Desc", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + cycle_file = tmp_path / ".forge" / "test-skill" / "review_cycle_1.json" + cycle_data = json.loads(cycle_file.read_text()) + + # elapsed_seconds should be a positive float + assert isinstance(cycle_data["elapsed_seconds"], float) + assert cycle_data["elapsed_seconds"] >= 0 + + @pytest.mark.asyncio + async def test_reviewer_error_treated_as_rejection(self, tmp_path: Path): + """Test that reviewer agent errors are treated as rejections.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 1\n---\nReview") + + with patch("entrypoint.run_reviewer_agent") as mock_reviewer: + mock_reviewer.side_effect = RuntimeError("API Error") + + from entrypoint import run_review_loop + + result = await run_review_loop( + workspace=tmp_path, + task_key="TEST-123", + task_summary="Test", + task_description="Desc", + guardrails="", + skill_name="test-skill", + review_md_path=review_md, + ) + + # Should still exit successfully after max retries + assert result is True + + # Check that rejection was recorded + cycle_file = tmp_path / ".forge" / "test-skill" / "review_cycle_1.json" + cycle_data = json.loads(cycle_file.read_text()) + assert cycle_data["verdict"] == "rejected" + + +# --------------------------------------------------------------------------- +# Test main function review loop integration +# --------------------------------------------------------------------------- + + +class TestMainReviewLoopIntegration: + def test_skips_review_when_no_skill_name(self, tmp_path: Path, monkeypatch): + """Test that review loop is skipped when no skill_name provided (SC-010).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create task file without skill_name + task_file = tmp_path / "task.json" + task_file.write_text( + json.dumps( + { + "task_key": "TEST-123", + "summary": "Test task", + "description": "Test description", + } + ) + ) + + # Create workspace + workspace = tmp_path / "workspace" + workspace.mkdir() + + monkeypatch.setenv("FORGE_SYSTEM_PROMPT_TEMPLATE", "Test prompt {task_key}") + monkeypatch.chdir(workspace) + + with ( + patch("entrypoint.asyncio.run") as mock_asyncio_run, + patch("entrypoint.configure_git"), + patch("entrypoint.subprocess.run") as mock_subprocess, + ): + mock_asyncio_run.return_value = True + mock_subprocess.return_value = MagicMock(returncode=0) + + from entrypoint import main + + with pytest.raises(SystemExit) as exc_info: + sys.argv = [ + "entrypoint.py", + "--task-file", + str(task_file), + "--workspace", + str(workspace), + ] + main() + + # Should exit successfully + assert exc_info.value.code == 0 + + def test_skips_review_when_no_review_md(self, tmp_path: Path, monkeypatch): + """Test that review loop is skipped when review.md doesn't exist (SC-010).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create task file with skill_name + task_file = tmp_path / "task.json" + task_file.write_text( + json.dumps( + { + "task_key": "TEST-123", + "summary": "Test task", + "description": "Test description", + "skill_name": "nonexistent-skill", + } + ) + ) + + # Create workspace + workspace = tmp_path / "workspace" + workspace.mkdir() + + monkeypatch.setenv("FORGE_SYSTEM_PROMPT_TEMPLATE", "Test prompt {task_key}") + monkeypatch.chdir(workspace) + + with ( + patch("entrypoint.asyncio.run") as mock_asyncio_run, + patch("entrypoint.configure_git"), + patch("entrypoint.subprocess.run") as mock_subprocess, + patch("entrypoint.detect_review_md") as mock_detect, + ): + mock_asyncio_run.return_value = True + mock_subprocess.return_value = MagicMock(returncode=0) + mock_detect.return_value = None # No review.md found + + from entrypoint import main + + with pytest.raises(SystemExit) as exc_info: + sys.argv = [ + "entrypoint.py", + "--task-file", + str(task_file), + "--workspace", + str(workspace), + ] + main() + + # Should exit successfully without running review loop + assert exc_info.value.code == 0 + # run_review_loop should not have been called + # (only run_agent_task via asyncio.run) + assert mock_asyncio_run.call_count == 1 + + def test_runs_review_when_review_md_exists(self, tmp_path: Path, monkeypatch): + """Test that review loop runs when review.md exists (SC-001).""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + # Create task file with skill_name + task_file = tmp_path / "task.json" + task_file.write_text( + json.dumps( + { + "task_key": "TEST-123", + "summary": "Test task", + "description": "Test description", + "skill_name": "test-skill", + } + ) + ) + + # Create workspace + workspace = tmp_path / "workspace" + workspace.mkdir() + + # Create review.md + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 1\n---\nReview instructions") + + monkeypatch.setenv("FORGE_SYSTEM_PROMPT_TEMPLATE", "Test prompt {task_key}") + monkeypatch.chdir(workspace) + + asyncio_call_count = 0 + + def mock_asyncio_side_effect(_coro): + nonlocal asyncio_call_count + asyncio_call_count += 1 + return True + + with ( + patch("entrypoint.asyncio.run") as mock_asyncio_run, + patch("entrypoint.configure_git"), + patch("entrypoint.subprocess.run") as mock_subprocess, + patch("entrypoint.detect_review_md") as mock_detect, + ): + mock_asyncio_run.side_effect = mock_asyncio_side_effect + mock_subprocess.return_value = MagicMock(returncode=0) + mock_detect.return_value = review_md + + from entrypoint import main + + with pytest.raises(SystemExit) as exc_info: + sys.argv = [ + "entrypoint.py", + "--task-file", + str(task_file), + "--workspace", + str(workspace), + ] + main() + + assert exc_info.value.code == 0 + # Should have called asyncio.run twice: once for run_agent_task, once for run_review_loop + assert asyncio_call_count == 2 + + +# --------------------------------------------------------------------------- +# Test _print_review_progress (SC-011) +# --------------------------------------------------------------------------- + + +class TestPrintReviewProgress: + """Tests for terminal progress display in local mode (SC-011).""" + + def test_prints_progress_when_tty(self, capsys): + """Test progress is printed when stdout is a TTY.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + with patch("entrypoint._IS_TTY", True): + # Need to reload the function to pick up the mocked _IS_TTY + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + # Re-patch after reload + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(1, 3, "rejected", "Missing tests") + + captured = capsys.readouterr() + assert "Review cycle 1/3: REJECTED - Missing tests" in captured.out + + def test_no_output_when_not_tty(self, capsys): + """Test no output when stdout is not a TTY.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + with patch.object(entrypoint, "_IS_TTY", False): + entrypoint._print_review_progress(1, 3, "rejected", "Missing tests") + + captured = capsys.readouterr() + assert captured.out == "" + + def test_feedback_truncated_at_200_chars(self, capsys): + """Test that feedback is truncated to 200 characters with ellipsis.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + long_feedback = "x" * 300 # 300 characters + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(2, 5, "rejected", long_feedback) + + captured = capsys.readouterr() + # Should have 200 chars + "..." + assert "x" * 200 + "..." in captured.out + # Should NOT have 201 x's + assert "x" * 201 not in captured.out + + def test_feedback_not_truncated_when_under_200(self, capsys): + """Test that feedback under 200 chars is not truncated.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + short_feedback = "Fix the error handling in function xyz" + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(1, 3, "rejected", short_feedback) + + captured = capsys.readouterr() + assert short_feedback in captured.out + assert "..." not in captured.out + + def test_verdict_uppercased(self, capsys): + """Test that verdict is uppercased in output.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(1, 3, "approved", "") + + captured = capsys.readouterr() + assert "APPROVED" in captured.out + assert "approved" not in captured.out + + def test_no_feedback_shows_just_verdict(self, capsys): + """Test that empty feedback shows just the verdict without dash.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(3, 3, "approved", "") + + captured = capsys.readouterr() + assert "Review cycle 3/3: APPROVED" in captured.out + assert " - " not in captured.out + + def test_format_matches_spec(self, capsys): + """Test output format matches spec: 'Review cycle N/M: VERDICT - feedback'.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(1, 3, "rejected", "Please add error handling") + + captured = capsys.readouterr() + expected = "Review cycle 1/3: REJECTED - Please add error handling\n" + assert captured.out == expected + + def test_feedback_exactly_200_chars_not_truncated(self, capsys): + """Test that feedback exactly 200 chars is not truncated.""" + import sys + + sys.path.insert(0, str(Path(__file__).parents[3] / "containers")) + + import importlib + + import entrypoint + + importlib.reload(entrypoint) + + exact_feedback = "y" * 200 + + with patch.object(entrypoint, "_IS_TTY", True): + entrypoint._print_review_progress(1, 2, "rejected", exact_feedback) + + captured = capsys.readouterr() + assert exact_feedback in captured.out + assert "..." not in captured.out diff --git a/tests/unit/containers/test_review.py b/tests/unit/containers/test_review.py new file mode 100644 index 00000000..5107b1c9 --- /dev/null +++ b/tests/unit/containers/test_review.py @@ -0,0 +1,1133 @@ +"""Unit tests for containers.review module.""" + +import json +from pathlib import Path + +import pytest +from review import ( + DEFAULT_MAX_RETRIES, + ENV_MAX_RETRIES, + ReviewConfig, + ReviewCycleData, + Verdict, + detect_review_md, + parse_review_config, + parse_verdict, + write_cycle_file, +) + +# --------------------------------------------------------------------------- +# Verdict enum tests +# --------------------------------------------------------------------------- + + +class TestVerdict: + def test_approved_value(self): + assert Verdict.APPROVED == "approved" + assert Verdict.APPROVED.value == "approved" + + def test_rejected_value(self): + assert Verdict.REJECTED == "rejected" + assert Verdict.REJECTED.value == "rejected" + + def test_is_str_enum(self): + # StrEnum values can be used directly as strings + assert f"Verdict: {Verdict.APPROVED}" == "Verdict: approved" + + +# --------------------------------------------------------------------------- +# ReviewConfig dataclass tests +# --------------------------------------------------------------------------- + + +class TestReviewConfig: + def test_default_values(self): + config = ReviewConfig() + assert config.max_retries == DEFAULT_MAX_RETRIES + assert config.instructions == "" + + def test_custom_values(self): + config = ReviewConfig(max_retries=5, instructions="Check for bugs") + assert config.max_retries == 5 + assert config.instructions == "Check for bugs" + + def test_default_max_retries_is_3(self): + assert DEFAULT_MAX_RETRIES == 3 + + +# --------------------------------------------------------------------------- +# ReviewCycleData dataclass tests +# --------------------------------------------------------------------------- + + +class TestReviewCycleData: + def test_all_fields_required(self): + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Looks good!", + skill="code-review", + elapsed_seconds=12.5, + timestamp="2024-01-15T10:30:00Z", + ) + assert data.cycle == 1 + assert data.max_cycles == 3 + assert data.verdict == "approved" + assert data.feedback == "Looks good!" + assert data.skill == "code-review" + assert data.elapsed_seconds == 12.5 + assert data.timestamp == "2024-01-15T10:30:00Z" + + def test_verdict_can_be_approved_or_rejected(self): + approved = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict=Verdict.APPROVED, + feedback="", + skill="test", + elapsed_seconds=1.0, + timestamp="2024-01-01T00:00:00Z", + ) + assert approved.verdict == "approved" + + rejected = ReviewCycleData( + cycle=2, + max_cycles=3, + verdict=Verdict.REJECTED, + feedback="Needs work", + skill="test", + elapsed_seconds=2.0, + timestamp="2024-01-01T00:00:00Z", + ) + assert rejected.verdict == "rejected" + + +# --------------------------------------------------------------------------- +# parse_review_config tests +# --------------------------------------------------------------------------- + + +class TestParseReviewConfig: + """Tests for parse_review_config function.""" + + def test_parse_valid_frontmatter(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: 5 +--- +Review the code for security issues. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == 5 + assert config.instructions == "Review the code for security issues." + + def test_parse_only_instructions_no_frontmatter(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text("Just some instructions, no frontmatter.") + + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert config.instructions == "Just some instructions, no frontmatter." + + def test_parse_empty_frontmatter(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +--- +Instructions after empty frontmatter. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert config.instructions == "Instructions after empty frontmatter." + + def test_file_not_found_returns_defaults(self, tmp_path: Path): + review_md = tmp_path / "nonexistent.md" + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert config.instructions == "" + + def test_malformed_yaml_logs_warning_and_returns_defaults( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: [invalid yaml +--- +Some instructions. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert config.instructions == "Some instructions." + assert "Malformed YAML" in caplog.text + + def test_env_var_fallback_when_no_frontmatter_value( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + monkeypatch.setenv(ENV_MAX_RETRIES, "7") + + review_md = tmp_path / "review.md" + review_md.write_text("No frontmatter, just instructions.") + + config = parse_review_config(review_md) + assert config.max_retries == 7 + assert config.instructions == "No frontmatter, just instructions." + + def test_frontmatter_overrides_env_var(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv(ENV_MAX_RETRIES, "7") + + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: 2 +--- +Instructions here. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == 2 + + def test_invalid_env_var_is_ignored( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture + ): + monkeypatch.setenv(ENV_MAX_RETRIES, "not-a-number") + + review_md = tmp_path / "review.md" + review_md.write_text("Instructions only.") + + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert "Invalid AUTO_REVIEW_MAX_RETRIES" in caplog.text + + def test_non_integer_max_retries_in_frontmatter_uses_default( + self, tmp_path: Path, caplog: pytest.LogCaptureFixture + ): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: "five" +--- +Instructions. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == DEFAULT_MAX_RETRIES + assert "Invalid max_retries" in caplog.text + + def test_multiline_instructions(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: 4 +--- +Line one. +Line two. +Line three. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == 4 + assert "Line one." in config.instructions + assert "Line two." in config.instructions + assert "Line three." in config.instructions + + def test_frontmatter_with_extra_fields_ignored(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: 6 +author: test +some_other_field: value +--- +Instructions. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == 6 + assert config.instructions == "Instructions." + + def test_env_var_fallback_for_file_not_found( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + monkeypatch.setenv(ENV_MAX_RETRIES, "10") + + review_md = tmp_path / "nonexistent.md" + config = parse_review_config(review_md) + assert config.max_retries == 10 + assert config.instructions == "" + + def test_env_var_fallback_for_malformed_yaml( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ): + monkeypatch.setenv(ENV_MAX_RETRIES, "8") + + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +: invalid: yaml: +--- +Some instructions. +""" + ) + config = parse_review_config(review_md) + assert config.max_retries == 8 + assert config.instructions == "Some instructions." + + def test_no_trailing_newline_after_frontmatter(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text("---\nmax_retries: 3\n---\nInstructions.") + config = parse_review_config(review_md) + assert config.max_retries == 3 + assert config.instructions == "Instructions." + + def test_whitespace_handling_in_instructions(self, tmp_path: Path): + review_md = tmp_path / "review.md" + review_md.write_text( + """\ +--- +max_retries: 1 +--- + + Indented instructions with leading/trailing whitespace. + +""" + ) + config = parse_review_config(review_md) + # Instructions should be stripped + assert config.instructions == "Indented instructions with leading/trailing whitespace." + + +# --------------------------------------------------------------------------- +# detect_review_md tests +# --------------------------------------------------------------------------- + + +class TestDetectReviewMd: + """Tests for detect_review_md function with project override precedence.""" + + def test_returns_project_override_when_both_exist(self, tmp_path: Path): + """SC-006: Project override wins when both default and project exist.""" + skills_base = tmp_path / "skills" + + # Create default skill review.md + default_dir = skills_base / "default" / "local-code-review" + default_dir.mkdir(parents=True) + default_review = default_dir / "review.md" + default_review.write_text("Default instructions") + + # Create project override review.md + project_dir = skills_base / "aisos" / "local-code-review" + project_dir.mkdir(parents=True) + project_review = project_dir / "review.md" + project_review.write_text("Project override instructions") + + result = detect_review_md("local-code-review", "AISOS-123", skills_base) + + assert result == project_review + assert result.read_text() == "Project override instructions" + + def test_returns_default_when_only_default_exists(self, tmp_path: Path): + """Returns default path when no project override exists.""" + skills_base = tmp_path / "skills" + + # Create only default skill review.md + default_dir = skills_base / "default" / "code-review" + default_dir.mkdir(parents=True) + default_review = default_dir / "review.md" + default_review.write_text("Default instructions") + + result = detect_review_md("code-review", "PROJ-456", skills_base) + + assert result == default_review + assert result.read_text() == "Default instructions" + + def test_returns_none_when_no_review_md_exists(self, tmp_path: Path): + """SC-010: Returns None when review.md doesn't exist in either location.""" + skills_base = tmp_path / "skills" + + # Create skill directories without review.md + default_dir = skills_base / "default" / "test-skill" + default_dir.mkdir(parents=True) + + project_dir = skills_base / "myproj" / "test-skill" + project_dir.mkdir(parents=True) + + result = detect_review_md("test-skill", "MYPROJ-789", skills_base) + + assert result is None + + def test_returns_none_when_skills_base_empty(self, tmp_path: Path): + """Returns None when skills_base is empty.""" + skills_base = tmp_path / "skills" + skills_base.mkdir(parents=True) + + result = detect_review_md("any-skill", "PROJ-1", skills_base) + + assert result is None + + def test_project_key_extracted_and_lowercased(self, tmp_path: Path): + """Project key is correctly extracted and lowercased from ticket_key.""" + skills_base = tmp_path / "skills" + + # Create project skill with lowercase directory + project_dir = skills_base / "myproject" / "review-skill" + project_dir.mkdir(parents=True) + project_review = project_dir / "review.md" + project_review.write_text("Project review") + + # Ticket key has uppercase project + result = detect_review_md("review-skill", "MYPROJECT-999", skills_base) + + assert result == project_review + + def test_handles_ticket_key_without_hyphen(self, tmp_path: Path): + """Falls back to default when ticket_key has no hyphen (no project).""" + skills_base = tmp_path / "skills" + + # Create default skill review.md + default_dir = skills_base / "default" / "test-skill" + default_dir.mkdir(parents=True) + default_review = default_dir / "review.md" + default_review.write_text("Default instructions") + + result = detect_review_md("test-skill", "INVALID", skills_base) + + assert result == default_review + + def test_returns_none_for_ticket_without_hyphen_and_no_default(self, tmp_path: Path): + """Returns None when ticket has no hyphen and no default exists.""" + skills_base = tmp_path / "skills" + skills_base.mkdir(parents=True) + + result = detect_review_md("test-skill", "NOHYPHEN", skills_base) + + assert result is None + + def test_ignores_project_directory_without_review_md(self, tmp_path: Path): + """Falls back to default when project dir exists but has no review.md.""" + skills_base = tmp_path / "skills" + + # Create project directory without review.md + project_dir = skills_base / "proj" / "skill-name" + project_dir.mkdir(parents=True) + (project_dir / "other-file.txt").write_text("not review.md") + + # Create default with review.md + default_dir = skills_base / "default" / "skill-name" + default_dir.mkdir(parents=True) + default_review = default_dir / "review.md" + default_review.write_text("Default review") + + result = detect_review_md("skill-name", "PROJ-1", skills_base) + + assert result == default_review + + def test_returns_default_when_project_dir_does_not_exist(self, tmp_path: Path): + """Returns default when project directory doesn't exist at all.""" + skills_base = tmp_path / "skills" + + # Only create default + default_dir = skills_base / "default" / "my-skill" + default_dir.mkdir(parents=True) + default_review = default_dir / "review.md" + default_review.write_text("Default instructions") + + # No project directory created + result = detect_review_md("my-skill", "PROJ-100", skills_base) + + assert result == default_review + + def test_does_not_match_directory_named_review_md(self, tmp_path: Path): + """Ensures we check for file, not directory named review.md.""" + skills_base = tmp_path / "skills" + + # Create a directory named review.md (edge case) + default_dir = skills_base / "default" / "edge-skill" + default_dir.mkdir(parents=True) + (default_dir / "review.md").mkdir() # This is a directory, not a file + + result = detect_review_md("edge-skill", "TEST-1", skills_base) + + assert result is None + + def test_multi_hyphen_ticket_key(self, tmp_path: Path): + """Correctly extracts project from ticket keys with multiple hyphens.""" + skills_base = tmp_path / "skills" + + # Create project skill + project_dir = skills_base / "proj" / "multi-hyphen-skill" + project_dir.mkdir(parents=True) + project_review = project_dir / "review.md" + project_review.write_text("Project review") + + # Skill name also has hyphens, ticket key is "PROJ-123" + result = detect_review_md("multi-hyphen-skill", "PROJ-123", skills_base) + + assert result == project_review + + +# --------------------------------------------------------------------------- +# parse_verdict tests +# --------------------------------------------------------------------------- + + +class TestParseVerdict: + """Tests for parse_verdict function (SC-002).""" + + # ----- APPROVED marker tests ----- + + def test_approved_uppercase(self): + """APPROVED marker in uppercase returns (APPROVED, '').""" + result = parse_verdict("The code looks good. APPROVED") + assert result == (Verdict.APPROVED, "") + + def test_approved_lowercase(self): + """Case-insensitive: 'approved' returns (APPROVED, '').""" + result = parse_verdict("Code review complete. approved") + assert result == (Verdict.APPROVED, "") + + def test_approved_mixed_case(self): + """Case-insensitive: 'Approved' returns (APPROVED, '').""" + result = parse_verdict("All tests pass. Approved") + assert result == (Verdict.APPROVED, "") + + def test_approved_with_text_before_and_after(self): + """APPROVED marker in middle of text still returns (APPROVED, '').""" + result = parse_verdict("Summary: Code is clean. APPROVED. No further changes needed.") + assert result == (Verdict.APPROVED, "") + + def test_approved_at_start(self): + """APPROVED at start of text.""" + result = parse_verdict("APPROVED - code meets all requirements") + assert result == (Verdict.APPROVED, "") + + # ----- REJECTED marker tests ----- + + def test_rejected_uppercase(self): + """REJECTED marker returns (REJECTED, feedback).""" + result = parse_verdict("REJECTED: Code has security issues.") + assert result[0] == Verdict.REJECTED + assert result[1] == ": Code has security issues." + + def test_rejected_lowercase(self): + """Case-insensitive: 'rejected' returns (REJECTED, feedback).""" + result = parse_verdict("Code review result: rejected due to missing tests.") + assert result[0] == Verdict.REJECTED + assert result[1] == "due to missing tests." + + def test_rejected_mixed_case(self): + """Case-insensitive: 'Rejected' returns (REJECTED, feedback).""" + result = parse_verdict("Rejected - needs refactoring.") + assert result[0] == Verdict.REJECTED + assert result[1] == "- needs refactoring." + + def test_rejected_extracts_feedback_after_marker(self): + """Feedback is all text after REJECTED marker.""" + result = parse_verdict( + "Review: REJECTED\n\nPlease fix the following:\n1. Bug in line 42\n2. Missing docstring" + ) + assert result[0] == Verdict.REJECTED + assert "Please fix the following:" in result[1] + assert "Bug in line 42" in result[1] + assert "Missing docstring" in result[1] + + def test_rejected_feedback_is_stripped(self): + """Feedback text is stripped of leading/trailing whitespace.""" + result = parse_verdict("REJECTED \n\n Needs work. \n\n") + assert result[0] == Verdict.REJECTED + assert result[1] == "Needs work." + + def test_rejected_with_empty_feedback(self): + """SC-003: Empty feedback after REJECTED marker is handled correctly.""" + result = parse_verdict("REJECTED") + assert result[0] == Verdict.REJECTED + assert result[1] == "" + + def test_rejected_with_only_whitespace_feedback(self): + """Whitespace-only feedback after REJECTED is stripped to empty string.""" + result = parse_verdict("REJECTED \n\n \t \n") + assert result[0] == Verdict.REJECTED + assert result[1] == "" + + # ----- Neither marker present ----- + + def test_neither_marker_returns_rejected_with_error_message(self): + """Neither marker present returns (REJECTED, 'Verdict could not be parsed').""" + result = parse_verdict("This review output has no clear verdict.") + assert result == (Verdict.REJECTED, "Verdict could not be parsed") + + def test_empty_string_returns_rejected_with_error_message(self): + """Empty string returns (REJECTED, 'Verdict could not be parsed').""" + result = parse_verdict("") + assert result == (Verdict.REJECTED, "Verdict could not be parsed") + + def test_whitespace_only_returns_rejected_with_error_message(self): + """Whitespace-only string returns (REJECTED, 'Verdict could not be parsed').""" + result = parse_verdict(" \n\t\n ") + assert result == (Verdict.REJECTED, "Verdict could not be parsed") + + # ----- Both markers present ----- + + def test_both_markers_approved_first_wins(self): + """When both markers present, APPROVED first wins.""" + result = parse_verdict("Code is APPROVED, not REJECTED because tests pass.") + assert result == (Verdict.APPROVED, "") + + def test_both_markers_rejected_first_wins(self): + """When both markers present, REJECTED first wins if it comes first.""" + result = parse_verdict( + "This code is REJECTED. It would have been APPROVED if tests passed." + ) + assert result[0] == Verdict.REJECTED + assert "It would have been APPROVED if tests passed." in result[1] + + def test_both_markers_same_position_edge_case(self): + """Edge case: if somehow both start at same position, check behavior. + + In practice this can't happen since they're different strings, + but we verify APPROVED is checked first per spec. + """ + # This tests the logic: APPROVED found, REJECTED found, but APPROVED position is smaller + text = "APPROVED followed by REJECTED" + result = parse_verdict(text) + assert result == (Verdict.APPROVED, "") + + # ----- Partial matches should not be detected ----- + + def test_approved_as_word_in_middle(self): + """APPROVED as substring in longer word is still detected (per current impl).""" + # NOTE: The spec doesn't mention word boundaries, so "APPROVED" in "UNAPPROVED" + # would still match. This documents current behavior. + result = parse_verdict("This is PREAPPROVED for the next phase") + assert result == (Verdict.APPROVED, "") + + def test_rejected_as_word_in_middle(self): + """REJECTED as substring is still detected (per current impl).""" + result = parse_verdict("Previously rejected items were fixed") + # "rejected" is found in the middle + assert result[0] == Verdict.REJECTED + + # ----- Complex real-world scenarios ----- + + def test_real_world_approved_review(self): + """Real-world style APPROVED review.""" + review = """ +## Code Review Summary + +The implementation looks good and follows the coding standards. +All tests pass and documentation is adequate. + +**Verdict: APPROVED** + +No further changes required. +""" + result = parse_verdict(review) + assert result == (Verdict.APPROVED, "") + + def test_real_world_rejected_review(self): + """Real-world style REJECTED review with detailed feedback.""" + review = """ +## Code Review Summary + +The implementation has several issues that need to be addressed. + +**Verdict: REJECTED** + +### Issues Found: +1. Missing error handling in `parse_data()` function +2. No unit tests for edge cases +3. Documentation needs to be updated + +### Recommendations: +- Add try/except blocks for file operations +- Add tests for empty input and malformed data +""" + result = parse_verdict(review) + assert result[0] == Verdict.REJECTED + assert "Missing error handling" in result[1] + assert "No unit tests for edge cases" in result[1] + assert "Recommendations:" in result[1] + + def test_multiline_approved(self): + """APPROVED marker on its own line.""" + review = """ +Review complete. + +APPROVED + +Ship it! +""" + result = parse_verdict(review) + assert result == (Verdict.APPROVED, "") + + +# --------------------------------------------------------------------------- +# write_cycle_file tests +# --------------------------------------------------------------------------- + + +class TestWriteCycleFile: + """Tests for write_cycle_file function (SC-007).""" + + def test_creates_step_directory(self, tmp_path: Path): + """Creates .forge/{step_name}/ directory if it doesn't exist.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="local-code-review", + elapsed_seconds=15.5, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "implement", cycle_data) + + step_dir = workspace / ".forge" / "implement" + assert step_dir.exists() + assert step_dir.is_dir() + + def test_creates_nested_forge_and_step_directories(self, tmp_path: Path): + """Creates both .forge/ and step subdirectory when neither exists.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="code-review", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "my-step", cycle_data) + + assert (workspace / ".forge").exists() + assert (workspace / ".forge" / "my-step").exists() + + def test_writes_review_cycle_n_json(self, tmp_path: Path): + """File written to .forge/{step_name}/review_cycle_N.json.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=2, + max_cycles=5, + verdict="rejected", + feedback="Needs work", + skill="test-skill", + elapsed_seconds=20.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step-name", cycle_data) + + output_file = workspace / ".forge" / "step-name" / "review_cycle_2.json" + assert output_file.exists() + assert output_file.is_file() + + def test_json_contains_all_required_fields(self, tmp_path: Path): + """SC-007: JSON contains all ReviewCycleData fields.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Looks good!", + skill="local-code-review", + elapsed_seconds=12.5, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "implement", cycle_data) + + output_file = workspace / ".forge" / "implement" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["cycle"] == 1 + assert data["max_cycles"] == 3 + assert data["verdict"] == "approved" + assert data["feedback"] == "Looks good!" + assert data["skill"] == "local-code-review" + assert data["elapsed_seconds"] == 12.5 + assert data["timestamp"] == "2024-01-15T10:30:00Z" + + def test_timestamp_iso_8601_utc_format(self, tmp_path: Path): + """Timestamp is ISO 8601 UTC format.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + # ISO 8601 UTC timestamp + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-06-20T14:30:45Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + # Verify timestamp format preserved + assert data["timestamp"] == "2024-06-20T14:30:45Z" + + def test_verdict_lowercase_approved(self, tmp_path: Path): + """Verdict is lowercase string 'approved'.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["verdict"] == "approved" + assert data["verdict"].islower() + + def test_verdict_lowercase_rejected(self, tmp_path: Path): + """Verdict is lowercase string 'rejected'.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Fix the bugs", + skill="test", + elapsed_seconds=8.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["verdict"] == "rejected" + assert data["verdict"].islower() + + def test_verdict_enum_converted_to_lowercase(self, tmp_path: Path): + """Verdict.APPROVED enum is converted to lowercase string.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict=Verdict.APPROVED, + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["verdict"] == "approved" + + def test_verdict_enum_rejected_converted_to_lowercase(self, tmp_path: Path): + """Verdict.REJECTED enum is converted to lowercase string.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict=Verdict.REJECTED, + feedback="Needs work", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["verdict"] == "rejected" + + def test_multiple_cycles_written_separately(self, tmp_path: Path): + """Multiple cycles are written to separate files.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle1 = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="First attempt needs work", + skill="code-review", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + cycle2 = ReviewCycleData( + cycle=2, + max_cycles=3, + verdict="rejected", + feedback="Still has issues", + skill="code-review", + elapsed_seconds=12.0, + timestamp="2024-01-15T10:35:00Z", + ) + cycle3 = ReviewCycleData( + cycle=3, + max_cycles=3, + verdict="approved", + feedback="", + skill="code-review", + elapsed_seconds=8.0, + timestamp="2024-01-15T10:40:00Z", + ) + + write_cycle_file(workspace, "impl", cycle1) + write_cycle_file(workspace, "impl", cycle2) + write_cycle_file(workspace, "impl", cycle3) + + # All three files should exist + assert (workspace / ".forge" / "impl" / "review_cycle_1.json").exists() + assert (workspace / ".forge" / "impl" / "review_cycle_2.json").exists() + assert (workspace / ".forge" / "impl" / "review_cycle_3.json").exists() + + # Verify content of each + data1 = json.loads((workspace / ".forge" / "impl" / "review_cycle_1.json").read_text()) + data2 = json.loads((workspace / ".forge" / "impl" / "review_cycle_2.json").read_text()) + data3 = json.loads((workspace / ".forge" / "impl" / "review_cycle_3.json").read_text()) + + assert data1["verdict"] == "rejected" + assert data2["verdict"] == "rejected" + assert data3["verdict"] == "approved" + + def test_json_format_is_pretty_printed(self, tmp_path: Path): + """JSON is formatted with indentation for readability.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + content = output_file.read_text(encoding="utf-8") + + # Should have newlines (pretty printed) + assert "\n" in content + # Should have indentation + assert " " in content + + def test_overwrites_existing_file(self, tmp_path: Path): + """Overwrites existing cycle file if run again with same cycle number.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + # Write first version + cycle_data1 = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="First feedback", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + write_cycle_file(workspace, "step", cycle_data1) + + # Overwrite with second version + cycle_data2 = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Updated feedback", + skill="test", + elapsed_seconds=8.0, + timestamp="2024-01-15T10:35:00Z", + ) + write_cycle_file(workspace, "step", cycle_data2) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + # Should have the updated values + assert data["verdict"] == "approved" + assert data["feedback"] == "Updated feedback" + assert data["elapsed_seconds"] == 8.0 + + def test_empty_feedback(self, tmp_path: Path): + """Empty feedback string is preserved.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["feedback"] == "" + + def test_multiline_feedback(self, tmp_path: Path): + """Multiline feedback is preserved correctly.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + multiline_feedback = """Fix the following issues: +1. Missing error handling +2. No unit tests +3. Documentation incomplete""" + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback=multiline_feedback, + skill="code-review", + elapsed_seconds=15.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["feedback"] == multiline_feedback + assert "1. Missing error handling" in data["feedback"] + assert "2. No unit tests" in data["feedback"] + + def test_special_characters_in_feedback(self, tmp_path: Path): + """Special characters in feedback are handled correctly.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback='Fix "quotes", , and unicode: émoji 🎉', + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["feedback"] == 'Fix "quotes", , and unicode: émoji 🎉' + + def test_elapsed_seconds_float_precision(self, tmp_path: Path): + """Float precision for elapsed_seconds is preserved.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=123.456789, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "step", cycle_data) + + output_file = workspace / ".forge" / "step" / "review_cycle_1.json" + data = json.loads(output_file.read_text(encoding="utf-8")) + + assert data["elapsed_seconds"] == 123.456789 + + def test_step_name_with_hyphens(self, tmp_path: Path): + """Step name with hyphens is handled correctly.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "local-code-review", cycle_data) + + assert (workspace / ".forge" / "local-code-review" / "review_cycle_1.json").exists() + + def test_step_name_with_underscores(self, tmp_path: Path): + """Step name with underscores is handled correctly.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + + cycle_data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + write_cycle_file(workspace, "my_step_name", cycle_data) + + assert (workspace / ".forge" / "my_step_name" / "review_cycle_1.json").exists() diff --git a/tests/unit/observability/__init__.py b/tests/unit/observability/__init__.py new file mode 100644 index 00000000..da6a2955 --- /dev/null +++ b/tests/unit/observability/__init__.py @@ -0,0 +1 @@ +"""Unit tests for the observability module.""" diff --git a/tests/unit/observability/test_review_notifier.py b/tests/unit/observability/test_review_notifier.py new file mode 100644 index 00000000..75cc75e6 --- /dev/null +++ b/tests/unit/observability/test_review_notifier.py @@ -0,0 +1,565 @@ +"""Unit tests for the ReviewJiraNotifier class.""" + +import logging +import time +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from forge.observability.review_notifier import ( + DEFAULT_RATE_LIMIT_SECONDS, + MAX_FEEDBACK_LENGTH, + NotifyResult, + ReviewJiraNotifier, +) +from forge.observability.review_poller import ReviewCycleData + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_jira_client(): + """Create a mock JiraClient.""" + client = MagicMock() + client.add_comment = AsyncMock() + return client + + +@pytest.fixture +def review_cycle_data(): + """Create sample ReviewCycleData.""" + return ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="The code looks good. All tests pass.", + skill="local-code-review", + elapsed_seconds=45.5, + timestamp="2024-01-15T10:30:00Z", + ) + + +@pytest.fixture +def notifier(mock_jira_client): + """Create a ReviewJiraNotifier with default settings.""" + return ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + ) + + +# --------------------------------------------------------------------------- +# NotifyResult tests +# --------------------------------------------------------------------------- + + +class TestNotifyResult: + """Tests for NotifyResult dataclass.""" + + def test_posted_success(self): + """Test successful post result.""" + result = NotifyResult(posted=True, rate_limited=False) + assert result.posted is True + assert result.rate_limited is False + assert result.error is None + + def test_rate_limited(self): + """Test rate limited result.""" + result = NotifyResult(posted=False, rate_limited=True) + assert result.posted is False + assert result.rate_limited is True + assert result.error is None + + def test_error_result(self): + """Test error result.""" + result = NotifyResult(posted=False, rate_limited=False, error="API Error") + assert result.posted is False + assert result.rate_limited is False + assert result.error == "API Error" + + +# --------------------------------------------------------------------------- +# ReviewJiraNotifier initialization tests +# --------------------------------------------------------------------------- + + +class TestReviewJiraNotifierInit: + """Tests for ReviewJiraNotifier initialization.""" + + def test_init_basic(self, mock_jira_client): + """Test basic initialization with defaults.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="PROJ-456", + ) + assert notifier.jira_client is mock_jira_client + assert notifier.ticket_key == "PROJ-456" + assert notifier.rate_limit_seconds == DEFAULT_RATE_LIMIT_SECONDS + assert notifier._last_notify_time is None + + def test_init_custom_rate_limit(self, mock_jira_client): + """Test initialization with custom rate limit.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=60.0, + ) + assert notifier.rate_limit_seconds == 60.0 + + def test_default_rate_limit_is_30_seconds(self): + """Test that the default rate limit is 30 seconds.""" + assert DEFAULT_RATE_LIMIT_SECONDS == 30.0 + + +# --------------------------------------------------------------------------- +# Rate limiting tests +# --------------------------------------------------------------------------- + + +class TestRateLimiting: + """Tests for rate limiting functionality.""" + + def test_not_rate_limited_initially(self, notifier): + """Test that first call is not rate limited.""" + assert notifier._is_rate_limited() is False + + def test_rate_limited_after_notify(self, notifier): + """Test that calls are rate limited after a notify.""" + # Simulate a recent notify by setting the timestamp + notifier._last_notify_time = time.monotonic() + assert notifier._is_rate_limited() is True + + def test_not_rate_limited_after_interval(self, mock_jira_client): + """Test that calls are not rate limited after interval passes.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=0.1, # Very short for testing + ) + + # Set a timestamp in the past + notifier._last_notify_time = time.monotonic() - 0.2 + assert notifier._is_rate_limited() is False + + def test_reset_rate_limit(self, notifier): + """Test that reset_rate_limit clears the timer.""" + notifier._last_notify_time = time.monotonic() + assert notifier._is_rate_limited() is True + + notifier.reset_rate_limit() + assert notifier._is_rate_limited() is False + + @pytest.mark.asyncio + async def test_notify_returns_rate_limited_result(self, mock_jira_client, review_cycle_data): + """Test that notify returns rate_limited=True when rate limited.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=60.0, + ) + + # First call should succeed + result1 = await notifier.notify(review_cycle_data) + assert result1.posted is True + assert result1.rate_limited is False + + # Second immediate call should be rate limited + result2 = await notifier.notify(review_cycle_data) + assert result2.posted is False + assert result2.rate_limited is True + + # Jira should only have been called once + assert mock_jira_client.add_comment.call_count == 1 + + +# --------------------------------------------------------------------------- +# Comment formatting tests +# --------------------------------------------------------------------------- + + +class TestCommentFormatting: + """Tests for comment formatting.""" + + def test_format_approved_verdict(self, notifier): + """Test formatting for approved verdict.""" + data = ReviewCycleData( + cycle=2, + max_cycles=5, + verdict="approved", + feedback="All good!", + skill="review-skill", + elapsed_seconds=30.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + assert "Review Cycle 2/5" in comment + assert "✅" in comment + assert "APPROVED" in comment + assert "review-skill" in comment + assert "30.0s" in comment + assert "All good!" in comment + + def test_format_rejected_verdict(self, notifier): + """Test formatting for rejected verdict.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Needs fixes", + skill="code-review", + elapsed_seconds=25.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + assert "Review Cycle 1/3" in comment + assert "❌" in comment + assert "REJECTED" in comment + assert "Needs fixes" in comment + + def test_format_duration_minutes(self, notifier): + """Test that duration >= 60s is formatted in minutes.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=125.0, # 2.08 minutes + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + assert "2.1m" in comment + + def test_format_duration_seconds(self, notifier): + """Test that duration < 60s is formatted in seconds.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=45.3, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + assert "45.3s" in comment + + def test_format_empty_feedback(self, notifier): + """Test formatting with empty feedback.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="test", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + # Should not include "Feedback:" section + assert "Feedback:" not in comment + + def test_format_whitespace_feedback(self, notifier): + """Test formatting with whitespace-only feedback.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback=" \n\t ", + skill="test", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + # Should not include "Feedback:" section + assert "Feedback:" not in comment + + def test_format_empty_skill(self, notifier): + """Test formatting with empty skill name.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Good", + skill="", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + # Should not include "Skill:" section + assert "Skill:" not in comment + + def test_format_truncates_long_feedback(self, notifier): + """Test that long feedback is truncated.""" + long_feedback = "A" * (MAX_FEEDBACK_LENGTH + 100) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback=long_feedback, + skill="test", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + # Should be truncated with ellipsis + assert "..." in comment + # Should not contain the full feedback + assert long_feedback not in comment + + def test_format_truncates_at_word_boundary(self, notifier): + """Test that truncation prefers word boundaries.""" + # Create feedback with words + words = ["word"] * 150 # More than MAX_FEEDBACK_LENGTH + long_feedback = " ".join(words) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback=long_feedback, + skill="test", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + # Should end with "..." after a complete word + assert "..." in comment + # The truncated content should not cut a word in half + feedback_part = comment.split("*Feedback:*")[1].strip() + assert feedback_part.endswith("word...") + + def test_format_max_feedback_length_constant(self): + """Test that MAX_FEEDBACK_LENGTH is 500.""" + assert MAX_FEEDBACK_LENGTH == 500 + + def test_format_case_insensitive_verdict(self, notifier): + """Test that verdict is uppercased in output.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="Approved", # Mixed case + feedback="", + skill="test", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + comment = notifier._format_comment(data) + + assert "APPROVED" in comment + + +# --------------------------------------------------------------------------- +# Notify tests +# --------------------------------------------------------------------------- + + +class TestNotify: + """Tests for notify method.""" + + @pytest.mark.asyncio + async def test_notify_posts_comment(self, mock_jira_client, notifier, review_cycle_data): + """Test that notify posts a comment to Jira.""" + result = await notifier.notify(review_cycle_data) + + assert result.posted is True + assert result.rate_limited is False + assert result.error is None + mock_jira_client.add_comment.assert_called_once() + + # Check the call args + call_args = mock_jira_client.add_comment.call_args + assert call_args[0][0] == "TEST-123" # ticket_key + assert "Review Cycle 1/3" in call_args[0][1] # comment body + + @pytest.mark.asyncio + async def test_notify_updates_last_notify_time(self, notifier, review_cycle_data): + """Test that notify updates the last notify time.""" + assert notifier._last_notify_time is None + + await notifier.notify(review_cycle_data) + + assert notifier._last_notify_time is not None + + @pytest.mark.asyncio + async def test_notify_handles_api_error( + self, mock_jira_client, notifier, review_cycle_data, caplog + ): + """Test that notify handles Jira API errors gracefully.""" + mock_jira_client.add_comment.side_effect = Exception("API Error") + + with caplog.at_level(logging.WARNING): + result = await notifier.notify(review_cycle_data) + + assert result.posted is False + assert result.rate_limited is False + assert result.error == "API Error" + + # Check warning was logged + assert "Failed to post review comment" in caplog.text + assert "TEST-123" in caplog.text + + @pytest.mark.asyncio + async def test_notify_logs_success(self, notifier, review_cycle_data, caplog): + """Test that notify logs success at INFO level.""" + with caplog.at_level(logging.INFO): + await notifier.notify(review_cycle_data) + + assert "Posted review cycle 1/3 comment to TEST-123" in caplog.text + + @pytest.mark.asyncio + async def test_notify_does_not_update_time_on_error( + self, mock_jira_client, notifier, review_cycle_data + ): + """Test that last_notify_time is not updated on error.""" + mock_jira_client.add_comment.side_effect = Exception("API Error") + + await notifier.notify(review_cycle_data) + + # Time should not be updated on failure + assert notifier._last_notify_time is None + + @pytest.mark.asyncio + async def test_notify_rate_limited_logs_debug( + self, mock_jira_client, review_cycle_data, caplog + ): + """Test that rate limited skips log at DEBUG level.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=60.0, + ) + + # First call + await notifier.notify(review_cycle_data) + + # Second call should be rate limited + with caplog.at_level(logging.DEBUG): + result = await notifier.notify(review_cycle_data) + + assert result.rate_limited is True + assert "Rate limited" in caplog.text + + +# --------------------------------------------------------------------------- +# Integration-style tests +# --------------------------------------------------------------------------- + + +class TestIntegration: + """Integration-style tests combining multiple features.""" + + @pytest.mark.asyncio + async def test_multiple_cycles_with_rate_limiting(self, mock_jira_client): + """Test posting multiple cycles respects rate limiting.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=0.1, # Short for testing + ) + + cycle1 = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="First review", + skill="review", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + cycle2 = ReviewCycleData( + cycle=2, + max_cycles=3, + verdict="approved", + feedback="Second review", + skill="review", + elapsed_seconds=15.0, + timestamp="2024-01-15T10:31:00Z", + ) + + # First cycle posts + result1 = await notifier.notify(cycle1) + assert result1.posted is True + + # Immediate second cycle is rate limited + result2 = await notifier.notify(cycle2) + assert result2.rate_limited is True + + # Wait for rate limit to expire + import asyncio + + await asyncio.sleep(0.15) + + # Third attempt should succeed + result3 = await notifier.notify(cycle2) + assert result3.posted is True + + # Verify two comments were posted + assert mock_jira_client.add_comment.call_count == 2 + + @pytest.mark.asyncio + async def test_error_does_not_block_subsequent_posts(self, mock_jira_client): + """Test that an error on one call doesn't block future calls.""" + notifier = ReviewJiraNotifier( + jira_client=mock_jira_client, + ticket_key="TEST-123", + rate_limit_seconds=0.0, # No rate limiting + ) + + cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Test", + skill="review", + elapsed_seconds=10.0, + timestamp="2024-01-15T10:30:00Z", + ) + + # First call fails + mock_jira_client.add_comment.side_effect = Exception("API Error") + result1 = await notifier.notify(cycle) + assert result1.posted is False + assert result1.error == "API Error" + + # Second call succeeds + mock_jira_client.add_comment.side_effect = None + result2 = await notifier.notify(cycle) + assert result2.posted is True + assert result2.error is None + + @pytest.mark.asyncio + async def test_comment_contains_all_cycle_info(self, mock_jira_client, notifier): + """Test that posted comment contains all relevant info.""" + data = ReviewCycleData( + cycle=2, + max_cycles=5, + verdict="rejected", + feedback="Please fix the typo on line 42", + skill="code-review", + elapsed_seconds=90.5, + timestamp="2024-01-15T10:30:00Z", + ) + + await notifier.notify(data) + + call_args = mock_jira_client.add_comment.call_args + comment_body = call_args[0][1] + + # Verify all key information is present + assert "2/5" in comment_body # cycle/max_cycles + assert "REJECTED" in comment_body # verdict + assert "❌" in comment_body # verdict icon + assert "code-review" in comment_body # skill + assert "1.5m" in comment_body # duration (90.5s = 1.5m) + assert "typo on line 42" in comment_body # feedback diff --git a/tests/unit/observability/test_review_poller.py b/tests/unit/observability/test_review_poller.py new file mode 100644 index 00000000..7d222875 --- /dev/null +++ b/tests/unit/observability/test_review_poller.py @@ -0,0 +1,726 @@ +"""Unit tests for the ReviewCyclePoller class.""" + +import json +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from forge.config import Settings +from forge.observability.review_poller import ( + MAX_JSON_PARSE_RETRIES, + ReviewCycleData, + ReviewCyclePoller, +) + +# --------------------------------------------------------------------------- +# ReviewCycleData tests +# --------------------------------------------------------------------------- + + +class TestReviewCycleData: + """Tests for ReviewCycleData dataclass.""" + + def test_all_fields(self): + """Test that all fields are correctly stored.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Looks good!", + skill="code-review", + elapsed_seconds=12.5, + timestamp="2024-01-15T10:30:00Z", + file_path="/path/to/file.json", + ) + assert data.cycle == 1 + assert data.max_cycles == 3 + assert data.verdict == "approved" + assert data.feedback == "Looks good!" + assert data.skill == "code-review" + assert data.elapsed_seconds == 12.5 + assert data.timestamp == "2024-01-15T10:30:00Z" + assert data.file_path == "/path/to/file.json" + + def test_default_file_path(self): + """Test that file_path defaults to empty string.""" + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Needs work", + skill="review", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + assert data.file_path == "" + + def test_from_dict_all_fields(self): + """Test creating from dict with all fields.""" + input_dict = { + "cycle": 2, + "max_cycles": 5, + "verdict": "rejected", + "feedback": "Fix bugs", + "skill": "local-code-review", + "elapsed_seconds": 8.3, + "timestamp": "2024-01-16T14:20:00Z", + } + data = ReviewCycleData.from_dict(input_dict, file_path="/test/path.json") + + assert data.cycle == 2 + assert data.max_cycles == 5 + assert data.verdict == "rejected" + assert data.feedback == "Fix bugs" + assert data.skill == "local-code-review" + assert data.elapsed_seconds == 8.3 + assert data.timestamp == "2024-01-16T14:20:00Z" + assert data.file_path == "/test/path.json" + + def test_from_dict_minimal_fields(self): + """Test creating from dict with only required fields.""" + input_dict = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + } + data = ReviewCycleData.from_dict(input_dict) + + assert data.cycle == 1 + assert data.max_cycles == 3 + assert data.verdict == "approved" + assert data.feedback == "" + assert data.skill == "" + assert data.elapsed_seconds == 0.0 + assert data.timestamp == "" + assert data.file_path == "" + + def test_from_dict_missing_required_raises(self): + """Test that missing required fields raise KeyError.""" + with pytest.raises(KeyError): + ReviewCycleData.from_dict({"cycle": 1, "max_cycles": 3}) # missing verdict + + with pytest.raises(KeyError): + ReviewCycleData.from_dict({"cycle": 1, "verdict": "approved"}) # missing max_cycles + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller initialization tests +# --------------------------------------------------------------------------- + + +class TestReviewCyclePollerInit: + """Tests for ReviewCyclePoller initialization.""" + + def test_init_basic(self): + """Test basic initialization.""" + poller = ReviewCyclePoller( + workspace_path=Path("/workspace"), + step_name="implement_task", + ) + assert poller.workspace_path == Path("/workspace") + assert poller.step_name == "implement_task" + assert poller._settings is None + assert poller._processed_files == set() + assert poller._running is False + + def test_init_with_settings(self, mock_settings): + """Test initialization with explicit settings.""" + poller = ReviewCyclePoller( + workspace_path=Path("/workspace"), + step_name="generate_prd", + settings=mock_settings, + ) + assert poller._settings is mock_settings + + def test_review_cycle_dir(self): + """Test review_cycle_dir property.""" + poller = ReviewCyclePoller( + workspace_path=Path("/workspace"), + step_name="implement_task", + ) + assert poller.review_cycle_dir == Path("/workspace/.forge/implement_task") + + def test_poll_interval_from_settings(self, mock_settings): + """Test poll_interval reads from settings.""" + mock_settings.auto_review_poll_interval = 10.0 + poller = ReviewCyclePoller( + workspace_path=Path("/workspace"), + step_name="test_step", + settings=mock_settings, + ) + assert poller.poll_interval == 10.0 + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller._get_review_cycle_files tests +# --------------------------------------------------------------------------- + + +class TestGetReviewCycleFiles: + """Tests for _get_review_cycle_files method.""" + + def test_no_directory_returns_empty(self, tmp_path): + """Test that non-existent directory returns empty list.""" + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="nonexistent_step", + ) + assert poller._get_review_cycle_files() == [] + + def test_empty_directory_returns_empty(self, tmp_path): + """Test that empty directory returns empty list.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + assert poller._get_review_cycle_files() == [] + + def test_finds_review_cycle_files(self, tmp_path): + """Test finding review_cycle_*.json files.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + # Create matching files + (step_dir / "review_cycle_1.json").write_text("{}") + (step_dir / "review_cycle_2.json").write_text("{}") + (step_dir / "review_cycle_10.json").write_text("{}") + + # Create non-matching files + (step_dir / "other_file.json").write_text("{}") + (step_dir / "review_cycle.json").write_text("{}") # Missing underscore + (step_dir / "review_cycle_1.txt").write_text("{}") # Wrong extension + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + files = poller._get_review_cycle_files() + + assert len(files) == 3 + assert all(f.name.startswith("review_cycle_") for f in files) + assert all(f.suffix == ".json" for f in files) + + def test_files_are_sorted(self, tmp_path): + """Test that files are returned in sorted order.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + # Create files in non-sorted order + (step_dir / "review_cycle_3.json").write_text("{}") + (step_dir / "review_cycle_1.json").write_text("{}") + (step_dir / "review_cycle_2.json").write_text("{}") + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + files = poller._get_review_cycle_files() + + assert [f.name for f in files] == [ + "review_cycle_1.json", + "review_cycle_2.json", + "review_cycle_3.json", + ] + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller._parse_json_with_retry tests +# --------------------------------------------------------------------------- + + +class TestParseJsonWithRetry: + """Tests for _parse_json_with_retry method.""" + + @pytest.mark.asyncio + async def test_parse_valid_json(self, tmp_path): + """Test parsing a valid JSON file.""" + json_file = tmp_path / "test.json" + json_file.write_text('{"cycle": 1, "max_cycles": 3, "verdict": "approved"}') + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + ) + result = await poller._parse_json_with_retry(json_file) + + assert result == {"cycle": 1, "max_cycles": 3, "verdict": "approved"} + + @pytest.mark.asyncio + async def test_parse_empty_file_returns_none(self, tmp_path): + """Test that empty file returns None after retries.""" + json_file = tmp_path / "empty.json" + json_file.write_text("") + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + ) + + with patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + result = await poller._parse_json_with_retry(json_file) + + assert result is None + # Should have retried MAX_JSON_PARSE_RETRIES - 1 times + assert mock_sleep.await_count == MAX_JSON_PARSE_RETRIES - 1 + + @pytest.mark.asyncio + async def test_parse_invalid_json_retries(self, tmp_path): + """Test that invalid JSON triggers retries.""" + json_file = tmp_path / "invalid.json" + json_file.write_text('{"incomplete": ') + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + ) + + with patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + result = await poller._parse_json_with_retry(json_file) + + assert result is None + assert mock_sleep.await_count == MAX_JSON_PARSE_RETRIES - 1 + + @pytest.mark.asyncio + async def test_parse_file_not_found_returns_none(self, tmp_path): + """Test that missing file returns None without retries.""" + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + ) + result = await poller._parse_json_with_retry(tmp_path / "nonexistent.json") + + assert result is None + + @pytest.mark.asyncio + async def test_parse_succeeds_on_retry(self, tmp_path): + """Test that parsing can succeed after initial failures.""" + json_file = tmp_path / "eventually_valid.json" + json_file.write_text('{"incomplete": ') + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + ) + + call_count = 0 + + async def mock_read(): + nonlocal call_count + call_count += 1 + if call_count < 3: + return '{"incomplete": ' + return '{"cycle": 1, "max_cycles": 3, "verdict": "approved"}' + + with patch("aiofiles.open") as mock_open: + mock_file = AsyncMock() + mock_file.read = mock_read + mock_file.__aenter__ = AsyncMock(return_value=mock_file) + mock_file.__aexit__ = AsyncMock(return_value=None) + mock_open.return_value = mock_file + + with patch("asyncio.sleep", new_callable=AsyncMock): + result = await poller._parse_json_with_retry(json_file) + + assert result == {"cycle": 1, "max_cycles": 3, "verdict": "approved"} + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller.poll_once tests +# --------------------------------------------------------------------------- + + +class TestPollOnce: + """Tests for poll_once method.""" + + @pytest.mark.asyncio + async def test_poll_once_no_files(self, tmp_path): + """Test poll_once with no files.""" + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + result = await poller.poll_once() + assert result == [] + + @pytest.mark.asyncio + async def test_poll_once_new_files(self, tmp_path): + """Test poll_once detects new files.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "LGTM", + "skill": "code-review", + "elapsed_seconds": 5.5, + "timestamp": "2024-01-15T10:00:00Z", + } + (step_dir / "review_cycle_1.json").write_text(json.dumps(cycle_data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + result = await poller.poll_once() + + assert len(result) == 1 + assert result[0].cycle == 1 + assert result[0].verdict == "approved" + assert result[0].feedback == "LGTM" + assert result[0].skill == "code-review" + + @pytest.mark.asyncio + async def test_poll_once_skips_already_processed(self, tmp_path): + """Test that poll_once skips already processed files.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "", + "skill": "", + "elapsed_seconds": 0, + "timestamp": "", + } + file_path = step_dir / "review_cycle_1.json" + file_path.write_text(json.dumps(cycle_data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + + # First poll should detect the file + result1 = await poller.poll_once() + assert len(result1) == 1 + + # Second poll should skip it + result2 = await poller.poll_once() + assert len(result2) == 0 + + @pytest.mark.asyncio + async def test_poll_once_detects_new_after_first(self, tmp_path): + """Test that poll_once detects new files on subsequent polls.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + cycle1 = {"cycle": 1, "max_cycles": 3, "verdict": "rejected", "feedback": "Fix it"} + (step_dir / "review_cycle_1.json").write_text(json.dumps(cycle1)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + + result1 = await poller.poll_once() + assert len(result1) == 1 + assert result1[0].verdict == "rejected" + + # Add new file + cycle2 = {"cycle": 2, "max_cycles": 3, "verdict": "approved", "feedback": ""} + (step_dir / "review_cycle_2.json").write_text(json.dumps(cycle2)) + + result2 = await poller.poll_once() + assert len(result2) == 1 + assert result2[0].cycle == 2 + assert result2[0].verdict == "approved" + + @pytest.mark.asyncio + async def test_poll_once_skips_invalid_json(self, tmp_path): + """Test that invalid JSON files are skipped.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + # Valid file + valid_data = {"cycle": 1, "max_cycles": 3, "verdict": "approved"} + (step_dir / "review_cycle_1.json").write_text(json.dumps(valid_data)) + + # Invalid JSON file + (step_dir / "review_cycle_2.json").write_text("not valid json") + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + + with patch("asyncio.sleep", new_callable=AsyncMock): + result = await poller.poll_once() + + # Only valid file should be returned + assert len(result) == 1 + assert result[0].cycle == 1 + + @pytest.mark.asyncio + async def test_poll_once_skips_missing_required_fields(self, tmp_path): + """Test that files with missing required fields are skipped.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + # Missing 'verdict' + invalid_data = {"cycle": 1, "max_cycles": 3} + (step_dir / "review_cycle_1.json").write_text(json.dumps(invalid_data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + result = await poller.poll_once() + + assert len(result) == 0 + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller async iteration tests +# --------------------------------------------------------------------------- + + +class TestAsyncIteration: + """Tests for async iteration interface.""" + + @pytest.mark.asyncio + async def test_poll_starts_iteration(self, tmp_path, mock_settings): + """Test that poll() returns an iterator.""" + mock_settings.auto_review_poll_interval = 0.01 + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + settings=mock_settings, + ) + + iterator = poller.poll() + assert iterator is poller + assert poller._running is True + + @pytest.mark.asyncio + async def test_stop_ends_iteration(self, tmp_path, mock_settings): + """Test that stop() ends iteration.""" + mock_settings.auto_review_poll_interval = 0.01 + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + settings=mock_settings, + ) + + poller.poll() + poller.stop() + assert poller._running is False + + with pytest.raises(StopAsyncIteration): + await poller.__anext__() + + @pytest.mark.asyncio + async def test_iteration_polls_at_interval(self, tmp_path, mock_settings): + """Test that iteration waits for poll interval.""" + mock_settings.auto_review_poll_interval = 0.5 + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test", + settings=mock_settings, + ) + + poller.poll() + + with patch("asyncio.sleep", new_callable=AsyncMock) as mock_sleep: + result = await poller.__anext__() + + mock_sleep.assert_awaited_once_with(0.5) + assert result == [] + + poller.stop() + + @pytest.mark.asyncio + async def test_async_for_loop(self, tmp_path, mock_settings): + """Test using poller in async for loop.""" + mock_settings.auto_review_poll_interval = 0.01 + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + cycle_data = {"cycle": 1, "max_cycles": 3, "verdict": "approved"} + (step_dir / "review_cycle_1.json").write_text(json.dumps(cycle_data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + settings=mock_settings, + ) + + results = [] + iteration_count = 0 + + async for new_cycles in poller.poll(): + iteration_count += 1 + results.extend(new_cycles) + if iteration_count >= 2: # Stop after 2 iterations + poller.stop() + + assert len(results) == 1 + assert results[0].verdict == "approved" + + +# --------------------------------------------------------------------------- +# ReviewCyclePoller reset and processed_count tests +# --------------------------------------------------------------------------- + + +class TestResetAndProcessedCount: + """Tests for reset and processed_count methods.""" + + @pytest.mark.asyncio + async def test_processed_count(self, tmp_path): + """Test processed_count tracks processed files.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + for i in range(3): + data = {"cycle": i + 1, "max_cycles": 5, "verdict": "approved"} + (step_dir / f"review_cycle_{i + 1}.json").write_text(json.dumps(data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + + assert poller.processed_count == 0 + + await poller.poll_once() + assert poller.processed_count == 3 + + @pytest.mark.asyncio + async def test_reset_clears_processed(self, tmp_path): + """Test that reset clears processed files.""" + step_dir = tmp_path / ".forge" / "test_step" + step_dir.mkdir(parents=True) + + data = {"cycle": 1, "max_cycles": 3, "verdict": "approved"} + (step_dir / "review_cycle_1.json").write_text(json.dumps(data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="test_step", + ) + + # First poll + result1 = await poller.poll_once() + assert len(result1) == 1 + assert poller.processed_count == 1 + + # Reset and poll again + poller.reset() + assert poller.processed_count == 0 + + result2 = await poller.poll_once() + assert len(result2) == 1 + + +# --------------------------------------------------------------------------- +# Integration-style tests +# --------------------------------------------------------------------------- + + +class TestPollerIntegration: + """Integration-style tests for the poller.""" + + @pytest.mark.asyncio + async def test_detects_files_within_time_limit(self, tmp_path, mock_settings): + """Test that new files are detected within acceptable time.""" + mock_settings.auto_review_poll_interval = 0.1 # Fast polling for test + step_dir = tmp_path / ".forge" / "implement_task" + step_dir.mkdir(parents=True) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="implement_task", + settings=mock_settings, + ) + + # Start polling + poller.poll() + + # Create file after starting poll + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "rejected", + "feedback": "Tests failing", + "skill": "local-code-review", + "elapsed_seconds": 15.2, + "timestamp": "2024-01-15T12:00:00Z", + } + (step_dir / "review_cycle_1.json").write_text(json.dumps(cycle_data)) + + # Poll and check detection + detected = [] + for _ in range(5): # Max 5 iterations + result = await poller.__anext__() + detected.extend(result) + if detected: + break + + poller.stop() + + assert len(detected) == 1 + assert detected[0].cycle == 1 + assert detected[0].verdict == "rejected" + assert detected[0].feedback == "Tests failing" + + @pytest.mark.asyncio + async def test_multiple_cycle_files_in_order(self, tmp_path): + """Test processing multiple review cycles in order.""" + step_dir = tmp_path / ".forge" / "implement_task" + step_dir.mkdir(parents=True) + + # Create multiple cycle files + for i in range(1, 4): + verdict = "approved" if i == 3 else "rejected" + data = { + "cycle": i, + "max_cycles": 3, + "verdict": verdict, + "feedback": f"Feedback for cycle {i}" if verdict == "rejected" else "", + "skill": "local-code-review", + "elapsed_seconds": i * 5.0, + "timestamp": f"2024-01-15T10:{i:02d}:00Z", + } + (step_dir / f"review_cycle_{i}.json").write_text(json.dumps(data)) + + poller = ReviewCyclePoller( + workspace_path=tmp_path, + step_name="implement_task", + ) + + result = await poller.poll_once() + + assert len(result) == 3 + assert result[0].cycle == 1 + assert result[0].verdict == "rejected" + assert result[1].cycle == 2 + assert result[1].verdict == "rejected" + assert result[2].cycle == 3 + assert result[2].verdict == "approved" + + +# --------------------------------------------------------------------------- +# Fixture for mock_settings +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mock_settings(): + """Create mock settings for tests.""" + return Settings( + redis_url="redis://localhost:6379/0", + jira_base_url="https://test.atlassian.net", + jira_api_token="test-token", + jira_user_email="test@example.com", + github_token="test-github-token", + anthropic_api_key="test-anthropic-key", + auto_review_poll_interval=5.0, + ) diff --git a/tests/unit/observability/test_review_recorder.py b/tests/unit/observability/test_review_recorder.py new file mode 100644 index 00000000..f536c0b7 --- /dev/null +++ b/tests/unit/observability/test_review_recorder.py @@ -0,0 +1,628 @@ +"""Unit tests for the ReviewCycleRecorder class.""" + +import json +import logging +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import patch + +import pytest + +from forge.observability.review_recorder import ( + ReviewCycleData, + ReviewCycleRecorder, +) + + +# --------------------------------------------------------------------------- +# ReviewCycleData tests +# --------------------------------------------------------------------------- + + +class TestReviewCycleData: + """Tests for ReviewCycleData dataclass.""" + + def test_all_fields(self): + """Test that all fields are correctly stored.""" + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Looks good!", + skill="code-review", + elapsed_seconds=12.5, + timestamp=ts, + ) + assert data.cycle == 1 + assert data.max_cycles == 3 + assert data.verdict == "approved" + assert data.feedback == "Looks good!" + assert data.skill == "code-review" + assert data.elapsed_seconds == 12.5 + assert data.timestamp == ts + + def test_timestamp_is_datetime(self): + """Test that timestamp field is datetime type.""" + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + assert isinstance(data.timestamp, datetime) + + def test_to_dict_returns_dict(self): + """Test to_dict returns a dictionary.""" + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=2, + max_cycles=5, + verdict="rejected", + feedback="Needs fixes", + skill="local-code-review", + elapsed_seconds=8.3, + timestamp=ts, + ) + result = data.to_dict() + + assert isinstance(result, dict) + assert result["cycle"] == 2 + assert result["max_cycles"] == 5 + assert result["verdict"] == "rejected" + assert result["feedback"] == "Needs fixes" + assert result["skill"] == "local-code-review" + assert result["elapsed_seconds"] == 8.3 + + def test_to_dict_timestamp_is_iso_string(self): + """Test that to_dict converts timestamp to ISO 8601 string.""" + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="", + elapsed_seconds=0.0, + timestamp=ts, + ) + result = data.to_dict() + + assert isinstance(result["timestamp"], str) + assert "2024-01-15" in result["timestamp"] + assert "10:30:00" in result["timestamp"] + + def test_to_json_returns_string(self): + """Test that to_json returns a JSON string.""" + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Good work", + skill="review", + elapsed_seconds=5.0, + timestamp=ts, + ) + result = data.to_json() + + assert isinstance(result, str) + # Should be valid JSON + parsed = json.loads(result) + assert parsed["cycle"] == 1 + assert parsed["verdict"] == "approved" + + def test_to_json_pretty_printed(self): + """Test that to_json output is pretty-printed with indentation.""" + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="", + elapsed_seconds=0.0, + timestamp=ts, + ) + result = data.to_json() + + # Should contain newlines (pretty printed) + assert "\n" in result + # Should contain indentation + assert " " in result + + +# --------------------------------------------------------------------------- +# ReviewCycleRecorder initialization tests +# --------------------------------------------------------------------------- + + +class TestReviewCycleRecorderInit: + """Tests for ReviewCycleRecorder initialization.""" + + def test_init_with_step_name_only(self): + """Test initialization with just step_name.""" + recorder = ReviewCycleRecorder(step_name="implement_task") + + assert recorder.step_name == "implement_task" + assert recorder.mode is None + assert recorder.recording_dir is None + + def test_init_with_log_mode(self): + """Test initialization with log mode.""" + recorder = ReviewCycleRecorder( + step_name="generate_prd", + mode="log", + ) + + assert recorder.step_name == "generate_prd" + assert recorder.mode == "log" + + def test_init_with_copy_mode_and_recording_dir(self): + """Test initialization with copy mode and recording directory.""" + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="copy", + recording_dir=Path("/recordings"), + ) + + assert recorder.step_name == "implement_task" + assert recorder.mode == "copy" + assert recorder.recording_dir == Path("/recordings") + + def test_init_copy_mode_requires_recording_dir(self): + """Test that copy mode raises ValueError without recording_dir.""" + with pytest.raises(ValueError, match="recording_dir is required"): + ReviewCycleRecorder( + step_name="test_step", + mode="copy", + ) + + def test_init_log_mode_does_not_require_recording_dir(self): + """Test that log mode works without recording_dir.""" + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="log", + ) + assert recorder.recording_dir is None + + def test_step_dir_property(self): + """Test step_dir property returns correct path.""" + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="copy", + recording_dir=Path("/recordings"), + ) + + assert recorder.step_dir == Path("/recordings/implement_task") + + def test_step_dir_none_when_no_recording_dir(self): + """Test step_dir returns None when no recording_dir.""" + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="log", + ) + assert recorder.step_dir is None + + +# --------------------------------------------------------------------------- +# ReviewCycleRecorder.record tests (log mode) +# --------------------------------------------------------------------------- + + +class TestReviewCycleRecorderRecord: + """Tests for ReviewCycleRecorder.record method.""" + + def test_record_disabled_mode_does_nothing(self, caplog): + """Test that record does nothing when mode is None.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode=None) + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Good", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + # No log messages should be recorded + assert len(caplog.records) == 0 + + def test_record_log_mode_logs_at_info_level(self, caplog): + """Test that log mode records at INFO level.""" + recorder = ReviewCycleRecorder(step_name="implement_task", mode="log") + ts = datetime(2024, 1, 15, 10, 30, 0, tzinfo=timezone.utc) + data = ReviewCycleData( + cycle=2, + max_cycles=5, + verdict="rejected", + feedback="Fix the tests", + skill="code-review", + elapsed_seconds=12.5, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + assert len(caplog.records) == 1 + assert caplog.records[0].levelno == logging.INFO + + def test_record_log_mode_includes_cycle_info(self, caplog): + """Test that log output includes cycle number and max cycles.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=3, + max_cycles=5, + verdict="approved", + feedback="", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "3/5" in log_message or ("3" in log_message and "5" in log_message) + + def test_record_log_mode_includes_step_name(self, caplog): + """Test that log output includes step name.""" + recorder = ReviewCycleRecorder(step_name="implement_task", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "implement_task" in log_message + + def test_record_log_mode_includes_verdict(self, caplog): + """Test that log output includes verdict.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "rejected" in log_message + + def test_record_log_mode_includes_skill(self, caplog): + """Test that log output includes skill name.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="local-code-review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "local-code-review" in log_message + + def test_record_log_mode_includes_elapsed_seconds(self, caplog): + """Test that log output includes elapsed seconds.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="review", + elapsed_seconds=15.75, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "15.75" in log_message + + def test_record_log_mode_includes_feedback(self, caplog): + """Test that log output includes feedback.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Fix the bug in line 42", + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + assert "Fix the bug in line 42" in log_message + + def test_record_log_mode_truncates_long_feedback(self, caplog): + """Test that log output truncates feedback longer than 100 chars.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + ts = datetime.now(timezone.utc) + long_feedback = "x" * 150 + data = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback=long_feedback, + skill="review", + elapsed_seconds=1.0, + timestamp=ts, + ) + + with caplog.at_level(logging.INFO): + recorder.record(data) + + log_message = caplog.records[0].message + # Should truncate and add ellipsis + assert "..." in log_message + # Should not contain the full feedback + assert long_feedback not in log_message + + +# --------------------------------------------------------------------------- +# ReviewCycleRecorder.record_file tests (copy mode) +# --------------------------------------------------------------------------- + + +class TestReviewCycleRecorderRecordFile: + """Tests for ReviewCycleRecorder.record_file method.""" + + def test_record_file_disabled_mode_returns_none(self, tmp_path): + """Test that record_file returns None when mode is None.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode=None) + + # Create a source file + source_file = tmp_path / "review_cycle_1.json" + source_file.write_text('{"cycle": 1}') + + result = recorder.record_file(source_file) + assert result is None + + def test_record_file_log_mode_returns_none(self, tmp_path): + """Test that record_file returns None when mode is log.""" + recorder = ReviewCycleRecorder(step_name="test_step", mode="log") + + # Create a source file + source_file = tmp_path / "review_cycle_1.json" + source_file.write_text('{"cycle": 1}') + + result = recorder.record_file(source_file) + assert result is None + + def test_record_file_copy_mode_creates_step_directory(self, tmp_path): + """Test that copy mode creates the step-specific subdirectory.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="copy", + recording_dir=recording_dir, + ) + + # Create a source file + source_dir = tmp_path / "source" + source_dir.mkdir() + source_file = source_dir / "review_cycle_1.json" + source_file.write_text('{"cycle": 1}') + + recorder.record_file(source_file) + + step_dir = recording_dir / "implement_task" + assert step_dir.exists() + assert step_dir.is_dir() + + def test_record_file_copy_mode_copies_file(self, tmp_path): + """Test that copy mode copies the file to recording directory.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="implement_task", + mode="copy", + recording_dir=recording_dir, + ) + + # Create a source file with content + source_dir = tmp_path / "source" + source_dir.mkdir() + source_file = source_dir / "review_cycle_1.json" + source_file.write_text('{"cycle": 1, "verdict": "approved"}') + + result = recorder.record_file(source_file) + + expected_dest = recording_dir / "implement_task" / "review_cycle_1.json" + assert result == expected_dest + assert expected_dest.exists() + assert expected_dest.read_text() == '{"cycle": 1, "verdict": "approved"}' + + def test_record_file_copy_mode_preserves_filename(self, tmp_path): + """Test that copy mode preserves the original filename.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="copy", + recording_dir=recording_dir, + ) + + source_dir = tmp_path / "source" + source_dir.mkdir() + source_file = source_dir / "review_cycle_5.json" + source_file.write_text('{"cycle": 5}') + + result = recorder.record_file(source_file) + + assert result is not None + assert result.name == "review_cycle_5.json" + + def test_record_file_copy_mode_handles_multiple_files(self, tmp_path): + """Test that multiple files can be copied.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="copy", + recording_dir=recording_dir, + ) + + source_dir = tmp_path / "source" + source_dir.mkdir() + + # Copy multiple files + for i in range(1, 4): + source_file = source_dir / f"review_cycle_{i}.json" + source_file.write_text(f'{{"cycle": {i}}}') + recorder.record_file(source_file) + + step_dir = recording_dir / "test_step" + assert (step_dir / "review_cycle_1.json").exists() + assert (step_dir / "review_cycle_2.json").exists() + assert (step_dir / "review_cycle_3.json").exists() + + def test_record_file_nonexistent_source_returns_none(self, tmp_path, caplog): + """Test that nonexistent source file returns None and logs warning.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="copy", + recording_dir=recording_dir, + ) + + nonexistent_file = tmp_path / "missing.json" + + with caplog.at_level(logging.WARNING): + result = recorder.record_file(nonexistent_file) + + assert result is None + assert any("does not exist" in record.message for record in caplog.records) + + def test_record_file_returns_path_on_success(self, tmp_path): + """Test that successful copy returns the destination path.""" + recording_dir = tmp_path / "recordings" + recorder = ReviewCycleRecorder( + step_name="my_step", + mode="copy", + recording_dir=recording_dir, + ) + + source_dir = tmp_path / "source" + source_dir.mkdir() + source_file = source_dir / "review_cycle_1.json" + source_file.write_text('{"cycle": 1}') + + result = recorder.record_file(source_file) + + assert result is not None + assert result == recording_dir / "my_step" / "review_cycle_1.json" + + def test_record_file_directory_creation_error_returns_none(self, tmp_path, caplog): + """Test that directory creation error returns None.""" + # Create a file where the directory should be + recording_dir = tmp_path / "recordings" + recording_dir.mkdir() + blocking_file = recording_dir / "test_step" + blocking_file.write_text("blocking file") + + recorder = ReviewCycleRecorder( + step_name="test_step", + mode="copy", + recording_dir=recording_dir, + ) + + source_dir = tmp_path / "source" + source_dir.mkdir() + source_file = source_dir / "review_cycle_1.json" + source_file.write_text('{"cycle": 1}') + + with caplog.at_level(logging.ERROR): + result = recorder.record_file(source_file) + + assert result is None + assert any("Failed to create directory" in record.message for record in caplog.records) + + +# --------------------------------------------------------------------------- +# Settings configuration tests +# --------------------------------------------------------------------------- + + +class TestSettingsConfiguration: + """Tests for AUTO_REVIEW_RECORD_POLLED_FILES configuration.""" + + def test_setting_default_is_none(self): + """Test that the default value is None.""" + from forge.config import Settings + + with patch.dict("os.environ", {}, clear=True): + # Create settings with minimal required fields + settings = Settings( + jira_base_url="https://example.atlassian.net", + jira_api_token="token", + jira_user_email="user@example.com", + github_token="ghtoken", + ) + assert settings.auto_review_record_polled_files is None + + def test_setting_accepts_log_value(self): + """Test that 'log' value is accepted.""" + from forge.config import Settings + + with patch.dict("os.environ", {"AUTO_REVIEW_RECORD_POLLED_FILES": "log"}, clear=False): + settings = Settings( + jira_base_url="https://example.atlassian.net", + jira_api_token="token", + jira_user_email="user@example.com", + github_token="ghtoken", + ) + assert settings.auto_review_record_polled_files == "log" + + def test_setting_accepts_copy_value(self): + """Test that 'copy' value is accepted.""" + from forge.config import Settings + + with patch.dict("os.environ", {"AUTO_REVIEW_RECORD_POLLED_FILES": "copy"}, clear=False): + settings = Settings( + jira_base_url="https://example.atlassian.net", + jira_api_token="token", + jira_user_email="user@example.com", + github_token="ghtoken", + ) + assert settings.auto_review_record_polled_files == "copy" diff --git a/tests/unit/sandbox/test_runner_review_polling.py b/tests/unit/sandbox/test_runner_review_polling.py new file mode 100644 index 00000000..ed5f4ec3 --- /dev/null +++ b/tests/unit/sandbox/test_runner_review_polling.py @@ -0,0 +1,1203 @@ +"""Unit tests for review polling integration in ContainerRunner.""" + +import asyncio +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from forge.observability import ReviewCycleData +from forge.sandbox.runner import ( + ContainerResult, + ContainerRunner, + _poller_to_recorder_cycle, +) + +# --------------------------------------------------------------------------- +# Helper to create a runner instance without __init__ side effects +# --------------------------------------------------------------------------- + + +def _runner_without_init() -> ContainerRunner: + """Create a ContainerRunner instance without running __init__.""" + return object.__new__(ContainerRunner) + + +# --------------------------------------------------------------------------- +# ContainerResult tests +# --------------------------------------------------------------------------- + + +class TestContainerResultReviewCycles: + """Tests for ContainerResult review_cycles field.""" + + def test_default_review_cycles_empty(self): + """Test that review_cycles defaults to empty list.""" + result = ContainerResult( + success=True, + exit_code=0, + stdout="", + stderr="", + ) + assert result.review_cycles == [] + + def test_review_cycles_with_data(self): + """Test that review_cycles can store ReviewCycleData.""" + cycles = [ + ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Fix the bug", + skill="local-code-review", + elapsed_seconds=5.5, + timestamp="2024-01-15T10:30:00Z", + ), + ReviewCycleData( + cycle=2, + max_cycles=3, + verdict="approved", + feedback="", + skill="local-code-review", + elapsed_seconds=3.2, + timestamp="2024-01-15T10:35:00Z", + ), + ] + result = ContainerResult( + success=True, + exit_code=0, + stdout="output", + stderr="", + review_cycles=cycles, + ) + assert len(result.review_cycles) == 2 + assert result.review_cycles[0].verdict == "rejected" + assert result.review_cycles[1].verdict == "approved" + + +# --------------------------------------------------------------------------- +# _poller_to_recorder_cycle conversion tests +# --------------------------------------------------------------------------- + + +class TestPollerToRecorderCycle: + """Tests for the _poller_to_recorder_cycle helper function.""" + + def test_converts_all_fields(self): + """Test that all fields are converted correctly.""" + poller_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="Looks good", + skill="implement-task", + elapsed_seconds=12.5, + timestamp="2024-01-15T10:30:00Z", + ) + recorder_cycle = _poller_to_recorder_cycle(poller_cycle, "implement_task") + + assert recorder_cycle.cycle == 1 + assert recorder_cycle.max_cycles == 3 + assert recorder_cycle.verdict == "approved" + assert recorder_cycle.feedback == "Looks good" + assert recorder_cycle.skill == "implement-task" + assert recorder_cycle.elapsed_seconds == 12.5 + # Timestamp is converted to datetime + assert recorder_cycle.timestamp.year == 2024 + assert recorder_cycle.timestamp.month == 1 + assert recorder_cycle.timestamp.day == 15 + + def test_handles_invalid_timestamp(self): + """Test that invalid timestamp falls back to now.""" + poller_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="", + skill="skill", + elapsed_seconds=1.0, + timestamp="not-a-valid-timestamp", + ) + recorder_cycle = _poller_to_recorder_cycle(poller_cycle, "step") + + # Should have a valid datetime (fallback to now) + assert recorder_cycle.timestamp is not None + + def test_handles_empty_timestamp(self): + """Test that empty timestamp falls back to now.""" + poller_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="skill", + elapsed_seconds=1.0, + timestamp="", + ) + recorder_cycle = _poller_to_recorder_cycle(poller_cycle, "step") + + # Should have a valid datetime (fallback to now) + assert recorder_cycle.timestamp is not None + + +# --------------------------------------------------------------------------- +# ContainerRunner.run() with step_name tests +# --------------------------------------------------------------------------- + + +class TestRunWithStepName: + """Tests for ContainerRunner.run() with step_name parameter.""" + + @pytest.mark.asyncio + async def test_run_accepts_step_name_parameter(self, tmp_path: Path): + """Test that run() accepts the step_name parameter.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = None + + # Create a mock process that completes immediately + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + ): + result = await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + assert result.success is True + assert result.review_cycles == [] + + @pytest.mark.asyncio + async def test_run_without_step_name_disables_polling(self, tmp_path: Path): + """Test that run() without step_name disables polling.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch("forge.sandbox.runner.ReviewCyclePoller") as mock_poller_class, + ): + result = await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + # No step_name provided + ) + + # Poller should not be created when step_name is None + mock_poller_class.assert_not_called() + assert result.review_cycles == [] + + +# --------------------------------------------------------------------------- +# Background polling task tests +# --------------------------------------------------------------------------- + + +class TestBackgroundPollingTask: + """Tests for the background polling task during container execution.""" + + @pytest.mark.asyncio + async def test_polling_task_started_with_step_name(self, tmp_path: Path): + """Test that polling task is started when step_name is provided.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + # Track if polling was started + poller_created = False + + def create_poller(*_args, **_kwargs): + nonlocal poller_created + poller_created = True + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = "implement_task" + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + assert poller_created is True + + @pytest.mark.asyncio + async def test_polling_task_cancelled_on_container_exit(self, tmp_path: Path): + """Test that polling task is cancelled when container exits.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + stop_called = False + + def create_poller(*_args, **_kwargs): + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + + def stop(): + nonlocal stop_called + stop_called = True + + mock_poller.stop = stop + mock_poller.step_name = "implement_task" + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + assert stop_called is True + + +# --------------------------------------------------------------------------- +# Review cycle collection tests +# --------------------------------------------------------------------------- + + +class TestReviewCycleCollection: + """Tests for collecting review cycles into ContainerResult.""" + + @pytest.mark.asyncio + async def test_detected_cycles_added_to_result(self, tmp_path: Path): + """Test that detected review cycles are added to the result.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 0.1 + runner.settings.auto_review_record_polled_files = "log" + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + # Simulate a review cycle file being detected during final poll + detected_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="LGTM", + skill="local-code-review", + elapsed_seconds=5.0, + timestamp="2024-01-15T10:30:00Z", + ) + + def create_poller(*_args, **_kwargs): + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[detected_cycle]) + mock_poller.stop = MagicMock() + mock_poller.step_name = "implement_task" + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + patch("forge.sandbox.runner.record_review_cycle"), + patch("forge.sandbox.runner.record_review_verdict"), + patch("forge.sandbox.runner.observe_review_duration"), + ): + result = await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + assert len(result.review_cycles) == 1 + assert result.review_cycles[0].verdict == "approved" + assert result.review_cycles[0].skill == "local-code-review" + + @pytest.mark.asyncio + async def test_cycles_collected_even_on_timeout(self, tmp_path: Path): + """Test that cycles are collected even when container times out.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 1 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 0.1 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(side_effect=TimeoutError()) + mock_process.returncode = None + + # Pre-collected cycle (simulating one detected before timeout) + timeout_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="rejected", + feedback="Partial review", + skill="local-code-review", + elapsed_seconds=2.0, + timestamp="2024-01-15T10:30:00Z", + ) + + cycles_collected = [] + + def create_poller(*_args, **_kwargs): + mock_poller = MagicMock() + + async def poll_iter(): + # Yield one cycle before timeout + cycles_collected.append(timeout_cycle) + yield [timeout_cycle] + # Then wait forever (simulating ongoing polling) + await asyncio.sleep(1000) + + mock_poller.poll = MagicMock(return_value=poll_iter()) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = "implement_task" + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch.object(runner, "_stop_timed_out_container", new=AsyncMock()), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + patch("forge.sandbox.runner.record_review_cycle"), + patch("forge.sandbox.runner.record_review_verdict"), + patch("forge.sandbox.runner.observe_review_duration"), + ): + result = await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + # Result should indicate failure + assert result.success is False + assert "Timeout" in (result.error_message or "") + # But cycles collected should still be present + assert len(result.review_cycles) >= 0 # May have collected the cycle + + +# --------------------------------------------------------------------------- +# Metrics recording tests +# --------------------------------------------------------------------------- + + +class TestMetricsRecording: + """Tests for Prometheus metrics recording during polling.""" + + @pytest.mark.asyncio + async def test_metrics_recorded_for_detected_cycles(self, tmp_path: Path): + """Test that metrics are recorded for each detected cycle.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 0.1 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + detected_cycle = ReviewCycleData( + cycle=1, + max_cycles=3, + verdict="approved", + feedback="", + skill="implement-task", + elapsed_seconds=10.5, + timestamp="2024-01-15T10:30:00Z", + ) + + def create_poller(*_args, **_kwargs): + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[detected_cycle]) + mock_poller.stop = MagicMock() + mock_poller.step_name = "implement_task" + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + patch("forge.sandbox.runner.record_review_cycle") as mock_cycle, + patch("forge.sandbox.runner.record_review_verdict") as mock_verdict, + patch("forge.sandbox.runner.observe_review_duration") as mock_duration, + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + # Verify metrics were recorded + mock_cycle.assert_called_with("implement-task", "implement_task") + mock_verdict.assert_called_with("implement-task", "implement_task", "approved") + mock_duration.assert_called_with("implement-task", "implement_task", 10.5) + + +# --------------------------------------------------------------------------- +# Step name path organization tests +# --------------------------------------------------------------------------- + + +class TestStepNamePathOrganization: + """Tests for step-name based path organization.""" + + @pytest.mark.asyncio + async def test_step_name_passed_to_poller(self, tmp_path: Path): + """Test that step_name is passed to the poller correctly.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + captured_step_name = None + + def create_poller(workspace_path=None, step_name=None, settings=None): + nonlocal captured_step_name + captured_step_name = step_name + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = step_name + # Suppress unused warnings + _ = workspace_path, settings + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="local_review", + ) + + assert captured_step_name == "local_review" + + @pytest.mark.asyncio + async def test_step_name_passed_to_recorder(self, tmp_path: Path): + """Test that step_name is passed to the recorder correctly.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = "log" + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + captured_recorder_step_name = None + + def create_poller(**kwargs): + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = kwargs.get("step_name", "") + return mock_poller + + def create_recorder(step_name=None, mode=None, recording_dir=None): + nonlocal captured_recorder_step_name + captured_recorder_step_name = step_name + mock_recorder = MagicMock() + mock_recorder.record = MagicMock() + mock_recorder.record_file = MagicMock() + # Suppress unused warnings + _ = mode, recording_dir + return mock_recorder + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch( + "forge.sandbox.runner.ReviewCycleRecorder", + side_effect=create_recorder, + ), + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="fix_ci", + ) + + assert captured_recorder_step_name == "fix_ci" + + +# --------------------------------------------------------------------------- +# Helper classes for async iteration mocking +# --------------------------------------------------------------------------- + + +class AsyncIteratorMock: + """Mock for async iterators that yields a list of items once and then stops.""" + + def __init__(self, items: list): + self.items = items + self.index = 0 + + def __aiter__(self): + return self + + async def __anext__(self): + if self.index < len(self.items): + item = self.items[self.index] + self.index += 1 + return item + raise StopAsyncIteration + + +# --------------------------------------------------------------------------- +# _sweep_review_cycles() tests +# --------------------------------------------------------------------------- + + +class TestSweepReviewCycles: + """Tests for the _sweep_review_cycles() post-execution sweep.""" + + def test_sweep_finds_missed_file(self, tmp_path: Path, caplog): + """Test that sweep catches files missed during async polling.""" + import json + import logging + + runner = _runner_without_init() + + # Create a review cycle file that was NOT processed by the poller + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "Looks good", + "skill": "local-review", + "elapsed_seconds": 5.5, + "timestamp": "2024-01-15T10:30:00Z", + } + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + # Empty processed files set - nothing was caught during polling + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + # Mock recorder + mock_recorder = MagicMock() + + with caplog.at_level(logging.WARNING): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should have found the missed file + assert len(collected_cycles) == 1 + assert collected_cycles[0].cycle == 1 + assert collected_cycles[0].verdict == "approved" + assert collected_cycles[0].skill == "local-review" + + # Should log a warning about missed files + assert "Sweep caught 1 review cycle file(s) missed" in caplog.text + assert step_name in caplog.text + + def test_sweep_deduplicates_against_processed_files(self, tmp_path: Path): + """Test that sweep skips files already processed by async poller.""" + import json + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + # Create two cycle files + for i in [1, 2]: + cycle_data = { + "cycle": i, + "max_cycles": 3, + "verdict": "approved", + "feedback": f"Review {i}", + "skill": "local-review", + "elapsed_seconds": float(i), + "timestamp": f"2024-01-15T10:3{i}:00Z", + } + cycle_file = cycle_dir / f"review_cycle_{i}.json" + cycle_file.write_text(json.dumps(cycle_data)) + + # Simulate that cycle_1 was already processed + cycle_1_path = str(cycle_dir / "review_cycle_1.json") + processed_files: set[str] = {cycle_1_path} + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should only find cycle_2 (cycle_1 was already processed) + assert len(collected_cycles) == 1 + assert collected_cycles[0].cycle == 2 + + def test_sweep_no_warning_when_no_missed_files(self, tmp_path: Path, caplog): + """Test that no warning is logged when all files were already processed.""" + import json + import logging + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "", + "skill": "local-review", + "elapsed_seconds": 5.0, + "timestamp": "2024-01-15T10:30:00Z", + } + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + # File was already processed + processed_files: set[str] = {str(cycle_file)} + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + with caplog.at_level(logging.WARNING): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should not find any new files + assert len(collected_cycles) == 0 + + # Should not log warning about missed files + assert "Sweep caught" not in caplog.text + + def test_sweep_handles_nonexistent_directory(self, tmp_path: Path): + """Test that sweep handles missing .forge/{step} directory gracefully.""" + runner = _runner_without_init() + + # Don't create the directory + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + # Should not raise + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name="nonexistent_step", + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + assert len(collected_cycles) == 0 + + def test_sweep_handles_invalid_json(self, tmp_path: Path, caplog): + """Test that sweep handles invalid JSON files gracefully.""" + import logging + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + # Create an invalid JSON file + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text("not valid json {") + + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + with caplog.at_level(logging.WARNING): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should not have collected any cycles + assert len(collected_cycles) == 0 + + # Should log warning about parse failure + assert "Failed to parse review cycle file" in caplog.text + + def test_sweep_handles_missing_required_fields(self, tmp_path: Path, caplog): + """Test that sweep handles JSON with missing required fields.""" + import json + import logging + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + # Create JSON missing required fields + cycle_data = {"verdict": "approved", "feedback": "Missing fields"} + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + with caplog.at_level(logging.WARNING): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should not have collected any cycles + assert len(collected_cycles) == 0 + + # Should log warning about invalid data + assert "Invalid review cycle data" in caplog.text + + def test_sweep_handles_empty_file(self, tmp_path: Path, caplog): + """Test that sweep handles empty files gracefully.""" + import logging + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + # Create an empty file + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text("") + + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + with caplog.at_level(logging.WARNING): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Should not have collected any cycles + assert len(collected_cycles) == 0 + + # Should log warning about empty file + assert "Empty review cycle file" in caplog.text + + def test_sweep_emits_metrics(self, tmp_path: Path): + """Test that sweep emits Prometheus metrics for caught files.""" + import json + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "rejected", + "feedback": "Needs work", + "skill": "local-review", + "elapsed_seconds": 8.5, + "timestamp": "2024-01-15T10:30:00Z", + } + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + with ( + patch("forge.sandbox.runner.record_review_cycle") as mock_cycle, + patch("forge.sandbox.runner.record_review_verdict") as mock_verdict, + patch("forge.sandbox.runner.observe_review_duration") as mock_duration, + ): + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Verify metrics were emitted + mock_cycle.assert_called_once_with("local-review", step_name) + mock_verdict.assert_called_once_with("local-review", step_name, "rejected") + mock_duration.assert_called_once_with("local-review", step_name, 8.5) + + def test_sweep_records_via_recorder(self, tmp_path: Path): + """Test that sweep uses recorder to record and copy files.""" + import json + + runner = _runner_without_init() + + step_name = "implement_task" + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "", + "skill": "local-review", + "elapsed_seconds": 5.0, + "timestamp": "2024-01-15T10:30:00Z", + } + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + processed_files: set[str] = set() + collected_cycles: list[ReviewCycleData] = [] + + mock_recorder = MagicMock() + + runner._sweep_review_cycles( + workspace_path=tmp_path, + step_name=step_name, + processed_files=processed_files, + collected_cycles=collected_cycles, + recorder=mock_recorder, + ) + + # Verify recorder methods were called + mock_recorder.record.assert_called_once() + mock_recorder.record_file.assert_called_once_with(cycle_file) + + +class TestSweepIntegrationWithRun: + """Tests for sweep integration with ContainerRunner.run().""" + + @pytest.mark.asyncio + async def test_fast_exit_files_caught_by_sweep(self, tmp_path: Path, caplog): + """Test that files written just before container exit are caught by sweep.""" + import json + import logging + + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = "log" + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + step_name = "implement_task" + + # Create a file that simulates being written just before container exit + # (not caught by async polling) + cycle_dir = tmp_path / ".forge" / step_name + cycle_dir.mkdir(parents=True) + cycle_data = { + "cycle": 1, + "max_cycles": 3, + "verdict": "approved", + "feedback": "Fast exit", + "skill": "fast-review", + "elapsed_seconds": 1.0, + "timestamp": "2024-01-15T10:30:00Z", + } + cycle_file = cycle_dir / "review_cycle_1.json" + cycle_file.write_text(json.dumps(cycle_data)) + + def create_poller(**kwargs): + mock_poller = MagicMock() + # Poller returns no files during polling and poll_once + # (simulating fast exit where files are written after polling stops) + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = kwargs.get("step_name", "") + # Empty processed files - nothing was caught during async polling + mock_poller._processed_files = set() + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder") as mock_recorder_class, + patch("forge.sandbox.runner.record_review_cycle"), + patch("forge.sandbox.runner.record_review_verdict"), + patch("forge.sandbox.runner.observe_review_duration"), + caplog.at_level(logging.WARNING), + ): + mock_recorder = MagicMock() + mock_recorder_class.return_value = mock_recorder + + result = await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name=step_name, + ) + + # The sweep should have caught the file + assert len(result.review_cycles) == 1 + assert result.review_cycles[0].verdict == "approved" + assert result.review_cycles[0].skill == "fast-review" + + # Should log warning about missed files + assert "Sweep caught 1 review cycle file(s) missed" in caplog.text + + @pytest.mark.asyncio + async def test_sweep_runs_after_async_polling(self, tmp_path: Path): + """Test that sweep is called after container exits.""" + runner = _runner_without_init() + runner.settings = MagicMock() + runner.settings.container_image = "test:latest" + runner.settings.container_timeout = 60 + runner.settings.container_memory = "1g" + runner.settings.container_cpus = "1" + runner.settings.container_keep = False + runner.settings.auto_review_poll_interval = 1.0 + runner.settings.auto_review_record_polled_files = None + + mock_process = MagicMock() + mock_process.communicate = AsyncMock(return_value=(b"output", b"")) + mock_process.returncode = 0 + + sweep_called = False + original_sweep = runner._sweep_review_cycles + + def mock_sweep(*args, **kwargs): + nonlocal sweep_called + sweep_called = True + return original_sweep(*args, **kwargs) + + def create_poller(**kwargs): + mock_poller = MagicMock() + mock_poller.poll = MagicMock(return_value=AsyncIteratorMock([])) + mock_poller.poll_once = AsyncMock(return_value=[]) + mock_poller.stop = MagicMock() + mock_poller.step_name = kwargs.get("step_name", "") + mock_poller._processed_files = set() + return mock_poller + + with ( + patch.object(runner, "_build_container_name", return_value="test-container"), + patch.object(runner, "_build_podman_command", return_value=["podman", "run"]), + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + return_value=mock_process, + ), + patch( + "forge.sandbox.runner.ReviewCyclePoller", + side_effect=create_poller, + ), + patch("forge.sandbox.runner.ReviewCycleRecorder"), + patch.object(runner, "_sweep_review_cycles", side_effect=mock_sweep), + ): + await runner.run( + workspace_path=tmp_path, + task_summary="Test task", + task_description="Test description", + step_name="implement_task", + ) + + assert sweep_called is True diff --git a/tests/unit/workflow/nodes/test_implementation.py b/tests/unit/workflow/nodes/test_implementation.py index 7c673597..07a2812f 100644 --- a/tests/unit/workflow/nodes/test_implementation.py +++ b/tests/unit/workflow/nodes/test_implementation.py @@ -56,7 +56,6 @@ def _make_successful_runner(): class TestImplementTaskStartedComment: - @pytest.mark.asyncio async def test_posts_comment_on_task_ticket_before_container(self): """A comment is posted on the task ticket (not parent) when implementation starts.""" @@ -184,7 +183,6 @@ async def test_passes_trace_context_to_container_runner(self): class TestImplementationNodeRouting: - @pytest.mark.asyncio async def test_feature_missing_workspace_uses_feature_implementation_node(self): """Feature implementation failures must resume at implement_task.""" @@ -272,3 +270,33 @@ async def test_bug_container_failure_keeps_bug_implementation_node(self): assert result["current_node"] == "implement_bug_fix" assert result["last_error"] == "container failed" assert result["retry_count"] == 1 + + +class TestImplementTaskStepName: + """Tests for step_name propagation to ContainerRunner.""" + + @pytest.mark.asyncio + async def test_passes_step_name_implement_task_to_runner(self): + """ContainerRunner.run() is called with step_name='implement_task'.""" + from forge.workflow.nodes.implementation import implement_task + + mock_jira = _make_mock_jira() + runner = _make_successful_runner() + + with ( + patch( + "forge.workflow.nodes.implementation.JiraClient", + return_value=mock_jira, + ), + patch( + "forge.workflow.nodes.implementation.ContainerRunner", + return_value=runner, + ), + patch("forge.workflow.nodes.implementation.get_settings"), + ): + await implement_task(_make_state()) + + # Verify step_name was passed + runner.run.assert_called_once() + call_kwargs = runner.run.call_args.kwargs + assert call_kwargs.get("step_name") == "implement_task" diff --git a/tests/unit/workflow/nodes/test_local_reviewer.py b/tests/unit/workflow/nodes/test_local_reviewer.py index 7a78a0ff..56092467 100644 --- a/tests/unit/workflow/nodes/test_local_reviewer.py +++ b/tests/unit/workflow/nodes/test_local_reviewer.py @@ -366,3 +366,69 @@ async def test_feature_no_qualitative_fields_set(self, base_feature_review_state "qualitative_retry_count" not in result or result.get("qualitative_retry_count") is None ) assert "local_review_verdict" not in result or result.get("local_review_verdict") is None + + +class TestLocalReviewStepName: + """Tests for step_name propagation to ContainerRunner.""" + + @pytest.mark.asyncio + async def test_bug_review_passes_step_name_local_review(self, base_bug_review_state): + """Bug qualitative review passes step_name='local_review' to ContainerRunner.""" + captured_kwargs = [] + + class _CapturingRunner: + async def run(self, **kwargs): + captured_kwargs.append(kwargs) + result = MagicMock() + result.success = True + result.exit_code = 0 + result.stdout = "verdict: adequate\n\nfeedback: Good." + result.stderr = "" + return result + + mock_git = _make_mock_git() + mock_workspace = MagicMock() + + with ( + patch( + "forge.workflow.nodes.local_reviewer.ContainerRunner", + return_value=_CapturingRunner(), + ), + patch("forge.workflow.nodes.local_reviewer.GitOperations", return_value=mock_git), + patch("forge.workflow.nodes.local_reviewer.Workspace", return_value=mock_workspace), + ): + await local_review_changes(base_bug_review_state) + + assert captured_kwargs, "runner.run was not called" + assert captured_kwargs[0].get("step_name") == "local_review" + + @pytest.mark.asyncio + async def test_feature_review_passes_step_name_local_review(self, base_feature_review_state): + """Feature review passes step_name='local_review' to ContainerRunner.""" + captured_kwargs = [] + + class _CapturingRunner: + async def run(self, **kwargs): + captured_kwargs.append(kwargs) + result = MagicMock() + result.success = True + result.exit_code = 0 + result.stdout = "No issues found." + result.stderr = "" + return result + + mock_git = _make_mock_git() + mock_workspace = MagicMock() + + with ( + patch( + "forge.workflow.nodes.local_reviewer.ContainerRunner", + return_value=_CapturingRunner(), + ), + patch("forge.workflow.nodes.local_reviewer.GitOperations", return_value=mock_git), + patch("forge.workflow.nodes.local_reviewer.Workspace", return_value=mock_workspace), + ): + await local_review_changes(base_feature_review_state) + + assert captured_kwargs, "runner.run was not called" + assert captured_kwargs[0].get("step_name") == "local_review"