Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
517 changes: 517 additions & 0 deletions docs/squash-merge-recovery-plan.md

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions src/whygraph/analyze/rationale.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ class CommitEvidence:

* ``"blame"`` — line-level attribution from the target's current
range (highest-precision signal).
* ``"pr-origin"`` — an original feature-branch commit recovered
from a squash-merged PR: when the queried lines blame to a
squash commit, they are re-blamed at the PR's ``head_sha`` so
each line maps back to the commit that actually authored it.
* ``"blame-walked"`` — surfaced only after blame walked past a
refactor-heavy commit. Still line-level, but one or more boring
commits were skipped to reach this author.
Expand Down
100 changes: 94 additions & 6 deletions src/whygraph/analyze/rationale_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import json
from collections.abc import Sequence
from dataclasses import dataclass

from whygraph.core.config import RationaleConfig
from whygraph.db.models import Commit, Issue, PullRequest
Expand Down Expand Up @@ -66,6 +67,53 @@ def _labels_suffix(raw: str) -> str:
return " [" + ", ".join(str(label) for label in labels) + "]"


def _json_list(raw: str | None) -> list:
"""Decode a JSON-encoded list column; empty list on anything malformed.

Mirrors :func:`whygraph.mcp.evidence._json_list` — kept local so the
formatters do not depend on the MCP layer.
"""
if not raw:
return []
try:
parsed = json.loads(raw)
except (TypeError, json.JSONDecodeError):
return []
return parsed if isinstance(parsed, list) else []


@dataclass(frozen=True, slots=True)
class _PrRenderCaps:
"""Size caps for rendering a PR block into the rationale prompt.

Passed explicitly into the module-level formatters (rather than read
from a global :func:`get_config`) so they stay pure and unit-testable.
Defaults mirror :class:`~whygraph.core.config.RationaleConfig`.

Attributes
----------
pr_roster_max_commits : int
Max squashed-commit headlines rendered into a PR block.
pr_discussion_max_comments : int
Max PR comments rendered into a PR block.
pr_comment_max_chars : int
Per-comment body clip.
"""

pr_roster_max_commits: int = 30
pr_discussion_max_comments: int = 20
pr_comment_max_chars: int = 500

@classmethod
def from_config(cls, config: RationaleConfig) -> "_PrRenderCaps":
"""Project the three rendering caps out of a :class:`RationaleConfig`."""
return cls(
pr_roster_max_commits=config.pr_roster_max_commits,
pr_discussion_max_comments=config.pr_discussion_max_comments,
pr_comment_max_chars=config.pr_comment_max_chars,
)


def _indent_block(text: str, prefix: str) -> str:
"""Indent every line of ``text`` by ``prefix``."""
return "\n".join(prefix + line for line in text.splitlines())
Expand All @@ -90,15 +138,38 @@ def _format_commit(commit: Commit) -> list[str]:
return lines


def _format_pr(pr: PullRequest) -> list[str]:
"""Render one pull request as the indented lines of an evidence block."""
def _format_pr(pr: PullRequest, caps: _PrRenderCaps = _PrRenderCaps()) -> list[str]:
"""Render one pull request as the indented lines of an evidence block.

Appends the squashed-commit roster and the PR discussion so the LLM
sees the narrative a squash merge collapsed. Both are clipped by
``caps`` to bound the prompt size; ``pr.commit_titles`` / ``pr.comments``
are JSON-encoded list columns decoded via :func:`_json_list`.
"""
when = f"merged {pr.merged_at}" if pr.merged_at else pr.state
author = f"by {pr.author}" if pr.author else "by unknown"
lines = [f" PR #{pr.number} {author} {when}{_labels_suffix(pr.labels)}"]
lines.append(f" Title: {pr.title}")
if pr.body and pr.body.strip():
lines.append(" Body:")
lines.append(_indent_block(pr.body.strip(), " "))
titles = _json_list(pr.commit_titles)[: caps.pr_roster_max_commits]
if titles:
lines.append(" Squashed commits:")
for c in titles:
if not isinstance(c, dict):
continue
oid = (c.get("oid") or "")[:9]
lines.append(f" - {c.get('headline', '')} ({oid})")
comments = _json_list(pr.comments)[: caps.pr_discussion_max_comments]
if comments:
lines.append(" Discussion:")
for cm in comments:
if not isinstance(cm, dict):
continue
who = cm.get("author") or "unknown"
body = (cm.get("body") or "").strip()[: caps.pr_comment_max_chars]
lines.append(_indent_block(f"[{who}] {body}", " "))
return lines


Expand All @@ -118,13 +189,16 @@ def _format_issue(issue: Issue) -> list[str]:

_SOURCE_LABELS = {
"blame": "line-blame",
"pr-origin": "original commit recovered from a squash-merged PR",
"blame-walked": "line-blame (skipped a refactor commit)",
"predecessor-blame": "line-blame on a pre-rename predecessor file",
"area": "area-history (touched the file but not these lines)",
}


def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
def _format_evidence(
evidence: Sequence[CommitEvidence], caps: _PrRenderCaps = _PrRenderCaps()
) -> str:
"""Render an evidence bundle as the text payload for the rationale prompt.

Commits are formatted in the order given — the caller controls
Expand All @@ -135,6 +209,9 @@ def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
the row reached this bundle (line-blame, area-history, etc.), so the
LLM can weight precision-vs-coverage signals when synthesising the
rationale.

``caps`` bounds the per-PR roster / discussion rendering (see
:func:`_format_pr`).
"""
n_prs = sum(len(item.pull_requests) for item in evidence)
n_issues = sum(len(item.issues) for item in evidence)
Expand All @@ -149,7 +226,7 @@ def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
lines.insert(1, f" Source: {label}")
for pr in item.pull_requests:
lines.append("")
lines.extend(_format_pr(pr))
lines.extend(_format_pr(pr, caps))
for issue in item.issues:
lines.append("")
lines.extend(_format_issue(issue))
Expand Down Expand Up @@ -331,6 +408,11 @@ class RationaleGenerator:
``task`` should contain the
:data:`~whygraph.analyze.prompt.RATIONALE_PLACEHOLDER` token. Mostly
used in tests and one-off overrides.
caps : _PrRenderCaps, optional
Size caps for rendering each PR's squashed-commit roster and
discussion into the prompt. ``None`` (default) uses the
:class:`~whygraph.core.config.RationaleConfig` defaults;
:meth:`from_config` projects them from the loaded config.

Examples
--------
Expand All @@ -345,9 +427,11 @@ def __init__(
*,
timeout_sec: int | None = None,
rationale_prompt: Prompt | None = None,
caps: _PrRenderCaps | None = None,
) -> None:
self._client = client
self._timeout_sec = timeout_sec
self._caps = caps if caps is not None else _PrRenderCaps()
self._rationale_prompt = (
rationale_prompt
if rationale_prompt is not None
Expand Down Expand Up @@ -390,7 +474,11 @@ def from_config(
"""
factory = factory if factory is not None else LlmClientFactory()
client = factory.make(config.provider, model=config.model)
return cls(client, timeout_sec=config.timeout_sec)
return cls(
client,
timeout_sec=config.timeout_sec,
caps=_PrRenderCaps.from_config(config),
)

def generate(
self,
Expand Down Expand Up @@ -433,7 +521,7 @@ def generate(

# TODO: capping bundle size belongs to the future evidence-bundle
# builder — the generator neither truncates nor chunks its input.
bundle = _format_evidence(evidence)
bundle = _format_evidence(evidence, self._caps)
if symbol_context is not None:
bundle = f"{_format_symbol_context(symbol_context)}\n\n{bundle}"
task = render(
Expand Down
51 changes: 48 additions & 3 deletions src/whygraph/cli/commands/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@
from rich.table import Table
from rich.text import Text

from whygraph.scan import CodeGraphCrawler, Crawler, GitCrawler, GitHubCrawler
from whygraph.scan import (
CodeGraphCrawler,
Crawler,
GitCrawler,
GitHubCrawler,
PROriginEnricher,
)

from ..console import console

Expand Down Expand Up @@ -73,11 +79,25 @@
"auto-rescan git hooks (`whygraph hooks install`). Default: on."
),
)
@click.option(
"--pr-origins/--no-pr-origins",
"enrich_pr_origins",
default=True,
help=(
"Recover a squash-merged PR's original feature-branch commits — "
"one targeted `git fetch` of the gated PRs' heads, persisted as "
"`commit` rows flagged off the default-branch walk so they enrich "
"evidence without polluting area-history / refactor-walk. Needs "
"the network, so it always runs in the remote phase and is skipped "
"under `--no-remote`. Default: on."
),
)
def scan_cmd(
no_llm_descriptions: bool,
refresh_codegraph: bool,
codegraph_image: str | None,
remote: bool,
enrich_pr_origins: bool,
) -> None:
"""Run the source crawlers, then describe each commit with the LLM."""
# Lazy-imported so that --help and other lightweight CLI surfaces
Expand Down Expand Up @@ -121,6 +141,7 @@ def scan_cmd(
analyze_skip=analyze_skip,
codegraph_enabled=refresh_codegraph,
remote_enabled=remote,
pr_origins_enabled=enrich_pr_origins and github_client is not None,
)

scan_log_path = db_path.parent / "scan.log"
Expand All @@ -142,8 +163,11 @@ def scan_cmd(
if github_client is not None:
phase1.append(GitHubCrawler(progress, client=github_client))

# Phase 2 — the analyzer, started only once phase 1 has joined
# (it reads the commits phase 1 persisted).
# Phase 2 — started only once phase 1 has joined (it reads the
# commits + PRs phase 1 persisted). The analyzer and the PR-origin
# enricher run concurrently: analyze only touches main-walk commits,
# the enricher only inserts new on_default_branch=0 rows, so they
# never contend over the same commit row.
phase2: list[Crawler] = []
if descriptor is not None:
phase2.append(
Expand All @@ -155,6 +179,18 @@ def scan_cmd(
large_commit_file_count=config.analyze.large_commit_file_count,
)
)
# The enricher needs PR rows (the GitHub crawler ran) and the
# network for its fetch — so it is gated on a resolved client, which
# is itself None under --no-remote.
if enrich_pr_origins and github_client is not None:
phase2.append(
PROriginEnricher(
progress,
repository=repository,
min_commits=config.analyze.pr_origin_min_commits,
large_commit_file_count=config.analyze.large_commit_file_count,
)
)

if codegraph_crawler is not None:
codegraph_crawler.start()
Expand Down Expand Up @@ -257,6 +293,7 @@ def _render_scan_panel(
analyze_skip: str | None,
codegraph_enabled: bool,
remote_enabled: bool,
pr_origins_enabled: bool,
) -> None:
"""Print a summary panel of what the upcoming scan will collect.

Expand Down Expand Up @@ -316,6 +353,14 @@ def _render_scan_panel(
("LLM descriptions", Text(f"skipped — {analyze_skip}", style="yellow"))
)
rows.append(("Worker threads", str(config.scan_max_workers)))
rows.append(
(
"PR commit recovery",
"recover squash-merged PR commits"
if pr_origins_enabled
else Text("skipped", style="yellow"),
)
)

grid = Table.grid(padding=(0, 3))
grid.add_column(style="bold cyan", justify="right", no_wrap=True)
Expand Down
49 changes: 46 additions & 3 deletions src/whygraph/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,20 @@ class AnalyzeConfig:
timeout_sec : int or None
Per-call timeout forwarded into :class:`CompletionRequest`.
``None`` (default) defers to the bound adapter's default.
pr_origin_min_commits : int
Commit-rich half of the squash-merge enrichment gate
(:mod:`whygraph.scan.pr_origin_enricher`). A squash-merged PR has
its original feature-branch commits recovered when it collapsed at
least this many commits (the file-bulk half reuses
``large_commit_file_count``). Must be ``>= 1``.
"""

provider: str = "anthropic"
model: str | None = None
max_diff_chars: int = 50_000
large_commit_file_count: int = 30
timeout_sec: int | None = None
pr_origin_min_commits: int = 5


@dataclass(frozen=True, slots=True)
Expand Down Expand Up @@ -277,11 +284,24 @@ class RationaleConfig:
timeout_sec : int or None
Per-call timeout forwarded into :class:`CompletionRequest`.
``None`` (default) defers to the bound adapter's default.
pr_roster_max_commits : int
Cap on how many squashed-commit headlines are rendered into a
single PR block in the rationale prompt. Bounds the prompt size
when a squash collapsed a long feature branch. Must be ``>= 1``.
pr_discussion_max_comments : int
Cap on how many PR comments are rendered into a single PR block
in the rationale prompt. Must be ``>= 1``.
pr_comment_max_chars : int
Per-comment body clip applied before rendering a PR comment into
the rationale prompt. Must be ``>= 1``.
"""

provider: str = "anthropic"
model: str | None = None
timeout_sec: int | None = None
pr_roster_max_commits: int = 30
pr_discussion_max_comments: int = 20
pr_comment_max_chars: int = 500


@dataclass(frozen=True, slots=True)
Expand Down Expand Up @@ -449,9 +469,12 @@ def __post_init__(self) -> None:
ConfigError
If ``log_level`` is not a known :class:`LogLevel` name, if
``scan_max_workers`` is less than ``1``, if ``scan_provider``
is not one of ``"off"`` / ``"github"`` / ``"auto"``, or if
``analyze.max_diff_chars`` or ``analyze.large_commit_file_count``
is less than ``1``.
is not one of ``"off"`` / ``"github"`` / ``"auto"``, if
``analyze.max_diff_chars``, ``analyze.large_commit_file_count``
or ``analyze.pr_origin_min_commits`` is less than ``1``, or if
any of the ``rationale`` PR-rendering caps
(``pr_roster_max_commits``, ``pr_discussion_max_comments``,
``pr_comment_max_chars``) is less than ``1``.
"""
try:
LogLevel[self.log_level.upper()]
Expand All @@ -476,6 +499,26 @@ def __post_init__(self) -> None:
"analyze.large_commit_file_count must be >= 1, "
f"got {self.analyze.large_commit_file_count}"
)
if self.analyze.pr_origin_min_commits < 1:
raise ConfigError(
"analyze.pr_origin_min_commits must be >= 1, "
f"got {self.analyze.pr_origin_min_commits}"
)
if self.rationale.pr_roster_max_commits < 1:
raise ConfigError(
"rationale.pr_roster_max_commits must be >= 1, "
f"got {self.rationale.pr_roster_max_commits}"
)
if self.rationale.pr_discussion_max_comments < 1:
raise ConfigError(
"rationale.pr_discussion_max_comments must be >= 1, "
f"got {self.rationale.pr_discussion_max_comments}"
)
if self.rationale.pr_comment_max_chars < 1:
raise ConfigError(
"rationale.pr_comment_max_chars must be >= 1, "
f"got {self.rationale.pr_comment_max_chars}"
)

@classmethod
def from_toml(cls, path: Path) -> Config:
Expand Down
Loading
Loading