mtrdesign · cvetty · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/docs/squash-merge-recovery-plan.md b/docs/squash-merge-recovery-plan.md
diff --git a/src/whygraph/analyze/rationale.py b/src/whygraph/analyze/rationale.py
@@ -39,6 +39,10 @@ class CommitEvidence:
 
         * ``"blame"`` — line-level attribution from the target's current
           range (highest-precision signal).
+        * ``"pr-origin"`` — an original feature-branch commit recovered
+          from a squash-merged PR: when the queried lines blame to a
+          squash commit, they are re-blamed at the PR's ``head_sha`` so
+          each line maps back to the commit that actually authored it.
         * ``"blame-walked"`` — surfaced only after blame walked past a
           refactor-heavy commit. Still line-level, but one or more boring
           commits were skipped to reach this author.

diff --git a/src/whygraph/analyze/rationale_generator.py b/src/whygraph/analyze/rationale_generator.py
@@ -21,6 +21,7 @@
 
 import json
 from collections.abc import Sequence
+from dataclasses import dataclass
 
 from whygraph.core.config import RationaleConfig
 from whygraph.db.models import Commit, Issue, PullRequest
@@ -66,6 +67,53 @@ def _labels_suffix(raw: str) -> str:
     return "  [" + ", ".join(str(label) for label in labels) + "]"
 
 
+def _json_list(raw: str | None) -> list:
+    """Decode a JSON-encoded list column; empty list on anything malformed.
+
+    Mirrors :func:`whygraph.mcp.evidence._json_list` — kept local so the
+    formatters do not depend on the MCP layer.
+    """
+    if not raw:
+        return []
+    try:
+        parsed = json.loads(raw)
+    except (TypeError, json.JSONDecodeError):
+        return []
+    return parsed if isinstance(parsed, list) else []
+
+
+@dataclass(frozen=True, slots=True)
+class _PrRenderCaps:
+    """Size caps for rendering a PR block into the rationale prompt.
+
+    Passed explicitly into the module-level formatters (rather than read
+    from a global :func:`get_config`) so they stay pure and unit-testable.
+    Defaults mirror :class:`~whygraph.core.config.RationaleConfig`.
+
+    Attributes
+    ----------
+    pr_roster_max_commits : int
+        Max squashed-commit headlines rendered into a PR block.
+    pr_discussion_max_comments : int
+        Max PR comments rendered into a PR block.
+    pr_comment_max_chars : int
+        Per-comment body clip.
+    """
+
+    pr_roster_max_commits: int = 30
+    pr_discussion_max_comments: int = 20
+    pr_comment_max_chars: int = 500
+
+    @classmethod
+    def from_config(cls, config: RationaleConfig) -> "_PrRenderCaps":
+        """Project the three rendering caps out of a :class:`RationaleConfig`."""
+        return cls(
+            pr_roster_max_commits=config.pr_roster_max_commits,
+            pr_discussion_max_comments=config.pr_discussion_max_comments,
+            pr_comment_max_chars=config.pr_comment_max_chars,
+        )
+
+
 def _indent_block(text: str, prefix: str) -> str:
     """Indent every line of ``text`` by ``prefix``."""
     return "\n".join(prefix + line for line in text.splitlines())
@@ -90,15 +138,38 @@ def _format_commit(commit: Commit) -> list[str]:
     return lines
 
 
-def _format_pr(pr: PullRequest) -> list[str]:
-    """Render one pull request as the indented lines of an evidence block."""
+def _format_pr(pr: PullRequest, caps: _PrRenderCaps = _PrRenderCaps()) -> list[str]:
+    """Render one pull request as the indented lines of an evidence block.
+
+    Appends the squashed-commit roster and the PR discussion so the LLM
+    sees the narrative a squash merge collapsed. Both are clipped by
+    ``caps`` to bound the prompt size; ``pr.commit_titles`` / ``pr.comments``
+    are JSON-encoded list columns decoded via :func:`_json_list`.
+    """
     when = f"merged {pr.merged_at}" if pr.merged_at else pr.state
     author = f"by {pr.author}" if pr.author else "by unknown"
     lines = [f"  PR #{pr.number}  {author}  {when}{_labels_suffix(pr.labels)}"]
     lines.append(f"    Title: {pr.title}")
     if pr.body and pr.body.strip():
         lines.append("    Body:")
         lines.append(_indent_block(pr.body.strip(), "      "))
+    titles = _json_list(pr.commit_titles)[: caps.pr_roster_max_commits]
+    if titles:
+        lines.append("    Squashed commits:")
+        for c in titles:
+            if not isinstance(c, dict):
+                continue
+            oid = (c.get("oid") or "")[:9]
+            lines.append(f"      - {c.get('headline', '')}  ({oid})")
+    comments = _json_list(pr.comments)[: caps.pr_discussion_max_comments]
+    if comments:
+        lines.append("    Discussion:")
+        for cm in comments:
+            if not isinstance(cm, dict):
+                continue
+            who = cm.get("author") or "unknown"
+            body = (cm.get("body") or "").strip()[: caps.pr_comment_max_chars]
+            lines.append(_indent_block(f"[{who}] {body}", "      "))
     return lines
 
 
@@ -118,13 +189,16 @@ def _format_issue(issue: Issue) -> list[str]:
 
 _SOURCE_LABELS = {
     "blame": "line-blame",
+    "pr-origin": "original commit recovered from a squash-merged PR",
     "blame-walked": "line-blame (skipped a refactor commit)",
     "predecessor-blame": "line-blame on a pre-rename predecessor file",
     "area": "area-history (touched the file but not these lines)",
 }
 
 
-def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
+def _format_evidence(
+    evidence: Sequence[CommitEvidence], caps: _PrRenderCaps = _PrRenderCaps()
+) -> str:
     """Render an evidence bundle as the text payload for the rationale prompt.
 
     Commits are formatted in the order given — the caller controls
@@ -135,6 +209,9 @@ def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
     the row reached this bundle (line-blame, area-history, etc.), so the
     LLM can weight precision-vs-coverage signals when synthesising the
     rationale.
+
+    ``caps`` bounds the per-PR roster / discussion rendering (see
+    :func:`_format_pr`).
     """
     n_prs = sum(len(item.pull_requests) for item in evidence)
     n_issues = sum(len(item.issues) for item in evidence)
@@ -149,7 +226,7 @@ def _format_evidence(evidence: Sequence[CommitEvidence]) -> str:
         lines.insert(1, f"  Source: {label}")
         for pr in item.pull_requests:
             lines.append("")
-            lines.extend(_format_pr(pr))
+            lines.extend(_format_pr(pr, caps))
         for issue in item.issues:
             lines.append("")
             lines.extend(_format_issue(issue))
@@ -331,6 +408,11 @@ class RationaleGenerator:
         ``task`` should contain the
         :data:`~whygraph.analyze.prompt.RATIONALE_PLACEHOLDER` token. Mostly
         used in tests and one-off overrides.
+    caps : _PrRenderCaps, optional
+        Size caps for rendering each PR's squashed-commit roster and
+        discussion into the prompt. ``None`` (default) uses the
+        :class:`~whygraph.core.config.RationaleConfig` defaults;
+        :meth:`from_config` projects them from the loaded config.
 
     Examples
     --------
@@ -345,9 +427,11 @@ def __init__(
         *,
         timeout_sec: int | None = None,
         rationale_prompt: Prompt | None = None,
+        caps: _PrRenderCaps | None = None,
     ) -> None:
         self._client = client
         self._timeout_sec = timeout_sec
+        self._caps = caps if caps is not None else _PrRenderCaps()
         self._rationale_prompt = (
             rationale_prompt
             if rationale_prompt is not None
@@ -390,7 +474,11 @@ def from_config(
         """
         factory = factory if factory is not None else LlmClientFactory()
         client = factory.make(config.provider, model=config.model)
-        return cls(client, timeout_sec=config.timeout_sec)
+        return cls(
+            client,
+            timeout_sec=config.timeout_sec,
+            caps=_PrRenderCaps.from_config(config),
+        )
 
     def generate(
         self,
@@ -433,7 +521,7 @@ def generate(
 
         # TODO: capping bundle size belongs to the future evidence-bundle
         # builder — the generator neither truncates nor chunks its input.
-        bundle = _format_evidence(evidence)
+        bundle = _format_evidence(evidence, self._caps)
         if symbol_context is not None:
             bundle = f"{_format_symbol_context(symbol_context)}\n\n{bundle}"
         task = render(

diff --git a/src/whygraph/cli/commands/scan.py b/src/whygraph/cli/commands/scan.py
@@ -12,7 +12,13 @@
 from rich.table import Table
 from rich.text import Text
 
-from whygraph.scan import CodeGraphCrawler, Crawler, GitCrawler, GitHubCrawler
+from whygraph.scan import (
+    CodeGraphCrawler,
+    Crawler,
+    GitCrawler,
+    GitHubCrawler,
+    PROriginEnricher,
+)
 
 from ..console import console
 
@@ -73,11 +79,25 @@
         "auto-rescan git hooks (`whygraph hooks install`). Default: on."
     ),
 )
+@click.option(
+    "--pr-origins/--no-pr-origins",
+    "enrich_pr_origins",
+    default=True,
+    help=(
+        "Recover a squash-merged PR's original feature-branch commits — "
+        "one targeted `git fetch` of the gated PRs' heads, persisted as "
+        "`commit` rows flagged off the default-branch walk so they enrich "
+        "evidence without polluting area-history / refactor-walk. Needs "
+        "the network, so it always runs in the remote phase and is skipped "
+        "under `--no-remote`. Default: on."
+    ),
+)
 def scan_cmd(
     no_llm_descriptions: bool,
     refresh_codegraph: bool,
     codegraph_image: str | None,
     remote: bool,
+    enrich_pr_origins: bool,
 ) -> None:
     """Run the source crawlers, then describe each commit with the LLM."""
     # Lazy-imported so that --help and other lightweight CLI surfaces
@@ -121,6 +141,7 @@ def scan_cmd(
         analyze_skip=analyze_skip,
         codegraph_enabled=refresh_codegraph,
         remote_enabled=remote,
+        pr_origins_enabled=enrich_pr_origins and github_client is not None,
     )
 
     scan_log_path = db_path.parent / "scan.log"
@@ -142,8 +163,11 @@ def scan_cmd(
         if github_client is not None:
             phase1.append(GitHubCrawler(progress, client=github_client))
 
-        # Phase 2 — the analyzer, started only once phase 1 has joined
-        # (it reads the commits phase 1 persisted).
+        # Phase 2 — started only once phase 1 has joined (it reads the
+        # commits + PRs phase 1 persisted). The analyzer and the PR-origin
+        # enricher run concurrently: analyze only touches main-walk commits,
+        # the enricher only inserts new on_default_branch=0 rows, so they
+        # never contend over the same commit row.
         phase2: list[Crawler] = []
         if descriptor is not None:
             phase2.append(
@@ -155,6 +179,18 @@ def scan_cmd(
                     large_commit_file_count=config.analyze.large_commit_file_count,
                 )
             )
+        # The enricher needs PR rows (the GitHub crawler ran) and the
+        # network for its fetch — so it is gated on a resolved client, which
+        # is itself None under --no-remote.
+        if enrich_pr_origins and github_client is not None:
+            phase2.append(
+                PROriginEnricher(
+                    progress,
+                    repository=repository,
+                    min_commits=config.analyze.pr_origin_min_commits,
+                    large_commit_file_count=config.analyze.large_commit_file_count,
+                )
+            )
 
         if codegraph_crawler is not None:
             codegraph_crawler.start()
@@ -257,6 +293,7 @@ def _render_scan_panel(
     analyze_skip: str | None,
     codegraph_enabled: bool,
     remote_enabled: bool,
+    pr_origins_enabled: bool,
 ) -> None:
     """Print a summary panel of what the upcoming scan will collect.
 
@@ -316,6 +353,14 @@ def _render_scan_panel(
             ("LLM descriptions", Text(f"skipped — {analyze_skip}", style="yellow"))
         )
     rows.append(("Worker threads", str(config.scan_max_workers)))
+    rows.append(
+        (
+            "PR commit recovery",
+            "recover squash-merged PR commits"
+            if pr_origins_enabled
+            else Text("skipped", style="yellow"),
+        )
+    )
 
     grid = Table.grid(padding=(0, 3))
     grid.add_column(style="bold cyan", justify="right", no_wrap=True)

diff --git a/src/whygraph/core/config.py b/src/whygraph/core/config.py
@@ -243,13 +243,20 @@ class AnalyzeConfig:
     timeout_sec : int or None
         Per-call timeout forwarded into :class:`CompletionRequest`.
         ``None`` (default) defers to the bound adapter's default.
+    pr_origin_min_commits : int
+        Commit-rich half of the squash-merge enrichment gate
+        (:mod:`whygraph.scan.pr_origin_enricher`). A squash-merged PR has
+        its original feature-branch commits recovered when it collapsed at
+        least this many commits (the file-bulk half reuses
+        ``large_commit_file_count``). Must be ``>= 1``.
     """
 
     provider: str = "anthropic"
     model: str | None = None
     max_diff_chars: int = 50_000
     large_commit_file_count: int = 30
     timeout_sec: int | None = None
+    pr_origin_min_commits: int = 5
 
 
 @dataclass(frozen=True, slots=True)
@@ -277,11 +284,24 @@ class RationaleConfig:
     timeout_sec : int or None
         Per-call timeout forwarded into :class:`CompletionRequest`.
         ``None`` (default) defers to the bound adapter's default.
+    pr_roster_max_commits : int
+        Cap on how many squashed-commit headlines are rendered into a
+        single PR block in the rationale prompt. Bounds the prompt size
+        when a squash collapsed a long feature branch. Must be ``>= 1``.
+    pr_discussion_max_comments : int
+        Cap on how many PR comments are rendered into a single PR block
+        in the rationale prompt. Must be ``>= 1``.
+    pr_comment_max_chars : int
+        Per-comment body clip applied before rendering a PR comment into
+        the rationale prompt. Must be ``>= 1``.
     """
 
     provider: str = "anthropic"
     model: str | None = None
     timeout_sec: int | None = None
+    pr_roster_max_commits: int = 30
+    pr_discussion_max_comments: int = 20
+    pr_comment_max_chars: int = 500
 
 
 @dataclass(frozen=True, slots=True)
@@ -449,9 +469,12 @@ def __post_init__(self) -> None:
         ConfigError
             If ``log_level`` is not a known :class:`LogLevel` name, if
             ``scan_max_workers`` is less than ``1``, if ``scan_provider``
-            is not one of ``"off"`` / ``"github"`` / ``"auto"``, or if
-            ``analyze.max_diff_chars`` or ``analyze.large_commit_file_count``
-            is less than ``1``.
+            is not one of ``"off"`` / ``"github"`` / ``"auto"``, if
+            ``analyze.max_diff_chars``, ``analyze.large_commit_file_count``
+            or ``analyze.pr_origin_min_commits`` is less than ``1``, or if
+            any of the ``rationale`` PR-rendering caps
+            (``pr_roster_max_commits``, ``pr_discussion_max_comments``,
+            ``pr_comment_max_chars``) is less than ``1``.
         """
         try:
             LogLevel[self.log_level.upper()]
@@ -476,6 +499,26 @@ def __post_init__(self) -> None:
                 "analyze.large_commit_file_count must be >= 1, "
                 f"got {self.analyze.large_commit_file_count}"
             )
+        if self.analyze.pr_origin_min_commits < 1:
+            raise ConfigError(
+                "analyze.pr_origin_min_commits must be >= 1, "
+                f"got {self.analyze.pr_origin_min_commits}"
+            )
+        if self.rationale.pr_roster_max_commits < 1:
+            raise ConfigError(
+                "rationale.pr_roster_max_commits must be >= 1, "
+                f"got {self.rationale.pr_roster_max_commits}"
+            )
+        if self.rationale.pr_discussion_max_comments < 1:
+            raise ConfigError(
+                "rationale.pr_discussion_max_comments must be >= 1, "
+                f"got {self.rationale.pr_discussion_max_comments}"
+            )
+        if self.rationale.pr_comment_max_chars < 1:
+            raise ConfigError(
+                "rationale.pr_comment_max_chars must be >= 1, "
+                f"got {self.rationale.pr_comment_max_chars}"
+            )
 
     @classmethod
     def from_toml(cls, path: Path) -> Config: