Skip to content

Commit 81fd409

Browse files
committed
feat(pycg): bound the fixpoint with --pycg-max-iter; stop following in-tree deps
Two robustness fixes for level-2 PyCG, motivated by odoo divergence analysis. 1. max_iter cap (--pycg-max-iter, default 50). PyCG runs its PostProcessor fixpoint with max_iter=-1 (until convergence). Its abstract domain is field-sensitive access paths with no k-limiting/widening, so on heavy metaclass/mixin code the def set balloons (measured: 23 odoo ORM files -> 7.3k defs pass 0, 8.4k pass 1) and convergence may need many O(defs^2) passes. Capping passes returns a sound-but-incomplete graph and guarantees termination even with --pycg-shard-timeout 0 (which previously hung forever on a single diverging shard). Threaded through _run_pycg_batch and the Ray worker. Note: the wall-clock timeout is still the guard for shards whose individual passes exceed it. 2. Dependency exclusion. PyCG bounds analysis to its package dir via "if mod_dir not in mod.__file__". The whole-project path used package=project_dir, but an in-tree .codeanalyzer venv / site-packages lives under project_dir, so PyCG followed imports into dependencies and exploded. Run the whole-project path inside a symlink mini-project (as the shards already do) whose root mirrors only the SKIP_DIRS-filtered source, so deps resolve outside mod_dir and stay ghost nodes. Add test/test_pycg_sharding.py (max_iter threading; in-tree dep stays a ghost and its internals are never analysed).
1 parent 3985d69 commit 81fd409

5 files changed

Lines changed: 112 additions & 13 deletions

File tree

codeanalyzer/__main__.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,22 @@ def main(
200200
),
201201
),
202202
] = ShardStrategy.JEDI,
203+
pycg_max_iter: Annotated[
204+
int,
205+
typer.Option(
206+
"--pycg-max-iter",
207+
help=(
208+
"Cap on PyCG's fixpoint passes per shard/project (level 2; "
209+
"default 50). PyCG iterates until its points-to state stops "
210+
"changing, but its access-path domain has no convergence bound, "
211+
"so heavy metaclass/mixin code (e.g. an ORM) can loop with each "
212+
"pass costing seconds. The cap returns a sound-but-incomplete "
213+
"call graph instead of looping until the timeout kills it. "
214+
"Set to -1 for PyCG's unbounded run-to-convergence behaviour."
215+
),
216+
min=-1,
217+
),
218+
] = 50,
203219
):
204220
options = AnalysisOptions(
205221
input=input,
@@ -224,6 +240,7 @@ def main(
224240
pycg_shard_ceiling=pycg_shard_ceiling,
225241
pycg_shard_timeout=pycg_shard_timeout,
226242
pycg_shard_strategy=pycg_shard_strategy,
243+
pycg_max_iter=pycg_max_iter,
227244
)
228245

229246
_set_log_level(options.verbosity)

codeanalyzer/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,7 @@ def _get_pycg_call_graph(
683683
shard_ceiling=self.options.pycg_shard_ceiling,
684684
shard_timeout=self.options.pycg_shard_timeout,
685685
shard_strategy=self.options.pycg_shard_strategy,
686+
max_iter=self.options.pycg_max_iter,
686687
using_ray=self.using_ray,
687688
)
688689
return pycg.build_call_graph_edges(symbol_table, jedi_edges=jedi_edges)

codeanalyzer/options/options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,4 @@ class AnalysisOptions:
6161
pycg_shard_ceiling: int = 100
6262
pycg_shard_timeout: int = 120
6363
pycg_shard_strategy: ShardStrategy = ShardStrategy.JEDI
64+
pycg_max_iter: int = 50

codeanalyzer/semantic_analysis/pycg/pycg_analysis.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,14 @@ def _pycg_shard_worker(
154154
entry_points: List[str],
155155
package_dir: str,
156156
prefix: str,
157+
max_iter: int = -1,
157158
) -> List[tuple]:
158159
"""Run PyCG on one shard; called in a Ray worker process.
159160
160161
Returns a list of ``(source, target, weight)`` tuples that the caller
161162
converts to :class:`PyCallEdge` objects. This function is a plain
162163
module-level callable so it can be pickled by Ray without capturing any
163-
class-level state.
164+
class-level state. *max_iter* caps PyCG's fixpoint passes (-1 = unbounded).
164165
"""
165166
import importlib
166167
import sys
@@ -191,7 +192,7 @@ def _pycg_shard_worker(
191192
cg = CallGraphGenerator(
192193
entry_points=entry_points,
193194
package=package_dir,
194-
max_iter=-1,
195+
max_iter=max_iter,
195196
operation="call-graph",
196197
)
197198
cg.analyze()
@@ -366,7 +367,22 @@ class PyCG:
366367
# --pycg-shard-timeout. Set to 0 to disable.
367368
_PYCG_SHARD_TIMEOUT: int = 120
368369

369-
# Directory names that should never be fed to PyCG as entry points.
370+
# Cap on PyCG's outer fixpoint passes. PyCG runs PostProcessor until the
371+
# def/scope/MRO state stops changing; its abstract domain (field-sensitive
372+
# access paths, no k-limiting or widening) has no ascending-chain bound, so
373+
# on heavy metaclass/mixin code (e.g. an ORM) the def set can balloon into
374+
# the thousands and each O(defs^2) pass costs seconds — convergence, if it
375+
# comes, takes many passes. A finite cap turns "loop until killed" into a
376+
# sound-but-incomplete result that still returns the edges found so far.
377+
# 50 is generous — well-behaved code converges in well under 20 passes —
378+
# while bounding the pathological case. Override via --pycg-max-iter;
379+
# -1 restores PyCG's unbounded run-to-convergence behaviour.
380+
_PYCG_MAX_ITER: int = 50
381+
382+
# Directory names that should never be fed to PyCG as entry points, nor
383+
# followed into during import resolution (an in-tree .codeanalyzer venv /
384+
# site-packages lives under project_dir and would otherwise be pulled into
385+
# the package bound and analysed — see _shard_symlink_root).
370386
_SKIP_DIRS: frozenset = frozenset({
371387
".codeanalyzer", ".git", "__pycache__",
372388
"venv", ".venv", "virtualenv", "env", ".env",
@@ -382,6 +398,7 @@ def __init__(
382398
shard_ceiling: Optional[int] = None,
383399
shard_timeout: Optional[int] = None,
384400
shard_strategy: str = "jedi",
401+
max_iter: Optional[int] = None,
385402
using_ray: bool = False,
386403
) -> None:
387404
self.project_dir = Path(project_dir).resolve()
@@ -393,6 +410,7 @@ def __init__(
393410
self.shard_timeout = (
394411
shard_timeout if shard_timeout is not None else self._PYCG_SHARD_TIMEOUT
395412
)
413+
self.max_iter = max_iter if max_iter is not None else self._PYCG_MAX_ITER
396414
# "jedi": partition the Jedi module graph (SCC + Louvain) so coupled
397415
# modules co-compute and few edges are severed (see shard_planner).
398416
# "package": legacy one-shard-per-package-directory grouping.
@@ -519,7 +537,7 @@ def _run_pycg_batch(
519537
cg = self._CallGraphGenerator(
520538
entry_points=entry_points,
521539
package=str(package_dir),
522-
max_iter=-1,
540+
max_iter=self.max_iter,
523541
operation="call-graph",
524542
)
525543
cg.analyze()
@@ -656,7 +674,7 @@ def _build_sharded_planned_ray(self, plan: "ShardPlan") -> List[PyCallEdge]:
656674
continue
657675
root, eps = _materialize_shard_root(files, self.project_dir)
658676
roots.append(root)
659-
fut = remote_fn.remote(eps, str(root), "")
677+
fut = remote_fn.remote(eps, str(root), "", self.max_iter)
660678
futures.append(fut)
661679
meta[fut] = (idx, n)
662680

@@ -844,7 +862,7 @@ def _build_sharded_ray(self, shards: Dict[Path, List[str]]) -> List[PyCallEdge]:
844862
progress.advance()
845863
continue
846864
prefix = self._package_prefix(pkg_root, self.project_dir)
847-
fut = remote_fn.remote(files, str(pkg_root), prefix)
865+
fut = remote_fn.remote(files, str(pkg_root), prefix, self.max_iter)
848866
futures.append(fut)
849867
meta[fut] = (pkg_label, n)
850868

@@ -989,14 +1007,15 @@ def build_call_graph_edges(
9891007
)
9901008
return []
9911009
else:
992-
# Small project (≤ ceiling): whole-project analysis.
1010+
# Small project (≤ ceiling): whole-project analysis. Run inside a
1011+
# symlink mini-project mirroring only the (already SKIP_DIRS-filtered)
1012+
# entry points, so PyCG's package bound covers project source alone.
1013+
# Pointing PyCG at project_dir directly would put an in-tree
1014+
# .codeanalyzer venv / site-packages *under* mod_dir, and PyCG would
1015+
# follow imports into those dependencies and explode the analysis.
9931016
logger.info("PyCG: starting whole-project call graph analysis (%d files)", n_files)
994-
try:
995-
edges = self._run_pycg_batch(
996-
entry_points, self.project_dir, resolver, prefix=""
997-
)
998-
except PyCGExceptions.PyCGAnalysisError as exc:
999-
raise
1017+
with _shard_symlink_root(entry_points, self.project_dir) as (root, eps):
1018+
edges = self._run_pycg_batch(eps, root, resolver, prefix="")
10001019

10011020
elapsed = time.perf_counter() - t0
10021021
logger.info("✅ PyCG: %d edges in %.1fs", len(edges), elapsed)

test/test_pycg_sharding.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""Tests for PyCG executor scoping: dependency exclusion and the max_iter cap.
2+
3+
These drive the real PyCG wrapper, so they require ``pycg`` (a level-2 install
4+
dependency). They are deliberately tiny (a few files) so they run fast.
5+
"""
6+
from pathlib import Path
7+
8+
import pytest
9+
10+
pytest.importorskip("PyCG")
11+
12+
from codeanalyzer.semantic_analysis.pycg.pycg_analysis import (
13+
PyCG,
14+
_PyCGCallableResolver,
15+
_shard_symlink_root,
16+
)
17+
18+
19+
def test_max_iter_default_and_override(tmp_path):
20+
p = PyCG(tmp_path)
21+
assert p.max_iter == PyCG._PYCG_MAX_ITER == 50
22+
assert PyCG(tmp_path, max_iter=7).max_iter == 7
23+
assert PyCG(tmp_path, max_iter=-1).max_iter == -1
24+
25+
26+
def test_pycg_does_not_follow_into_in_tree_dependency(tmp_path):
27+
"""An in-tree ``.codeanalyzer`` venv under project_dir must stay a ghost.
28+
29+
PyCG bounds analysis to its ``package`` directory; running inside the
30+
symlink mini-project keeps that bound on project source only, so imports
31+
into a bundled dependency are recorded as ghost edges but never analysed.
32+
Regression guard for the dep-reach blowup.
33+
"""
34+
proj = tmp_path
35+
app = proj / "app"
36+
app.mkdir()
37+
(app / "__init__.py").write_text("")
38+
(app / "main.py").write_text("import bigdep\ndef run():\n return bigdep.work()\n")
39+
40+
# A bundled dependency with many internal functions: if PyCG followed into
41+
# it, dozens of bigdep.fN definitions/edges would appear.
42+
dep = proj / ".codeanalyzer" / "venv" / "site-packages" / "bigdep"
43+
dep.mkdir(parents=True)
44+
body = "".join(f"def f{i}(x):\n return f{(i + 1) % 50}(x)\n" for i in range(50))
45+
body += "def work():\n return f0(1)\n"
46+
(dep / "__init__.py").write_text(body)
47+
48+
pycg = PyCG(proj)
49+
pycg._ensure_pycg_loaded()
50+
resolver = _PyCGCallableResolver(set())
51+
entry_points = [str(app / "__init__.py"), str(app / "main.py")]
52+
with _shard_symlink_root(entry_points, proj) as (root, eps):
53+
edges = pycg._run_pycg_batch(eps, root, resolver, prefix="")
54+
55+
nodes = {n for e in edges for n in (e.source, e.target)}
56+
# bigdep is reachable as a ghost target ...
57+
assert any(n.startswith("bigdep") for n in nodes)
58+
# ... but none of its internals were analysed.
59+
assert not [n for n in nodes if n.startswith("bigdep.f")]
60+
# and the real app edge is present.
61+
assert any(e.source == "app.main.run" and e.target == "bigdep.work" for e in edges)

0 commit comments

Comments
 (0)