Skip to content

Commit 3985d69

Browse files
committed
fix(pycg): key the shard module graph by file path, not module_name
PyModule.module_name is only the file stem (py_file.stem), which collides heavily across a real project — every __init__.py, models.py, views.py shares a name. Keying the partition graph by module_name collapsed all same-stem files into a single node and, via the last-wins module_name->file_path map, silently dropped every colliding file but one from the shards. Observed on odoo: a 1028-file symbol table produced a graph of only 399 nodes (4 fat shards), so ~600 files were never handed to PyCG. Key graph nodes by file_path (unique) instead; carry module_name as a node attribute for readable reporting. plan_shards now emits file-path shards directly (no name->file remap) with a parallel module_shards name view. Add a regression test asserting every file lands in exactly one shard under stem collisions, and update graph tests for file-keyed nodes.
1 parent 5c02ba3 commit 3985d69

2 files changed

Lines changed: 81 additions & 42 deletions

File tree

codeanalyzer/semantic_analysis/pycg/shard_planner.py

Lines changed: 38 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161

6262

6363
# ----------------------------------------------------------------------------
64-
# Symbol-table walking: callable / class signature -> owning module
64+
# Symbol-table walking: callable / class signature -> defining file
6565
# ----------------------------------------------------------------------------
6666

6767
def _walk_callable_sigs(c: PyCallable) -> Iterator[str]:
@@ -80,22 +80,25 @@ def _walk_class_sigs(cls: PyClass) -> Iterator[str]:
8080
yield from _walk_class_sigs(inner)
8181

8282

83-
def _signature_to_module(symbol_table: Dict[str, PyModule]) -> Dict[str, str]:
84-
"""Map every callable/class signature in the project to its module name.
83+
def _signature_to_file(symbol_table: Dict[str, PyModule]) -> Dict[str, str]:
84+
"""Map every callable/class signature in the project to its defining file.
8585
86-
Built by walking each module's full nesting tree, so the mapping is exact
87-
rather than relying on longest-prefix matching against module names (which
88-
is ambiguous when one module name is a prefix of another).
86+
Built by walking each module's full nesting tree, recording the file that
87+
actually defines each callable. Files are the partition unit because
88+
``PyModule.module_name`` is only the file *stem* (``py_file.stem``), which
89+
collides heavily across a real project (every ``__init__.py``, ``models.py``
90+
…) — keying the module graph by name would collapse unrelated files into
91+
one node and silently drop files from shards. ``file_path`` is unique.
8992
"""
90-
sig_to_mod: Dict[str, str] = {}
93+
sig_to_file: Dict[str, str] = {}
9194
for module in symbol_table.values():
9295
for fn in module.functions.values():
9396
for sig in _walk_callable_sigs(fn):
94-
sig_to_mod[sig] = module.module_name
97+
sig_to_file[sig] = module.file_path
9598
for cls in module.classes.values():
9699
for sig in _walk_class_sigs(cls):
97-
sig_to_mod[sig] = module.module_name
98-
return sig_to_mod
100+
sig_to_file[sig] = module.file_path
101+
return sig_to_file
99102

100103

101104
# ----------------------------------------------------------------------------
@@ -133,22 +136,24 @@ def build_module_graph(
133136
symbol_table: Dict[str, PyModule],
134137
jedi_edges: List[PyCallEdge],
135138
) -> nx.DiGraph:
136-
"""Project Jedi callable→callable edges onto a weighted module DiGraph.
137-
138-
Every project module is a node (isolated modules included). Edge weight is
139-
the summed Jedi weight of cross-module call sites; intra-module edges and
140-
edges touching external/library symbols (no symbol-table entry) are
141-
dropped — they cannot influence how the project is partitioned.
139+
"""Project Jedi callable→callable edges onto a weighted file DiGraph.
140+
141+
Nodes are file paths (the unique, collision-free partition unit — see
142+
:func:`_signature_to_file`); each carries a ``module_name`` attribute for
143+
readable reporting. Every project file is a node (isolated files
144+
included). Edge weight is the summed Jedi weight of cross-file call sites;
145+
intra-file edges and edges touching external/library symbols (no
146+
symbol-table entry) are dropped — they cannot influence the partition.
142147
"""
143-
sig_to_mod = _signature_to_module(symbol_table)
148+
sig_to_file = _signature_to_file(symbol_table)
144149

145150
g = nx.DiGraph()
146151
for module in symbol_table.values():
147-
g.add_node(module.module_name, file_path=module.file_path)
152+
g.add_node(module.file_path, module_name=module.module_name)
148153

149154
for edge in jedi_edges:
150-
src = sig_to_mod.get(edge.source)
151-
dst = sig_to_mod.get(edge.target)
155+
src = sig_to_file.get(edge.source)
156+
dst = sig_to_file.get(edge.target)
152157
if src is None or dst is None or src == dst:
153158
continue
154159
if g.has_edge(src, dst):
@@ -355,23 +360,25 @@ def plan_shards(
355360
if merge_small:
356361
unit_shards = _merge_small(unit_shards, hu, unit_size, budget)
357362

358-
# Expand SCC units back to modules, then to file paths.
359-
module_shards: List[List[str]] = []
363+
# Expand SCC units back to file paths (graph nodes are files).
364+
file_shards: List[List[str]] = []
360365
for units in unit_shards:
361-
mods: Set[str] = set()
366+
files: Set[str] = set()
362367
for scc in units:
363-
mods |= unit_members[scc]
364-
if mods:
365-
module_shards.append(sorted(mods))
368+
files |= unit_members[scc]
369+
if files:
370+
file_shards.append(sorted(files))
366371

367-
mod_to_file = {n: g.nodes[n]["file_path"] for n in g.nodes}
368-
file_shards = [[mod_to_file[m] for m in mods] for mods in module_shards]
372+
# Parallel view in module names (file stems) for human-readable reporting.
373+
module_shards = [
374+
[g.nodes[f].get("module_name", f) for f in files] for files in file_shards
375+
]
369376

370377
# Metrics: how much Jedi edge weight does this partition sever?
371378
shard_of: Dict[str, int] = {}
372-
for idx, mods in enumerate(module_shards):
373-
for m in mods:
374-
shard_of[m] = idx
379+
for idx, files in enumerate(file_shards):
380+
for f in files:
381+
shard_of[f] = idx
375382
cut_weight = 0.0
376383
for u, v, w in g.edges(data="weight", default=1):
377384
if shard_of.get(u) != shard_of.get(v):

test/test_shard_planner.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ def _edge(src: str, dst: str, w: int = 1) -> PyCallEdge:
3131
return PyCallEdge(source=src, target=dst, weight=w, provenance=["jedi"])
3232

3333

34-
def _cut_ratio(g: nx.DiGraph, module_shards: List[List[str]]) -> float:
35-
shard_of = {m: i for i, mods in enumerate(module_shards) for m in mods}
34+
def _cut_ratio(g: nx.DiGraph, file_shards: List[List[str]]) -> float:
35+
# Graph nodes are file paths; partitions are lists of file paths.
36+
shard_of = {f: i for i, files in enumerate(file_shards) for f in files}
3637
total = cut = 0.0
3738
for u, v, w in g.edges(data="weight", default=1):
3839
total += w
@@ -45,27 +46,41 @@ def _cut_ratio(g: nx.DiGraph, module_shards: List[List[str]]) -> float:
4546
# build_module_graph
4647
# ----------------------------------------------------------------------------
4748

48-
def test_module_graph_projects_callables_to_modules():
49+
def test_module_graph_keys_nodes_by_file_not_name():
50+
# Two distinct files share the stem "models" (module_name collision); the
51+
# graph must keep them as separate nodes keyed by path, not collapse them.
52+
st = {
53+
"/pkg_a/models.py": _module("models", "/pkg_a/models.py", ["f"]),
54+
"/pkg_b/models.py": _module("models", "/pkg_b/models.py", ["h"]),
55+
}
56+
edges = [
57+
_edge("models.f", "models.h", 3), # but which file? mapped by definition
58+
]
59+
g = build_module_graph(st, edges)
60+
assert set(g.nodes) == {"/pkg_a/models.py", "/pkg_b/models.py"}
61+
assert g.nodes["/pkg_a/models.py"]["module_name"] == "models"
62+
63+
64+
def test_module_graph_projects_callables_to_files():
4965
st = {
5066
"/a.py": _module("a", "/a.py", ["f", "g"]),
5167
"/b.py": _module("b", "/b.py", ["h"]),
5268
}
5369
edges = [
54-
_edge("a.f", "b.h", 3), # cross-module -> kept
55-
_edge("a.f", "a.g", 5), # intra-module -> dropped
70+
_edge("a.f", "b.h", 3), # cross-file -> kept
71+
_edge("a.f", "a.g", 5), # intra-file -> dropped
5672
_edge("a.g", "ext.lib.x"), # external target -> dropped
5773
]
5874
g = build_module_graph(st, edges)
59-
assert set(g.nodes) == {"a", "b"}
60-
assert g.has_edge("a", "b") and g["a"]["b"]["weight"] == 3
61-
assert not g.has_edge("a", "a")
75+
assert set(g.nodes) == {"/a.py", "/b.py"}
76+
assert g.has_edge("/a.py", "/b.py") and g["/a.py"]["/b.py"]["weight"] == 3
6277
assert g.number_of_edges() == 1
6378

6479

65-
def test_isolated_modules_are_nodes():
80+
def test_isolated_files_are_nodes():
6681
st = {"/a.py": _module("a", "/a.py", ["f"]), "/b.py": _module("b", "/b.py", ["g"])}
6782
g = build_module_graph(st, []) # no edges
68-
assert set(g.nodes) == {"a", "b"}
83+
assert set(g.nodes) == {"/a.py", "/b.py"}
6984
assert g.number_of_edges() == 0
7085

7186

@@ -105,9 +120,11 @@ def test_every_module_assigned_exactly_once():
105120
def test_beats_naive_per_package_cut_ratio():
106121
st, edges = _coupled_clusters_project()
107122
g = build_module_graph(st, edges)
123+
# Naive baseline: one shard per top-level directory (e.g. /pkg_a/...).
108124
naive = {}
109125
for m in st.values():
110-
naive.setdefault(m.module_name.split(".")[0], []).append(m.module_name)
126+
top = m.file_path.split("/")[1]
127+
naive.setdefault(top, []).append(m.file_path)
111128
naive_ratio = _cut_ratio(g, list(naive.values()))
112129

113130
plan = plan_shards(st, edges, budget=4)
@@ -150,6 +167,21 @@ def test_determinism():
150167
assert norm(a) == norm(b)
151168

152169

170+
def test_no_file_dropped_on_stem_collision():
171+
# Regression: module_name is only the file stem, so many files collide on
172+
# name (every __init__.py, models.py, ...). Keying by name would drop all
173+
# but one per stem. Every FILE must land in exactly one shard.
174+
st, edges = {}, []
175+
for pkg in ("a", "b", "c", "d"):
176+
for stem in ("__init__", "models", "views"):
177+
path = f"/{pkg}/{stem}.py"
178+
st[path] = _module(stem, path, ["f"])
179+
plan = plan_shards(st, edges, budget=5)
180+
assigned = sorted(f for shard in plan.shards for f in shard)
181+
assert assigned == sorted(st.keys()) # all 12 files present
182+
assert len(assigned) == len(set(assigned)) # none duplicated
183+
184+
153185
def test_empty_project():
154186
plan = plan_shards({}, [], budget=4)
155187
assert plan.shards == []

0 commit comments

Comments
 (0)