Skip to content

Commit aab32e8

Browse files
committed
Add PyCG as Level 2 call graph backend
Wire PyCG as the call graph engine for analysis_level >= 2. PyCG's iterative name-pointer analysis recovers locally-scoped function calls, closures, and higher-order patterns that Jedi's type-inference misses. Edges from both backendsare merged; edges seen by both carry provenance=["jedi","pycg"]. Entry-point filter excludes .codeanalyzer, venv, site-packages and other non-project directories so PyCG only analyses the project's own source. Result on test fixture: 6 edges (vs. 2 Jedi-only), recovering all locally-scoped function calls. Signed-off-by: Saurabh Sinha <sinha108@gmail.com>
1 parent 60be312 commit aab32e8

5 files changed

Lines changed: 274 additions & 189 deletions

File tree

codeanalyzer/core.py

Lines changed: 31 additions & 189 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
merge_edges,
1616
resolve_unresolved_constructors,
1717
)
18-
from codeanalyzer.semantic_analysis.codeql import CodeQLLoader
19-
from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL
20-
from codeanalyzer.semantic_analysis.codeql.codeql_exceptions import CodeQLExceptions
18+
from codeanalyzer.semantic_analysis.pycg import PyCG, PyCGExceptions
2119
from codeanalyzer.syntactic_analysis.exceptions import SymbolTableBuilderRayError
2220
from codeanalyzer.syntactic_analysis.symbol_table_builder import SymbolTableBuilder
2321
from codeanalyzer.utils import ProgressBar
@@ -48,7 +46,7 @@ def _process_file_with_ray(py_file: Union[Path, str], project_dir: Union[Path, s
4846

4947

5048
class Codeanalyzer:
51-
"""Core functionality for CodeQL analysis.
49+
"""Core static analysis engine for Python projects.
5250
5351
Args:
5452
options (AnalysisOptions): Analysis configuration options containing all necessary parameters.
@@ -58,15 +56,12 @@ def __init__(self, options: AnalysisOptions) -> None:
5856
self.options = options
5957
self.project_dir = Path(options.input).resolve()
6058
self.skip_tests = options.skip_tests
61-
self.using_codeql = options.using_codeql
59+
self.analysis_level = options.analysis_level
6260
self.rebuild_analysis = options.rebuild_analysis
6361
self.cache_dir = (
6462
options.cache_dir.resolve() if options.cache_dir is not None else self.project_dir
6563
) / ".codeanalyzer"
6664
self.clear_cache = options.clear_cache
67-
self.db_path: Optional[Path] = None
68-
self.codeql_bin: Optional[Path] = None
69-
self.codeql_packs_dir: Optional[Path] = None
7065
self.virtualenv: Optional[Path] = None
7166
self.using_ray: bool = options.using_ray
7267
self.file_name: Optional[Path] = options.file_name
@@ -297,60 +292,6 @@ def __enter__(self) -> "Codeanalyzer":
297292
else:
298293
logger.warning("No package definition files found, skipping editable installation")
299294

300-
if self.using_codeql:
301-
logger.info(f"(Re-)initializing CodeQL analysis for {self.project_dir}")
302-
303-
# Resolve the CLI binary before anything else uses it: DB build
304-
# below needs it, and so does every subsequent query run.
305-
self.codeql_bin = self._ensure_codeql_bin()
306-
# Download the standard query library pack (idempotent). The
307-
# CLI install ships only the language extractors; the
308-
# ``codeql/python-all`` library pack must be fetched separately.
309-
self.codeql_packs_dir = self._ensure_codeql_packs(self.codeql_bin)
310-
311-
cache_root = self.cache_dir / "codeql"
312-
cache_root.mkdir(parents=True, exist_ok=True)
313-
self.db_path = cache_root / f"{self.project_dir.name}-db"
314-
self.db_path.mkdir(exist_ok=True)
315-
316-
checksum_file = self.db_path / ".checksum"
317-
current_checksum = self._compute_checksum(self.project_dir)
318-
319-
def is_cache_valid() -> bool:
320-
if not (self.db_path / "db-python").exists():
321-
return False
322-
if not checksum_file.exists():
323-
return False
324-
return checksum_file.read_text().strip() == current_checksum
325-
326-
if self.rebuild_analysis or not is_cache_valid():
327-
logger.info("Creating new CodeQL database...")
328-
329-
cmd = [
330-
str(self.codeql_bin),
331-
"database",
332-
"create",
333-
str(self.db_path),
334-
f"--source-root={self.project_dir}",
335-
"--language=python",
336-
"--overwrite",
337-
]
338-
339-
proc = subprocess.Popen(
340-
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE
341-
)
342-
_, err = proc.communicate()
343-
344-
if proc.returncode != 0:
345-
raise CodeQLExceptions.CodeQLDatabaseBuildException(
346-
f"Error building CodeQL database:\n{err.decode()}"
347-
)
348-
349-
checksum_file.write_text(current_checksum)
350-
351-
else:
352-
logger.info(f"Reusing cached CodeQL DB at {self.db_path}")
353-
354295
return self
355296

356297
def __exit__(self, *args, **kwargs) -> None:
@@ -378,24 +319,22 @@ def analyze(self) -> PyApplication:
378319
# Build symbol table from cached application if available (if no available, the build a new one)
379320
symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {})
380321

381-
# Build the call graph in four steps:
382-
# 1. Run CodeQL (when enabled). Produces resolved edges with
383-
# ``provenance=["codeql"]`` and augments ``PyCallsite``s
384-
# in-place — filling ``callee_signature`` for sites Jedi
385-
# couldn't resolve.
386-
# 2. Heuristic fallback for constructor calls neither Jedi nor
387-
# CodeQL could resolve (commonly classes nested inside
388-
# functions). Walks the symbol table by class short-name +
389-
# scope and writes ``<class>.__init__`` into the site.
390-
# 3. Derive Jedi edges from the now-fully-augmented symbol
391-
# table — these reflect every resolution the symbol table
392-
# contains, regardless of which pass put it there.
393-
# 4. Merge with CodeQL edges; provenance unions for edges both
394-
# backends saw.
395-
codeql_edges = self._get_call_graph(symbol_table, augment_sites=True)
322+
# Level 1: symbol table only — constructor heuristic still runs to
323+
# enrich PyCallsite.callee_signature inside the symbol table itself,
324+
# but no call_graph edge list is produced.
396325
resolve_unresolved_constructors(symbol_table)
397-
jedi_edges = jedi_call_graph_edges(symbol_table)
398-
call_graph = merge_edges(jedi_edges, codeql_edges)
326+
327+
call_graph = []
328+
if self.analysis_level >= 2:
329+
# Level 2: build call graph.
330+
# 1. Derive Jedi edges from the augmented symbol table.
331+
# 2. Run PyCG (iterative name-pointer analysis) for additional
332+
# edges — particularly locally-scoped function calls and
333+
# higher-order patterns that Jedi misses.
334+
# 3. Merge; provenance unions for edges seen by both backends.
335+
jedi_edges = jedi_call_graph_edges(symbol_table)
336+
pycg_edges = self._get_pycg_call_graph(symbol_table)
337+
call_graph = merge_edges(jedi_edges, pycg_edges)
399338

400339
# Recreate pyapplication
401340
app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build()
@@ -601,120 +540,23 @@ def _build_symbol_table(self, cached_symbol_table: Optional[Dict[str, PyModule]]
601540
logger.info("✅ Symbol table generation complete.")
602541
return symbol_table
603542

604-
def _ensure_codeql_packs(self, codeql_bin: Path) -> Path:
605-
"""Materialize a qlpack that depends on ``codeql/python-all``.
606-
607-
The CodeQL CLI install ships only the language extractors — query
608-
library packs (and their transitive dependencies like
609-
``codeql/concepts``) must be resolved separately. The canonical
610-
way is to declare the dependency in a ``qlpack.yml`` and run
611-
``codeql pack install`` in that directory; CodeQL writes a
612-
``codeql-pack.lock.yml`` and downloads everything needed.
613-
614-
We do this once per project under ``<cache_dir>/codeql/qlpack/``
615-
and return that directory. The query runner then writes its
616-
temporary ``.ql`` file inside this pack — colocation makes
617-
``import python`` resolve without any ``--additional-packs`` or
618-
``--search-path`` gymnastics.
619-
"""
620-
pack_dir = self.cache_dir / "codeql" / "qlpack"
621-
pack_dir.mkdir(parents=True, exist_ok=True)
622-
qlpack_yml = pack_dir / "qlpack.yml"
623-
lock_file = pack_dir / "codeql-pack.lock.yml"
624-
625-
if not qlpack_yml.exists():
626-
qlpack_yml.write_text(
627-
"name: codeanalyzer-deps\n"
628-
"version: 1.0.0\n"
629-
"dependencies:\n"
630-
' codeql/python-all: "*"\n'
631-
)
632-
633-
if lock_file.exists():
634-
logger.debug(f"CodeQL pack dependencies already installed in {pack_dir}")
635-
return pack_dir
636-
637-
logger.info(f"Installing CodeQL pack dependencies in {pack_dir}.")
638-
proc = subprocess.Popen(
639-
[str(codeql_bin), "pack", "install", str(pack_dir)],
640-
stdout=subprocess.PIPE,
641-
stderr=subprocess.PIPE,
642-
)
643-
_, err = proc.communicate()
644-
if proc.returncode != 0:
645-
raise CodeQLExceptions.CodeQLDatabaseBuildException(
646-
f"Failed to install CodeQL pack dependencies:\n"
647-
f"{(err or b'').decode(errors='replace')}"
648-
)
649-
return pack_dir
650-
651-
def _ensure_codeql_bin(self) -> Path:
652-
"""Locate (or download) the CodeQL CLI binary into the project cache.
653-
654-
Resolution order:
655-
1. An existing binary inside ``<cache_dir>/codeql/bin/`` —
656-
reused across runs on the same project.
657-
2. ``codeql`` already on the user's PATH — picked up verbatim.
658-
3. Otherwise, download into ``<cache_dir>/codeql/bin/``.
659-
660-
The project-local cache is preferred over PATH so the version we
661-
installed earlier wins over whatever the OS ships — keeps behavior
662-
deterministic when the user has both.
663-
"""
664-
bin_root = self.cache_dir / "codeql" / "bin"
665-
bin_root.mkdir(parents=True, exist_ok=True)
666-
667-
existing = next(
668-
(p for p in bin_root.rglob("codeql") if p.is_file()),
669-
None,
670-
)
671-
if existing and os.access(existing, os.X_OK):
672-
logger.debug(f"Reusing cached CodeQL CLI at {existing}")
673-
return existing.resolve()
674-
675-
on_path = shutil.which("codeql")
676-
if on_path:
677-
logger.debug(f"Using CodeQL CLI from PATH at {on_path}")
678-
return Path(on_path)
679-
680-
logger.info(f"CodeQL CLI not found; downloading into {bin_root}.")
681-
downloaded = CodeQLLoader.download_and_extract_codeql(bin_root)
682-
if not downloaded.exists() or not os.access(downloaded, os.X_OK):
683-
raise FileNotFoundError(
684-
f"CodeQL binary not executable after download: {downloaded}"
685-
)
686-
return downloaded
687-
688-
def _get_call_graph(
543+
def _get_pycg_call_graph(
689544
self,
690545
symbol_table: Dict[str, PyModule],
691-
augment_sites: bool = False,
692546
) -> List[PyCallEdge]:
693-
"""Build CodeQL-resolved call edges and optionally augment sites.
694-
695-
Returns an empty list when CodeQL isn't enabled or the database
696-
isn't available. Edges carry ``provenance=["codeql"]`` — merge
697-
with Jedi-derived edges via ``call_graph.merge_edges``.
547+
"""Build PyCG-resolved call edges.
698548
699-
When ``augment_sites`` is True, also mutates
700-
``PyCallable.call_sites`` in the symbol table to backfill
701-
``callee_signature`` for sites Jedi couldn't resolve. The single
702-
CodeQL query is shared (cached on the ``CodeQL`` instance) so
703-
this costs no extra DB work.
549+
Runs PyCG's iterative name-pointer analysis over the whole project
550+
and returns edges with ``provenance=["pycg"]``. Falls back to an
551+
empty list and logs a warning on any failure so the caller can
552+
continue with Jedi-only edges.
704553
"""
705-
if not self.using_codeql or self.db_path is None:
706-
return []
707554
try:
708-
cq = CodeQL(
709-
self.project_dir,
710-
self.db_path,
711-
codeql_bin=self.codeql_bin,
712-
codeql_packs_dir=self.codeql_packs_dir,
713-
)
714-
edges = cq.build_call_graph_edges(symbol_table)
715-
if augment_sites:
716-
cq.augment_call_sites(symbol_table)
717-
return edges
718-
except Exception as exc:
719-
logger.warning(f"CodeQL call-graph extraction failed: {exc}")
555+
pycg = PyCG(self.project_dir, skip_tests=self.skip_tests)
556+
return pycg.build_call_graph_edges(symbol_table)
557+
except PyCGExceptions.PyCGImportError as exc:
558+
logger.warning(f"PyCG not installed — level 2 edges will be Jedi-only: {exc}")
559+
return []
560+
except PyCGExceptions.PyCGAnalysisError as exc:
561+
logger.warning(f"PyCG analysis failed — level 2 edges will be Jedi-only: {exc}")
720562
return []
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
################################################################################
2+
# Copyright IBM Corporation 2025
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
################################################################################
16+
17+
from codeanalyzer.semantic_analysis.pycg.pycg_analysis import PyCG
18+
from codeanalyzer.semantic_analysis.pycg.pycg_exceptions import PyCGExceptions
19+
20+
__all__ = ["PyCG", "PyCGExceptions"]

0 commit comments

Comments
 (0)