From 96f249c1a852b0582a0fc6bae7543f7699fe7f7c Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 11:46:32 -0700 Subject: [PATCH 001/228] fix unit test setup --- src/graph_sitter/compiled/autocommit.pyx | 18 +----------------- .../git/repo_operator/repo_operator.py | 2 +- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/src/graph_sitter/compiled/autocommit.pyx b/src/graph_sitter/compiled/autocommit.pyx index 3d2a27c38..894e40f9d 100644 --- a/src/graph_sitter/compiled/autocommit.pyx +++ b/src/graph_sitter/compiled/autocommit.pyx @@ -1,6 +1,6 @@ import functools from collections.abc import Callable -from typing import Any, ParamSpec, TypeVar, Union, overload +from typing import Any, ParamSpec, TypeVar, Union import wrapt @@ -20,14 +20,6 @@ def is_outdated(c) -> bool: return False -@overload -def reader(wrapped: Callable[P, T]) -> Callable[P, T]: ... - - -@overload -def reader(wrapped: None = None, *, cache: bool | None = ...) -> Callable[[Callable[P, T]], Callable[P, T]]: ... - - def reader(wrapped: Callable[P, T] | None = None, *, cache: bool | None = None) -> Callable[P, T] | Callable[[Callable[P, T]], Callable[P, T]]: """Indicates this method is a read @@ -176,14 +168,6 @@ def update_dict(seen: set["Editable"], obj: "Editable", new_obj: "Editable"): assert not obj.is_outdated -@overload -def commiter(wrapped: Callable[P, T]) -> Callable[P, T]: ... - - -@overload -def commiter(wrapped: None = None, *, reset: bool = ...) -> Callable[[Callable[P, T]], Callable[P, T]]: ... - - def commiter(wrapped: Callable[P, T] | None = None, *, reset: bool = False) -> Callable[P, T] | Callable[[Callable[P, T]], Callable[P, T]]: """Indicates this method is part of a commit. There should be no writes within this method and reads will not be updated diff --git a/src/graph_sitter/git/repo_operator/repo_operator.py b/src/graph_sitter/git/repo_operator/repo_operator.py index 3b1f8099c..f08fd5c11 100644 --- a/src/graph_sitter/git/repo_operator/repo_operator.py +++ b/src/graph_sitter/git/repo_operator/repo_operator.py @@ -488,7 +488,7 @@ def _get_username_email(self) -> tuple[str, str] | None: def commit_changes(self, message: str, verify: bool = False) -> bool: """Returns True if a commit was made and False otherwise.""" - staged_changes = self.git_cli.git.diff("--staged") + staged_changes = self.git_cli.git.diff("--no-ext-diff", "--staged") if staged_changes: if self.bot_commit and (info := self._get_username_email()): user, email = info From 14884438fce4826e99700eeb3fc69df180c920d0 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 11:57:42 -0700 Subject: [PATCH 002/228] Add rust rewrite strategy --- rust-rewrite/strategy.md | 210 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 rust-rewrite/strategy.md diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md new file mode 100644 index 000000000..b964fbb10 --- /dev/null +++ b/rust-rewrite/strategy.md @@ -0,0 +1,210 @@ +# Rust Rewrite Strategy + +## Goal + +Replace the memory-heavy Python object graph with a compact Rust engine while preserving the current Python-facing API. The Python shell should remain the user and codemod interface; Rust should own parsing, indexing, symbol/import/export resolution, dependency graph storage, and eventually incremental invalidation. + +The main problem to solve is not just CPU time. The current architecture eagerly materializes the codebase as many Python objects, keeps tree-sitter nodes and parent/context/file links on those objects, stores the same objects in `rustworkx.PyDiGraph`, and maintains additional per-file node lists and range indexes. On very large repos this can inflate into tens of GB of resident memory. + +## Strategy + +Build a Rust core behind the existing Python API: + +1. Keep `Codebase`, `SourceFile`, `Symbol`, `Import`, `Export`, and related Python classes as compatibility handles. +2. Move canonical storage into Rust: + - interned paths, strings, import specifiers, symbol names + - compact `FileId`, `NodeId`, `SymbolId`, `ImportId`, `ExportId`, `EdgeId` + - arena/slotmap-backed records instead of Python objects + - adjacency tables or compressed graph storage instead of `PyDiGraph` payloads + - byte ranges and kind enums instead of persistent Python `tree_sitter.Node` wrappers for every node +3. Create Python wrappers lazily only when user code asks for them. +4. Run graph queries in Rust and return IDs or compact records; Python adapts those into existing objects/lists. +5. Port incrementally behind a backend flag, keeping the Python backend available until parity is proven. + +## Non-Goals + +- Do not rewrite the public codemod API first. +- Do not translate every Python class one-for-one into Rust. +- Do not make Rust own all edit formatting in the first slice. +- Do not remove the current Python backend until large-repo memory and parity targets are met. + +## Current Hot Spots To Replace + +- `CodebaseContext` owns a `rustworkx.PyDiGraph` of Python node payloads. +- `SourceFile` eagerly parses and stores all parsed nodes in `_nodes`. +- `Editable` objects keep `ts_node`, `ctx`, `parent`, and file/node IDs. +- Initial graph build parses every source file and then runs import/export/dependency passes over the aggregate node set. +- Dependency recomputation uses object methods and fixed-point list expansion rather than compact indexed frontiers. +- Public queries such as `codebase.symbols`, `codebase.imports`, and `codebase.files` materialize Python lists by filtering graph nodes. + +## Target Architecture + +### Rust Crates + +- `graph_sitter_engine` + - core data model + - tree-sitter parsing + - compact indexes + - import/export/name/scope resolution + - dependency graph + - incremental invalidation + - debug dumps and benchmark hooks +- `graph_sitter_py` + - PyO3 bindings + - backend facade used by Python `CodebaseContext` + - lazy handle constructors + +### Python Integration + +- Add a backend option such as `CodebaseConfig(graph_backend="python" | "rust")`. +- Introduce an engine facade under `CodebaseContext`. +- Keep current Python objects for compatibility, but make Rust-backed versions hold IDs instead of owning canonical state. +- Keep the existing transaction manager initially; Rust should provide ranges and patch intents, not own all formatting in phase 1. + +### Data Model + +Minimum records for the vertical slice: + +- `FileRecord`: path ID, language, content hash, root range, per-file node ranges +- `SymbolRecord`: file ID, name ID, full-name ID, kind, parent symbol, scope, range, declaration range +- `ImportRecord`: file ID, module/name/alias IDs, import kind, range, statement range +- `ExportRecord`: file ID, exported name, target symbol/import/file, range +- `UsageRecord`: file ID, source node, target node, usage kind/type, match range +- `GraphEdge`: source ID, target ID, edge kind, optional usage ID + +## Multi-Agent Work Convention + +This file is the shared coordination ledger for helper agents. + +- Every task must be represented as a Markdown checkbox line. +- Use `[ ]` for open or claimed work and `[x]` for completed work. +- To claim a task, append `owner: ` to the same checkbox line. +- To mark a task blocked, keep it unchecked and append `BLOCKED: `. +- When completing a task, change `[ ]` to `[x]` and append a short result note. +- Add new tasks under the relevant phase rather than creating a separate tracking file. +- Each agent should append a short entry to `Agent Log` when it starts or finishes meaningful work. +- Avoid broad edits to sections owned by another active agent; add notes instead. +- Keep implementation-specific findings near the task they affect. + +Recommended task format: + +```md +- [ ] Short imperative task title. owner: agent-name. Notes: current finding or next action. +- [x] Completed task title. owner: agent-name. Result: concise outcome. +``` + +## Agent Hierarchy + +- [ ] Lead/RFC agent: maintain this strategy, define interfaces, arbitrate scope, and keep phases coherent. +- [ ] Benchmark agent: measure current memory/time by phase on small, medium, and huge repos. +- [ ] API inventory agent: enumerate public APIs and classify P0/P1/P2 compatibility requirements. +- [ ] Rust data-model agent: design compact arenas, IDs, interners, and graph storage. +- [ ] Parser/index agent: implement Rust tree-sitter extraction into compact IR. +- [ ] Resolver agent: port import, export, scope, name, superclass, and dependency resolution. +- [ ] PyO3 binding agent: expose Rust engine operations to the existing Python package. +- [ ] Incremental agent: design file add/reparse/delete invalidation and stable ID behavior. +- [ ] Parity/test agent: run existing tests against both backends and build golden graph snapshots. +- [ ] Packaging/CI agent: integrate Rust builds with the current hatch/Cython packaging and CI. + +## Phase 0: Baseline, RFC, And Contracts + +- [ ] Add memory benchmark harness for current Python backend. +- [ ] Measure cold parse RSS and wall time for representative repos. +- [ ] Measure graph node/edge counts, Python object counts, and per-phase allocation peaks. +- [ ] Document the exact current build phases with timings: file enumeration, parse, directory tree, config parse, import resolution, export resolution, dependency recompute. +- [ ] Inventory all public `Codebase` properties and methods. +- [ ] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. +- [ ] Define P0 compatibility surface for the first Rust backend slice. +- [ ] Define large-repo success targets for memory and time. +- [ ] Draft Rust engine RFC with module boundaries and Python integration points. +- [ ] Decide build tooling: `maturin`, setuptools-rust, or hatch custom hook. + +## Phase 1: Rust Engine Skeleton + +- [ ] Add Rust workspace/crate skeleton without changing default behavior. +- [ ] Add PyO3 module import smoke test. +- [ ] Add `graph_backend` config flag with default `python`. +- [ ] Add Rust engine facade object that can be constructed from `CodebaseContext`. +- [ ] Add a minimal debug API returning engine version and enabled features. +- [ ] Add CI job that builds the Rust extension on supported Python versions. +- [ ] Add benchmark command that can select `--backend python|rust`. + +## Phase 2: Parser And Compact Index Vertical Slice + +- [ ] Implement Rust file discovery input format from Python repo operator. +- [ ] Implement tree-sitter parser setup for Python. +- [ ] Implement tree-sitter parser setup for TypeScript/TSX. +- [ ] Extract file records with path, language, content hash, and root ranges. +- [ ] Extract top-level Python classes, functions, and globals. +- [ ] Extract top-level TypeScript classes, functions, interfaces, type aliases, enums, and globals. +- [ ] Extract imports for Python. +- [ ] Extract imports and exports for TypeScript. +- [ ] Build path and string interners. +- [ ] Expose `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` ID queries through PyO3. +- [ ] Add golden snapshots for compact IR on small Python fixtures. +- [ ] Add golden snapshots for compact IR on small TypeScript fixtures. + +## Phase 3: Resolution And Dependency Graph + +- [ ] Port Python import resolution rules. +- [ ] Port TypeScript relative import resolution rules. +- [ ] Port TypeScript config/path alias handling. +- [ ] Represent external modules compactly. +- [ ] Implement import-to-file and import-to-symbol edges. +- [ ] Implement export-to-symbol/import/file edges. +- [ ] Implement lexical scope tables for name resolution. +- [ ] Implement symbol usage extraction by identifier ranges. +- [ ] Implement dependency edge construction from usage records. +- [ ] Implement superclass/interface dependency edges. +- [ ] Add graph debug dump for nodes, edges, and usage metadata. +- [ ] Add parity tests comparing Python backend and Rust backend graph edges on fixtures. + +## Phase 4: Lazy Python Compatibility Layer + +- [ ] Define Python handle base class that stores engine reference and stable ID. +- [ ] Implement Rust-backed file handles for P0 `SourceFile` APIs. +- [ ] Implement Rust-backed symbol handles for P0 `Symbol`, `Class`, and `Function` APIs. +- [ ] Implement Rust-backed import handles for P0 `Import` APIs. +- [ ] Implement Rust-backed export handles for P0 TypeScript `Export` APIs. +- [ ] Make `Codebase.files` return lazy handles under Rust backend. +- [ ] Make `Codebase.symbols`, `classes`, `functions`, `imports`, and `exports` return lazy handles under Rust backend. +- [ ] Preserve existing sorting behavior for public query results. +- [ ] Add fallback path to Python backend for unsupported methods. +- [ ] Add tests that verify no full Python object graph is materialized for simple list queries. + +## Phase 5: Incremental Sync And Edits + +- [ ] Define stable ID behavior across file reparse. +- [ ] Implement add file in Rust backend. +- [ ] Implement delete file in Rust backend. +- [ ] Implement reparse changed file in Rust backend. +- [ ] Implement dependency invalidation frontier based on changed imports, exports, symbols, and usages. +- [ ] Integrate Rust backend with existing `apply_diffs`. +- [ ] Integrate Rust backend with existing transaction commit flow. +- [ ] Preserve Python transaction manager as first edit backend. +- [ ] Add parity tests for rename/move/add-import flows on Rust backend. +- [ ] Add stress tests for repeated incremental edits. + +## Phase 6: Hardening And Rollout + +- [ ] Run full unit suite with Python backend. +- [ ] Run full unit suite with Rust backend where supported. +- [ ] Add large-repo memory regression benchmark to CI or nightly. +- [ ] Add feature flag documentation. +- [ ] Add migration notes for unsupported APIs. +- [ ] Decide default backend criteria. +- [ ] Flip default to Rust only after memory, speed, and parity targets are met. +- [ ] Keep Python backend available for one release after Rust becomes default. + +## Acceptance Targets + +- [ ] Cold parse memory on a representative huge repo is less than 25% of current Python backend. +- [ ] Cold parse wall time is no slower than current Python backend, with a target of at least 2x faster. +- [ ] P0 query APIs have parity with current behavior. +- [ ] Existing unit tests pass for Python backend throughout the rewrite. +- [ ] Rust backend has golden snapshots for graph IR and dependency edges. +- [ ] Unsupported Python APIs fail explicitly or fall back to Python backend. + +## Agent Log + +- [ ] 2026-06-18: Initial strategy file created on `rust-rewrite` branch. owner: codex. Notes: ready for helper agents to claim phase tasks. From f815e24e9854fa5fc0fe04a3f1eef05aa9529b40 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 12:31:53 -0700 Subject: [PATCH 003/228] Record rust rewrite agent roster --- rust-rewrite/strategy.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index b964fbb10..1a0458e80 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -106,6 +106,16 @@ Recommended task format: - [ ] Parity/test agent: run existing tests against both backends and build golden graph snapshots. - [ ] Packaging/CI agent: integrate Rust builds with the current hatch/Cython packaging and CI. +## Active Worktrees + +- [ ] Benchmarks/profiling. owner: Poincare. Agent: `019edc37-802c-7223-8d37-75a51b65abbd`. Branch: `codex/rust-rewrite-benchmarks`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-benchmarks`. +- [ ] API inventory. owner: Dewey. Agent: `019edc37-82ff-7b92-9fac-5364e2d8098b`. Branch: `codex/rust-rewrite-api-inventory`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-api-inventory`. +- [ ] Rust data model. owner: Pasteur. Agent: `019edc37-859c-71b2-b884-ab7a2bfc707e`. Branch: `codex/rust-rewrite-data-model`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-data-model`. +- [ ] Parser/index vertical slice. owner: Meitner. Agent: `019edc37-8867-7a83-a18e-b0ec0ca29d11`. Branch: `codex/rust-rewrite-parser-index`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-parser-index`. +- [ ] Resolver/dependency algorithms. owner: Gauss. Agent: `019edc37-8c34-7f93-b0ae-746cbd579962`. Branch: `codex/rust-rewrite-resolver`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-resolver`. +- [ ] Rust engine skeleton. owner: Beauvoir. Agent: `019edc37-8f2d-7dd3-b3ed-a1f9e1b191a7`. Branch: `codex/rust-rewrite-engine-skeleton`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-engine-skeleton`. +- [ ] PyO3/Python compatibility. owner: queued. Branch: `codex/rust-rewrite-pyo3-compat`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-pyo3-compat`. Notes: agent spawn queued until an active helper completes. + ## Phase 0: Baseline, RFC, And Contracts - [ ] Add memory benchmark harness for current Python backend. @@ -208,3 +218,4 @@ Recommended task format: ## Agent Log - [ ] 2026-06-18: Initial strategy file created on `rust-rewrite` branch. owner: codex. Notes: ready for helper agents to claim phase tasks. +- [ ] 2026-06-18: Integrator created seven worktrees and spawned six helper agents; PyO3 compatibility is queued due to agent concurrency limit. owner: codex. From fe6051608e27198f686872008d62ba10aff10385 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 12:59:15 -0700 Subject: [PATCH 004/228] Integrate rust rewrite planning artifacts --- Cargo.toml | 16 + crates/graph-sitter-engine/Cargo.toml | 11 + crates/graph-sitter-engine/src/lib.rs | 64 +++ crates/graph-sitter-py/Cargo.toml | 21 + crates/graph-sitter-py/src/lib.rs | 131 +++++ rust-rewrite/api-inventory.md | 274 +++++++++ rust-rewrite/benchmarks.md | 103 ++++ rust-rewrite/data-model.md | 567 +++++++++++++++++++ rust-rewrite/engine-skeleton.md | 24 + rust-rewrite/parser-index.md | 340 +++++++++++ rust-rewrite/resolution-algorithms.md | 309 ++++++++++ rust-rewrite/strategy.md | 38 +- rust-rewrite/tools/measure_python_backend.py | 402 +++++++++++++ 13 files changed, 2283 insertions(+), 17 deletions(-) create mode 100644 Cargo.toml create mode 100644 crates/graph-sitter-engine/Cargo.toml create mode 100644 crates/graph-sitter-engine/src/lib.rs create mode 100644 crates/graph-sitter-py/Cargo.toml create mode 100644 crates/graph-sitter-py/src/lib.rs create mode 100644 rust-rewrite/api-inventory.md create mode 100644 rust-rewrite/benchmarks.md create mode 100644 rust-rewrite/data-model.md create mode 100644 rust-rewrite/engine-skeleton.md create mode 100644 rust-rewrite/parser-index.md create mode 100644 rust-rewrite/resolution-algorithms.md create mode 100644 rust-rewrite/tools/measure_python_backend.py diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..f70a5907d --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,16 @@ +[workspace] +members = [ + "crates/graph-sitter-engine", + "crates/graph-sitter-py", +] +resolver = "2" + +[workspace.package] +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" +repository = "https://github.com/codegen-sh/graph-sitter" + +[workspace.dependencies] +graph-sitter-engine = { path = "crates/graph-sitter-engine" } +pyo3 = "0.22" diff --git a/crates/graph-sitter-engine/Cargo.toml b/crates/graph-sitter-engine/Cargo.toml new file mode 100644 index 000000000..2a2ef9603 --- /dev/null +++ b/crates/graph-sitter-engine/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "graph-sitter-engine" +description = "Core Rust engine skeleton for graph-sitter" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +name = "graph_sitter_engine" +path = "src/lib.rs" diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs new file mode 100644 index 000000000..913bac76b --- /dev/null +++ b/crates/graph-sitter-engine/src/lib.rs @@ -0,0 +1,64 @@ +#![forbid(unsafe_code)] + +const ENABLED_FEATURES: &[&str] = &["skeleton"]; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EngineInfo { + version: &'static str, + enabled_features: &'static [&'static str], +} + +impl EngineInfo { + pub fn version(&self) -> &'static str { + self.version + } + + pub fn enabled_features(&self) -> &'static [&'static str] { + self.enabled_features + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct Engine; + +impl Engine { + pub fn new() -> Self { + Self + } + + pub fn debug_info(&self) -> EngineInfo { + debug_info() + } + + pub fn version(&self) -> &'static str { + engine_version() + } + + pub fn enabled_features(&self) -> &'static [&'static str] { + ENABLED_FEATURES + } +} + +pub fn engine_version() -> &'static str { + env!("CARGO_PKG_VERSION") +} + +pub fn debug_info() -> EngineInfo { + EngineInfo { + version: engine_version(), + enabled_features: ENABLED_FEATURES, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn debug_info_reports_version_and_skeleton_feature() { + let info = Engine::new().debug_info(); + + assert_eq!(info.version(), env!("CARGO_PKG_VERSION")); + assert_eq!(info.enabled_features(), ["skeleton"]); + } +} diff --git a/crates/graph-sitter-py/Cargo.toml b/crates/graph-sitter-py/Cargo.toml new file mode 100644 index 000000000..f70c7e676 --- /dev/null +++ b/crates/graph-sitter-py/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "graph-sitter-py" +description = "PyO3 binding placeholder for graph-sitter-engine" +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true + +[lib] +name = "graph_sitter_py" +path = "src/lib.rs" +crate-type = ["cdylib", "rlib"] + +[features] +default = [] +pyo3-bindings = ["dep:pyo3"] +extension-module = ["pyo3-bindings", "pyo3/extension-module"] + +[dependencies] +graph-sitter-engine.workspace = true +pyo3 = { workspace = true, optional = true } diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs new file mode 100644 index 000000000..2b992a341 --- /dev/null +++ b/crates/graph-sitter-py/src/lib.rs @@ -0,0 +1,131 @@ +#![cfg_attr(not(feature = "pyo3-bindings"), forbid(unsafe_code))] + +pub fn engine_version() -> &'static str { + graph_sitter_engine::engine_version() +} + +pub fn enabled_features() -> &'static [&'static str] { + graph_sitter_engine::debug_info().enabled_features() +} + +#[cfg(feature = "pyo3-bindings")] +mod bindings { + use graph_sitter_engine::{self, Engine, EngineInfo}; + use pyo3::prelude::*; + + #[pyclass(name = "EngineInfo", module = "graph_sitter_py")] + #[derive(Debug, Clone, PartialEq, Eq)] + pub struct PyEngineInfo { + version: String, + enabled_features: Vec, + } + + impl From for PyEngineInfo { + fn from(info: EngineInfo) -> Self { + Self { + version: info.version().to_owned(), + enabled_features: info + .enabled_features() + .iter() + .map(|feature| (*feature).to_owned()) + .collect(), + } + } + } + + #[pymethods] + impl PyEngineInfo { + #[getter] + fn version(&self) -> &str { + &self.version + } + + #[getter] + fn enabled_features(&self) -> Vec { + self.enabled_features.clone() + } + + fn __repr__(&self) -> String { + format!( + "EngineInfo(version={:?}, enabled_features={:?})", + self.version, self.enabled_features + ) + } + } + + #[pyclass(name = "Engine", module = "graph_sitter_py")] + #[derive(Debug, Default, Clone)] + pub struct PyEngine { + inner: Engine, + } + + #[pymethods] + impl PyEngine { + #[new] + fn new() -> Self { + Self { + inner: Engine::new(), + } + } + + #[getter] + fn version(&self) -> &str { + self.inner.version() + } + + fn enabled_features(&self) -> Vec { + self.inner + .enabled_features() + .iter() + .map(|feature| (*feature).to_owned()) + .collect() + } + + fn debug_info(&self) -> PyEngineInfo { + self.inner.debug_info().into() + } + } + + #[pyfunction(name = "engine_version")] + fn py_engine_version() -> &'static str { + graph_sitter_engine::engine_version() + } + + #[pyfunction(name = "debug_info")] + fn py_debug_info() -> PyEngineInfo { + graph_sitter_engine::debug_info().into() + } + + #[pymodule] + fn graph_sitter_py(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(py_engine_version, m)?)?; + m.add_function(wrap_pyfunction!(py_debug_info, m)?)?; + Ok(()) + } + + #[cfg(test)] + mod tests { + use super::*; + + #[test] + fn debug_info_forwards_core_engine_metadata() { + let info = py_debug_info(); + + assert_eq!(info.version, graph_sitter_engine::engine_version()); + assert_eq!(info.enabled_features, vec!["skeleton".to_owned()]); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn forwards_core_engine_metadata_without_python_linking() { + assert_eq!(engine_version(), graph_sitter_engine::engine_version()); + assert_eq!(enabled_features(), ["skeleton"]); + } +} diff --git a/rust-rewrite/api-inventory.md b/rust-rewrite/api-inventory.md new file mode 100644 index 000000000..9b0122f15 --- /dev/null +++ b/rust-rewrite/api-inventory.md @@ -0,0 +1,274 @@ +# Rust Rewrite API Inventory + +Inventory date: 2026-06-18 + +Scope: Python-facing public APIs that the Rust backend must preserve for `Codebase`, `File`/`SourceFile`, `Symbol`, `Import`, `Export`, and `Directory`. This inventory prioritizes APIs referenced by API docs, unit tests, and codemod examples/workflows. Source references point to the current Python implementation. + +Priority meanings: + +- P0: First Rust backend slice must preserve behavior and return shapes. It may still return Python compatibility handles, but query results, ordering, exceptions, and basic resolution semantics must match. +- P1: Public and used enough to preserve, but can initially fall back to the Python backend or existing transaction manager. Most edit/search/AST-manipulation APIs are here. +- P2: Preserve as explicit fallback, compatibility shim, or documented unsupported behavior for the Rust backend. These are Git/GitHub, AI, visualization, diagnostics, or low-level/internal APIs. + +## P0 Compatibility Surface + +### Codebase + +Source references: `src/graph_sitter/core/codebase.py:259`, `src/graph_sitter/core/codebase.py:286`, `src/graph_sitter/core/codebase.py:338`, `src/graph_sitter/core/codebase.py:351`, `src/graph_sitter/core/codebase.py:366`, `src/graph_sitter/core/codebase.py:399`, `src/graph_sitter/core/codebase.py:409`, `src/graph_sitter/core/codebase.py:421`, `src/graph_sitter/core/codebase.py:432`, `src/graph_sitter/core/codebase.py:443`, `src/graph_sitter/core/codebase.py:455`, `src/graph_sitter/core/codebase.py:529`, `src/graph_sitter/core/codebase.py:551`, `src/graph_sitter/core/codebase.py:596`, `src/graph_sitter/core/codebase.py:609`, `src/graph_sitter/core/codebase.py:631`, `src/graph_sitter/core/codebase.py:644`, `src/graph_sitter/core/codebase.py:671`, `src/graph_sitter/core/codebase.py:687`, `src/graph_sitter/core/codebase.py:711`, `src/graph_sitter/core/codebase.py:803`, `src/graph_sitter/core/codebase.py:846`, `src/graph_sitter/core/codebase.py:1331`, `src/graph_sitter/core/codebase.py:1405`, `src/graph_sitter/core/codebase.py:1452`. + +Docs/tests/codemods evidence: `docs/api-reference/core/Codebase.mdx`, `tests/unit/sdk/core/test_codebase.py`, `tests/unit/sdk/python/codebase/test_codebase.py`, `src/codemods/**`, `docs/tutorials/**`. + +- Construction and metadata: + - `Codebase(...)` constructor surface and config behavior. + - `Codebase.from_files(...)` and `Codebase.from_string(...)` for fixture/test construction. + - `Codebase.from_repo(...)` should keep its Python checkout/setup behavior; the Rust engine can start after the repo path and config are resolved. + - `codebase.name` and `codebase.language`. +- File and directory queries: + - `codebase.files(...)`, including `extensions=None`, `extensions="*"`, `extensions=[...]`, source-file-only default behavior, and alphabetical sorting. + - `codebase.has_file(filepath, ignore_case=False)` and `codebase.get_file(filepath, optional=False, ignore_case=False)`. + - `codebase.directories`, `codebase.has_directory(dir_path, ignore_case=False)`, and `codebase.get_directory(dir_path, optional=False, ignore_case=False)`. +- Graph-level node queries: + - `codebase.imports`. + - `codebase.exports` for TypeScript, including `NotImplementedError` on Python codebases. + - `codebase.symbols`, `codebase.classes`, `codebase.functions`, `codebase.global_vars`, `codebase.interfaces`, `codebase.types`. + - `codebase.has_symbol(name)`, `codebase.get_symbol(name, optional=False)`, `codebase.get_symbols(name)`, `codebase.get_class(name, optional=False)`, `codebase.get_function(name, optional=False)`. + - Ambiguity and missing-result errors for `get_symbol`, `get_class`, and `get_function`. +- Transaction compatibility: + - `codebase.commit(...)` and `codebase.reset(...)` must remain callable for codemod workflows. The first Rust slice should delegate to the existing Python transaction manager rather than porting edit application. + +### File and SourceFile + +Source references: `src/graph_sitter/core/file.py:50`, `src/graph_sitter/core/file.py:121`, `src/graph_sitter/core/file.py:131`, `src/graph_sitter/core/file.py:168`, `src/graph_sitter/core/file.py:180`, `src/graph_sitter/core/file.py:191`, `src/graph_sitter/core/file.py:253`, `src/graph_sitter/core/file.py:411`, `src/graph_sitter/core/file.py:613`, `src/graph_sitter/core/file.py:633`, `src/graph_sitter/core/file.py:647`, `src/graph_sitter/core/file.py:669`, `src/graph_sitter/core/file.py:681`, `src/graph_sitter/core/file.py:696`, `src/graph_sitter/core/file.py:708`, `src/graph_sitter/core/file.py:734`, `src/graph_sitter/core/file.py:752`, `src/graph_sitter/core/file.py:773`, `src/graph_sitter/core/file.py:785`, `src/graph_sitter/core/file.py:797`, `src/graph_sitter/core/file.py:810`, `src/graph_sitter/core/file.py:826`, `src/graph_sitter/core/file.py:839`, `src/graph_sitter/core/file.py:921`, `src/graph_sitter/core/file.py:1174`, `src/graph_sitter/python/file.py:38`, `src/graph_sitter/python/file.py:85`, `src/graph_sitter/typescript/file.py:47`, `src/graph_sitter/typescript/file.py:61`, `src/graph_sitter/typescript/file.py:79`, `src/graph_sitter/typescript/file.py:91`, `src/graph_sitter/typescript/file.py:107`, `src/graph_sitter/typescript/file.py:121`, `src/graph_sitter/typescript/file.py:136`, `src/graph_sitter/typescript/file.py:148`, `src/graph_sitter/typescript/file.py:160`, `src/graph_sitter/typescript/file.py:174`, `src/graph_sitter/typescript/file.py:426`. + +Docs/tests/codemods evidence: `docs/api-reference/core/File.mdx`, `docs/api-reference/core/SourceFile.mdx`, `docs/api-reference/python/PyFile.mdx`, `docs/api-reference/typescript/TSFile.mdx`, `tests/unit/sdk/python/file/test_file_properties.py`, `tests/unit/sdk/typescript/file/test_file_import_statemets.py`, `tests/unit/sdk/typescript/export/test_export_resolve_export.py`. + +- File identity and content: + - `file.name`, `file.file_path`, `file.filepath`, `file.path`. + - `file.content`, `file.content_bytes`, `file.source`. + - `file.directory`, `file.extension`, `file.is_binary`. + - `File.get_extensions()`, `PyFile.get_extensions()`, `TSFile.get_extensions()`. + - Class constructors used in tests: `File.from_content(...)`, language-specific `from_content(...)`, and `create_from_filepath(...)`. +- Source-file graph queries: + - `file.imports`, `file.import_statements`, `file.inbound_imports`, `file.importers`. + - `file.has_import(name_or_source)` and `file.get_import(name_or_source, optional=False)`. + - `file.symbols(...)`, including nested filtering behavior. + - `file.symbols_sorted_topologically`. + - `file.get_symbol(name, optional=False)`. + - `file.global_vars`, `file.get_global_var(name, optional=False)`. + - `file.classes`, `file.get_class(name, optional=False)`. + - `file.functions`, `file.get_function(name, optional=False)`. + - `file.find_by_byte_range(...)`. +- TypeScript-specific source-file queries: + - `file.exports`, `file.export_statements`, `file.default_exports`, `file.named_exports`, `file.get_export(name, optional=False)`. + - `file.interfaces`, `file.get_interface(name, optional=False)`. + - `file.types`, `file.get_type(name, optional=False)`. + - `file.get_namespace(name, optional=False)`. + - `file.promise_chains` should return the same shape if the Rust slice exposes TS expression indexes; otherwise route to Python initially. +- Import string helpers: + - `file.import_module_name(...)`. + - `PyFile.get_import_string(...)`. + - `TSFile.get_import_string(...)`. + +### Symbol and Inherited Editable/Usable APIs + +Source references: `src/graph_sitter/core/symbol.py:41`, `src/graph_sitter/core/symbol.py:96`, `src/graph_sitter/core/symbol.py:141`, `src/graph_sitter/core/symbol.py:435`, `src/graph_sitter/core/interfaces/has_name.py:17`, `src/graph_sitter/core/interfaces/has_name.py:29`, `src/graph_sitter/core/interfaces/usable.py:25`, `src/graph_sitter/core/interfaces/usable.py:44`, `src/graph_sitter/core/interfaces/importable.py:44`, `src/graph_sitter/core/interfaces/editable.py:236`, `src/graph_sitter/core/interfaces/editable.py:372`, `src/graph_sitter/core/interfaces/editable.py:383`, `src/graph_sitter/core/interfaces/editable.py:1048`, `src/graph_sitter/python/symbol.py:33`, `src/graph_sitter/python/symbol.py:45`, `src/graph_sitter/typescript/symbol.py:35`, `src/graph_sitter/typescript/symbol.py:130`, `src/graph_sitter/typescript/symbol.py:407`. + +Docs/tests/codemods evidence: `docs/api-reference/core/Symbol.mdx`, `docs/api-reference/core/Editable.mdx`, `docs/api-reference/core/Usable.mdx`, `docs/api-reference/core/HasName.mdx`, `docs/api-reference/python/PySymbol.mdx`, `docs/api-reference/typescript/TSSymbol.mdx`, codemods under `src/codemods/`. + +- Symbol identity and source: + - `symbol.name`, `symbol.full_name`, `symbol.symbol_type`. + - `symbol.file`, `symbol.filepath`, `symbol.source`, `symbol.extended_source`, `symbol.extended_nodes`. + - Python `symbol.is_exported`. + - TypeScript export-facing metadata such as `symbol.export`, `symbol.exported_name`, `symbol.has_semicolon`, and `symbol.semicolon_node` where used by TS export/edit helpers. +- Graph relationships: + - `symbol.dependencies`. + - `symbol.usages` and `symbol.symbol_usages`. + - `symbol.descendant_symbols`. + - `symbol.function_calls`. +- Name/source helpers that must still work on compatibility handles: + - `symbol.get_name()`. + - `symbol.get_import_string(...)` for Python and TypeScript language subclasses. + +### Import + +Source references: `src/graph_sitter/core/import_resolution.py:60`, `src/graph_sitter/core/import_resolution.py:165`, `src/graph_sitter/core/import_resolution.py:184`, `src/graph_sitter/core/import_resolution.py:202`, `src/graph_sitter/core/import_resolution.py:213`, `src/graph_sitter/core/import_resolution.py:224`, `src/graph_sitter/core/import_resolution.py:237`, `src/graph_sitter/core/import_resolution.py:252`, `src/graph_sitter/core/import_resolution.py:278`, `src/graph_sitter/core/import_resolution.py:291`, `src/graph_sitter/core/import_resolution.py:356`, `src/graph_sitter/core/import_resolution.py:379`, `src/graph_sitter/core/import_resolution.py:392`, `src/graph_sitter/core/import_resolution.py:526`, `src/graph_sitter/core/import_resolution.py:545`, `src/graph_sitter/python/import_resolution.py:33`, `src/graph_sitter/python/import_resolution.py:44`, `src/graph_sitter/python/import_resolution.py:63`, `src/graph_sitter/python/import_resolution.py:87`, `src/graph_sitter/python/import_resolution.py:331`, `src/graph_sitter/typescript/import_resolution.py:35`, `src/graph_sitter/typescript/import_resolution.py:58`, `src/graph_sitter/typescript/import_resolution.py:78`, `src/graph_sitter/typescript/import_resolution.py:93`, `src/graph_sitter/typescript/import_resolution.py:110`, `src/graph_sitter/typescript/import_resolution.py:137`, `src/graph_sitter/typescript/import_resolution.py:200`, `src/graph_sitter/typescript/import_resolution.py:548`, `src/graph_sitter/typescript/import_resolution.py:582`, `src/graph_sitter/typescript/import_resolution.py:603`. + +Docs/tests/codemods evidence: `docs/api-reference/core/Import.mdx`, `docs/api-reference/python/PyImport.mdx`, `docs/api-reference/typescript/TSImport.mdx`, `tests/unit/sdk/typescript/file/test_file_import_statemets.py`, TS export/import resolution tests, codemods under `src/codemods/`. + +- Import identity: + - `import.name`, `import.source`, `import.module`, `import.symbol_name`, `import.alias`, `import.import_type`. + - `import.import_specifier`. +- Import predicates: + - `import.is_aliased_import`, `import.is_module_import`, `import.is_symbol_import`, `import.is_wildcard_import`, `import.is_dynamic`, `import.is_reexport`. + - TypeScript `import.is_type_import`, `import.is_default_import`, `import.namespace_imports`, `import.is_namespace_import`. +- Resolution: + - `import.from_file`, `import.to_file`. + - `import.imported_symbol`, `import.resolved_symbol`, `import.imported_exports`, `import.namespace`. + - Python `resolve_import(...)` and TypeScript `resolve_import(...)` semantics should be reflected through the public properties even if the function itself is not exposed as the first Rust boundary. +- Import string helpers: + - `import.get_import_string(...)`. + +### Export + +Source references: `src/graph_sitter/core/export.py:22`, `src/graph_sitter/core/export.py:41`, `src/graph_sitter/core/export.py:50`, `src/graph_sitter/core/export.py:61`, `src/graph_sitter/core/export.py:69`, `src/graph_sitter/core/export.py:80`, `src/graph_sitter/typescript/export.py:45`, `src/graph_sitter/typescript/export.py:236`, `src/graph_sitter/typescript/export.py:248`, `src/graph_sitter/typescript/export.py:274`, `src/graph_sitter/typescript/export.py:299`, `src/graph_sitter/typescript/export.py:312`, `src/graph_sitter/typescript/export.py:328`, `src/graph_sitter/typescript/export.py:339`, `src/graph_sitter/typescript/export.py:350`, `src/graph_sitter/typescript/export.py:365`, `src/graph_sitter/typescript/export.py:381`, `src/graph_sitter/typescript/export.py:523`, `src/graph_sitter/typescript/export.py:549`, `src/graph_sitter/typescript/export.py:561`, `src/graph_sitter/typescript/export.py:578`, `src/graph_sitter/typescript/export.py:617`. + +Docs/tests/codemods evidence: `docs/api-reference/core/Export.mdx`, `docs/api-reference/typescript/TSExport.mdx`, `tests/unit/sdk/typescript/export/test_export_resolve_export.py`, TS export codemod examples. + +- Export identity and source: + - `export.name`, `export.source`, `export.exported_name` where exposed by TS-specific classes. + - `export.descendant_symbols`. +- Export predicates: + - `export.is_named_export`, `export.is_default_export`, `export.is_default_symbol_export`, `export.is_type_export`, `export.is_reexport`, `export.is_wildcard_export`, `export.is_module_export`, `export.is_aliased`, `export.is_external_export`. +- Resolution: + - `export.declared_symbol`, `export.exported_symbol`, `export.resolved_symbol`. + - Reexport and wildcard resolution must preserve current symbol/import/file targets. +- Import string helpers: + - `export.to_import_string(...)` and `export.get_import_string(...)`. + +### Directory + +Source references: `src/graph_sitter/core/directory.py:31`, `src/graph_sitter/core/directory.py:60`, `src/graph_sitter/core/directory.py:71`, `src/graph_sitter/core/directory.py:95`, `src/graph_sitter/core/directory.py:99`, `src/graph_sitter/core/directory.py:105`, `src/graph_sitter/core/directory.py:116`, `src/graph_sitter/core/directory.py:158`, `src/graph_sitter/core/directory.py:177`, `src/graph_sitter/core/directory.py:188`, `src/graph_sitter/core/directory.py:199`, `src/graph_sitter/core/directory.py:204`, `src/graph_sitter/core/directory.py:213`, `src/graph_sitter/core/directory.py:224`, `src/graph_sitter/core/directory.py:240`, `src/graph_sitter/core/interfaces/has_symbols.py:51`. + +Docs/tests/codemods evidence: `docs/api-reference/core/Directory.mdx`, `tests/unit/sdk/core/test_directory.py`, directory traversal examples in docs/codemods. + +- Directory identity and traversal: + - `directory.name`, `directory.path`, `directory.dirpath`, `directory.parent`. + - `directory.files(...)`, `directory.subdirectories(...)`, `directory.items`, `directory.item_names`, `directory.file_names`, `directory.tree`. + - `directory.get_file(name)`, `directory.get_subdirectory(name)`. + - `__iter__`, `__contains__`, `__len__`, and `__getitem__`. +- Inherited recursive symbol queries from `HasSymbols`: + - `directory.symbols`, `directory.import_statements`, `directory.global_vars`, `directory.classes`, `directory.functions`, `directory.exports`, `directory.imports`. + - `directory.get_symbol(...)`, `directory.get_import_statement(...)`, `directory.get_global_var(...)`, `directory.get_class(...)`, `directory.get_function(...)`, `directory.get_export(...)`, `directory.get_import(...)`. + +## P1 Compatibility Surface + +P1 APIs should be preserved, but the first Rust backend can use the current Python implementation as a fallback. These APIs create or mutate files, edits, imports, exports, names, comments, or AST source ranges. + +### Codebase P1 + +Source references: `src/graph_sitter/core/codebase.py:325`, `src/graph_sitter/core/codebase.py:388`, `src/graph_sitter/core/codebase.py:476`, `src/graph_sitter/core/codebase.py:511`, `src/graph_sitter/core/codebase.py:748`, `src/graph_sitter/core/codebase.py:1012`, `src/graph_sitter/core/codebase.py:1185`, `src/graph_sitter/core/codebase.py:1196`, `src/graph_sitter/core/codebase.py:1293`, `src/graph_sitter/core/codebase.py:1310`. + +- `codebase.create_file(...)`. +- `codebase.create_directory(...)`. +- `codebase.codeowners`. +- `codebase.external_modules`. +- `codebase.get_relative_path(from_file, to_file)`. +- `codebase.find_by_span(span)`. +- `codebase.set_session_options(...)`. +- `codebase.ai(...)`, `codebase.ai_client`, and AI/session helpers, if enabled in the environment. +- `codebase.visualize(...)`, if graph handles can be mapped back to a display graph. + +### File and SourceFile P1 + +Source references: `src/graph_sitter/core/file.py:238`, `src/graph_sitter/core/file.py:262`, `src/graph_sitter/core/file.py:294`, `src/graph_sitter/core/file.py:329`, `src/graph_sitter/core/file.py:359`, `src/graph_sitter/core/file.py:396`, `src/graph_sitter/core/file.py:976`, `src/graph_sitter/core/file.py:1027`, `src/graph_sitter/core/file.py:1047`, `src/graph_sitter/typescript/file.py:214`, `src/graph_sitter/typescript/file.py:230`, `src/graph_sitter/typescript/file.py:298`, `src/graph_sitter/typescript/file.py:322`, `src/graph_sitter/typescript/file.py:397`. + +- `file.write(...)`, `file.write_bytes(...)`. +- `file.edit(...)`, `file.replace(...)`, `file.remove(...)`. +- `file.rename(...)`, `file.update_filepath(...)`. +- `file.add_import(...)`. +- `file.add_symbol_from_source(...)`, `file.add_symbol(...)`. +- TypeScript `file.add_export_to_symbol(...)`. +- TypeScript `file.remove_unused_exports(...)`. +- TypeScript `file.has_export_statement_for_path(...)` and `file.get_export_statement_for_path(...)`. +- TypeScript `file.update_filepath(...)` behavior that also updates import paths. + +### Editable and Symbol P1 + +Source references: `src/graph_sitter/core/symbol.py:123`, `src/graph_sitter/core/symbol.py:169`, `src/graph_sitter/core/symbol.py:179`, `src/graph_sitter/core/symbol.py:189`, `src/graph_sitter/core/symbol.py:204`, `src/graph_sitter/core/symbol.py:219`, `src/graph_sitter/core/symbol.py:242`, `src/graph_sitter/core/symbol.py:269`, `src/graph_sitter/core/symbol.py:408`, `src/graph_sitter/core/interfaces/has_name.py:51`, `src/graph_sitter/core/interfaces/has_name.py:64`, `src/graph_sitter/core/interfaces/has_name.py:79`, `src/graph_sitter/core/interfaces/usable.py:78`, `src/graph_sitter/core/interfaces/editable.py:394`, `src/graph_sitter/core/interfaces/editable.py:428`, `src/graph_sitter/core/interfaces/editable.py:483`, `src/graph_sitter/core/interfaces/editable.py:516`, `src/graph_sitter/core/interfaces/editable.py:571`, `src/graph_sitter/core/interfaces/editable.py:604`, `src/graph_sitter/core/interfaces/editable.py:633`, `src/graph_sitter/core/interfaces/editable.py:683`, `src/graph_sitter/core/interfaces/editable.py:859`, `src/graph_sitter/core/interfaces/editable.py:905`, `src/graph_sitter/core/interfaces/editable.py:936`, `src/graph_sitter/core/interfaces/editable.py:1040`, `src/graph_sitter/core/interfaces/editable.py:1084`, `src/graph_sitter/core/interfaces/editable.py:1090`, `src/graph_sitter/core/interfaces/editable.py:1098`, `src/graph_sitter/core/interfaces/editable.py:1106`, `src/graph_sitter/core/interfaces/editable.py:1115`, `src/graph_sitter/core/interfaces/editable.py:1132`, `src/graph_sitter/core/interfaces/editable.py:1140`, `src/graph_sitter/core/interfaces/editable.py:1148`. + +- `symbol.set_name(...)`, `symbol.rename(...)`, `symbol.edit(...)`, source setter behavior. +- `symbol.comment`, `symbol.inline_comment`, `symbol.set_comment(...)`, `symbol.add_comment(...)`, `symbol.set_inline_comment(...)`. +- `symbol.insert_before(...)`, `symbol.insert_after(...)`, `symbol.remove(...)`, `symbol.move_to_file(...)`, `symbol.add_keyword(...)`. +- `Editable.find_string_literals(...)`, `find(...)`, `search(...)`. +- `Editable.replace(...)`, `insert_before(...)`, `insert_after(...)`, `edit(...)`, `remove(...)`. +- `Editable.variable_usages`, `get_variable_usages(...)`. +- `Editable.flag(...)`, `reduce_condition(...)`. +- `Editable.is_wrapped_in(...)`, `parent_of_type(...)`, `parent_of_types(...)`, `is_child_of(...)`, `ancestors`, `parent_statement`, `parent_function`, `parent_class`. + +### Import, Export, and Directory P1 + +Source references: `src/graph_sitter/core/import_resolution.py:437`, `src/graph_sitter/core/import_resolution.py:458`, `src/graph_sitter/core/import_resolution.py:479`, `src/graph_sitter/core/import_resolution.py:503`, `src/graph_sitter/typescript/import_resolution.py:624`, `src/graph_sitter/typescript/export.py:413`, `src/graph_sitter/typescript/export.py:651`, `src/graph_sitter/core/directory.py:244`, `src/graph_sitter/core/directory.py:252`, `src/graph_sitter/core/directory.py:257`. + +- `import.set_import_module(...)`, `import.set_import_symbol_alias(...)`, `import.rename(...)`, `import.remove(...)`. +- TypeScript `import.set_import_module(...)` path-update behavior. +- `export.make_non_default(...)`, `export.reexport_symbol(...)`, and inherited `export.remove(...)`. +- `directory.update_filepath(...)`, `directory.remove(...)`, `directory.rename(...)`. + +## P2 Compatibility Surface + +P2 APIs are public or semi-public, but should not drive the first Rust data model. Preserve them through Python-side delegation, clear errors, or later parity work. + +Source references: `src/graph_sitter/core/codebase.py:235`, `src/graph_sitter/core/codebase.py:241`, `src/graph_sitter/core/codebase.py:822`, `src/graph_sitter/core/codebase.py:833`, `src/graph_sitter/core/codebase.py:865`, `src/graph_sitter/core/codebase.py:931`, `src/graph_sitter/core/codebase.py:938`, `src/graph_sitter/core/codebase.py:974`, `src/graph_sitter/core/codebase.py:1116`, `src/graph_sitter/core/codebase.py:1542`, `src/graph_sitter/core/codebase.py:1546`. + +- Git and GitHub: + - `codebase.github`, `codebase.op`. + - `codebase.git_commit`, `codebase.default_branch`, `codebase.current_commit`, `codebase.checkout(...)`. + - `codebase.get_diffs(...)`, `codebase.get_diff(...)`. + - `codebase.create_pr(...)`, `codebase.create_pr_comment(...)`, `codebase.create_pr_review_comment(...)`. + - PR-diff helpers such as modified-symbol lookup should remain Python-side until Rust graph parity is proven. +- Diagnostics, logs, and visualization: + - `codebase.reset_logs()`. + - Rich repr and diagnostic properties relying on Python graph object counts. + - Visualization internals and `viz`/graph display helpers. +- Low-level/internal object access: + - `ctx`, `_op`, raw `ts_node`, `node_id`, `parent`, `get_nodes()`, `parse/sync/recompute` helpers, and language-specific noapidoc helpers such as `valid_symbol_names`/`valid_import_names`. + - These should not become the Rust public contract; if compatibility requires them, expose minimal Python shim objects or fail explicitly under the Rust backend. + +## APIs That Currently Materialize Full Lists + +These are the main memory-sensitive APIs. They should keep returning Python `list` objects for compatibility, but the Rust backend should generate compact ID lists first and wrap handles lazily. + +### Codebase-wide materializers + +- `codebase.files(...)` currently returns sorted Python file objects and may walk the repo operator for non-source files: `src/graph_sitter/core/codebase.py:286`. +- `codebase.directories` returns `list(self.ctx.directories.values())`: `src/graph_sitter/core/codebase.py:338`. +- `codebase.imports` returns `ctx.get_nodes(NodeType.IMPORT)`: `src/graph_sitter/core/codebase.py:351`. +- `codebase.exports` returns `ctx.get_nodes(NodeType.EXPORT)`: `src/graph_sitter/core/codebase.py:366`. +- `codebase.external_modules` returns `ctx.get_nodes(NodeType.EXTERNAL)`: `src/graph_sitter/core/codebase.py:388`. +- `codebase.symbols`, `classes`, `functions`, `global_vars`, `interfaces`, and `types` call `_symbols`, which scans `ctx.get_nodes(NodeType.SYMBOL)` and filters top-level symbols: `src/graph_sitter/core/codebase.py:273`, `src/graph_sitter/core/codebase.py:399`. +- `codebase.get_symbol(...)`, `get_symbols(...)`, `get_class(...)`, and `get_function(...)` scan those full lists: `src/graph_sitter/core/codebase.py:644`, `src/graph_sitter/core/codebase.py:671`, `src/graph_sitter/core/codebase.py:687`, `src/graph_sitter/core/codebase.py:711`. + +### SourceFile materializers + +- `SourceFile` inherits `Importable`, whose constructor appends each parsed node into `self.file._nodes`: `src/graph_sitter/core/interfaces/importable.py:37`. +- `file.get_nodes()` returns the per-file `_nodes` list: `src/graph_sitter/core/file.py:725`. +- `file.imports`, `file.import_statements`, `file.symbols`, `file.global_vars`, `file.classes`, and `file.functions` all filter or transform that per-file list: `src/graph_sitter/core/file.py:633`, `src/graph_sitter/core/file.py:669`, `src/graph_sitter/core/file.py:708`, `src/graph_sitter/core/file.py:773`, `src/graph_sitter/core/file.py:797`, `src/graph_sitter/core/file.py:826`. +- `file.symbols_sorted_topologically` constructs a subgraph of in-file symbol nodes: `src/graph_sitter/core/file.py:752`. +- `file.inbound_imports` combines `self.symbols`, `self.imports`, and `self.symbol_usages`: `src/graph_sitter/core/file.py:613`. +- TypeScript `file.exports`, `export_statements`, `default_exports`, `named_exports`, `interfaces`, and `types` materialize filtered lists: `src/graph_sitter/typescript/file.py:47`, `src/graph_sitter/typescript/file.py:61`, `src/graph_sitter/typescript/file.py:79`, `src/graph_sitter/typescript/file.py:91`, `src/graph_sitter/typescript/file.py:121`, `src/graph_sitter/typescript/file.py:148`. + +### Directory recursive materializers + +- `directory.files(...)` recursively collects files into a list: `src/graph_sitter/core/directory.py:116`. +- `directory.subdirectories(...)`, `items`, `item_names`, `file_names`, and `tree` all materialize directory children: `src/graph_sitter/core/directory.py:158`, `src/graph_sitter/core/directory.py:177`, `src/graph_sitter/core/directory.py:188`, `src/graph_sitter/core/directory.py:199`, `src/graph_sitter/core/directory.py:204`. +- `HasSymbols` recursively chains per-file properties for `symbols`, `imports`, `exports`, `classes`, `functions`, and globals: `src/graph_sitter/core/interfaces/has_symbols.py:51`. + +### Relationship materializers + +- `symbol.dependencies` traverses descendant symbols and dependency graph out-edges: `src/graph_sitter/core/interfaces/importable.py:44`. +- `symbol.usages` and `symbol.symbol_usages` traverse graph edges and collect usage objects: `src/graph_sitter/core/interfaces/usable.py:25`, `src/graph_sitter/core/interfaces/usable.py:44`. +- `import.imported_symbol`, `import.resolved_symbol`, `import.imported_exports`, `import.from_file`, and `import.to_file` resolve through graph edges and source-file/import lists: `src/graph_sitter/core/import_resolution.py:252`, `src/graph_sitter/core/import_resolution.py:278`, `src/graph_sitter/core/import_resolution.py:291`, `src/graph_sitter/core/import_resolution.py:356`, `src/graph_sitter/core/import_resolution.py:379`. +- `export.declared_symbol`, `export.exported_symbol`, and `export.resolved_symbol` resolve across TS export/import/file graph edges: `src/graph_sitter/typescript/export.py:350`, `src/graph_sitter/typescript/export.py:365`, `src/graph_sitter/typescript/export.py:381`. + +## Recommended First-Slice Compatibility Boundary + +The first Rust backend slice should be read-heavy and graph-oriented: + +- Parse Python and TypeScript/TSX source files into compact records for files, top-level symbols, classes, functions, globals, TypeScript interfaces/types, imports, exports, and ranges. +- Preserve public list-returning APIs by returning Python lists of lazy compatibility handles, but do not eagerly instantiate every Python node object during codebase construction. +- Preserve current public ordering: alphabetical sorting for `codebase.files`, sorted symbol/class/function lists where the Python API sorts today, and existing file-local ordering for imports/exports/symbols. +- Preserve path normalization, `optional=True` behavior, ambiguity errors, Python-vs-TypeScript export behavior, and `ignore_case` lookup behavior. +- Implement import/export resolution, dependency edges, and usage records in Rust before claiming parity for `import.resolved_symbol`, `import.imported_symbol`, `export.resolved_symbol`, `symbol.dependencies`, or `symbol.usages`. +- Keep edit APIs, transaction application, formatting, comments, AST parent navigation, AI, Git/GitHub, and visualization on the Python backend/fallback path for the first slice. +- Make unsupported P1/P2 APIs under the Rust backend explicit: either delegate to Python compatibility objects or raise a clear `NotImplementedError`. P0 APIs should not silently fall back to incomplete or behavior-changing approximations. +- Avoid exposing persistent Rust-owned tree-sitter node wrappers as the long-term contract. Use stable IDs plus byte ranges/source text and construct Python handles only on demand. + +## Initial Rust Data Required For P0 + +- `FileRecord`: stable file ID, interned path/name/extension, language, content hash, source/binary flag, directory ID, root range. +- `DirectoryRecord`: stable directory ID, interned path/name, parent ID, sorted child file/directory ID indexes. +- `SymbolRecord`: stable symbol ID, file ID, kind, name/full-name IDs, top-level/nested flag, parent symbol ID, range, extended range, export metadata. +- `ImportRecord`: stable import ID, file ID, module/name/alias IDs, kind flags, statement range, target file/symbol/export IDs where resolved. +- `ExportRecord`: stable export ID, file ID, exported name, kind flags, declared/exported/resolved target IDs, range. +- `UsageRecord`: stable usage ID, source file/node ID, target symbol/import/export ID, usage kind, range. +- `GraphEdge`: compact dependency and resolution edges by ID, not Python object payloads. diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md new file mode 100644 index 000000000..1e1c32396 --- /dev/null +++ b/rust-rewrite/benchmarks.md @@ -0,0 +1,103 @@ +# Phase 0 Benchmarking And Profiling + +This document captures the first practical baseline plan for the Python backend before replacing the eager Python object graph with a Rust engine. + +## Goals + +- Measure cold `Codebase(...)` construction wall time and RSS for the current Python backend. +- Split the build into coarse phases that match today's implementation. +- Record graph size and Python object counts so memory regressions can be compared against graph scale. +- Keep the smoke benchmark runnable without a large external repository. + +## Current Build Phase Map + +The eager path is: + +1. `Codebase.__init__` validates inputs, builds `ProjectConfig`, and constructs `CodebaseContext`. +2. `CodebaseContext.__init__` creates `rustworkx.PyDiGraph`, indexes, parser, config parser, dependency manager, and language engine. +3. `CodebaseContext.build_graph` enumerates files with `RepoOperator.iter_files`. +4. `_process_diff_files` adds files: + - dependency manager / language engine startup if configured + - file existence checks for incremental runs + - new file parsing through `SourceFile.from_content` + - tree-sitter parse through `parse_file` + - eager Python object materialization through `SourceFile.parse` +5. `_process_diff_files` builds the directory tree with `build_directory_tree`. +6. TypeScript only: `config_parser.parse_configs` assigns nearest `tsconfig.json` data. +7. Unless `CodebaseConfig(disable_graph=True)` is set, graph resolution runs: + - import resolution through `Import.add_symbol_resolution_edge` + - TypeScript export dependency resolution through `TSExport.compute_export_dependencies` + - superclass/interface dependency resolution through `compute_superclass_dependencies` + - fixed-point dependency recompute through `_compute_dependencies` and `Importable.recompute` + +The known memory-heavy points are `SourceFile._nodes`, every `Editable` retaining `ts_node`, `ctx`, `parent`, and IDs, and `CodebaseContext._graph` storing Python payload objects plus `Edge` objects. + +## Harness + +`rust-rewrite/tools/measure_python_backend.py` is a standalone measurement harness. It runtime-wraps stable Python backend choke points and writes a JSON report. + +Smoke test with a generated tiny Python git repo: + +```bash +uv run python rust-rewrite/tools/measure_python_backend.py --language python --json +``` + +Measure a real repo: + +```bash +uv run python rust-rewrite/tools/measure_python_backend.py /path/to/repo --language python --output /tmp/python-backend-baseline.json +``` + +Run multiple cold samples as separate processes: + +```bash +for i in 1 2 3 4 5; do + uv run python rust-rewrite/tools/measure_python_backend.py /path/to/repo --language python \ + --output "/tmp/python-backend-baseline-$i.json" +done +``` + +Isolate parse/object materialization from graph resolution: + +```bash +uv run python rust-rewrite/tools/measure_python_backend.py /path/to/repo --language python \ + --disable-graph --output /tmp/python-backend-parse-only.json +``` + +## Metrics + +The JSON report includes: + +- total constructor wall time +- process RSS before and after construction +- sampled process RSS peak for the full run +- `ru_maxrss` for process max RSS +- inclusive wall time and sampled RSS peak for each wrapped phase +- phase call counts and phase-specific counters, such as parsed bytes +- graph node and edge counts +- graph node counts by `NodeType` +- sum of per-file `_nodes` lengths +- optional `gc` object counts for `graph_sitter.*` classes + +Phase timings are inclusive and do not sum to total time because some wrappers are nested. RSS phase attribution uses a background sampler and should be treated as trend data, not allocator-accurate attribution. + +## Recommended Baseline Matrix + +Use pinned commits and record hardware, Python version, OS, and command line from the JSON metadata. + +| Tier | Repo | Purpose | Minimum samples | +| --- | --- | --- | --- | +| Smoke | generated fixture | CI/local sanity check | 1 | +| Small | this repo or a compact fixture repo | stable regression signal | 5 | +| Medium | representative Python service or TS package | phase distribution | 5 | +| Huge | known memory-stressing monorepo | Rust rewrite target | 3 | + +For each real repo, capture both default graph mode and `--disable-graph` parse-only mode. The delta approximates resolution/dependency graph cost. + +## Open Questions + +- Which exact small, medium, and huge repositories should become canonical Phase 0 baselines? +- Should TypeScript baselines run with dependency manager and language engine flags off, on, or both? +- Do we need allocator-level attribution with `memray`, `tracemalloc`, or `py-spy` in addition to RSS sampling? +- What commit, dependency lockfile, and Python minor version should define the official baseline? +- Which memory target should be set for the first Rust vertical slice: total RSS, graph-only delta, or parse-only delta? diff --git a/rust-rewrite/data-model.md b/rust-rewrite/data-model.md new file mode 100644 index 000000000..d5928089e --- /dev/null +++ b/rust-rewrite/data-model.md @@ -0,0 +1,567 @@ +# Rust Data Model Proposal + +## Scope + +This document proposes the compact Rust-side storage model for replacing the current Python object graph while preserving the Python API as a lazy compatibility layer. It is based on inspection of: + +- `CodebaseContext`: `PyDiGraph[Importable, Edge]`, `filepath_idx`, external module index, graph build/reparse flow. +- `SourceFile`: eager file node plus per-file `_nodes`, `_range_index`, import/symbol/export query helpers. +- `Editable` and `Importable`: persistent `ts_node`, `ctx`, `parent`, `file_node_id`, `node_id`, `range`, and edit helpers. +- `Usage`, `RangeIndex`, `ResolutionStack`, `Import`, `Export`, and edge construction paths. + +The important constraint is to avoid translating every Python semantic node into a PyO3-owned object. Rust should own compact records and return IDs; Python objects should be handles created only when user code asks for them. + +## Current Shape To Preserve + +The current graph endpoints are only semantic `Importable` objects: + +- files +- symbols +- imports +- exports +- external modules + +General expressions/statements are usually not graph endpoints, but many are still materialized as Python `Editable` objects for parent traversal, source/range access, edits, and dependency extraction. `RangeIndex` can additionally keep all parsed editables when `full_range_index` is enabled. + +Current graph edge kinds are: + +- `IMPORT_SYMBOL_RESOLUTION`: import to resolved symbol/import/export/file/external module. +- `EXPORT`: export to declared/exported symbol, import, file, or other export target. +- `SUBCLASS`: class/interface symbol to resolved superclass/interface. +- `SYMBOL_USAGE`: symbol/import/export/file owner to used symbol/import/export/file/external module, with `Usage` metadata. + +The Rust model should preserve those graph semantics, not the Python object ownership model. + +## Core Storage + +```rust +pub struct EngineStore { + pub schema_version: u16, + pub engine_epoch: u32, + + pub strings: StringInterner, + pub paths: PathInterner, + pub modules: StringInterner, + pub ts_kinds: StringInterner, + + pub files: Arena, + pub syntax: Arena, + pub symbols: Arena, + pub imports: Arena, + pub exports: Arena, + pub externals: Arena, + pub scopes: Arena, + pub usages: Arena, + + pub nodes: NodeTable, + pub graph: GraphStore, + pub indexes: IndexStore, +} +``` + +`EngineStore` is the sole owner of canonical codebase state. Records store IDs and interned keys, never Python object references, Rust references into other arenas, or persistent `tree_sitter::Node` wrappers. + +`Arena` should be a dense `Vec>` with tombstones and a per-slot generation, or an equivalent slotmap. Dense vectors keep scans and dumps cache-friendly; generations let lazy Python handles fail clearly after invalidation instead of reading a reused slot. + +## IDs + +Use typed IDs internally: + +```rust +pub struct FileId(u32); +pub struct SyntaxId(u32); +pub struct SymbolId(u32); +pub struct ImportId(u32); +pub struct ExportId(u32); +pub struct ExternalId(u32); +pub struct ScopeId(u32); +pub struct UsageId(u32); +pub struct EdgeId(u32); +pub struct StringId(u32); +pub struct PathId(u32); +pub struct TsKindId(u16); +pub struct LineIndexId(u32); + +pub enum NodeRef { + File(FileId), + Symbol(SymbolId), + Import(ImportId), + Export(ExportId), + External(ExternalId), +} + +pub struct HandleKey { + pub node: NodeRef, + pub generation: u32, +} +``` + +The Python-facing `node_id` compatibility value should be an encoded `u64` or a `NodeId(u32)` lookup in `NodeTable`. The preferred shape is: + +```rust +pub struct NodeId(u32); + +pub struct NodeSlot { + pub node: NodeRef, + pub generation: u32, + pub alive: bool, +} +``` + +Compatibility handles store both `NodeId` and generation. `NodeId` values are not reused during a live engine epoch. On full rebuild, `engine_epoch` changes; on file reparse/delete, affected node generations change. A handle is valid only if `(engine_epoch, node_id, generation)` still matches. + +Future incremental stable IDs can be layered on top with `StableKey` fingerprints: + +```rust +pub enum StableKey { + File { normalized_path: PathId }, + Symbol { file: FileId, full_name: StringId, kind: SymbolKind, declaration_range_hash: u64 }, + Import { file: FileId, statement_range_hash: u64, local_index: u32 }, + Export { file: FileId, exported_name: Option, statement_range_hash: u64, local_index: u32 }, +} +``` + +Do not make stable keys the primary storage key in the first slice. Keep arena IDs compact, and use stable keys only to remap handles across reparses later. + +## Interning + +Intern these values: + +- normalized relative paths +- absolute paths only when needed for IO/debug +- module specifiers and import sources +- symbol names and full names +- aliases, exported names, namespaces +- tree-sitter kind strings +- language-specific small strings that appear many times + +Content is not string-interned. Each parsed file owns an `Arc<[u8]>` or equivalent immutable byte buffer for the current revision. Source slices are `(FileId, ByteRange)` views into that buffer. + +Path normalization invariants: + +- `FileRecord.path` is the repo-relative path used by public APIs. +- A separate absolute path cache can exist for IO, but graph identity uses the relative path. +- Case-insensitive lookups are an auxiliary index and must not change canonical path IDs. + +## Ranges + +All canonical ranges are byte ranges in UTF-8 file content: + +```rust +pub struct ByteRange { + pub start: u32, + pub end: u32, +} + +pub struct Point { + pub row: u32, + pub column: u32, +} + +pub struct SourceRange { + pub bytes: ByteRange, + pub start_point: Point, + pub end_point: Point, +} +``` + +`Point.column` must match tree-sitter semantics for the grammar bindings, which are byte columns, not Unicode scalar columns. Keep a per-file line index so byte to point and point to byte conversions are cheap and deterministic. + +Range invariants: + +- `start <= end <= file.content.len()`. +- A record's `file_id` owns every range it stores. +- Ranges are half-open byte ranges. +- Public line ranges keep current behavior: `start_point.row..=end_point.row`. +- Edit transactions operate on byte ranges, matching today's `Editable.edit`, `insert_at`, and `remove_byte_range`. + +## Syntax Anchors + +Rust should not store one Python `Editable` per syntax node. Store compact syntax anchors instead: + +```rust +pub struct SyntaxRecord { + pub file: FileId, + pub parent: Option, + pub kind: TsKindId, + pub range: SourceRange, + pub flags: SyntaxFlags, + pub first_child: Option, + pub next_sibling: Option, +} + +bitflags! { + pub struct SyntaxFlags: u16 { + const NAMED = 1 << 0; + const ERROR = 1 << 1; + const MISSING = 1 << 2; + const CANONICAL = 1 << 3; + const SEMANTIC_ANCHOR = 1 << 4; + } +} +``` + +Default mode should store only anchors required by semantic records and usage matches: + +- file root +- symbol declaration/name/body/extended ranges +- import statement/specifier/module/name/alias ranges +- export statement/name/value ranges +- usage match ranges +- edit anchors needed by P0 methods + +When `full_range_index` or LSP mode is enabled, store all named syntax nodes and the parent/child links required for `ast()`, cursor lookup, and range lookup. This preserves compatibility without paying that cost for every normal codebase load. + +## File Records + +```rust +pub struct FileRecord { + pub path: PathId, + pub language: LanguageKind, + pub content_hash: u64, + pub content_len: u32, + pub content: Arc<[u8]>, + pub line_index: LineIndexId, + pub root: SyntaxId, + pub root_range: SourceRange, + pub parse_status: ParseStatus, + pub file_epoch: u32, + + pub symbols: IdSpan, + pub imports: IdSpan, + pub exports: IdSpan, + pub syntax_nodes: IdSpan, +} +``` + +Per-file ID spans point into sorted side arrays in `IndexStore`, not embedded `Vec`s in every file. This keeps `FileRecord` small and allows bulk rebuild of file indexes after parse. + +File invariants: + +- `path` is unique among live files. +- Per-file symbols/imports/exports are sorted by `(start_byte, end_byte, local_order)`. +- Deleting a file tombstones all semantic records owned by the file and removes graph edges touching those records. +- Reparsing a file increments `file_epoch`; lazy handles with old epoch become stale. + +## Symbol Records + +```rust +pub struct SymbolRecord { + pub node_id: NodeId, + pub file: FileId, + pub kind: SymbolKind, + pub language_kind: LanguageSymbolKind, + pub name: StringId, + pub full_name: StringId, + pub parent_symbol: Option, + pub parent_scope: ScopeId, + pub declaration: SyntaxId, + pub name_syntax: Option, + pub body: Option, + pub extended_range: SourceRange, + pub declaration_range: SourceRange, + pub name_range: Option, + pub flags: SymbolFlags, + pub local_order: u32, +} +``` + +Symbol invariants: + +- `node_id` maps back to `NodeRef::Symbol(self_id)`. +- `parent_symbol` is in the same file and must not form a cycle. +- `full_name` is the language-specific qualified name used by current public APIs. +- `is_top_level` is a flag derived during extraction, not recomputed by climbing Python parents. +- `descendant_symbols` is answered by a symbol tree index. + +## Import Records + +```rust +pub struct ImportRecord { + pub node_id: NodeId, + pub file: FileId, + pub import_type: ImportType, + pub statement: SyntaxId, + pub specifier: SyntaxId, + pub module: Option, + pub symbol_name: Option, + pub alias: Option, + pub namespace: Option, + pub is_type_only: bool, + pub is_dynamic: bool, + pub unique_range: SourceRange, + pub statement_range: SourceRange, + pub specifier_range: SourceRange, + pub module_range: Option, + pub symbol_range: Option, + pub alias_range: Option, + pub resolved: Option, + pub local_order: u32, +} +``` + +Import invariants: + +- `node_id` maps back to `NodeRef::Import(self_id)`. +- `unique_range` preserves current equality/hash behavior for multi-import statements. +- `resolved` is mirrored by one `IMPORT_SYMBOL_RESOLUTION` edge when resolution succeeds or an external module record is created. +- External module records are keyed by `(import.source, unique_import_name)`, matching the current `module::import_name` index. +- Wildcard imports expose `names` through a wildcard expansion index, not by materializing `WildcardImport` Python objects up front. + +## Export Records + +```rust +pub struct ExportRecord { + pub node_id: NodeId, + pub file: FileId, + pub export_kind: ExportKind, + pub name: Option, + pub exported_name: Option, + pub declared_symbol: Option, + pub statement: SyntaxId, + pub name_syntax: Option, + pub value_syntax: Option, + pub statement_range: SourceRange, + pub name_range: Option, + pub target: Option, + pub flags: ExportFlags, + pub local_order: u32, +} +``` + +Export invariants: + +- `node_id` maps back to `NodeRef::Export(self_id)`. +- `target` is mirrored by an `EXPORT` edge when known. +- Wildcard exports target the source file node when current behavior does. +- `resolved_symbol` follows export/import edges with a visited set to preserve circular-chain behavior. + +## External Records + +```rust +pub struct ExternalRecord { + pub node_id: NodeId, + pub module: StringId, + pub import_name: StringId, + pub display_name: StringId, + pub first_import: ImportId, +} +``` + +External modules do not own file ranges. Any source/range shown for compatibility should come from `first_import` or the usage/import that reached the external. + +## Usage Records + +```rust +pub struct UsageRecord { + pub source: NodeRef, + pub target: NodeRef, + pub usage_symbol: NodeRef, + pub match_syntax: SyntaxId, + pub imported_by: Option, + pub usage_type: UsageType, + pub usage_kind: UsageKind, + pub match_range: SourceRange, +} +``` + +`source` is the graph edge source, matching the current `dest.node_id` emitted by `ResolutionStack.get_edges`. `usage_symbol` mirrors the current `Usage.usage_symbol` payload, which is usually `dest.parent_symbol` and may differ from `source` for nested symbols. `target` is the used node. `match_syntax` is the `Name`, `ChainedAttribute`, or `FunctionCall` anchor used for renames and source display. + +Usage invariants: + +- Every `SYMBOL_USAGE` edge has exactly one `UsageId`. +- `UsageRecord.source == edge.source`. +- `UsageRecord.target == edge.target`. +- `UsageRecord.usage_symbol` is a live graph node. +- `match_syntax.file == source.file` when the source has a file. +- `usage_type` preserves `DIRECT`, `CHAINED`, `INDIRECT`, and `ALIASED` resolution stack semantics. +- `usage_kind` preserves body/type/decorator/import/export/subclass context. + +## Graph Storage + +```rust +pub struct EdgeRecord { + pub source: NodeRef, + pub target: NodeRef, + pub kind: EdgeKind, + pub usage: Option, +} + +pub struct GraphStore { + pub edges: Vec, + pub out_offsets: Vec, + pub out_edges: Vec, + pub in_offsets: Vec, + pub in_edges: Vec, +} +``` + +During parsing/resolution, use mutable per-node edge vectors plus a dedupe set. After a phase completes, freeze into CSR-style adjacency arrays. Incremental reparses can rebuild adjacency for affected nodes first; whole-graph CSR rebuild is acceptable for the first vertical slice if it is simpler. + +Edge invariants: + +- Edge endpoints are live `NodeRef`s. +- Multi-edges are allowed only when their full edge key differs. +- Full edge key is `(source, target, kind, usage_key)`. +- `IMPORT_SYMBOL_RESOLUTION` source is always `Import`. +- `EXPORT` source is always `Export`. +- `SUBCLASS` source is always `Symbol`. +- `SYMBOL_USAGE` has `usage.is_some()`. +- Non-`SYMBOL_USAGE` edges have `usage.is_none()`. + +## Indexes + +```rust +pub struct IndexStore { + pub path_to_file: HashMap, + pub casefold_path_to_file: HashMap, + pub external_by_key: HashMap<(StringId, StringId), ExternalId>, + + pub file_symbols: Vec, + pub file_imports: Vec, + pub file_exports: Vec, + pub file_syntax: Vec, + + pub symbol_children: Vec, + pub scope_bindings: ScopeBindingIndex, + pub import_names_by_file: NameBindingIndex, + pub exported_names_by_file: NameBindingIndex, + + pub range_index_by_file: HashMap, +} +``` + +`RangeIndex` should be compact and optional: + +```rust +pub struct RangeIndex { + pub by_start: Vec, + pub exact: HashMap<(ByteRange, TsKindId), SyntaxId>, + pub all_for_range: HashMap>, +} +``` + +Query patterns: + +- `Codebase.files`: scan live `FileRecord`s sorted by path, return lazy file handles. +- `Codebase.symbols/classes/functions`: scan `symbols`, filter flags/kind/top-level, return handles sorted by file and range. +- `Codebase.imports/exports`: scan arenas or per-file spans. +- `SourceFile.imports/symbols/exports`: use file spans in `IndexStore`; no graph scan required. +- `Import.imported_symbol`: follow the one import resolution edge, then optionally follow export edges. +- `Export.exported_symbol`: follow the one export edge. +- `Symbol.usages`: inspect incoming edges for the target node, filter `SYMBOL_USAGE`, load usage records, sort by match start byte descending. +- `Importable.dependencies`: inspect outgoing `SYMBOL_USAGE` edges from descendant symbol IDs, filter usage type, dedupe, and sort by file/range. +- `find_by_byte_range`: use `RangeIndex.exact` or `all_for_range`. +- Cursor lookup: binary search `RangeIndex.by_start`, then choose the smallest containing range. + +## Lazy Python Compatibility + +Python classes remain compatibility handles: + +```text +PySourceFile -> EngineHandle +PySymbol -> EngineHandle +PyImport -> EngineHandle +PyExport -> EngineHandle +PyExternal -> EngineHandle +PyEditable -> EngineSyntaxHandle +PyUsage -> EngineHandle +``` + +Each handle stores: + +- `Arc` or equivalent engine owner +- typed ID or `NodeId` +- slot generation +- file epoch if the handle depends on file ranges/content + +Handle methods delegate to Rust for source, ranges, relationships, and graph queries. Python lists are built from returned IDs, not from prebuilt objects. + +Compatibility notes: + +- A weak handle cache can preserve object identity for repeated access without materializing the full graph. The cache is optional and must not be part of canonical state. +- `source`, `start_byte`, `end_byte`, `range`, `span`, and `github_url` are computed from records and file content. +- `file`, `parent_symbol`, `parent`, and `descendant_symbols` are ID lookups. +- Unsupported deep AST methods can reparse one file and build transient Python editables for that call. Those transient objects must not be inserted into canonical graph storage. +- Existing writer methods can initially emit byte-range edit intents using stored ranges, then let the Python transaction manager apply them. +- Stale handles should raise a clear invalidation error or fall back to resolving by `StableKey` once stable remap exists. + +## Debug Dumps + +Add Rust debug APIs early: + +```text +debug_dump_ir(format="jsonl", include_strings=true, include_snippets=false) +debug_dump_graph(format="jsonl") +debug_dump_ranges(file_id) +debug_check_invariants() +``` + +Dump format requirements: + +- Include `schema_version`, engine version, repo root hash/path, and language. +- Sort files by path, records by `(file, start_byte, local_order)`, and edges by `(source, kind, target, usage)`. +- Resolve interned strings in human-readable dumps. +- Include raw content hashes and byte ranges by default, not full file content. +- Include optional snippets only when requested. +- Emit enough usage data to compare with the Python backend: edge kind, source node, target node, usage symbol, usage type, usage kind, match range, imported_by. + +Invariant checker should validate: + +- live IDs and node table round trips +- path uniqueness +- range bounds +- edge endpoint kinds +- usage/edge consistency +- per-file sorted spans +- scope parent cycles +- duplicate edge keys +- external module key uniqueness + +## Memory Rationale + +The current model pays for: + +- Python object headers and dicts for semantic nodes and many expressions. +- Persistent tree-sitter node wrappers on `Editable`. +- Backrefs from every object to context, parent, and file. +- The same Python objects stored as rustworkx graph payloads. +- Per-file `_nodes` and optional range indexes containing Python object references. +- `Usage` objects that hold Python object references to match nodes, owner symbols, and imports. + +The proposed model replaces that with: + +- `u32` IDs and small enums instead of object pointers. +- interned strings instead of repeated Python strings. +- contiguous arenas for cache-friendly scans. +- edge payloads as `EdgeRecord` plus optional `UsageId`. +- syntax anchors as byte ranges rather than Python wrappers. +- optional full syntax/range tables only for debug/LSP modes. + +Expected record sizes should be in the tens of bytes for edges/usages and under roughly 100 bytes for most symbols/imports, before interned strings and content. The exact target should be validated by the benchmark agent, but the design removes the multiplicative Python object and graph payload overhead. + +## Migration Risks + +- Python identity and hashing: current equality relies on filepath, range, kind ID, and import unique ranges. Handles must reproduce that behavior even though canonical state is ID based. +- Sorting parity: public APIs rely on file/range/node ID order. Rust queries need explicit stable sort keys. +- Tree-sitter node access: any API exposing or depending on `ts_node` needs either a Rust-backed compatibility surface or a transient per-file reparse fallback. +- Range columns: tree-sitter points use byte columns. Accidentally switching to Unicode columns will break LSP and edit behavior for non-ASCII files. +- Wildcard imports and exports: current code lazily expands and invalidates wildcard-derived names. Rust needs explicit invalidation for files importing from wildcard providers. +- Conditional scope resolution: current `Name.resolve_name` has special conditional-block behavior. Scope tables need tests before Rust becomes authoritative. +- External modules: current identity is tied to import source plus unique node source. The Rust key must match enough behavior to avoid duplicate external nodes. +- Edits and stale handles: any committed edit invalidates ranges for at least one file. Handles must check file epoch before applying edits. +- Full range index memory: enabling all syntax anchors can be expensive. It must remain opt-in and visible in debug stats. +- Fallback materialization: unsupported APIs may temporarily materialize Python objects. This must be per-call/per-file and never recreate the full Python object graph behind PyO3. + +## First Slice Recommendation + +Implement the Rust data model in this order: + +1. Interners, typed IDs, arenas, node table, and file records. +2. Symbol/import/export/external records for Python and TypeScript top-level extraction. +3. Graph edge table with import/export/subclass/symbol usage edge kinds and debug dumps. +4. Per-file query indexes for files, symbols, imports, and exports. +5. Lazy Python handles returning source/ranges and ID-backed relationships. +6. Optional full range index for debug/LSP parity. + +This gives the resolver and PyO3 agents a stable contract while keeping the first engine slice focused on compact canonical state rather than Python object emulation. diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md new file mode 100644 index 000000000..edfe5f255 --- /dev/null +++ b/rust-rewrite/engine-skeleton.md @@ -0,0 +1,24 @@ +# Rust Engine Skeleton Notes + +## Layout + +- `Cargo.toml` defines a standalone Cargo workspace. It is not referenced by `pyproject.toml`, `hatch.toml`, or the current Python package build. +- `crates/graph-sitter-engine` is the dependency-free core crate. It exposes a minimal `Engine` plus `debug_info()` metadata API. +- `crates/graph-sitter-py` is a PyO3 placeholder crate. Its default build is a Rust-testable stub that forwards the same metadata API without linking Python. Enabling `pyo3-bindings` exposes a future Python extension module named `graph_sitter_py`. + +## Build Commands + +```sh +cargo fmt --all +cargo test --workspace +``` + +The PyO3 crate intentionally does not enable PyO3 by default so normal `cargo test --workspace` does not depend on a local Python development library. Build tooling can enable the crate feature later when producing a Python extension: + +```sh +cargo build -p graph-sitter-py --features extension-module +``` + +## Integration Choice + +This skeleton does not alter the Hatch/Cython Python packaging path. The current `hatch.toml` custom hook is disabled by default, so wiring Rust into wheels should be a separate packaging/CI task after the backend facade and import smoke test are defined. diff --git a/rust-rewrite/parser-index.md b/rust-rewrite/parser-index.md new file mode 100644 index 000000000..15a5eaa7d --- /dev/null +++ b/rust-rewrite/parser-index.md @@ -0,0 +1,340 @@ +# Rust Parser And Compact Index Plan + +## Purpose + +Phase 2 should replace the current eager Python AST/object construction with a Rust parser/indexer that emits a compact, snapshot-friendly IR. The IR should preserve enough structure for `files`, `symbols`, `classes`, `functions`, `imports`, and TypeScript `exports` queries, while leaving dependency resolution, expression modeling, edits, and Python object compatibility to later phases. + +Current behavior to match where relevant: + +- Parser setup maps `.py` to tree-sitter Python and `.js`, `.jsx`, `.ts`, `.tsx` to the TSX grammar (`src/graph_sitter/tree_sitter_parser.py`). +- Language semantic maps live in `PyNodeClasses` and `TSNodeClasses` (`src/graph_sitter/codebase/node_classes/*_node_classes.py`). +- Statement classification is currently custom code in `Parser.parse_py_statements` and `Parser.parse_ts_statements` (`src/graph_sitter/core/parser.py`). +- `SourceFile.parse` eagerly creates a `CodeBlock`, recursively parses statements, populates `file._nodes`, and stores Python payload objects in `rustworkx.PyDiGraph` (`src/graph_sitter/core/file.py`). +- Import/export resolution edges are separate graph phases after parse (`src/graph_sitter/codebase/codebase_context.py`), so the Rust parser slice should only extract unresolved import/export facts. + +## Compact IR + +Use append-only arenas plus interners. IDs are opaque integers scoped to the engine. + +### Shared Primitives + +- `FileId`, `SymbolId`, `ImportId`, `ExportId`, `ScopeId`, `StatementId`, `StringId`, `PathId`. +- `Range`: `start_byte`, `end_byte`, `start_point { row, column }`, `end_point { row, column }`. +- `NodeRef`: `file_id`, tree-sitter kind enum/string ID, `range`. Store this only for retained declarations/statements, not every tree-sitter node. +- `Language`: `Python`, `TypeScript`, `TSX`, `JavaScript`, `JSX`. Parser grammar may still be TSX for all JS/TS files to match the Python backend, while the file language should keep the extension-derived source kind. + +### Records + +`FileRecord` + +- `file_id` +- `path_id`, `name_id`, extension-derived `language` +- `content_hash` +- `root_range` +- `parse_status`: `ok`, `tree_sitter_error`, `skipped_binary`, `skipped_minified` +- ordered lists: `top_level_symbols`, `imports`, `exports`, `scopes` + +`SymbolRecord` + +- `symbol_id` +- `file_id` +- `name_id` +- `full_name_id` +- `kind`: `Function`, `Class`, `GlobalVar`, `Interface`, `TypeAlias`, `Enum`, `Namespace` +- `parent_symbol_id: Option`; the first parser/index slice should normally be `None` because only top-level symbols are emitted +- `scope_id` +- `range`: extended source range when the current Python API would include decorators/export keywords +- `declaration_range`: actual declaration node range +- `name_range` +- `body_range: Option` +- flags: `decorated`, `async`, `default_exported`, `named_exported`, `type_only` + +`ImportRecord` + +- `import_id` +- `file_id` +- `scope_id` +- `statement_id` +- `kind`: reuse current `ImportType` shape: `DefaultExport`, `NamedExport`, `Wildcard`, `Module`, `SideEffect`, `Unknown` +- `module_id: Option`: raw module/source text, including Python leading dots or TS quotes stripped in a separate normalized field +- `imported_name_id: Option` +- `local_name_id: Option` +- `namespace_id: Option` +- `is_type_only` +- `is_future_import` +- `is_dynamic` +- `from_export` +- ranges: `statement_range`, `import_range`, `module_range`, `name_range`, `alias_range` + +`ExportRecord` (TypeScript/JS only in the first parser/index slice) + +- `export_id` +- `file_id` +- `scope_id` +- `statement_id` +- `kind`: `Named`, `Default`, `Wildcard`, `Namespace`, `ExportEquals`, `Unknown` +- `exported_name_id: Option` +- `local_name_id: Option` +- `source_module_id: Option` for re-exports +- `declared_symbol_id: Option` when the export declares a top-level symbol in the same statement +- `import_id: Option` when the export is a direct re-export modeled through an import fact +- `is_type_only` +- ranges: `statement_range`, `export_range`, `name_range`, `source_range` + +`ScopeRecord` + +- `scope_id` +- `file_id` +- `parent_scope_id: Option` +- `owner`: `File(FileId)` or `Symbol(SymbolId)` +- `kind`: `File`, `ClassBody`, `FunctionBody`, `ModuleBlock` +- `range` +- ordered child IDs for top-level symbols/imports/exports owned by this scope + +First-slice scopes are lookup boundaries and ownership containers, not full lexical environments. + +## Python Extraction Rules + +### Files + +- Parse only `.py`. +- Emit one `FileRecord` per readable, non-skipped file. +- The file scope owns top-level declarations and top-level imports. Nested import statements can be emitted with the nearest top-level symbol scope when found cheaply by range containment. + +### Top-Level Symbols + +Walk direct named children of the root `module`. + +- `decorated_definition` + - Read child field `definition`. + - If definition is `function_definition`, emit `Function`. + - If definition is `class_definition`, emit `Class`. + - `range` is the `decorated_definition`; `declaration_range` is the nested definition. + - `name_range` is the nested definition's `name` field. + - Set `decorated = true`. +- `function_definition` + - Emit `Function`. + - `name_range` is field `name`. + - `body_range` is field `body`. +- `class_definition` + - Emit `Class`. + - `name_range` is field `name`. + - `body_range` is field `body`. +- `expression_statement` containing top-level `assignment` or `augmented_assignment` + - Emit `GlobalVar` records for simple identifier names on the left side. + - For `pattern_list`, emit one `GlobalVar` per identifier in source order. + - For attribute/subscript left sides, store no phase-1 symbol; those are not importable globals in the same way and require expression modeling. + - Preserve the assignment statement range and the specific name range. + +Do not emit nested functions/classes/methods as `SymbolRecord` in the first vertical slice. Current Python can materialize them through recursive `CodeBlock` parsing, but the phase-1 query target is top-level symbols. + +### Imports + +Emit one `ImportRecord` per imported binding. Store raw syntax facts only; do not resolve to files or symbols. + +- `import_statement` + - For each `dotted_name`, emit `Module` with `module = name = alias = dotted_name`. + - For each `aliased_import`, emit `Module` with `module/name` from field `name` and `local_name` from field `alias`. +- `import_from_statement` + - `module` is field `module_name`; keep leading dots as raw text and also store `relative_level` if practical. + - For each `dotted_name`, emit `NamedExport` with `imported_name = local_name = dotted_name`. + - For each `aliased_import`, emit `NamedExport` with `imported_name` from field `name` and `local_name` from field `alias`. + - For `wildcard_import`, emit `Wildcard`; keep the current Python-backend-compatible local name empty or `*` in a dedicated wildcard field, not as a normal binding. +- `future_import_statement` + - Emit imports with `kind = SideEffect` and `is_future_import = true`, matching current backend behavior. + +### Python Exports + +Do not emit `ExportRecord` for Python in the first parser/index slice. Python importability is represented by top-level symbols, module imports, wildcard chains, and `__init__.py` rules in the resolver phase. + +## TypeScript, TSX, JavaScript Extraction Rules + +### Files + +- Include `.ts`, `.tsx`, `.js`, `.jsx`. +- For parity with the existing backend, parse all four extensions with the TSX grammar initially. Keep `FileRecord.language` extension-specific so a later parser split does not change public file identity. + +### Top-Level Symbols + +Walk direct named children of `program`, plus declarations wrapped by top-level `export_statement`. + +Emit direct top-level declarations: + +- `function_declaration`, `generator_function_declaration` -> `Function` +- `class_declaration`, `abstract_class_declaration` -> `Class` +- `interface_declaration` -> `Interface` +- `type_alias_declaration` -> `TypeAlias` +- `enum_declaration` -> `Enum` +- `internal_module` -> `Namespace` +- `lexical_declaration` or `variable_declaration` + - If a `variable_declarator` value contains a top-level `arrow_function`, `function_expression`, or `generator_function` at depth <= 2, emit a `Function` named from the declarator's `name` field. + - Otherwise emit `GlobalVar` records for simple identifier declarator names. + - For object/array patterns, emit one `GlobalVar` per simple bound identifier in source order. Defer type-aware destructuring semantics. + +For `export_statement` with field `declaration`, emit the same symbol kinds from the declaration and attach `named_exported` or `default_exported` flags through the paired `ExportRecord`. + +Do not emit class methods, private fields, JSX elements, object-literal properties, call expressions, promise chains, or nested declarations in the first parser/index slice. + +### Static Imports + +For `import_statement`, emit one `ImportRecord` per current backend import object: + +- No `import_clause`: `import "./setup";` + - Emit `SideEffect`, `module = source`, no local binding. +- Identifier child of `import_clause`: `import Foo from "./m";` + - Emit `DefaultExport`, `imported_name = local_name = Foo`. +- `named_imports`: `import { a, b as c } from "./m";` + - Emit one `NamedExport` per `import_specifier`. + - `imported_name` is field `name`; `local_name` is field `alias` or `name`. + - Skip `comment` children. +- `namespace_import`: `import * as ns from "./m";` + - Emit `Wildcard`, `namespace/local_name = ns`. +- Type imports: `import type { T } from "./m";`, `import { type T } from "./m";` + - Set `is_type_only` on the statement-wide or specifier-specific import. If specifier-level detection is initially awkward in tree-sitter, snapshot it as a known gap rather than resolving incorrectly. + +### Dynamic Imports And Require + +The first vertical slice should include a small, syntax-only subset because existing file import tests expect `require` and dynamic `import()` to surface as imports: + +- Side-effect calls: `require("./m")`, `import("./m")`, `await import("./m")` in expression statements -> `SideEffect`, `is_dynamic = true`. +- Named module binding: `const pkg = require("./m")` or `const pkg = await import("./m")` -> `Module`, `local_name = pkg`, `is_dynamic = true`. +- Destructured binding: `const { a, b: c } = require("./m")` -> one `NamedExport` per simple property binding. +- Member access type/value import: `import("./m").SomeType` or `(await import("./m")).default` -> `NamedExport` or `DefaultExport` when the property is a simple identifier. + +Defer dynamic imports with computed module paths, conditional module expressions, nested object patterns, and non-literal source arguments. + +### Exports + +Emit unresolved `ExportRecord` facts and any directly declared symbols. + +- Declaration exports: + - `export function f() {}`, `export class C {}`, `export interface I {}`, `export type T = ...`, `export enum E {}`, `export namespace N {}`, `export const x = ...` + - Emit the declared `SymbolRecord`. + - Emit `ExportRecord(kind = Named, exported_name = symbol name, declared_symbol_id = symbol_id)`. +- Default declaration/value exports: + - `export default function f() {}`, `export default class C {}`, `export default foo`, `export = foo` + - Emit `ExportRecord(kind = Default)` or `ExportEquals`. + - If the statement declares a named top-level function/class/assignment, link `declared_symbol_id`. + - If anonymous/default value has no durable name, do not invent a `SymbolRecord`; keep only the export fact and value range. +- Named export clauses: + - `export { a, b as c };` + - Emit one `ExportRecord(kind = Named)` per `export_specifier`. + - `local_name = name`, `exported_name = alias or name`. +- Re-exports: + - `export { a, b as c } from "./m";` + - Emit one `ImportRecord(from_export = true)` per imported binding and one `ExportRecord` linked to that import. + - `source_module = "./m"`. + - `export { default as Foo } from "./m"` should set the import kind to `DefaultExport`. +- Wildcard re-exports: + - `export * from "./m";` -> `ExportRecord(kind = Wildcard, source_module = "./m")` plus a `Wildcard` import fact from the source. + - `export * as ns from "./m";` -> `ExportRecord(kind = Namespace, exported_name = ns, source_module = "./m")` plus a `Wildcard` import fact with namespace/local name `ns`. +- Type exports: + - `export type { T } from "./types";`, `export type T = ...` + - Set `is_type_only = true`. + +Do not resolve `ExportRecord` targets across files in the first parser/index slice. That belongs to Phase 3. + +## Ranges And Scopes + +Every retained record should be reconstructible from byte ranges against file content: + +- Store byte ranges for file root, declaration, full/extended source, names, module strings, aliases, and statement boundaries. +- Store point ranges for user-facing diagnostics and snapshots. +- Keep both `statement_range` and focused binding/export ranges because current `Import` and `ExportStatement` APIs distinguish a single binding from the whole statement. +- Ranges must be byte offsets from UTF-8 source bytes. Do not derive offsets from Python string indices. + +Minimal phase-1 scope rules: + +- Create one `File` scope per file. +- Create one owned body scope for each top-level class/function/namespace. +- Assign each import/export to the narrowest retained scope by range containment: file scope or nearest top-level symbol body scope. +- Do not create scopes for every `if`, `for`, `while`, `try`, match/switch case, lambda/arrow expression, or nested block in the first parser/index slice. +- Do not compute name lookup tables, hoisting, `global`/`nonlocal`, closure captures, or TypeScript block scoping in the first parser/index slice. + +## What The First Parser/Index Slice Must Not Eagerly Materialize + +- Python wrapper objects for every node. +- Persistent tree-sitter node handles after extraction. +- `CodeBlock`, `Statement`, `Expression`, `FunctionCall`, JSX, type-expression, decorator, comment, and docstring objects. +- `rustworkx` graph payloads or Python object graph edges. +- Dependency edges, symbol usage records, superclass/interface edges, import resolution edges, or export resolution edges. +- Full local-variable indexes inside functions/classes. +- External module records beyond unresolved import module strings. +- Directory tree, tsconfig path expansion, sys.path/import override resolution, and package `__init__.py` wildcard semantics. +- Edit/formatting metadata beyond source ranges needed by later lazy handles. + +## Golden Snapshots + +Add Rust IR snapshot tests that compare stable JSON, sorted by `(file_path, range_start, kind, name)` and using interned string values in the debug dump for readability. + +### Python Fixtures + +- `py_symbols_basic.py` + - module imports, `from` imports, aliases, wildcard import + - top-level decorated function, async function, class, simple globals, tuple assignment + - nested function/class/assignment present but absent from phase-1 symbols +- `py_relative_imports.py` + - `from . import x`, `from ..pkg.mod import A as B`, `from __future__ import annotations` + - verify raw module text, relative level, future flag +- `py_scopes.py` + - top-level import, import inside a function, import inside a class method + - verify import scope assignment without full nested statement materialization + +### TypeScript/TSX Fixtures + +- `ts_symbols_basic.ts` + - function, generator, class, abstract class, interface, type alias, enum, namespace, const global, arrow-function const +- `ts_imports.ts` + - default, named, aliased named, namespace, side-effect, type-only import +- `ts_dynamic_imports.js` + - `require`, `await import`, destructured require, side-effect require +- `ts_exports.ts` + - declaration exports, default exports, named export clause, re-export clause, wildcard re-export, namespace re-export, type export, export equals +- `tsx_component.tsx` + - JSX in a function component and exported component; verify parser accepts JSX but does not materialize JSX records +- `ts_scopes.ts` + - imports inside top-level function/class body plus top-level exports; verify minimal scope owners + +### Existing Tests To Mine For Source Cases + +- Python import cases: `tests/unit/sdk/python/import_resolution/` +- Python globals: `tests/unit/sdk/python/global_var/` +- TypeScript import cases: `tests/unit/sdk/typescript/file/test_file_import_statemets.py`, `tests/unit/sdk/typescript/import_resolution/` +- TypeScript export cases: `tests/unit/sdk/typescript/file/test_file_export_statements.py`, `tests/unit/sdk/typescript/export/` +- TypeScript globals and arrow functions: `tests/unit/sdk/typescript/global_var/`, `tests/unit/sdk/typescript/function/test_function_arrow.py` + +## Proposed First Vertical Slice + +1. Add Rust parser crate module boundaries and tree-sitter setup. + - `parser::language` maps paths to parser grammar and `Language`. + - `parser::parse_file(path, bytes)` returns parse status and root range. +2. Add arena records and interners. + - `Index` owns files, symbols, imports, exports, scopes, strings, paths. + - JSON debug dump exposes stable, string-expanded snapshots. +3. Implement file discovery input from Python. + - Python passes `(relative_path, absolute_path, language, content bytes/hash)` or a repo-operator file list. + - Rust does not walk the filesystem independently in the first slice. +4. Implement Python extraction. + - File records, top-level class/function/global symbols, imports, ranges, file/top-level symbol scopes. + - Snapshot `py_symbols_basic.py`, `py_relative_imports.py`, and `py_scopes.py`. +5. Implement TypeScript/TSX extraction. + - File records, top-level declaration/global/function symbols, static imports, direct export facts, ranges, scopes. + - Snapshot `ts_symbols_basic.ts`, `ts_imports.ts`, `ts_exports.ts`, and `tsx_component.tsx`. +6. Add dynamic import/require subset. + - Snapshot `ts_dynamic_imports.js`. +7. Expose PyO3 debug/query APIs. + - `files() -> Vec` + - `symbols(file_id?) -> Vec` + - `classes()`, `functions()`, `imports()`, `exports()` + - record lookup APIs returning compact structs or JSON for tests +8. Add parity smoke tests against the Python backend counts/names for the fixture set. + - Compare file paths, symbol names/kinds, import local names/kinds/modules, export names/kinds/modules. + - Do not compare dependency edges or wrapper behavior in this phase. + +## Acceptance For The First Parser/Index Slice + +- Building the Rust index for fixture repos does not instantiate Python `SourceFile`, `Symbol`, `Import`, `Export`, `CodeBlock`, `Statement`, or expression objects. +- Snapshot debug output is deterministic across runs. +- Python and Rust backends agree on top-level file/symbol/import/export counts and names for the selected fixtures. +- Unsupported syntax is represented as an omitted record plus parse warning/debug gap, not as a placeholder Python object. +- All records have byte ranges and point ranges sufficient to reconstruct source substrings from file bytes. diff --git a/rust-rewrite/resolution-algorithms.md b/rust-rewrite/resolution-algorithms.md new file mode 100644 index 000000000..34874f463 --- /dev/null +++ b/rust-rewrite/resolution-algorithms.md @@ -0,0 +1,309 @@ +# Resolution And Dependency Algorithm Inventory + +## Scope + +This inventory maps the current Python implementation that needs parity in the Rust rewrite. It focuses on: + +- Import resolution: `src/graph_sitter/core/import_resolution.py`, `src/graph_sitter/python/import_resolution.py`, `src/graph_sitter/typescript/import_resolution.py`, `src/graph_sitter/typescript/ts_config.py` +- Export resolution: `src/graph_sitter/core/export.py`, `src/graph_sitter/typescript/export.py`, `src/graph_sitter/core/statements/export_statement.py`, `src/graph_sitter/typescript/file.py` +- Name/scope and type-frame resolution: `src/graph_sitter/core/file.py`, `src/graph_sitter/core/function.py`, `src/graph_sitter/core/expressions/name.py`, `src/graph_sitter/core/expressions/chained_attribute.py`, `src/graph_sitter/compiled/resolution.pyx` +- Usage metadata and dependency edges: `src/graph_sitter/core/dataclasses/usage.py`, `src/graph_sitter/core/interfaces/importable.py`, `src/graph_sitter/core/interfaces/usable.py` +- Subclass/interface dependencies: `src/graph_sitter/core/interfaces/inherits.py`, `src/graph_sitter/core/symbol_groups/parents.py`, `src/graph_sitter/core/class_definition.py`, `src/graph_sitter/core/interface.py` +- Incremental recomputation: `src/graph_sitter/codebase/codebase_context.py`, especially `_process_diff_files` and `_compute_dependencies` + +## Current Graph Model + +The Python backend stores graph nodes as live Python objects in `rustworkx.PyDiGraph`. The resolver/dependency graph uses: + +| Concept | Current node/record | Important fields | +| --- | --- | --- | +| File | `SourceFile` | `node_id`, `filepath`, `_nodes`, `code_block`, `valid_symbol_names`, `valid_import_names` | +| Symbol | `Symbol` subclasses | `name`, `full_name`, `symbol_type`, `parent_symbol`, code ranges, nested `code_block` | +| Import | `Import` subclasses | `module`, `symbol_name`, `alias`, `import_type`, `_unique_node`, `to_file_id` | +| Export | `TSExport` | `name`, `exported_name`, `_declared_symbol`, `_exported_symbol`, `_value_node` | +| External module | `ExternalModule` | module/source name, originating import | +| Usage | `Usage` dataclass | `match`, `usage_symbol`, `imported_by`, `usage_type`, `kind` | + +Graph edges: + +| Edge kind | Direction | Meaning | +| --- | --- | --- | +| `IMPORT_SYMBOL_RESOLUTION` | import -> symbol/file/external | Import path/specifier resolution | +| `EXPORT` | export -> symbol/import/file | Export target resolution | +| `SUBCLASS` | class/interface -> class/interface/external | Resolved inheritance/implements relation | +| `SYMBOL_USAGE` | usage owner -> target | Dependency edge with `Usage` metadata | + +`UsageType` is an `IntFlag` with `DIRECT`, `CHAINED`, `INDIRECT`, and `ALIASED`. `UsageKind` records where the reference came from: subclass, typed parameter, type annotation, body, decorator, return type, type definition, exported symbol, wildcard export, generic, imported, wildcard import, or default value. + +## Build And Recomputation Pipeline + +`CodebaseContext._process_diff_files` is the orchestrator: + +1. Clear caches unless this is an incremental add-only update. +2. Start and wait for dependency manager/language engine if configured. +3. Normalize missing `ADD`/`REPARSE` paths into `DELETE`. +4. For deleted files, remove internal edges, unparse nodes, remove graph nodes, and collect predecessor nodes of removed nodes into `to_resolve`. +5. For reparsed files, remove internal edges, unparse children, reparse the same file node from disk, and enqueue the file plus all new nodes. +6. For added files, parse and enqueue the file plus all new nodes. +7. Rebuild directory tree and TypeScript configs. +8. For every import in `to_resolve`, remove old import-resolution edges, add new ones, and append `node.symbol_usages` to `to_resolve`. +9. For every export in `to_resolve`, remove old export edges, compute export edges, and append `node.symbol_usages` to `to_resolve`. +10. For every inherited symbol in `to_resolve`, remove old subclass edges and compute superclass dependencies. +11. Run `_compute_dependencies(to_resolve, incremental)`. + +`_compute_dependencies` is a fixed-point queue over Python objects. Each node recomputes outgoing `SYMBOL_USAGE` edges. In incremental mode, `Importable.recompute` removes old usage edges, calls `_compute_dependencies`, and returns `descendant_symbols + file.get_nodes(sort=False)`. In non-incremental mode, each fixed-point round appends every graph node not yet seen. This is correct enough for the object model, but it fans out far beyond the semantic delta. + +## Import Resolution Algorithms + +### Shared Import Flow + +`Import.add_symbol_resolution_edge` calls the language-specific `resolve_import`: + +- If it returns `None`, the import is unresolved internally and gets an `ExternalModule` target keyed by module/source. +- If it returns `symbol`, add `IMPORT_SYMBOL_RESOLUTION` import -> symbol unless it is a self-loop. +- If it returns `imports_file=True`, add `IMPORT_SYMBOL_RESOLUTION` import -> source file. +- `imported_symbol` follows a direct import-resolution edge and, for exports, follows `EXPORT` edges until a non-export target. +- `resolved_symbol` follows chains of imports and stops on cycles. +- `names` yields one binding for normal imports, expands wildcard imports through the resolved file's `valid_import_names`, and invalidates importer files when wildcard expansion changes. + +### Python + +`PyImport.resolve_import` resolves from `module`, `symbol_name`, `alias`, and `ImportType`: + +1. Pick `base_path` from the first project or an explicit retry. +2. Convert relative dot imports to absolute dotted paths based on the current file directory. +3. For module and wildcard imports, try `base_path/module/path.py`. +4. For named imports, first try `base_path/module/path/symbol.py` to support importing a submodule as the symbol. +5. Try configured `import_resolution_paths` and optionally `sys.path` before the default graph lookup. +6. Try direct file paths, then package `__init__.py`. +7. For `module.py` or `module/__init__.py`, look up `symbol_name` through `get_node_by_name`. +8. If a symbol is missing but a wildcard import chain can provide it, return the file as `imports_file=True`. +9. If unresolved from repo root, retry with `src`, then `test` if those directories exist. +10. Otherwise return `None` and let the shared layer create/reuse an external module node. + +Python `valid_import_names` extends the base file map for `__init__.py`: child files in the package directory are importable by file stem. + +### TypeScript And JavaScript + +`TSImport` parses static imports, re-export imports, side-effect imports, namespace imports, CommonJS `require`, and dynamic `import()` forms into the same import record shape. + +`TSImport.resolve_import`: + +1. Strip quotes from the import source. +2. Translate aliases through the nearest `TSConfig` if available. +3. Mark relative imports, prepend the project base path for non-prefixed sources, and normalize relative paths against the importing file directory. +4. If the path has no extension and an index file exists, prefer `index.ts`, `index.js`, `index.tsx`, then `index.jsx`. +5. Try both the import source and its extensionless stem with extensions: empty, `.ts`, `.d.ts`, `.tsx`, `.d.tsx`, `.js`, `.jsx`. +6. If the target file exists and the import is module-like (`MODULE`, `WILDCARD`, `DEFAULT_EXPORT`, or non-type `SIDE_EFFECT`), resolve to the file. +7. For named imports, resolve to `file.get_export(symbol_name)`. If the export is missing, return the file as `imports_file=True` so module re-export search can resolve later. +8. If no file matches, return `None` for external module handling. + +`TSImport.resolved_symbol` adds TypeScript-specific hops: + +- Default imports can collapse to the single default export's resolved symbol. +- Named imports that initially resolve to a file search module imports in that file with BFS to find re-exported named exports. +- Import chains are followed until a non-import target or a cycle. + +`TSConfig` precomputes alias maps from `extends`, `compilerOptions.baseUrl`, `paths`, `rootDirs`, `outDir`, project `references`, and explicit `import_resolution_overrides`. Alias lookup uses longest-prefix matching and has an optimization to skip non-`@`/`~` imports when all aliases use those prefixes. + +## Export Resolution Algorithms + +Only TypeScript has explicit export nodes. + +`ExportStatement` parses: + +- Declaration exports: exported function, class, variable, interface, type alias, enum, namespace. +- Value exports: `export default value`, `export = value`, object literals, assignment expressions, detached expression values. +- Source re-exports: `export { x as y } from "./m"`, `export * from "./m"`, `export * as ns from "./m"`. +- Local named exports: `export { local as public }`. + +`TSExport.compute_export_dependencies` creates `EXPORT` edges: + +- If the export declared a symbol, export -> declared symbol. +- If it names an existing local/imported symbol, export -> resolved local/import node. +- If it exports a value expression that is `Chainable`, export -> each resolved value target. +- If it is a bare wildcard export, export -> current file. +- Wildcard exports invalidate import-name caches in importer files. + +`TSFile.valid_import_names` is export-centric: + +- A single default export is stored under `default`. +- Each export contributes `export.names`: explicit exported names or expanded wildcard-export names. +- TypeScript imports therefore resolve importable names through file exports, not raw file symbols. + +`TSExport.resolved_symbol` follows export/import chains until it reaches a symbol, file, or external module, while tracking cycles. `TSExport._compute_dependencies` separately records usage edges for exported symbols or exported values using `UsageKind.EXPORTED_SYMBOL`. + +## Name, Scope, And Resolution Frames + +### Lexical Lookup + +The core lookup path is recursive and object-centric: + +- `Name._resolved_types` calls `resolve_name(self.source, self.start_byte)`. +- `Editable.resolve_name` delegates to the parent scope, falling back to the file. +- `SourceFile.resolve_name` looks in `valid_symbol_names`, which combines top-level symbols keyed by full name and imports keyed by import names/wildcards. If a candidate starts after the usage byte, it scans previous file symbols backward for the closest visible definition. +- `Function.resolve_name` checks function parameters and descendant symbols in reverse source order before delegating to the parent scope. +- `PyFunction.resolve_name` special-cases method receivers: the first parameter and `super()` resolve to the parent class for non-static methods. +- `TSFunction.resolve_name` special-cases `this` to the parent class. +- `ForLoopStatement.resolve_name` can bind loop variables from the iterable's resolved generic frames. +- `Name.resolve_name` optionally expands conditional-block alternatives when `conditional_type_resolution` is enabled. + +### Resolution Frames + +`Chainable.resolved_type_frames` returns one or more `ResolutionStack` frames with cycle protection. Frames carry: + +- `node`: current target or intermediate node +- `parent_frame`: next target in the chain +- `direct`, `aliased`, `chained`: usage classification flags +- `generics`: generic substitutions discovered along the way + +`ResolutionStack.get_edges` emits `SYMBOL_USAGE` edges from the destination owner to every graph node in the resolution stack. This preserves current API behavior where a symbol can be used by an import/export intermediary and by the final callsite. The edge's `Usage` stores the exact match node, owner symbol, usage type, usage kind, and optional importer. + +### Chained Attributes And Calls + +`ChainedAttribute._resolved_types`: + +- Resolves full names directly from `file.valid_import_names` for module-style imports. +- Otherwise resolves the object, then asks the top target to `resolve_attribute(attribute)`. +- If the top target has no attributes, it still yields the top target as a chained dependency and may adjust dict generics for common methods. +- `_compute_dependencies` records chained usage edges and also computes dependencies for the object unless it is `self` or `this`. + +`FunctionCall._resolved_types` resolves calls through the function name. Constructors resolve to their parent class. Functions with return types resolve to the return type, with generic substitution where possible. Unresolved calls still yield a frame for the call itself so dependency computation can continue. `_compute_dependencies` computes argument dependencies, generic type arguments, and then either adds usages for resolved function definitions or computes the name dependency directly. + +## Subclass And Interface Dependencies + +Classes and interfaces implement `Inherits`. + +- Python classes parse `superclasses` into a `Parents` collection. +- TypeScript classes parse `extends_clause` and `implements_clause` from `class_heritage`. +- TypeScript interfaces parse `extends_type_clause` into `parent_interfaces`. +- `Parents._compute_dependencies` records normal usage dependencies for parent type expressions and generic type arguments. +- `Parents.compute_superclass_dependencies` resolves each parent expression. If exactly one resolved target is on the graph, it adds a `SUBCLASS` edge from the class/interface to that target. Ambiguous or missing parents are logged and do not get `SUBCLASS` edges. +- `Inherits._get_superclasses` and `_get_subclasses` perform BFS over `SUBCLASS` successors/predecessors, matching the current Python MRO-like traversal. + +Parity requires both edge families: `SYMBOL_USAGE` for the inheritance expression and `SUBCLASS` for inheritance traversal APIs. + +## Where The Current Algorithm Fans Out + +The main fan-out points to avoid in Rust are: + +1. `to_resolve.extend(node.symbol_usages)` during import and export passes. A changed import/export pulls all current users of that object into the recompute queue, even if only one name or target changed. +2. `Importable.recompute(incremental=True)` returns `descendant_symbols + file.get_nodes(sort=False)`. Any changed node schedules the whole file's graph nodes plus nested descendants. +3. Non-incremental `_compute_dependencies` appends every graph node not yet seen on every fixed-point round. +4. Cache invalidation is coarse: `uncache_all()` and file-level `invalidate()` drop broad Python cached properties instead of specific name/export/import indexes. +5. Wildcard imports and exports invalidate importer files by object traversal, not by changed exported-name sets. +6. TypeScript re-export search uses BFS through module imports at query time, so a missing named export can repeatedly search the same module-import frontier. +7. `valid_symbol_names` and `valid_import_names` are derived from live object lists and can expand wildcard imports into many object wrappers. + +The Rust engine should compute semantic deltas first and only enqueue relations whose inputs changed. + +## Required Rust Tables And Indexes + +### Canonical Records + +| Record | Required fields | +| --- | --- | +| `FileRecord` | `FileId`, path ID, language, content hash, parser generation, tsconfig ID, root range | +| `ScopeRecord` | `ScopeId`, file ID, parent scope, owner node, kind, range, hoist behavior | +| `SymbolRecord` | `SymbolId`, file ID, scope ID, name ID, full-name ID, kind, parent symbol, declaration range, body range | +| `ImportRecord` | `ImportId`, file ID, scope ID, module specifier ID, symbol name ID, alias ID, import type, statement range, specifier range | +| `ExportRecord` | `ExportId`, file ID, export name ID, declared symbol/import ID, local exported symbol name ID, value expression ID, export kind, statement range | +| `UsageSiteRecord` | `UsageSiteId`, file ID, scope ID, owner node ID, expression node ID, name/full-name IDs, match range, usage kind | +| `ExternalModuleRecord` | `ExternalId`, module specifier ID, import name ID | +| `GraphEdge` | source ID, target ID, edge kind, optional usage ID | +| `UsageRecord` | usage site, owner node, target node, imported-by import ID, usage type, usage kind, match range | + +### Lookup Indexes + +| Index | Purpose | +| --- | --- | +| `path_to_file` and `module_key_to_file` | O(1) candidate file lookup for Python/TS import paths and package/index files | +| `file_to_nodes`, `file_to_imports`, `file_to_exports`, `file_to_scopes` | Fast deletion/reparse and debug dumps | +| `scope_parent`, `scope_children`, `binding_by_scope_name` | Lexical name lookup without parent object recursion | +| `binding_visibility_by_name` | Resolve nearest visible binding before a usage byte | +| `file_importable_name` | `valid_import_names` equivalent for each file | +| `wildcard_import_expansion` and `wildcard_export_expansion` | Cache expanded names with source file/export generation | +| `import_resolution` | Import -> target file/symbol/export/external and reverse target -> imports | +| `export_target` | Export -> symbol/import/file/external and reverse target -> exports | +| `usage_by_owner`, `usage_by_target`, `usage_by_match` | Dependency queries, usages API, rename callsites | +| `edge_by_source_kind`, `edge_by_target_kind` | Efficient graph deletes and parity dumps | +| `subclass_succ`, `subclass_pred` | Superclass/subclass APIs | +| `tsconfig_for_file`, `alias_prefix_to_imports` | Narrow TypeScript alias invalidation | +| `unresolved_by_name`, `external_by_key` | Revisit unresolved references only when matching names/modules appear | + +## Compact Frontier And Invalidation Rules + +### Semantic Deltas + +For each changed file, compute deltas before invalidating dependents: + +- `PathDelta`: file added/deleted/moved or extension/index/package status changed. +- `ConfigDelta`: nearest tsconfig, alias map, baseUrl, paths, or references changed. +- `ImportDelta`: import specifier/module/type/alias changed, added, or removed. +- `ExportNameDelta`: importable names added/removed/retargeted for a file. +- `BindingDelta`: lexical bindings added/removed/renamed/retargeted by `(scope, name, visibility range)`. +- `UsageSiteDelta`: identifier/chained-attribute/function-call sites added/removed/changed owner or range. +- `InheritanceDelta`: parent type expressions or generic args changed. + +### Work Queues + +Use separate queues instead of one object queue: + +1. `ResolveImports`: import IDs whose module candidate set or specifier fields changed. +2. `ResolveExports`: export IDs whose declared/local/import target changed, plus wildcard re-exporters of changed export names. +3. `ResolveNames`: usage sites whose lexical binding candidates changed by name/scope/range. +4. `BuildUsageEdges`: usage sites whose resolution stack changed. +5. `BuildSubclassEdges`: inheritance expressions whose resolved target changed. +6. `PropagateNameExports`: files whose `file_importable_name` set changed. + +### Frontier Rules + +- A changed import spec enqueues only that import for path resolution, then only usage sites bound to that import alias/name. +- A changed file path enqueues imports whose precomputed candidate path set includes the old or new path, plus unresolved imports with matching module suffix. +- A tsconfig change enqueues imports in files covered by that config and imports whose specifier matches changed alias prefixes. +- A file's `ExportNameDelta` enqueues imports targeting that file/name, wildcard imports from that file, and wildcard re-exporters whose expansion includes changed names. +- A `BindingDelta(scope, name)` enqueues usage sites with the same name in descendant scopes whose lookup path crosses the changed scope and whose usage byte is after the binding visibility point. +- A local symbol body change with no binding/import/export/name-set delta only enqueues usage sites inside that symbol owner. +- A parent class/interface expression change enqueues only that class/interface for `SUBCLASS` rebuild and its inheritance-expression usage edges. +- A target deletion enqueues reverse dependents from `usage_by_target`, `import_resolution` reverse index, `export_target` reverse index, and `subclass_pred`, but filtered by changed names where possible. + +The Rust fixed point should operate on relation generations: if a queue item recomputes to the same normalized output tuple, do not enqueue its dependents. + +## Rust Port Plan + +1. Extract compact import/export/scope/usage IR alongside the Python backend and produce debug snapshots without changing behavior. +2. Implement Python import path resolution in Rust with a candidate-path trace for parity debugging. +3. Implement TypeScript import path resolution, including tsconfig alias maps, index files, extension permutations, dynamic imports, and external module records. +4. Implement TypeScript export target resolution and file importable-name tables, including wildcard re-export expansion. +5. Implement lexical scope tables and name lookup for file, function, class, parameter, loop, `self`/`super()`, `this`, and conditional-resolution cases. +6. Implement resolution-stack edge emission so normalized `SYMBOL_USAGE` edges include intermediate import/export nodes and the current `UsageType`/`UsageKind`. +7. Implement `SUBCLASS` edge construction from parent/interface expressions and BFS query indexes for superclass/subclass APIs. +8. Add incremental relation generations and the compact work queues above. +9. Expose graph debug dumps through PyO3: nodes, imports, exports, usage sites, resolution stacks, and normalized edges. +10. Keep Python object APIs as wrappers over IDs only after graph edge parity is proven. + +## Edge Parity Tests + +Add Rust-vs-Python golden snapshots using normalized tuples: + +```text +(source_kind, source_file, source_range, source_name, + edge_kind, + target_kind, target_file, target_range, target_name, + usage_type, usage_kind, match_file, match_range, match_text, imported_by_key) +``` + +Required parity categories: + +| Category | Fixtures to cover | +| --- | --- | +| Python imports | module, named, aliased, wildcard, relative dots, package `__init__.py`, custom resolve paths, `src`/`test` fallback, external modules | +| TypeScript imports | default, named, alias, namespace, side-effect, `require`, dynamic import, directory index, extension fallback, tsconfig paths/baseUrl/references, external modules | +| TypeScript exports | declaration exports, default exports, `export =`, object value exports, named local exports, named re-exports, wildcard re-exports, aliased wildcard exports, type-only exports | +| Usage types | direct same-file references, imported references, indirect re-export chains, aliased imports/exports, chained module/class/namespace references | +| Usage kinds | body, decorator, subclass, generic, type annotation, typed parameter, return type, type definition, exported symbol, imported, default value | +| Name/scope | nested functions, parameter shadowing, definitions after usage, class methods, Python `self` and `super()`, TypeScript `this`, loop variables, conditional blocks | +| Subclass/interface | Python class bases, TS `extends`, TS `implements`, interface `extends`, generic parent types, external/ambiguous parents | +| Incremental | add file, delete file, reparse no-op, rename import target, change exported name, wildcard export name delta, tsconfig alias delta | + +Existing tests already cover many behavior assertions under `tests/unit/sdk/python/import_resolution`, `tests/unit/sdk/typescript/import_resolution`, `tests/unit/sdk/typescript/export`, `tests/unit/sdk/python/class_definition/test_class_dependencies.py`, `tests/unit/sdk/typescript/class_definition/test_class_dependencies.py`, `tests/unit/sdk/typescript/interface/test_interface_dependencies.py`, `tests/unit/sdk/python/file/test_file_reparse.py`, and `tests/unit/sdk/python/codebase/test_codebase_reset.py`. The Rust parity layer should reuse those fixture shapes and compare graph-edge snapshots directly. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 1a0458e80..a77a98be4 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -108,39 +108,41 @@ Recommended task format: ## Active Worktrees -- [ ] Benchmarks/profiling. owner: Poincare. Agent: `019edc37-802c-7223-8d37-75a51b65abbd`. Branch: `codex/rust-rewrite-benchmarks`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-benchmarks`. -- [ ] API inventory. owner: Dewey. Agent: `019edc37-82ff-7b92-9fac-5364e2d8098b`. Branch: `codex/rust-rewrite-api-inventory`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-api-inventory`. -- [ ] Rust data model. owner: Pasteur. Agent: `019edc37-859c-71b2-b884-ab7a2bfc707e`. Branch: `codex/rust-rewrite-data-model`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-data-model`. -- [ ] Parser/index vertical slice. owner: Meitner. Agent: `019edc37-8867-7a83-a18e-b0ec0ca29d11`. Branch: `codex/rust-rewrite-parser-index`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-parser-index`. -- [ ] Resolver/dependency algorithms. owner: Gauss. Agent: `019edc37-8c34-7f93-b0ae-746cbd579962`. Branch: `codex/rust-rewrite-resolver`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-resolver`. -- [ ] Rust engine skeleton. owner: Beauvoir. Agent: `019edc37-8f2d-7dd3-b3ed-a1f9e1b191a7`. Branch: `codex/rust-rewrite-engine-skeleton`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-engine-skeleton`. -- [ ] PyO3/Python compatibility. owner: queued. Branch: `codex/rust-rewrite-pyo3-compat`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-pyo3-compat`. Notes: agent spawn queued until an active helper completes. +- [x] Benchmarks/profiling. owner: Poincare. Agent: `019edc37-802c-7223-8d37-75a51b65abbd`. Branch: `codex/rust-rewrite-benchmarks`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-benchmarks`. Result: benchmark plan and Python backend harness committed. +- [x] API inventory. owner: Dewey. Agent: `019edc37-82ff-7b92-9fac-5364e2d8098b`. Branch: `codex/rust-rewrite-api-inventory`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-api-inventory`. Result: P0/P1/P2 API compatibility inventory committed. +- [x] Rust data model. owner: Pasteur. Agent: `019edc37-859c-71b2-b884-ab7a2bfc707e`. Branch: `codex/rust-rewrite-data-model`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-data-model`. Result: compact Rust-side schema and migration risks committed. +- [x] Parser/index vertical slice. owner: Meitner. Agent: `019edc37-8867-7a83-a18e-b0ec0ca29d11`. Branch: `codex/rust-rewrite-parser-index`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-parser-index`. Result: parser/index extraction plan committed. +- [x] Resolver/dependency algorithms. owner: Gauss. Agent: `019edc37-8c34-7f93-b0ae-746cbd579962`. Branch: `codex/rust-rewrite-resolver`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-resolver`. Result: resolver algorithm inventory and Rust port plan committed. +- [x] Rust engine skeleton. owner: Beauvoir. Agent: `019edc37-8f2d-7dd3-b3ed-a1f9e1b191a7`. Branch: `codex/rust-rewrite-engine-skeleton`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-engine-skeleton`. Result: standalone Cargo workspace and smoke tests committed. +- [ ] PyO3/Python compatibility. owner: Wegener. Agent: `019edc4e-72b1-7a00-8644-e43503f0cdc3`. Branch: `codex/rust-rewrite-pyo3-compat`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-pyo3-compat`. Notes: spawned after completed agents freed capacity. ## Phase 0: Baseline, RFC, And Contracts -- [ ] Add memory benchmark harness for current Python backend. +- [x] Add memory benchmark harness for current Python backend. owner: Poincare. Result: added `rust-rewrite/tools/measure_python_backend.py`. - [ ] Measure cold parse RSS and wall time for representative repos. - [ ] Measure graph node/edge counts, Python object counts, and per-phase allocation peaks. -- [ ] Document the exact current build phases with timings: file enumeration, parse, directory tree, config parse, import resolution, export resolution, dependency recompute. -- [ ] Inventory all public `Codebase` properties and methods. -- [ ] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. -- [ ] Define P0 compatibility surface for the first Rust backend slice. +- [x] Document the exact current build phases with timings: file enumeration, parse, directory tree, config parse, import resolution, export resolution, dependency recompute. owner: Poincare. Result: added phase map in `rust-rewrite/benchmarks.md`; representative repo timings remain open. +- [x] Inventory all public `Codebase` properties and methods. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. +- [x] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. +- [x] Define P0 compatibility surface for the first Rust backend slice. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [ ] Define large-repo success targets for memory and time. -- [ ] Draft Rust engine RFC with module boundaries and Python integration points. +- [x] Draft compact Rust data model with module boundaries and Python integration points. owner: Pasteur. Result: documented in `rust-rewrite/data-model.md`. +- [ ] Draft full Rust engine RFC with module boundaries and Python integration points. - [ ] Decide build tooling: `maturin`, setuptools-rust, or hatch custom hook. ## Phase 1: Rust Engine Skeleton -- [ ] Add Rust workspace/crate skeleton without changing default behavior. +- [x] Add Rust workspace/crate skeleton without changing default behavior. owner: Beauvoir. Result: added standalone Cargo workspace under `crates/`. - [ ] Add PyO3 module import smoke test. - [ ] Add `graph_backend` config flag with default `python`. - [ ] Add Rust engine facade object that can be constructed from `CodebaseContext`. -- [ ] Add a minimal debug API returning engine version and enabled features. +- [x] Add a minimal debug API returning engine version and enabled features. owner: Beauvoir. Result: added Rust `Engine::debug_info` and feature-gated PyO3 bindings. - [ ] Add CI job that builds the Rust extension on supported Python versions. - [ ] Add benchmark command that can select `--backend python|rust`. ## Phase 2: Parser And Compact Index Vertical Slice +- [x] Specify parser/index vertical slice and extraction rules. owner: Meitner. Result: documented in `rust-rewrite/parser-index.md`. - [ ] Implement Rust file discovery input format from Python repo operator. - [ ] Implement tree-sitter parser setup for Python. - [ ] Implement tree-sitter parser setup for TypeScript/TSX. @@ -156,6 +158,7 @@ Recommended task format: ## Phase 3: Resolution And Dependency Graph +- [x] Inventory current resolver/dependency algorithms and Rust relation-table plan. owner: Gauss. Result: documented in `rust-rewrite/resolution-algorithms.md`. - [ ] Port Python import resolution rules. - [ ] Port TypeScript relative import resolution rules. - [ ] Port TypeScript config/path alias handling. @@ -217,5 +220,6 @@ Recommended task format: ## Agent Log -- [ ] 2026-06-18: Initial strategy file created on `rust-rewrite` branch. owner: codex. Notes: ready for helper agents to claim phase tasks. -- [ ] 2026-06-18: Integrator created seven worktrees and spawned six helper agents; PyO3 compatibility is queued due to agent concurrency limit. owner: codex. +- [x] 2026-06-18: Initial strategy file created on `rust-rewrite` branch. owner: codex. Notes: ready for helper agents to claim phase tasks. +- [x] 2026-06-18: Integrator created seven worktrees and spawned six helper agents; PyO3 compatibility was queued due to agent concurrency limit. owner: codex. +- [x] 2026-06-18: Six completed helper branches reviewed and their artifacts staged for integration. owner: codex. Notes: PyO3 compatibility agent is now running as Wegener. diff --git a/rust-rewrite/tools/measure_python_backend.py b/rust-rewrite/tools/measure_python_backend.py new file mode 100644 index 000000000..6a22b05f3 --- /dev/null +++ b/rust-rewrite/tools/measure_python_backend.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import gc +import json +import os +import platform +import resource +import subprocess +import sys +import tempfile +import threading +import time +from collections import Counter, defaultdict +from contextlib import contextmanager +from dataclasses import dataclass, field +from functools import wraps +from pathlib import Path +from typing import Any + +REPO_ROOT = Path(__file__).resolve().parents[2] +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + + +def bytes_to_mb(value: float) -> float: + return value / (1024 * 1024) + + +def current_rss_bytes() -> int: + import psutil + + return int(psutil.Process(os.getpid()).memory_info().rss) + + +def max_rss_bytes() -> int: + rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform == "darwin": + return int(rss) + return int(rss * 1024) + + +@dataclass +class PhaseStats: + calls: int = 0 + wall_seconds: float = 0.0 + rss_peak_bytes: int = 0 + counters: dict[str, int] = field(default_factory=lambda: defaultdict(int)) + + +class Recorder: + def __init__(self, sample_interval: float) -> None: + self.sample_interval = sample_interval + self._lock = threading.Lock() + self._stack: list[str] = [] + self._stop = threading.Event() + self._thread: threading.Thread | None = None + self.phases: dict[str, PhaseStats] = defaultdict(PhaseStats) + self.rss_peak_bytes = 0 + + @contextmanager + def measure(self, phase: str): + with self._lock: + self._stack.append(phase) + start = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start + rss = current_rss_bytes() + with self._lock: + if self._stack and self._stack[-1] == phase: + self._stack.pop() + elif phase in self._stack: + self._stack.remove(phase) + stats = self.phases[phase] + stats.calls += 1 + stats.wall_seconds += elapsed + stats.rss_peak_bytes = max(stats.rss_peak_bytes, rss) + self.rss_peak_bytes = max(self.rss_peak_bytes, rss) + + def add_counter(self, phase: str, key: str, value: int) -> None: + with self._lock: + self.phases[phase].counters[key] += int(value) + + def start(self) -> None: + self._stop.clear() + self._thread = threading.Thread(target=self._sample_loop, name="rss-sampler", daemon=True) + self._thread.start() + + def stop(self) -> None: + self._stop.set() + if self._thread is not None: + self._thread.join(timeout=max(1.0, self.sample_interval * 4)) + self._sample_once() + + def _sample_loop(self) -> None: + while not self._stop.wait(self.sample_interval): + self._sample_once() + + def _sample_once(self) -> None: + rss = current_rss_bytes() + with self._lock: + self.rss_peak_bytes = max(self.rss_peak_bytes, rss) + if self._stack: + phase = self._stack[-1] + self.phases[phase].rss_peak_bytes = max(self.phases[phase].rss_peak_bytes, rss) + + def as_jsonable(self) -> list[dict[str, Any]]: + rows = [] + for name, stats in sorted(self.phases.items()): + rows.append( + { + "name": name, + "calls": stats.calls, + "wall_seconds": round(stats.wall_seconds, 6), + "rss_peak_mb": round(bytes_to_mb(stats.rss_peak_bytes), 3), + "counters": dict(sorted(stats.counters.items())), + } + ) + return rows + + +def patch_method( + recorder: Recorder, + patches: list[tuple[Any, str, Any]], + owner: Any, + method_name: str, + phase: str, +) -> None: + original = getattr(owner, method_name) + + @wraps(original) + def wrapped(*args, **kwargs): + with recorder.measure(phase): + return original(*args, **kwargs) + + setattr(owner, method_name, wrapped) + patches.append((owner, method_name, original)) + + +def patch_iter_files(recorder: Recorder, patches: list[tuple[Any, str, Any]], repo_operator_cls: Any) -> None: + original = repo_operator_cls.iter_files + + @wraps(original) + def wrapped(self, *args, **kwargs): + iterator = original(self, *args, **kwargs) + + def measured_iterator(): + yielded = 0 + while True: + try: + with recorder.measure("repo_iter_files"): + item = next(iterator) + except StopIteration: + break + yielded += 1 + yield item + recorder.add_counter("repo_iter_files", "items_yielded", yielded) + + return measured_iterator() + + repo_operator_cls.iter_files = wrapped + patches.append((repo_operator_cls, "iter_files", original)) + + +def install_instrumentation(recorder: Recorder) -> list[tuple[Any, str, Any]]: + import graph_sitter.core.file as file_module + import graph_sitter.tree_sitter_parser as parser_module + from graph_sitter.codebase.codebase_context import CodebaseContext + from graph_sitter.core.class_definition import Class + from graph_sitter.core.file import SourceFile + from graph_sitter.core.import_resolution import Import + from graph_sitter.core.interface import Interface + from graph_sitter.core.interfaces.importable import Importable + from graph_sitter.core.symbol_groups.parents import Parents + from graph_sitter.git.repo_operator.repo_operator import RepoOperator + from graph_sitter.typescript.config_parser import TSConfigParser + from graph_sitter.typescript.export import TSExport + + patches: list[tuple[Any, str, Any]] = [] + + original_parse_file = file_module.parse_file + + @wraps(original_parse_file) + def parse_file_wrapper(filepath, content): + if isinstance(content, str): + recorder.add_counter("tree_sitter_parse_file", "bytes", len(content.encode("utf-8"))) + with recorder.measure("tree_sitter_parse_file"): + return original_parse_file(filepath, content) + + file_module.parse_file = parse_file_wrapper + patches.append((file_module, "parse_file", original_parse_file)) + if parser_module.parse_file is original_parse_file: + parser_module.parse_file = parse_file_wrapper + patches.append((parser_module, "parse_file", original_parse_file)) + + patch_iter_files(recorder, patches, RepoOperator) + patch_method(recorder, patches, CodebaseContext, "build_graph", "build_graph_total") + patch_method(recorder, patches, CodebaseContext, "_process_diff_files", "process_diff_files_total") + patch_method(recorder, patches, CodebaseContext, "build_directory_tree", "directory_tree") + patch_method(recorder, patches, CodebaseContext, "_compute_dependencies", "dependency_fixed_point") + patch_method(recorder, patches, SourceFile, "parse", "sourcefile_object_parse") + patch_method(recorder, patches, Import, "add_symbol_resolution_edge", "import_resolution") + patch_method(recorder, patches, Importable, "recompute", "importable_recompute") + patch_method(recorder, patches, TSConfigParser, "parse_configs", "config_parse") + patch_method(recorder, patches, TSExport, "compute_export_dependencies", "export_resolution") + patch_method(recorder, patches, Class, "compute_superclass_dependencies", "superclass_resolution") + patch_method(recorder, patches, Interface, "compute_superclass_dependencies", "superclass_resolution") + patch_method(recorder, patches, Parents, "compute_superclass_dependencies", "superclass_resolution") + + return patches + + +def restore_patches(patches: list[tuple[Any, str, Any]]) -> None: + for owner, method_name, original in reversed(patches): + setattr(owner, method_name, original) + + +def run_git(repo_path: Path, *args: str) -> None: + subprocess.run(["git", *args], cwd=repo_path, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + +def create_python_fixture(base_dir: Path, file_count: int, functions_per_file: int) -> Path: + repo_path = base_dir / "python-smoke-repo" + package = repo_path / "pkg" + package.mkdir(parents=True) + (package / "__init__.py").write_text("from .module_0 import Class0\n", encoding="utf-8") + for idx in range(file_count): + previous_import = "" if idx == 0 else f"from .module_{idx - 1} import Class{idx - 1}, helper_{idx - 1}_0\n" + functions = "\n\n".join( + [ + f"def helper_{idx}_{fn}(value: int) -> int:\n" + f" total = value + {idx} + {fn}\n" + " return total\n" + for fn in range(functions_per_file) + ] + ) + parent = f"Class{idx - 1}" if idx else "object" + inherited_call = f"helper_{idx - 1}_0(value)" if idx else "value" + content = ( + "from __future__ import annotations\n" + f"{previous_import}\n\n" + f"class Class{idx}({parent}):\n" + " def __init__(self, value: int) -> None:\n" + f" self.value = {inherited_call}\n\n" + " def compute(self) -> int:\n" + f" return helper_{idx}_0(self.value)\n\n" + f"{functions}\n" + ) + (package / f"module_{idx}.py").write_text(content, encoding="utf-8") + run_git(repo_path, "init") + run_git(repo_path, "add", ".") + return repo_path + + +def summarize_graph(codebase: Any) -> dict[str, Any]: + from graph_sitter.core.file import SourceFile + + ctx = codebase.ctx + nodes = list(ctx.nodes) + edges = list(ctx.edges) + node_types = Counter(getattr(node.node_type, "name", str(node.node_type)) for node in nodes) + files = [node for node in nodes if isinstance(node, SourceFile)] + return { + "nodes": len(nodes), + "edges": len(edges), + "node_types": dict(sorted(node_types.items())), + "source_files": len(files), + "source_file_nodes_total": sum(len(getattr(file, "_nodes", [])) for file in files), + "directories": len(getattr(ctx, "directories", {})), + } + + +def summarize_objects(skip: bool) -> dict[str, Any] | None: + if skip: + return None + gc.collect() + counts: Counter[str] = Counter() + total = 0 + for obj in gc.get_objects(): + cls = type(obj) + module = getattr(cls, "__module__", "") + if not isinstance(module, str): + continue + if module.startswith("graph_sitter"): + total += 1 + counts[f"{module}.{cls.__qualname__}"] += 1 + return { + "graph_sitter_objects": total, + "top_classes": counts.most_common(30), + } + + +def build_codebase(args: argparse.Namespace) -> tuple[Any, Path, bool, tempfile.TemporaryDirectory[str] | None]: + from graph_sitter.configs.models.codebase import CodebaseConfig + from graph_sitter.core.codebase import Codebase + + temp_dir: tempfile.TemporaryDirectory[str] | None = None + generated_fixture = False + if args.repo is None: + temp_dir = tempfile.TemporaryDirectory(prefix="graph-sitter-bench-") + repo_path = create_python_fixture(Path(temp_dir.name), args.fixture_files, args.fixture_functions) + generated_fixture = True + else: + repo_path = Path(args.repo).expanduser().resolve() + + config = CodebaseConfig(disable_graph=args.disable_graph) + language = None if args.language == "auto" else args.language + codebase = Codebase(str(repo_path), language=language, config=config) + return codebase, repo_path, generated_fixture, temp_dir + + +def make_report(args: argparse.Namespace) -> dict[str, Any]: + recorder = Recorder(sample_interval=args.sample_interval) + patches = install_instrumentation(recorder) + rss_start = current_rss_bytes() + start = time.perf_counter() + recorder.start() + temp_dir = None + try: + with recorder.measure("codebase_construct"): + codebase, repo_path, generated_fixture, temp_dir = build_codebase(args) + finally: + recorder.stop() + restore_patches(patches) + wall = time.perf_counter() - start + rss_end = current_rss_bytes() + + report = { + "metadata": { + "repo_path": str(repo_path), + "generated_fixture": generated_fixture, + "language": args.language, + "disable_graph": args.disable_graph, + "python": sys.version, + "platform": platform.platform(), + "sample_interval_seconds": args.sample_interval, + "command": " ".join(sys.argv), + }, + "totals": { + "wall_seconds": round(wall, 6), + "rss_start_mb": round(bytes_to_mb(rss_start), 3), + "rss_end_mb": round(bytes_to_mb(rss_end), 3), + "rss_peak_sampled_mb": round(bytes_to_mb(recorder.rss_peak_bytes), 3), + "max_rss_mb": round(bytes_to_mb(max_rss_bytes()), 3), + }, + "phases": recorder.as_jsonable(), + "graph": summarize_graph(codebase), + "objects": summarize_objects(args.skip_object_counts), + } + if temp_dir is not None: + temp_dir.cleanup() + return report + + +def print_human(report: dict[str, Any]) -> None: + totals = report["totals"] + graph = report["graph"] + print(f"repo: {report['metadata']['repo_path']}") + print(f"wall: {totals['wall_seconds']:.3f}s") + print(f"rss: start={totals['rss_start_mb']:.1f} MB end={totals['rss_end_mb']:.1f} MB peak={totals['rss_peak_sampled_mb']:.1f} MB max={totals['max_rss_mb']:.1f} MB") + print(f"graph: nodes={graph['nodes']} edges={graph['edges']} files={graph['source_files']} file_nodes={graph['source_file_nodes_total']}") + print("phases:") + for phase in report["phases"]: + print( + f" {phase['name']}: calls={phase['calls']} " + f"wall={phase['wall_seconds']:.3f}s rss_peak={phase['rss_peak_mb']:.1f} MB" + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Measure current graph-sitter Python backend cold parse RSS and wall time.") + parser.add_argument("repo", nargs="?", help="Path to a git repository. If omitted, a tiny Python fixture repo is generated.") + parser.add_argument("--language", choices=["auto", "python", "typescript"], default="auto", help="Language passed to Codebase.") + parser.add_argument("--disable-graph", action="store_true", help="Set CodebaseConfig(disable_graph=True) to isolate parse/object materialization.") + parser.add_argument("--fixture-files", type=int, default=8, help="Generated fixture Python module count when repo is omitted.") + parser.add_argument("--fixture-functions", type=int, default=8, help="Generated helper functions per fixture module when repo is omitted.") + parser.add_argument("--sample-interval", type=float, default=0.01, help="RSS sampling interval in seconds.") + parser.add_argument("--skip-object-counts", action="store_true", help="Skip post-run gc object counting.") + parser.add_argument("--output", type=Path, help="Optional path to write JSON report.") + parser.add_argument("--json", action="store_true", help="Print JSON report instead of a human summary.") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + report = make_report(args) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print_human(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From 37e61c7a6b9e505d3ddc7e65bcc2c6ef979c69c3 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:00:24 -0700 Subject: [PATCH 005/228] Integrate Python compatibility plan --- rust-rewrite/python-compat.md | 268 ++++++++++++++++++++++++++++++++++ rust-rewrite/strategy.md | 4 +- 2 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 rust-rewrite/python-compat.md diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md new file mode 100644 index 000000000..46982e91d --- /dev/null +++ b/rust-rewrite/python-compat.md @@ -0,0 +1,268 @@ +# Python/PyO3 Compatibility Plan + +## Objective + +Preserve the current Python shell and codemod API while allowing a Rust engine to own canonical graph storage. The compatibility layer must not recreate today's full Python object graph when the Rust backend is selected. Python objects should be lightweight handles over Rust IDs and should only be created for files, symbols, imports, exports, or usages that user code actually accesses. + +## Current Python Shape + +Key findings from the current code: + +- `Codebase` is the user facade. Public list properties such as `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` mostly call `CodebaseContext.get_nodes(...)`, then sort/filter Python objects. +- `CodebaseContext` owns a `rustworkx.PyDiGraph` whose node payloads are Python objects. It also owns `filepath_idx`, directory state, parser/config/dependency managers, transaction state, and import/export/dependency graph mutation helpers. +- `SourceFile.__init__` immediately adds itself to the graph, parses the tree-sitter root, fills `file._nodes`, and registers the file path. +- `Importable.__init__` adds most child nodes to the graph and appends each child to `file._nodes`. +- `Editable` assumes a persistent `tree_sitter.Node`, `ctx`, `parent`, and `file_node_id`. Many inherited methods rely on `ts_node`, `parent`, and a populated Python graph. +- The compiled setup is Cython-based today under `graph_sitter.compiled`; wheel builds use a Hatch Cython hook. `cibuildwheel` already installs Rust toolchains, but no Rust extension build hook is active. + +These constructors make "subclass the current objects and call `super().__init__`" the wrong default for Rust-backed objects. The Rust path needs separate lazy handle initialization that bypasses eager graph insertion and only materializes the Python tree on explicit fallback. + +## Backend Flag Shape + +Add a first-class graph backend setting to `CodebaseConfig` without changing the default behavior: + +```python +class GraphBackend(StrEnum): + PYTHON = "python" + RUST = "rust" + AUTO = "auto" + + +class RustFallbackMode(StrEnum): + PYTHON = "python" + ERROR = "error" + + +class CodebaseConfig(BaseConfig): + graph_backend: GraphBackend = GraphBackend.PYTHON + rust_fallback: RustFallbackMode = RustFallbackMode.PYTHON +``` + +Environment variables follow the existing `BaseConfig` prefix behavior: + +- `CODEBASE_GRAPH_BACKEND=python|rust|auto` +- `CODEBASE_RUST_FALLBACK=python|error` + +Selection policy: + +- `python`: always use the current `PyDiGraph` backend. +- `rust`: require the PyO3 extension and supported language/config. If unavailable or unsupported, obey `rust_fallback`. +- `auto`: try Rust only when the language and config are known supported; otherwise use Python without warning unless debug logging is enabled. + +`use_pink` should remain separate from `graph_backend`. Pink currently acts as an alternate file listing/file IO path for some modes, not as the graph engine. Initial Rust graph work should reject or fall back when `use_pink == PinkMode.ALL_FILES`, because `Codebase.files` is already delegated to `codegen_sdk_pink` in that mode. + +## Backend Facade + +Introduce a narrow internal facade owned by `CodebaseContext`: + +```python +class GraphBackendFacade(Protocol): + kind: Literal["python", "rust"] + generation: int + + def build(self, repo_operator: RepoOperator) -> None: ... + def apply_diffs(self, diff_list: list[DiffLite]) -> None: ... + + def get_file(self, file_path: os.PathLike, *, ignore_case: bool = False) -> SourceFile | None: ... + def get_node(self, node_id: int) -> Importable: ... + def get_nodes(self, node_type: NodeType | None = None, exclude_type: NodeType | None = None) -> list[Importable]: ... + + def successors(self, node_id: int, *, edge_type: EdgeType | None = None, sort: bool = True) -> Sequence[Importable]: ... + def predecessors(self, node_id: int, edge_type: EdgeType | None = None) -> Sequence[Importable]: ... + def in_edges(self, node_id: int) -> list[EdgeRecord]: ... + def out_edges(self, node_id: int) -> list[EdgeRecord]: ... +``` + +Implementation split: + +- `PythonGraphBackend` wraps the existing `CodebaseContext` graph fields and behavior. This is a mechanical extraction target and keeps default behavior identical. +- `RustGraphBackend` wraps a PyO3 `Engine` object and exposes the same query surface by converting Rust IDs into lazy Python handles. + +Migration order: + +1. Add the config flag and facade with `PythonGraphBackend` only. +2. Add PyO3 import smoke test and `RustGraphBackend.engine_version()`. +3. Route only read/list APIs through the facade. +4. Add Rust-backed query methods one family at a time. +5. Keep graph mutation and transaction-heavy APIs on Python or explicit fallback until Rust patch intents exist. + +## PyO3 Surface + +Expose a private extension module, for example `graph_sitter._rust`, with one main PyO3 class: + +```python +class Engine: + @staticmethod + def version() -> str: ... + + def build(input: BuildInput) -> BuildReport: ... + def apply_diffs(diffs: list[DiffRecord]) -> InvalidationReport: ... + + def files() -> list[int]: ... + def symbols(kind: SymbolKind | None = None, top_level_only: bool = True) -> list[int]: ... + def imports() -> list[int]: ... + def exports() -> list[int]: ... + + def file_record(id: int) -> FileRecord: ... + def symbol_record(id: int) -> SymbolRecord: ... + def import_record(id: int) -> ImportRecord: ... + def export_record(id: int) -> ExportRecord: ... + + def successors(object_ref: ObjectRef, edge_type: EdgeType | None) -> list[ObjectRef]: ... + def predecessors(object_ref: ObjectRef, edge_type: EdgeType | None) -> list[ObjectRef]: ... + def source_slice(file_id: int, start_byte: int, end_byte: int) -> str: ... +``` + +Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: + +- `python_node_id -> ObjectRef(kind, rust_id)` +- `ObjectRef(kind, rust_id) -> python_node_id` + +This preserves current APIs that pass `node_id` back to `ctx.get_node(...)` while avoiding assumptions that Rust IDs are globally interchangeable with today's `PyDiGraph` IDs. + +## Lazy Handle Classes + +Use a handle mixin plus concrete public-class subclasses to preserve `isinstance` behavior where practical: + +```python +class RustHandleMixin: + _ctx: CodebaseContext + _backend: RustGraphBackend + _ref: ObjectRef + _node_id: int + _generation: int + _record_cache: object | None + _materialized: Importable | None + + @property + def node_id(self) -> int: ... + def _record(self): ... + def _ensure_current(self) -> None: ... + def _materialize(self, reason: str) -> Importable: ... +``` + +Concrete handle classes: + +- `RustSourceFile(RustHandleMixin, SourceFile)` +- `RustPyFile(RustSourceFile, PyFile)` +- `RustTSFile(RustSourceFile, TSFile)` +- `RustSymbol(RustHandleMixin, Symbol)` +- `RustPySymbol`, `RustTSSymbol`, plus class/function/interface/type/global-var variants as needed for user-visible type checks +- `RustImport(RustHandleMixin, Import)` +- `RustPyImport`, `RustTSImport` +- `RustExport(RustHandleMixin, Export)`, TypeScript only at first + +These classes must not call the eager base constructors. Construction happens through a factory: + +```python +handle = backend.handle_for(ObjectRef(kind="symbol", id=42)) +``` + +The factory should use a `WeakValueDictionary` keyed by `(generation, kind, rust_id)` so repeated access can preserve object identity while alive without pinning every graph node in memory. + +Field-backed P0 properties should read from Rust records and avoid materialization: + +- common: `node_id`, `node_type`, `filepath`, `file_path`, `path`, `name`, `source`, `start_byte`, `end_byte`, `start_point`, `end_point`, `range` +- files: `content`, `content_bytes`, `extension`, `imports`, `symbols`, TypeScript `exports` +- symbols: `symbol_type`, `full_name`, `is_top_level`, `file`, `parent_symbol` when Rust has parent IDs +- imports: `module`, `symbol_name`, `alias`, `import_type`, `from_file`, `to_file`, `imported_symbol`, `resolved_symbol` +- exports: `name`, `exported_name`, `exported_symbol`, `resolved_symbol`, `is_named_export`, `is_module_export` + +Properties that need `ts_node`, `code_block`, arbitrary parent traversal, formatting-specific edit behavior, or Python-only resolver details should call `_materialize(...)` or raise in strict mode. + +## Lazy Object Lifecycle + +1. `Codebase` construction creates `CodebaseContext`. +2. `CodebaseContext` resolves the backend from config. +3. Python backend follows the existing eager graph path. +4. Rust backend builds Rust indexes and records, but no Python `SourceFile`, `Symbol`, `Import`, or `Export` objects are created during build. +5. Public list queries ask the engine for sorted IDs and wrap only those returned IDs in handles. +6. Handle metadata is loaded on first property access and cached per handle. +7. Nested queries are also ID based. For example, `file.symbols` asks Rust for symbol IDs in that file and wraps only those IDs. +8. A handle records the backend generation. After `apply_diffs`, handles either rebind through stable IDs or become outdated and follow the existing stale-node semantics. +9. If user code requests unsupported Python behavior, the handle uses the fallback policy below. + +Avoiding full materialization: + +- Do not keep `file._nodes` for Rust-backed files. Expose `get_nodes(...)` by querying Rust for IDs. +- Do not create persistent Python `tree_sitter.Node` wrappers for every record. Use ranges and source slices. +- Do not back Rust handles with `PyDiGraph` node payloads. If a compatibility `node_id` is needed, it is a facade ID, not a graph index. +- Do not call `sort_editables` on a hidden eager graph. Either engine returns stable sorted IDs, or handles expose the small set of sort fields needed by existing callers. + +## Fallback Policy + +Fallback has two levels. + +Cold fallback: + +- Used when the Rust extension is missing, the language/config is unsupported, engine build fails, or `use_pink == PinkMode.ALL_FILES`. +- If `rust_fallback == "python"` or `graph_backend == "auto"`, log the reason and build the current Python backend. +- If `rust_fallback == "error"` and `graph_backend == "rust"`, raise a `RustBackendUnavailableError` with the exact unsupported feature or import/build failure. + +Method fallback: + +- Read-only, file-local unsupported behavior can materialize one file through the current parser, locate the matching Python object by `(kind, range, name)`, and delegate the method. +- Graph-wide unsupported behavior, dependency recomputation, and resolver operations that require a populated `PyDiGraph` should promote the whole context to the Python backend unless strict mode is enabled. +- Mutations should initially prefer Python promotion. Direct Rust-handle range edits can come later as patch intents, but structural helpers such as `move_to_file`, `add_import`, `remove_unused_exports`, or usage-based `rename` need Python graph semantics until Rust owns those flows. +- On any promotion, clear Rust handle caches, increment context generation, and make old handles outdated rather than half-valid. + +Strict behavior: + +- In `rust_fallback == "error"`, unsupported method access raises `RustBackendUnsupportedError(method=..., handle=..., reason=...)`. +- Tests should run some parity slices in strict mode to catch accidental Python promotion. + +## Packaging Impact + +Current packaging state: + +- `hatch.toml` uses a Hatch Cython hook to compile selected `graph_sitter.compiled` modules. +- `pyproject.toml` uses `hatchling.build`. +- `cibuildwheel` already installs Rust on Linux and macOS, but no PyO3 build hook is configured. + +Recommended packaging path: + +- Add a Rust workspace with `graph_sitter_engine` and `graph_sitter_py`. +- Publish the PyO3 module as `graph_sitter._rust` so the public package namespace stays stable. +- Keep the extension optional at import time. Default `graph_backend="python"` must work without the Rust binary. +- Use a Hatch-compatible Rust build hook or a small custom Hatch hook that invokes `maturin` for the PyO3 crate and adds the built extension to wheel artifacts. +- Add `maturin` or the selected hook to `build-system.requires` and build hook dependencies when implementation starts. +- Ensure `sdist` includes `Cargo.toml`, `Cargo.lock` if policy chooses locked builds, crate sources, and any tree-sitter grammar inputs required by Rust. +- Keep Cython modules in place. The Rust handle layer can still import `graph_sitter.compiled.sort`, `autocommit`, and `utils` for the Python backend and fallback paths. +- Start with CPython-version-specific wheels rather than `abi3` unless PyO3 and tree-sitter dependencies are confirmed compatible with `abi3`. +- Add a CI smoke job that imports `graph_sitter._rust`, checks `Engine.version()`, and builds a minimal Python fixture with `CODEBASE_GRAPH_BACKEND=rust`. + +## Initial Tests + +Config and selection: + +- `CodebaseConfig().graph_backend == "python"` keeps current behavior. +- `CODEBASE_GRAPH_BACKEND=rust` selects Rust when the extension is importable. +- `graph_backend="auto"` falls back to Python for unsupported languages/config without changing user-facing `Codebase` construction. +- `graph_backend="rust", rust_fallback="error"` raises on missing extension or unsupported feature. + +Facade parity: + +- Existing small Python fixtures: compare `files`, `symbols`, `classes`, `functions`, and `imports` names, paths, ranges, and sort order between Python and Rust backends. +- Existing small TypeScript fixtures: compare `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` names, paths, ranges, and sort order. +- `get_file`, `has_file`, `get_symbol`, `get_class`, and `get_function` return compatible results. + +Lazy behavior: + +- Rust backend construction does not call eager `SourceFile.__init__`, `Symbol.__init__`, `Import.__init__`, or `Export.__init__`. +- `codebase.files` creates handles only for returned files and does not populate `ctx._graph` with Python file payloads. +- `codebase.symbols` creates top-level symbol handles only, not every parsed AST node. +- `file.symbols`, `file.imports`, and TypeScript `file.exports` only wrap IDs for that file. +- Handle properties `name`, `filepath`, `source`, `start_byte`, and `end_byte` do not materialize Python tree-sitter nodes. + +Fallback: + +- Accessing an unsupported file-local property materializes only the containing file in non-strict fallback mode. +- Accessing an unsupported graph-wide mutation promotes to Python backend in non-strict fallback mode. +- The same unsupported accesses raise `RustBackendUnsupportedError` in strict mode. +- Old handles become outdated after promotion or `apply_diffs`. + +Packaging: + +- Wheel build includes both existing Cython extensions and `graph_sitter._rust`. +- Importing `graph_sitter` with `graph_backend="python"` succeeds if `graph_sitter._rust` is absent. +- Importing `graph_sitter._rust` succeeds in CI wheels for supported Python versions and platforms. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index a77a98be4..3075fcc71 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -114,7 +114,7 @@ Recommended task format: - [x] Parser/index vertical slice. owner: Meitner. Agent: `019edc37-8867-7a83-a18e-b0ec0ca29d11`. Branch: `codex/rust-rewrite-parser-index`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-parser-index`. Result: parser/index extraction plan committed. - [x] Resolver/dependency algorithms. owner: Gauss. Agent: `019edc37-8c34-7f93-b0ae-746cbd579962`. Branch: `codex/rust-rewrite-resolver`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-resolver`. Result: resolver algorithm inventory and Rust port plan committed. - [x] Rust engine skeleton. owner: Beauvoir. Agent: `019edc37-8f2d-7dd3-b3ed-a1f9e1b191a7`. Branch: `codex/rust-rewrite-engine-skeleton`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-engine-skeleton`. Result: standalone Cargo workspace and smoke tests committed. -- [ ] PyO3/Python compatibility. owner: Wegener. Agent: `019edc4e-72b1-7a00-8644-e43503f0cdc3`. Branch: `codex/rust-rewrite-pyo3-compat`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-pyo3-compat`. Notes: spawned after completed agents freed capacity. +- [x] PyO3/Python compatibility. owner: Wegener. Agent: `019edc4e-72b1-7a00-8644-e43503f0cdc3`. Branch: `codex/rust-rewrite-pyo3-compat`. Worktree: `/Users/jayhack/CS/CODEGEN/graph-sitter-rust-pyo3-compat`. Result: compatibility plan committed. ## Phase 0: Baseline, RFC, And Contracts @@ -174,6 +174,7 @@ Recommended task format: ## Phase 4: Lazy Python Compatibility Layer +- [x] Plan Python/PyO3 compatibility layer and lazy handle migration. owner: Wegener. Result: documented in `rust-rewrite/python-compat.md`. - [ ] Define Python handle base class that stores engine reference and stable ID. - [ ] Implement Rust-backed file handles for P0 `SourceFile` APIs. - [ ] Implement Rust-backed symbol handles for P0 `Symbol`, `Class`, and `Function` APIs. @@ -223,3 +224,4 @@ Recommended task format: - [x] 2026-06-18: Initial strategy file created on `rust-rewrite` branch. owner: codex. Notes: ready for helper agents to claim phase tasks. - [x] 2026-06-18: Integrator created seven worktrees and spawned six helper agents; PyO3 compatibility was queued due to agent concurrency limit. owner: codex. - [x] 2026-06-18: Six completed helper branches reviewed and their artifacts staged for integration. owner: codex. Notes: PyO3 compatibility agent is now running as Wegener. +- [x] 2026-06-18: PyO3 compatibility helper completed and its planning artifact was staged for integration. owner: codex. From b12020dfdc3a51f692c73013242838649586e51a Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:26:14 -0700 Subject: [PATCH 006/228] Implement Rust Python compact index slice --- .gitignore | 1 + Cargo.lock | 371 ++++++++++++++ Cargo.toml | 4 + crates/graph-sitter-engine/Cargo.toml | 8 + .../examples/index_python.rs | 45 ++ crates/graph-sitter-engine/src/lib.rs | 456 +++++++++++++++++- crates/graph-sitter-py/src/lib.rs | 7 +- rust-rewrite/benchmarks.md | 47 ++ rust-rewrite/strategy.md | 16 +- .../tools/compare_rust_python_index.py | 233 +++++++++ 10 files changed, 1178 insertions(+), 10 deletions(-) create mode 100644 Cargo.lock create mode 100644 crates/graph-sitter-engine/examples/index_python.rs create mode 100644 rust-rewrite/tools/compare_rust_python_index.py diff --git a/.gitignore b/.gitignore index 8b55b255c..dc8e3c720 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ alembic_versions_backup **/*.c **/build/ **/dist/ +target/ **/*.so **/.diffs/** **/.coverage* diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 000000000..b24474d2b --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,371 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "cc" +version = "1.2.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "graph-sitter-engine" +version = "0.1.0" +dependencies = [ + "serde", + "serde_json", + "tree-sitter", + "tree-sitter-python", +] + +[[package]] +name = "graph-sitter-py" +version = "0.1.0" +dependencies = [ + "graph-sitter-engine", + "pyo3", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "tree-sitter" +version = "0.26.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dab76d0b724ba557954125188cf0633a1ca43199ced82d95c7b9c32cc3de1f3" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index f70a5907d..f8943d5df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,7 @@ repository = "https://github.com/codegen-sh/graph-sitter" [workspace.dependencies] graph-sitter-engine = { path = "crates/graph-sitter-engine" } pyo3 = "0.22" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +tree-sitter = "0.26" +tree-sitter-python = "0.25" diff --git a/crates/graph-sitter-engine/Cargo.toml b/crates/graph-sitter-engine/Cargo.toml index 2a2ef9603..ae39702f1 100644 --- a/crates/graph-sitter-engine/Cargo.toml +++ b/crates/graph-sitter-engine/Cargo.toml @@ -9,3 +9,11 @@ repository.workspace = true [lib] name = "graph_sitter_engine" path = "src/lib.rs" + +[dependencies] +serde.workspace = true +tree-sitter.workspace = true +tree-sitter-python.workspace = true + +[dev-dependencies] +serde_json.workspace = true diff --git a/crates/graph-sitter-engine/examples/index_python.rs b/crates/graph-sitter-engine/examples/index_python.rs new file mode 100644 index 000000000..20fb79ef5 --- /dev/null +++ b/crates/graph-sitter-engine/examples/index_python.rs @@ -0,0 +1,45 @@ +use graph_sitter_engine::index_python_path; +use std::env; +use std::error::Error; +use std::time::Instant; + +fn main() -> Result<(), Box> { + let mut args = env::args().skip(1); + let Some(repo_path) = args.next() else { + eprintln!("usage: cargo run -p graph-sitter-engine --example index_python -- [--json]"); + std::process::exit(2); + }; + let json = args.any(|arg| arg == "--json"); + + let started = Instant::now(); + let index = index_python_path(&repo_path)?; + let elapsed = started.elapsed(); + let summary = index.summary(); + + if json { + println!( + "{}", + serde_json::json!({ + "repo_path": repo_path, + "wall_seconds": elapsed.as_secs_f64(), + "summary": summary, + }) + ); + } else { + println!("repo: {repo_path}"); + println!("wall: {:.6}s", elapsed.as_secs_f64()); + println!( + "index: files={} symbols={} classes={} functions={} imports={} bytes={} lines={} files_with_errors={}", + summary.files, + summary.symbols, + summary.classes, + summary.functions, + summary.imports, + summary.bytes, + summary.lines, + summary.files_with_errors + ); + } + + Ok(()) +} diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 913bac76b..30f9c708f 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1,6 +1,13 @@ #![forbid(unsafe_code)] -const ENABLED_FEATURES: &[&str] = &["skeleton"]; +use serde::Serialize; +use std::fmt; +use std::fs; +use std::io; +use std::path::{Path, PathBuf}; +use tree_sitter::{Node, Parser, Range, Tree}; + +const ENABLED_FEATURES: &[&str] = &["skeleton", "python-index"]; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct EngineInfo { @@ -37,6 +44,13 @@ impl Engine { pub fn enabled_features(&self) -> &'static [&'static str] { ENABLED_FEATURES } + + pub fn index_python_path( + &self, + repo_path: impl AsRef, + ) -> Result { + PythonIndexer::new()?.index_path(repo_path) + } } pub fn engine_version() -> &'static str { @@ -50,15 +64,451 @@ pub fn debug_info() -> EngineInfo { } } +pub fn index_python_path(repo_path: impl AsRef) -> Result { + Engine::new().index_python_path(repo_path) +} + +#[derive(Debug)] +pub enum IndexError { + Io { path: PathBuf, source: io::Error }, + ParseFailed { path: PathBuf }, + Language(tree_sitter::LanguageError), +} + +impl fmt::Display for IndexError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Io { path, source } => write!(f, "failed to read {}: {source}", path.display()), + Self::ParseFailed { path } => { + write!(f, "tree-sitter failed to parse {}", path.display()) + } + Self::Language(source) => { + write!(f, "failed to load tree-sitter Python language: {source}") + } + } + } +} + +impl std::error::Error for IndexError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io { source, .. } => Some(source), + Self::Language(source) => Some(source), + Self::ParseFailed { .. } => None, + } + } +} + +impl From for IndexError { + fn from(value: tree_sitter::LanguageError) -> Self { + Self::Language(value) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct PythonIndex { + pub files: Vec, + pub symbols: Vec, + pub imports: Vec, +} + +impl PythonIndex { + pub fn summary(&self) -> IndexSummary { + IndexSummary { + files: self.files.len(), + symbols: self.symbols.len(), + classes: self + .symbols + .iter() + .filter(|symbol| symbol.kind == SymbolKind::Class) + .count(), + functions: self + .symbols + .iter() + .filter(|symbol| symbol.kind == SymbolKind::Function) + .count(), + imports: self.imports.len(), + bytes: self.files.iter().map(|file| file.byte_len).sum(), + lines: self.files.iter().map(|file| file.line_count).sum(), + files_with_errors: self.files.iter().filter(|file| file.has_error).count(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct IndexSummary { + pub files: usize, + pub symbols: usize, + pub classes: usize, + pub functions: usize, + pub imports: usize, + pub bytes: usize, + pub lines: usize, + pub files_with_errors: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct FileRecord { + pub id: u32, + pub path: String, + pub byte_len: usize, + pub line_count: usize, + pub has_error: bool, + pub root_range: SourceRange, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum SymbolKind { + Class, + Function, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct SymbolRecord { + pub id: u32, + pub file_id: u32, + pub name: String, + pub kind: SymbolKind, + pub range: SourceRange, + pub name_range: SourceRange, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ImportKind { + Import, + FromImport, + FutureImport, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct ImportRecord { + pub id: u32, + pub file_id: u32, + pub kind: ImportKind, + pub module: Option, + pub name: Option, + pub alias: Option, + pub range: SourceRange, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +pub struct SourceRange { + pub start_byte: usize, + pub end_byte: usize, + pub start_row: usize, + pub start_column: usize, + pub end_row: usize, + pub end_column: usize, +} + +impl From for SourceRange { + fn from(value: Range) -> Self { + Self { + start_byte: value.start_byte, + end_byte: value.end_byte, + start_row: value.start_point.row, + start_column: value.start_point.column, + end_row: value.end_point.row, + end_column: value.end_point.column, + } + } +} + +struct PythonIndexer { + parser: Parser, +} + +impl PythonIndexer { + fn new() -> Result { + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_python::LANGUAGE.into())?; + Ok(Self { parser }) + } + + fn index_path(mut self, repo_path: impl AsRef) -> Result { + let repo_path = repo_path.as_ref(); + let mut index = PythonIndex { + files: Vec::new(), + symbols: Vec::new(), + imports: Vec::new(), + }; + let mut paths = Vec::new(); + collect_python_files(repo_path, &mut paths)?; + paths.sort(); + + for path in paths { + let file_id = index.files.len() as u32; + let content = fs::read_to_string(&path).map_err(|source| IndexError::Io { + path: path.clone(), + source, + })?; + let tree = self + .parser + .parse(&content, None) + .ok_or_else(|| IndexError::ParseFailed { path: path.clone() })?; + let root = tree.root_node(); + let relative_path = path + .strip_prefix(repo_path) + .unwrap_or(path.as_path()) + .to_string_lossy() + .replace('\\', "/"); + + index.files.push(FileRecord { + id: file_id, + path: relative_path, + byte_len: content.len(), + line_count: line_count(&content), + has_error: root.has_error(), + root_range: root.range().into(), + }); + extract_python_file(file_id, &content, &tree, &mut index); + } + + Ok(index) + } +} + +fn collect_python_files(dir: &Path, out: &mut Vec) -> Result<(), IndexError> { + let entries = fs::read_dir(dir).map_err(|source| IndexError::Io { + path: dir.to_path_buf(), + source, + })?; + for entry in entries { + let entry = entry.map_err(|source| IndexError::Io { + path: dir.to_path_buf(), + source, + })?; + let path = entry.path(); + let file_type = entry.file_type().map_err(|source| IndexError::Io { + path: path.clone(), + source, + })?; + if file_type.is_dir() { + if should_skip_dir(&path) { + continue; + } + collect_python_files(&path, out)?; + } else if file_type.is_file() && path.extension().and_then(|ext| ext.to_str()) == Some("py") + { + out.push(path); + } + } + Ok(()) +} + +fn should_skip_dir(path: &Path) -> bool { + matches!( + path.file_name().and_then(|name| name.to_str()), + Some( + ".git" | ".hg" | ".svn" | ".venv" | "venv" | "__pycache__" | "node_modules" | "target" + ) + ) +} + +fn extract_python_file(file_id: u32, source: &str, tree: &Tree, index: &mut PythonIndex) { + let root = tree.root_node(); + let mut cursor = root.walk(); + for child in root.named_children(&mut cursor) { + extract_top_level_node(file_id, source, child, index); + } +} + +fn extract_top_level_node(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { + match node.kind() { + "class_definition" => push_symbol(file_id, source, node, SymbolKind::Class, index), + "function_definition" => push_symbol(file_id, source, node, SymbolKind::Function, index), + "decorated_definition" => { + if let Some(definition) = + first_child_of_kind(node, &["class_definition", "function_definition"]) + { + let kind = if definition.kind() == "class_definition" { + SymbolKind::Class + } else { + SymbolKind::Function + }; + push_symbol_with_range(file_id, source, definition, node.range(), kind, index); + } + } + "import_statement" => push_import_statement(file_id, source, node, index), + "import_from_statement" | "future_import_statement" => { + push_from_import_statement(file_id, source, node, index) + } + _ => {} + } +} + +fn push_symbol( + file_id: u32, + source: &str, + node: Node<'_>, + kind: SymbolKind, + index: &mut PythonIndex, +) { + push_symbol_with_range(file_id, source, node, node.range(), kind, index); +} + +fn push_symbol_with_range( + file_id: u32, + source: &str, + node: Node<'_>, + declaration_range: Range, + kind: SymbolKind, + index: &mut PythonIndex, +) { + let Some(name_node) = node.child_by_field_name("name") else { + return; + }; + let Ok(name) = name_node.utf8_text(source.as_bytes()) else { + return; + }; + index.symbols.push(SymbolRecord { + id: index.symbols.len() as u32, + file_id, + name: name.to_owned(), + kind, + range: declaration_range.into(), + name_range: name_node.range().into(), + }); +} + +fn push_import_statement(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { + let text = node_text(source, node); + let imports = text + .trim_start_matches("import") + .split(',') + .map(str::trim) + .filter(|part| !part.is_empty()); + + for import in imports { + let (name, alias) = split_alias(import); + index.imports.push(ImportRecord { + id: index.imports.len() as u32, + file_id, + kind: ImportKind::Import, + module: None, + name: Some(name.to_owned()), + alias: alias.map(str::to_owned), + range: node.range().into(), + }); + } +} + +fn push_from_import_statement(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { + let text = node_text(source, node); + let stripped = text.trim(); + let kind = if node.kind() == "future_import_statement" { + ImportKind::FutureImport + } else { + ImportKind::FromImport + }; + let Some(after_from) = stripped.strip_prefix("from ") else { + return; + }; + let Some((module, names)) = after_from.split_once(" import ") else { + return; + }; + + for import in names + .split(',') + .map(str::trim) + .filter(|part| !part.is_empty()) + { + let (name, alias) = split_alias(import); + index.imports.push(ImportRecord { + id: index.imports.len() as u32, + file_id, + kind, + module: Some(module.trim().to_owned()), + name: Some(name.to_owned()), + alias: alias.map(str::to_owned), + range: node.range().into(), + }); + } +} + +fn first_child_of_kind<'tree>(node: Node<'tree>, kinds: &[&str]) -> Option> { + let mut cursor = node.walk(); + let child = node + .named_children(&mut cursor) + .find(|child| kinds.iter().any(|kind| child.kind() == *kind)); + child +} + +fn split_alias(import: &str) -> (&str, Option<&str>) { + if let Some((name, alias)) = import.split_once(" as ") { + (name.trim(), Some(alias.trim())) + } else { + (import.trim(), None) + } +} + +fn node_text<'source>(source: &'source str, node: Node<'_>) -> &'source str { + &source[node.start_byte()..node.end_byte()] +} + +fn line_count(source: &str) -> usize { + if source.is_empty() { + 0 + } else { + source + .as_bytes() + .iter() + .filter(|byte| **byte == b'\n') + .count() + + usize::from(!source.ends_with('\n')) + } +} + #[cfg(test)] mod tests { use super::*; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; #[test] - fn debug_info_reports_version_and_skeleton_feature() { + fn debug_info_reports_version_and_python_index_feature() { let info = Engine::new().debug_info(); assert_eq!(info.version(), env!("CARGO_PKG_VERSION")); - assert_eq!(info.enabled_features(), ["skeleton"]); + assert_eq!(info.enabled_features(), ["skeleton", "python-index"]); + } + + #[test] + fn indexes_python_files_without_materializing_python_objects() { + let repo = temp_repo_path("index-python"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write( + repo.join("pkg/mod.py"), + "from __future__ import annotations\nfrom .base import Base as RenamedBase\nimport os, sys as system\n\n@decorator\nclass Service(RenamedBase):\n pass\n\ndef helper(value):\n return value\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + assert_eq!(index.summary().files, 1); + assert_eq!(index.summary().classes, 1); + assert_eq!(index.summary().functions, 1); + assert_eq!(index.summary().imports, 4); + assert_eq!(index.symbols[0].name, "Service"); + assert_eq!(index.symbols[1].name, "helper"); + assert!(index + .imports + .iter() + .any(|import| import.module.as_deref() == Some(".base"))); + assert!(index + .imports + .iter() + .any(|import| import.alias.as_deref() == Some("system"))); + } + + fn temp_repo_path(prefix: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("graph-sitter-{prefix}-{nanos}")) } } diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 2b992a341..4ae86fa6a 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -114,7 +114,10 @@ mod bindings { let info = py_debug_info(); assert_eq!(info.version, graph_sitter_engine::engine_version()); - assert_eq!(info.enabled_features, vec!["skeleton".to_owned()]); + assert_eq!( + info.enabled_features, + vec!["skeleton".to_owned(), "python-index".to_owned()] + ); } } } @@ -126,6 +129,6 @@ mod tests { #[test] fn forwards_core_engine_metadata_without_python_linking() { assert_eq!(engine_version(), graph_sitter_engine::engine_version()); - assert_eq!(enabled_features(), ["skeleton"]); + assert_eq!(enabled_features(), ["skeleton", "python-index"]); } } diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 1e1c32396..d44e73fb2 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -64,6 +64,31 @@ uv run python rust-rewrite/tools/measure_python_backend.py /path/to/repo --langu --disable-graph --output /tmp/python-backend-parse-only.json ``` +`rust-rewrite/tools/compare_rust_python_index.py` compares that current Python backend path with the Rust compact Python indexer. It builds the Rust release example once, generates or accepts a repo, and samples the Rust indexer process RSS. + +Generated fixture comparison: + +```bash +uv run python rust-rewrite/tools/compare_rust_python_index.py \ + --fixture-files 150 --fixture-functions 20 \ + --output /tmp/graph-sitter-rust-compare.json +``` + +Current repo comparison: + +```bash +uv run python rust-rewrite/tools/compare_rust_python_index.py . \ + --output /tmp/graph-sitter-rust-compare-repo.json +``` + +Compare against the current full Python graph instead of parse/object materialization only: + +```bash +uv run python rust-rewrite/tools/compare_rust_python_index.py . \ + --python-full-graph \ + --output /tmp/graph-sitter-rust-compare-repo-full.json +``` + ## Metrics The JSON report includes: @@ -94,6 +119,28 @@ Use pinned commits and record hardware, Python version, OS, and command line fro For each real repo, capture both default graph mode and `--disable-graph` parse-only mode. The delta approximates resolution/dependency graph cost. +## Initial Rust Index Evidence + +These measurements are for the first Rust vertical slice only: repo walk, tree-sitter Python parsing, top-level class/function extraction, and import extraction into compact Rust records. This is not yet full `Codebase` API parity and does not yet include dependency graph resolution. + +Commands were run on this branch on 2026-06-18. + +| Input | Python mode | Python wall | Python max RSS | Rust index wall | Rust process wall | Rust sampled RSS | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| Generated fixture, 150 modules x 20 helpers | `--disable-graph` | 0.460s | 166.3 MB | 0.047s | 0.281s | 3.3 MB | 9.875x | 50.918x | +| Generated fixture, 150 modules x 20 helpers | full graph | 1.147s | 208.5 MB | 0.038s | 0.051s | 3.1 MB | 30.502x | 66.380x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.874s | 531.9 MB | 0.317s | 0.333s | 7.6 MB | 9.069x | 70.045x | +| `graph-sitter` repo checkout | full graph | 7.448s | 788.8 MB | 0.331s | 0.342s | 7.6 MB | 22.480x | 103.877x | + +The most conservative current-repo comparison is parse/object materialization only: Rust is about 9x faster and about 70x lower RSS for the implemented compact-index slice. Against today's full graph construction on this repo, Rust is about 22x faster and about 104x lower RSS for the same implemented slice. + +Important caveats: + +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions, and imports. +- The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. +- The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. +- The generated fixture and this repo are useful proof points, but the huge-repo target still needs canonical pinned baselines. + ## Open Questions - Which exact small, medium, and huge repositories should become canonical Phase 0 baselines? diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 3075fcc71..91cbb48f5 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -119,7 +119,8 @@ Recommended task format: ## Phase 0: Baseline, RFC, And Contracts - [x] Add memory benchmark harness for current Python backend. owner: Poincare. Result: added `rust-rewrite/tools/measure_python_backend.py`. -- [ ] Measure cold parse RSS and wall time for representative repos. +- [x] Measure initial cold parse RSS and wall time for generated fixture and this repo. owner: codex. Result: recorded in `rust-rewrite/benchmarks.md`. +- [ ] Measure cold parse RSS and wall time for canonical small, medium, and huge repos. - [ ] Measure graph node/edge counts, Python object counts, and per-phase allocation peaks. - [x] Document the exact current build phases with timings: file enumeration, parse, directory tree, config parse, import resolution, export resolution, dependency recompute. owner: Poincare. Result: added phase map in `rust-rewrite/benchmarks.md`; representative repo timings remain open. - [x] Inventory all public `Codebase` properties and methods. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. @@ -138,18 +139,22 @@ Recommended task format: - [ ] Add Rust engine facade object that can be constructed from `CodebaseContext`. - [x] Add a minimal debug API returning engine version and enabled features. owner: Beauvoir. Result: added Rust `Engine::debug_info` and feature-gated PyO3 bindings. - [ ] Add CI job that builds the Rust extension on supported Python versions. -- [ ] Add benchmark command that can select `--backend python|rust`. +- [x] Add benchmark command comparing Python backend with Rust compact indexer. owner: codex. Result: added `rust-rewrite/tools/compare_rust_python_index.py`. +- [ ] Add benchmark command that can select full `Codebase` `--backend python|rust` once Rust backend is wired into Python. ## Phase 2: Parser And Compact Index Vertical Slice - [x] Specify parser/index vertical slice and extraction rules. owner: Meitner. Result: documented in `rust-rewrite/parser-index.md`. +- [x] Implement standalone Rust Python file discovery for the first compact-index slice. owner: codex. Result: recursive repo walk with common generated/cache directory skips. - [ ] Implement Rust file discovery input format from Python repo operator. -- [ ] Implement tree-sitter parser setup for Python. +- [x] Implement tree-sitter parser setup for Python. owner: codex. Result: `graph-sitter-engine` uses `tree-sitter-python` and indexes Python files. - [ ] Implement tree-sitter parser setup for TypeScript/TSX. - [ ] Extract file records with path, language, content hash, and root ranges. -- [ ] Extract top-level Python classes, functions, and globals. +- [x] Extract file records with path, byte length, line count, error status, and root ranges for Python. owner: codex. +- [x] Extract top-level Python classes and functions. owner: codex. Result: compact `SymbolRecord` extraction for class/function definitions and decorated definitions. +- [ ] Extract top-level Python globals. - [ ] Extract top-level TypeScript classes, functions, interfaces, type aliases, enums, and globals. -- [ ] Extract imports for Python. +- [x] Extract imports for Python. owner: codex. Result: compact `ImportRecord` extraction for `import`, `from`, and future imports. - [ ] Extract imports and exports for TypeScript. - [ ] Build path and string interners. - [ ] Expose `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` ID queries through PyO3. @@ -225,3 +230,4 @@ Recommended task format: - [x] 2026-06-18: Integrator created seven worktrees and spawned six helper agents; PyO3 compatibility was queued due to agent concurrency limit. owner: codex. - [x] 2026-06-18: Six completed helper branches reviewed and their artifacts staged for integration. owner: codex. Notes: PyO3 compatibility agent is now running as Wegener. - [x] 2026-06-18: PyO3 compatibility helper completed and its planning artifact was staged for integration. owner: codex. +- [x] 2026-06-18: Implemented first Rust Python compact-index slice and benchmark comparison; initial measurements show 9x-22x wall-time improvement and 70x-104x RSS improvement on this repo for the implemented slice. owner: codex. diff --git a/rust-rewrite/tools/compare_rust_python_index.py b/rust-rewrite/tools/compare_rust_python_index.py new file mode 100644 index 000000000..501f0b641 --- /dev/null +++ b/rust-rewrite/tools/compare_rust_python_index.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import platform +import subprocess +import sys +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +TOOLS_DIR = Path(__file__).resolve().parent +REPO_ROOT = TOOLS_DIR.parents[1] +if str(TOOLS_DIR) not in sys.path: + sys.path.insert(0, str(TOOLS_DIR)) + +from measure_python_backend import bytes_to_mb, create_python_fixture # noqa: E402 + + +@dataclass +class SampledProcess: + command: list[str] + wall_seconds: float + rss_peak_mb: float + stdout: str + stderr: str + + +def sample_process(command: list[str], *, cwd: Path, sample_interval: float) -> SampledProcess: + import psutil + + start = time.perf_counter() + process = subprocess.Popen(command, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + ps_process = psutil.Process(process.pid) + rss_peak = 0 + while process.poll() is None: + try: + rss_peak = max(rss_peak, int(ps_process.memory_info().rss)) + except psutil.NoSuchProcess: + break + time.sleep(sample_interval) + stdout, stderr = process.communicate() + try: + rss_peak = max(rss_peak, int(ps_process.memory_info().rss)) + except psutil.NoSuchProcess: + pass + wall = time.perf_counter() - start + if process.returncode != 0: + msg = f"command failed with exit {process.returncode}: {' '.join(command)}\n{stderr}" + raise RuntimeError(msg) + return SampledProcess( + command=command, + wall_seconds=wall, + rss_peak_mb=round(bytes_to_mb(rss_peak), 3), + stdout=stdout, + stderr=stderr, + ) + + +def run_json(command: list[str], *, cwd: Path) -> dict[str, Any]: + result = subprocess.run(command, cwd=cwd, check=True, capture_output=True, text=True) + return parse_json_output(result.stdout) + + +def parse_json_output(output: str) -> dict[str, Any]: + start = output.find("{") + end = output.rfind("}") + if start == -1 or end == -1 or end < start: + msg = f"command did not emit JSON output:\n{output}" + raise ValueError(msg) + return json.loads(output[start : end + 1]) + + +def rust_example_path() -> Path: + exe = "index_python.exe" if os.name == "nt" else "index_python" + return REPO_ROOT / "target" / "release" / "examples" / exe + + +def build_rust_example() -> None: + subprocess.run( + ["cargo", "build", "--release", "-p", "graph-sitter-engine", "--example", "index_python"], + cwd=REPO_ROOT, + check=True, + ) + + +def run_python_backend(repo_path: Path, *, disable_graph: bool) -> dict[str, Any]: + command = [ + sys.executable, + str(TOOLS_DIR / "measure_python_backend.py"), + str(repo_path), + "--language", + "python", + "--skip-object-counts", + "--json", + ] + if disable_graph: + command.append("--disable-graph") + return run_json(command, cwd=REPO_ROOT) + + +def run_rust_index(repo_path: Path, *, sample_interval: float) -> dict[str, Any]: + command = [str(rust_example_path()), str(repo_path), "--json"] + sampled = sample_process(command, cwd=REPO_ROOT, sample_interval=sample_interval) + report = parse_json_output(sampled.stdout) + report["process"] = { + "command": " ".join(command), + "wall_seconds": round(sampled.wall_seconds, 6), + "rss_peak_mb": sampled.rss_peak_mb, + } + return report + + +def ratio(numerator: float, denominator: float) -> float | None: + if denominator <= 0: + return None + return round(numerator / denominator, 3) + + +def make_report(args: argparse.Namespace) -> dict[str, Any]: + temp_dir: tempfile.TemporaryDirectory[str] | None = None + if args.repo is None: + temp_dir = tempfile.TemporaryDirectory(prefix="graph-sitter-rust-compare-") + repo_path = create_python_fixture(Path(temp_dir.name), args.fixture_files, args.fixture_functions) + generated_fixture = True + else: + repo_path = Path(args.repo).expanduser().resolve() + generated_fixture = False + + try: + if not args.skip_build: + build_rust_example() + python_report = run_python_backend(repo_path, disable_graph=args.python_disable_graph) + rust_report = run_rust_index(repo_path, sample_interval=args.sample_interval) + finally: + if temp_dir is not None: + temp_dir.cleanup() + + python_totals = python_report["totals"] + rust_process = rust_report["process"] + comparison = { + "python_to_rust_wall_ratio": ratio(python_totals["wall_seconds"], rust_report["wall_seconds"]), + "python_to_rust_process_wall_ratio": ratio(python_totals["wall_seconds"], rust_process["wall_seconds"]), + "python_to_rust_peak_rss_ratio": ratio(python_totals["max_rss_mb"], rust_process["rss_peak_mb"]), + "python_wall_seconds": python_totals["wall_seconds"], + "rust_index_wall_seconds": round(rust_report["wall_seconds"], 6), + "rust_process_wall_seconds": rust_process["wall_seconds"], + "python_max_rss_mb": python_totals["max_rss_mb"], + "rust_sampled_rss_peak_mb": rust_process["rss_peak_mb"], + } + return { + "metadata": { + "repo_path": str(repo_path), + "generated_fixture": generated_fixture, + "fixture_files": args.fixture_files if generated_fixture else None, + "fixture_functions": args.fixture_functions if generated_fixture else None, + "python_disable_graph": args.python_disable_graph, + "python": sys.version, + "platform": platform.platform(), + "sample_interval_seconds": args.sample_interval, + }, + "comparison": comparison, + "python_backend": python_report, + "rust_index": rust_report, + } + + +def print_human(report: dict[str, Any]) -> None: + metadata = report["metadata"] + comparison = report["comparison"] + python_graph = report["python_backend"]["graph"] + rust_summary = report["rust_index"]["summary"] + print(f"repo: {metadata['repo_path']}") + print(f"python disable_graph: {metadata['python_disable_graph']}") + print( + "python backend: " + f"wall={comparison['python_wall_seconds']:.3f}s " + f"max_rss={comparison['python_max_rss_mb']:.1f} MB " + f"nodes={python_graph['nodes']} edges={python_graph['edges']} file_nodes={python_graph['source_file_nodes_total']}" + ) + print( + "rust index: " + f"wall={comparison['rust_index_wall_seconds']:.3f}s " + f"process_wall={comparison['rust_process_wall_seconds']:.3f}s " + f"rss_peak={comparison['rust_sampled_rss_peak_mb']:.1f} MB " + f"files={rust_summary['files']} symbols={rust_summary['symbols']} imports={rust_summary['imports']}" + ) + print( + "ratios: " + f"wall={comparison['python_to_rust_wall_ratio']}x " + f"process_wall={comparison['python_to_rust_process_wall_ratio']}x " + f"rss={comparison['python_to_rust_peak_rss_ratio']}x" + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Compare current Python backend parse/index cost with the Rust compact Python indexer.") + parser.add_argument("repo", nargs="?", help="Path to a git repository. If omitted, a generated Python fixture is used.") + parser.add_argument("--fixture-files", type=int, default=150, help="Generated fixture module count when repo is omitted.") + parser.add_argument("--fixture-functions", type=int, default=20, help="Generated helper functions per module when repo is omitted.") + parser.add_argument("--sample-interval", type=float, default=0.005, help="RSS sampling interval for the Rust process.") + parser.add_argument("--skip-build", action="store_true", help="Do not build the Rust example before running it.") + parser.add_argument( + "--python-full-graph", + action="store_false", + dest="python_disable_graph", + help="Compare against the full Python graph instead of parse/object materialization only.", + ) + parser.add_argument("--output", type=Path, help="Optional path to write JSON report.") + parser.add_argument("--json", action="store_true", help="Print JSON report instead of a human summary.") + parser.set_defaults(python_disable_graph=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + report = make_report(args) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print_human(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From dce2daebc79989ecd2df15da12b2ceb8ad9b43e5 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:31:56 -0700 Subject: [PATCH 007/228] Expose Rust Python index through PyO3 --- Cargo.lock | 1 + crates/graph-sitter-py/Cargo.toml | 1 + crates/graph-sitter-py/src/lib.rs | 175 +++++++++++++++++++++++++++++- rust-rewrite/engine-skeleton.md | 10 ++ rust-rewrite/python-compat.md | 8 ++ rust-rewrite/strategy.md | 4 +- 6 files changed, 197 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b24474d2b..1dcf547bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -61,6 +61,7 @@ version = "0.1.0" dependencies = [ "graph-sitter-engine", "pyo3", + "serde_json", ] [[package]] diff --git a/crates/graph-sitter-py/Cargo.toml b/crates/graph-sitter-py/Cargo.toml index f70c7e676..e3239d213 100644 --- a/crates/graph-sitter-py/Cargo.toml +++ b/crates/graph-sitter-py/Cargo.toml @@ -19,3 +19,4 @@ extension-module = ["pyo3-bindings", "pyo3/extension-module"] [dependencies] graph-sitter-engine.workspace = true pyo3 = { workspace = true, optional = true } +serde_json.workspace = true diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 4ae86fa6a..582bc32eb 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -10,8 +10,10 @@ pub fn enabled_features() -> &'static [&'static str] { #[cfg(feature = "pyo3-bindings")] mod bindings { - use graph_sitter_engine::{self, Engine, EngineInfo}; + use graph_sitter_engine::{self, Engine, EngineInfo, IndexSummary, PythonIndex}; + use pyo3::exceptions::{PyRuntimeError, PyValueError}; use pyo3::prelude::*; + use std::path::Path; #[pyclass(name = "EngineInfo", module = "graph_sitter_py")] #[derive(Debug, Clone, PartialEq, Eq)] @@ -53,6 +55,119 @@ mod bindings { } } + #[pyclass(name = "IndexSummary", module = "graph_sitter_py")] + #[derive(Debug, Clone, PartialEq, Eq)] + pub struct PyIndexSummary { + #[pyo3(get)] + files: usize, + #[pyo3(get)] + symbols: usize, + #[pyo3(get)] + classes: usize, + #[pyo3(get)] + functions: usize, + #[pyo3(get)] + imports: usize, + #[pyo3(get)] + bytes: usize, + #[pyo3(get)] + lines: usize, + #[pyo3(get)] + files_with_errors: usize, + } + + impl From for PyIndexSummary { + fn from(summary: IndexSummary) -> Self { + Self { + files: summary.files, + symbols: summary.symbols, + classes: summary.classes, + functions: summary.functions, + imports: summary.imports, + bytes: summary.bytes, + lines: summary.lines, + files_with_errors: summary.files_with_errors, + } + } + } + + #[pymethods] + impl PyIndexSummary { + fn as_dict(&self) -> std::collections::BTreeMap<&'static str, usize> { + std::collections::BTreeMap::from([ + ("files", self.files), + ("symbols", self.symbols), + ("classes", self.classes), + ("functions", self.functions), + ("imports", self.imports), + ("bytes", self.bytes), + ("lines", self.lines), + ("files_with_errors", self.files_with_errors), + ]) + } + + fn __repr__(&self) -> String { + format!( + "IndexSummary(files={}, symbols={}, classes={}, functions={}, imports={}, bytes={}, lines={}, files_with_errors={})", + self.files, + self.symbols, + self.classes, + self.functions, + self.imports, + self.bytes, + self.lines, + self.files_with_errors + ) + } + } + + #[pyclass(name = "PythonIndex", module = "graph_sitter_py")] + #[derive(Debug, Clone, PartialEq, Eq)] + pub struct PyPythonIndex { + inner: PythonIndex, + } + + impl From for PyPythonIndex { + fn from(inner: PythonIndex) -> Self { + Self { inner } + } + } + + #[pymethods] + impl PyPythonIndex { + fn summary(&self) -> PyIndexSummary { + self.inner.summary().into() + } + + fn to_json(&self) -> PyResult { + serde_json::to_string(&self.inner) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + + #[getter] + fn file_count(&self) -> usize { + self.inner.files.len() + } + + #[getter] + fn symbol_count(&self) -> usize { + self.inner.symbols.len() + } + + #[getter] + fn import_count(&self) -> usize { + self.inner.imports.len() + } + + fn __repr__(&self) -> String { + let summary = self.inner.summary(); + format!( + "PythonIndex(files={}, symbols={}, imports={})", + summary.files, summary.symbols, summary.imports + ) + } + } + #[pyclass(name = "Engine", module = "graph_sitter_py")] #[derive(Debug, Default, Clone)] pub struct PyEngine { @@ -84,6 +199,10 @@ mod bindings { fn debug_info(&self) -> PyEngineInfo { self.inner.debug_info().into() } + + fn index_python_path(&self, repo_path: &str) -> PyResult { + index_python_path_impl(repo_path) + } } #[pyfunction(name = "engine_version")] @@ -96,18 +215,41 @@ mod bindings { graph_sitter_engine::debug_info().into() } + #[pyfunction(name = "index_python_path")] + fn py_index_python_path(repo_path: &str) -> PyResult { + index_python_path_impl(repo_path) + } + + fn index_python_path_impl(repo_path: &str) -> PyResult { + let path = Path::new(repo_path); + if !path.exists() { + return Err(PyValueError::new_err(format!( + "repo path does not exist: {repo_path}" + ))); + } + graph_sitter_engine::index_python_path(path) + .map(PyPythonIndex::from) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + #[pymodule] fn graph_sitter_py(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(py_engine_version, m)?)?; m.add_function(wrap_pyfunction!(py_debug_info, m)?)?; + m.add_function(wrap_pyfunction!(py_index_python_path, m)?)?; Ok(()) } #[cfg(test)] mod tests { use super::*; + use std::fs; + use std::path::PathBuf; + use std::time::{SystemTime, UNIX_EPOCH}; #[test] fn debug_info_forwards_core_engine_metadata() { @@ -119,6 +261,37 @@ mod bindings { vec!["skeleton".to_owned(), "python-index".to_owned()] ); } + + #[test] + fn py_engine_indexes_python_path() { + let repo = temp_repo_path("py-binding-index"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write( + repo.join("pkg/mod.py"), + "import os\n\nclass Service:\n pass\n\ndef helper():\n return os.getcwd()\n", + ) + .unwrap(); + + let index = PyEngine::new() + .index_python_path(repo.to_str().unwrap()) + .unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let summary = index.summary(); + assert_eq!(summary.files, 1); + assert_eq!(summary.classes, 1); + assert_eq!(summary.functions, 1); + assert_eq!(summary.imports, 1); + assert!(index.to_json().unwrap().contains("\"Service\"")); + } + + fn temp_repo_path(prefix: &str) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("graph-sitter-{prefix}-{nanos}")) + } } } diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md index edfe5f255..479dfe9bc 100644 --- a/rust-rewrite/engine-skeleton.md +++ b/rust-rewrite/engine-skeleton.md @@ -19,6 +19,16 @@ The PyO3 crate intentionally does not enable PyO3 by default so normal `cargo te cargo build -p graph-sitter-py --features extension-module ``` +On macOS, local extension smoke tests currently need PyO3 pointed at the active Python interpreter and dynamic lookup linker flags: + +```sh +PYO3_PYTHON="$(uv run python -c 'import sys; print(sys.executable)')" \ +RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup" \ +cargo build --release -p graph-sitter-py --features extension-module +``` + +The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, and `index_python_path`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index. + ## Integration Choice This skeleton does not alter the Hatch/Cython Python packaging path. The current `hatch.toml` custom hook is disabled by default, so wiring Rust into wheels should be a separate packaging/CI task after the backend facade and import smoke test are defined. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 46982e91d..264d23e77 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -113,6 +113,14 @@ class Engine: def source_slice(file_id: int, start_byte: int, end_byte: int) -> str: ... ``` +Current implemented bridge status: + +- `crates/graph-sitter-py` builds a PyO3 module named `graph_sitter_py` behind the `extension-module` feature. +- `Engine.index_python_path(repo_path)` and module-level `index_python_path(repo_path)` return a compact `PythonIndex` for Python files. +- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, byte, line, and error counts. +- `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. +- This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. + Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: - `python_node_id -> ObjectRef(kind, rust_id)` diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 91cbb48f5..3f087225b 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -134,7 +134,7 @@ Recommended task format: ## Phase 1: Rust Engine Skeleton - [x] Add Rust workspace/crate skeleton without changing default behavior. owner: Beauvoir. Result: added standalone Cargo workspace under `crates/`. -- [ ] Add PyO3 module import smoke test. +- [x] Add PyO3 module import smoke test. owner: codex. Result: built the extension module and imported it from Python, then indexed this repo through `index_python_path`. - [ ] Add `graph_backend` config flag with default `python`. - [ ] Add Rust engine facade object that can be constructed from `CodebaseContext`. - [x] Add a minimal debug API returning engine version and enabled features. owner: Beauvoir. Result: added Rust `Engine::debug_info` and feature-gated PyO3 bindings. @@ -157,6 +157,7 @@ Recommended task format: - [x] Extract imports for Python. owner: codex. Result: compact `ImportRecord` extraction for `import`, `from`, and future imports. - [ ] Extract imports and exports for TypeScript. - [ ] Build path and string interners. +- [x] Expose compact Python index summary and JSON through PyO3. owner: codex. Result: added `PythonIndex`, `IndexSummary`, `Engine.index_python_path`, and module-level `index_python_path`. - [ ] Expose `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` ID queries through PyO3. - [ ] Add golden snapshots for compact IR on small Python fixtures. - [ ] Add golden snapshots for compact IR on small TypeScript fixtures. @@ -231,3 +232,4 @@ Recommended task format: - [x] 2026-06-18: Six completed helper branches reviewed and their artifacts staged for integration. owner: codex. Notes: PyO3 compatibility agent is now running as Wegener. - [x] 2026-06-18: PyO3 compatibility helper completed and its planning artifact was staged for integration. owner: codex. - [x] 2026-06-18: Implemented first Rust Python compact-index slice and benchmark comparison; initial measurements show 9x-22x wall-time improvement and 70x-104x RSS improvement on this repo for the implemented slice. owner: codex. +- [x] 2026-06-18: Exposed the compact Python index through the PyO3 module and verified a Python import smoke against this repo. owner: codex. Notes: extension returned 1127 files, 3117 symbols, and 6414 imports for the current checkout. From 9d9898537716127c220cfc0605511c33dc42a97e Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:40:05 -0700 Subject: [PATCH 008/228] Wire Rust compact index into Python context --- crates/graph-sitter-engine/src/lib.rs | 75 +++++++++++- crates/graph-sitter-py/src/lib.rs | 48 ++++++++ rust-rewrite/benchmarks.md | 22 ++++ rust-rewrite/engine-skeleton.md | 2 +- rust-rewrite/python-compat.md | 3 + rust-rewrite/strategy.md | 12 +- rust-rewrite/tools/measure_rust_facade.py | 112 ++++++++++++++++++ src/graph_sitter/codebase/codebase_context.py | 52 +++++++- src/graph_sitter/codebase/rust_backend.py | 75 ++++++++++++ src/graph_sitter/configs/models/codebase.py | 15 ++- src/graph_sitter/core/codebase.py | 7 ++ tests/unit/sdk/codebase/test_rust_backend.py | 102 ++++++++++++++++ 12 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 rust-rewrite/tools/measure_rust_facade.py create mode 100644 src/graph_sitter/codebase/rust_backend.py create mode 100644 tests/unit/sdk/codebase/test_rust_backend.py diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 30f9c708f..eea40cf50 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -51,6 +51,18 @@ impl Engine { ) -> Result { PythonIndexer::new()?.index_path(repo_path) } + + pub fn index_python_paths( + &self, + repo_path: impl AsRef, + file_paths: I, + ) -> Result + where + I: IntoIterator, + P: AsRef, + { + PythonIndexer::new()?.index_paths(repo_path, file_paths) + } } pub fn engine_version() -> &'static str { @@ -68,6 +80,17 @@ pub fn index_python_path(repo_path: impl AsRef) -> Result( + repo_path: impl AsRef, + file_paths: I, +) -> Result +where + I: IntoIterator, + P: AsRef, +{ + Engine::new().index_python_paths(repo_path, file_paths) +} + #[derive(Debug)] pub enum IndexError { Io { path: PathBuf, source: io::Error }, @@ -229,13 +252,45 @@ impl PythonIndexer { fn index_path(mut self, repo_path: impl AsRef) -> Result { let repo_path = repo_path.as_ref(); + let mut paths = Vec::new(); + collect_python_files(repo_path, &mut paths)?; + self.index_absolute_paths(repo_path, paths) + } + + fn index_paths( + mut self, + repo_path: impl AsRef, + file_paths: I, + ) -> Result + where + I: IntoIterator, + P: AsRef, + { + let repo_path = repo_path.as_ref(); + let paths = file_paths + .into_iter() + .map(|path| { + let path = path.as_ref(); + if path.is_absolute() { + path.to_path_buf() + } else { + repo_path.join(path) + } + }) + .collect(); + self.index_absolute_paths(repo_path, paths) + } + + fn index_absolute_paths( + &mut self, + repo_path: &Path, + mut paths: Vec, + ) -> Result { let mut index = PythonIndex { files: Vec::new(), symbols: Vec::new(), imports: Vec::new(), }; - let mut paths = Vec::new(); - collect_python_files(repo_path, &mut paths)?; paths.sort(); for path in paths { @@ -504,6 +559,22 @@ mod tests { .any(|import| import.alias.as_deref() == Some("system"))); } + #[test] + fn indexes_only_requested_python_paths() { + let repo = temp_repo_path("index-python-paths"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/included.py"), "class Included:\n pass\n").unwrap(); + fs::write(repo.join("pkg/skipped.py"), "class Skipped:\n pass\n").unwrap(); + + let index = index_python_paths(&repo, ["pkg/included.py"]).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + assert_eq!(index.summary().files, 1); + assert_eq!(index.files[0].path, "pkg/included.py"); + assert_eq!(index.summary().classes, 1); + assert_eq!(index.symbols[0].name, "Included"); + } + fn temp_repo_path(prefix: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 582bc32eb..899d64cd6 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -203,6 +203,14 @@ mod bindings { fn index_python_path(&self, repo_path: &str) -> PyResult { index_python_path_impl(repo_path) } + + fn index_python_paths( + &self, + repo_path: &str, + file_paths: Vec, + ) -> PyResult { + index_python_paths_impl(repo_path, file_paths) + } } #[pyfunction(name = "engine_version")] @@ -220,6 +228,11 @@ mod bindings { index_python_path_impl(repo_path) } + #[pyfunction(name = "index_python_paths")] + fn py_index_python_paths(repo_path: &str, file_paths: Vec) -> PyResult { + index_python_paths_impl(repo_path, file_paths) + } + fn index_python_path_impl(repo_path: &str) -> PyResult { let path = Path::new(repo_path); if !path.exists() { @@ -232,6 +245,21 @@ mod bindings { .map_err(|error| PyRuntimeError::new_err(error.to_string())) } + fn index_python_paths_impl( + repo_path: &str, + file_paths: Vec, + ) -> PyResult { + let path = Path::new(repo_path); + if !path.exists() { + return Err(PyValueError::new_err(format!( + "repo path does not exist: {repo_path}" + ))); + } + graph_sitter_engine::index_python_paths(path, file_paths) + .map(PyPythonIndex::from) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + #[pymodule] fn graph_sitter_py(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; @@ -241,6 +269,7 @@ mod bindings { m.add_function(wrap_pyfunction!(py_engine_version, m)?)?; m.add_function(wrap_pyfunction!(py_debug_info, m)?)?; m.add_function(wrap_pyfunction!(py_index_python_path, m)?)?; + m.add_function(wrap_pyfunction!(py_index_python_paths, m)?)?; Ok(()) } @@ -285,6 +314,25 @@ mod bindings { assert!(index.to_json().unwrap().contains("\"Service\"")); } + #[test] + fn py_engine_indexes_selected_python_paths() { + let repo = temp_repo_path("py-binding-index-paths"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/included.py"), "class Included:\n pass\n").unwrap(); + fs::write(repo.join("pkg/skipped.py"), "class Skipped:\n pass\n").unwrap(); + + let index = PyEngine::new() + .index_python_paths(repo.to_str().unwrap(), vec!["pkg/included.py".to_owned()]) + .unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let summary = index.summary(); + assert_eq!(summary.files, 1); + assert_eq!(summary.classes, 1); + assert!(index.to_json().unwrap().contains("\"Included\"")); + assert!(!index.to_json().unwrap().contains("\"Skipped\"")); + } + fn temp_repo_path(prefix: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index d44e73fb2..b0d25d3a9 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -89,6 +89,15 @@ uv run python rust-rewrite/tools/compare_rust_python_index.py . \ --output /tmp/graph-sitter-rust-compare-repo-full.json ``` +`rust-rewrite/tools/measure_rust_facade.py` measures the Python-facing Rust compact-index facade. It expects the PyO3 extension module to be importable as `graph_sitter_py`. By default it discovers files through the same Python `RepoOperator.iter_files(...)` filters used by `CodebaseContext`, then passes that selected list into Rust: + +```bash +PYTHONPATH=/path/to/dir/containing/graph_sitter_py_extension \ + uv run python rust-rewrite/tools/measure_rust_facade.py . --json +``` + +Use `--raw-rust-walk` to measure Rust's standalone recursive walk instead of Python-selected file discovery. + ## Metrics The JSON report includes: @@ -134,9 +143,22 @@ Commands were run on this branch on 2026-06-18. The most conservative current-repo comparison is parse/object materialization only: Rust is about 9x faster and about 70x lower RSS for the implemented compact-index slice. Against today's full graph construction on this repo, Rust is about 22x faster and about 104x lower RSS for the same implemented slice. +## Python-Facing Rust Facade Evidence + +These measurements use the new Python shell integration path: Python discovers files with `RepoOperator.iter_files(...)`, the selected file list is passed to the PyO3 extension, and Rust builds the compact index. This includes Python interpreter/import overhead and is therefore a higher RSS number than the standalone Rust process, but it is the relevant measurement for an opt-in Python shell path. + +Commands were run on this branch on 2026-06-18 after adding selected-file PyO3 indexing. + +| Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.961s | 532.3 MB | 0.632s | 114.3 MB | 1129 | 1129 | 4.685x | 4.657x | + +This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index wall time and process max RSS substantially for the implemented compact-index slice. + Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions, and imports. +- The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. - The generated fixture and this repo are useful proof points, but the huge-repo target still needs canonical pinned baselines. diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md index 479dfe9bc..eeef84317 100644 --- a/rust-rewrite/engine-skeleton.md +++ b/rust-rewrite/engine-skeleton.md @@ -27,7 +27,7 @@ RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup" \ cargo build --release -p graph-sitter-py --features extension-module ``` -The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, and `index_python_path`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index. +The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. ## Integration Choice diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 264d23e77..0a9e92847 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -117,8 +117,11 @@ Current implemented bridge status: - `crates/graph-sitter-py` builds a PyO3 module named `graph_sitter_py` behind the `extension-module` feature. - `Engine.index_python_path(repo_path)` and module-level `index_python_path(repo_path)` return a compact `PythonIndex` for Python files. +- `Engine.index_python_paths(repo_path, file_paths)` and module-level `index_python_paths(repo_path, file_paths)` index an explicit Python file list. The Python shell integration uses this path so Rust sees the same `RepoOperator.iter_files(...)` selection as the current Python backend. - `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. +- `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. +- `Codebase.rust_index_summary` exposes the attached compact summary for shell smoke checks. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 3f087225b..441643f21 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -127,6 +127,8 @@ Recommended task format: - [x] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [x] Define P0 compatibility surface for the first Rust backend slice. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [ ] Define large-repo success targets for memory and time. +- [ ] Select pinned large Python repo commits for golden parity and latency benchmarks. Notes: Airflow is a good first candidate. +- [ ] Build golden reference/import/dependency graph snapshots for the pinned large Python repo commits. - [x] Draft compact Rust data model with module boundaries and Python integration points. owner: Pasteur. Result: documented in `rust-rewrite/data-model.md`. - [ ] Draft full Rust engine RFC with module boundaries and Python integration points. - [ ] Decide build tooling: `maturin`, setuptools-rust, or hatch custom hook. @@ -135,18 +137,20 @@ Recommended task format: - [x] Add Rust workspace/crate skeleton without changing default behavior. owner: Beauvoir. Result: added standalone Cargo workspace under `crates/`. - [x] Add PyO3 module import smoke test. owner: codex. Result: built the extension module and imported it from Python, then indexed this repo through `index_python_path`. -- [ ] Add `graph_backend` config flag with default `python`. -- [ ] Add Rust engine facade object that can be constructed from `CodebaseContext`. +- [x] Add `graph_backend` config flag with default `python`. owner: codex. Result: added `GraphBackend` and `RustFallbackMode` to `CodebaseConfig`. +- [x] Add compact Rust index facade that can be constructed from `CodebaseContext`. owner: codex. Result: `ctx.rust_index` builds through the optional PyO3 extension when `graph_backend` is `rust` or `auto`. +- [ ] Add full Rust engine facade object that can back existing `CodebaseContext` graph query APIs. - [x] Add a minimal debug API returning engine version and enabled features. owner: Beauvoir. Result: added Rust `Engine::debug_info` and feature-gated PyO3 bindings. - [ ] Add CI job that builds the Rust extension on supported Python versions. - [x] Add benchmark command comparing Python backend with Rust compact indexer. owner: codex. Result: added `rust-rewrite/tools/compare_rust_python_index.py`. +- [x] Add benchmark command for the Python-facing Rust facade. owner: codex. Result: added `rust-rewrite/tools/measure_rust_facade.py`. - [ ] Add benchmark command that can select full `Codebase` `--backend python|rust` once Rust backend is wired into Python. ## Phase 2: Parser And Compact Index Vertical Slice - [x] Specify parser/index vertical slice and extraction rules. owner: Meitner. Result: documented in `rust-rewrite/parser-index.md`. - [x] Implement standalone Rust Python file discovery for the first compact-index slice. owner: codex. Result: recursive repo walk with common generated/cache directory skips. -- [ ] Implement Rust file discovery input format from Python repo operator. +- [x] Implement Rust file discovery input format from Python repo operator. owner: codex. Result: added selected-file `index_python_paths` API and pass `RepoOperator.iter_files(...)` results from `CodebaseContext`. - [x] Implement tree-sitter parser setup for Python. owner: codex. Result: `graph-sitter-engine` uses `tree-sitter-python` and indexes Python files. - [ ] Implement tree-sitter parser setup for TypeScript/TSX. - [ ] Extract file records with path, language, content hash, and root ranges. @@ -210,6 +214,7 @@ Recommended task format: - [ ] Run full unit suite with Python backend. - [ ] Run full unit suite with Rust backend where supported. - [ ] Add large-repo memory regression benchmark to CI or nightly. +- [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. - [ ] Add feature flag documentation. - [ ] Add migration notes for unsupported APIs. - [ ] Decide default backend criteria. @@ -233,3 +238,4 @@ Recommended task format: - [x] 2026-06-18: PyO3 compatibility helper completed and its planning artifact was staged for integration. owner: codex. - [x] 2026-06-18: Implemented first Rust Python compact-index slice and benchmark comparison; initial measurements show 9x-22x wall-time improvement and 70x-104x RSS improvement on this repo for the implemented slice. owner: codex. - [x] 2026-06-18: Exposed the compact Python index through the PyO3 module and verified a Python import smoke against this repo. owner: codex. Notes: extension returned 1127 files, 3117 symbols, and 6414 imports for the current checkout. +- [x] 2026-06-18: Added Python-shell Rust index integration behind `CodebaseConfig(graph_backend=...)`, selected-file PyO3 indexing from `RepoOperator`, and a facade benchmark. owner: codex. Notes: selected-file facade matched Python's 1129-file discovery and ran 4.7x faster with 4.7x lower process max RSS than Python parse/object materialization on this checkout. diff --git a/rust-rewrite/tools/measure_rust_facade.py b/rust-rewrite/tools/measure_rust_facade.py new file mode 100644 index 000000000..e467198d7 --- /dev/null +++ b/rust-rewrite/tools/measure_rust_facade.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import dataclasses +import json +import platform +import resource +import sys +import time +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from graph_sitter.codebase.codebase_context import GLOBAL_FILE_IGNORE_LIST, get_node_classes # noqa: E402 +from graph_sitter.codebase.config import ProjectConfig # noqa: E402 +from graph_sitter.codebase.rust_backend import RustIndexBackend # noqa: E402 +from graph_sitter.shared.enums.programming_language import ProgrammingLanguage # noqa: E402 + + +def bytes_to_mb(value: float) -> float: + return value / (1024 * 1024) + + +def max_rss_bytes() -> int: + rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform == "darwin": + return int(rss) + return int(rss * 1024) + + +def discover_python_files(repo: Path) -> tuple[Path, list[str]]: + project = ProjectConfig.from_path(str(repo), programming_language=ProgrammingLanguage.PYTHON) + node_classes = get_node_classes(ProgrammingLanguage.PYTHON) + extensions = node_classes.file_cls.get_extensions() + file_paths = [ + str(filepath) + for filepath, _ in project.repo_operator.iter_files( + subdirs=project.subdirectories, + extensions=extensions, + ignore_list=GLOBAL_FILE_IGNORE_LIST, + ) + ] + return Path(project.repo_operator.repo_path).resolve(), file_paths + + +def make_report(repo: Path, *, raw_rust_walk: bool) -> dict: + start = time.perf_counter() + if raw_rust_walk: + repo_root = repo + file_paths = None + else: + repo_root, file_paths = discover_python_files(repo) + backend = RustIndexBackend.build(repo_root, file_paths=file_paths) + wall = time.perf_counter() - start + return { + "metadata": { + "repo_path": str(repo), + "repo_root": str(repo_root), + "raw_rust_walk": raw_rust_walk, + "selected_file_count": None if file_paths is None else len(file_paths), + "python": sys.version, + "platform": platform.platform(), + "engine_version": backend.engine_version, + }, + "totals": { + "wall_seconds": round(wall, 6), + "max_rss_mb": round(bytes_to_mb(max_rss_bytes()), 3), + }, + "summary": dataclasses.asdict(backend.summary), + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Measure the Python-facing Rust compact index facade.") + parser.add_argument("repo", nargs="?", default=".", help="Path to the Python repository to index.") + parser.add_argument("--raw-rust-walk", action="store_true", help="Use Rust's recursive file walk instead of Python RepoOperator file discovery.") + parser.add_argument("--output", type=Path, help="Optional path to write JSON report.") + parser.add_argument("--json", action="store_true", help="Print JSON report instead of a human summary.") + return parser.parse_args() + + +def print_human(report: dict) -> None: + totals = report["totals"] + summary = report["summary"] + print(f"repo: {report['metadata']['repo_path']}") + print(f"repo root: {report['metadata']['repo_root']}") + print(f"engine: {report['metadata']['engine_version']}") + print(f"raw rust walk: {report['metadata']['raw_rust_walk']}") + print(f"selected files: {report['metadata']['selected_file_count']}") + print(f"rust facade: wall={totals['wall_seconds']:.3f}s max_rss={totals['max_rss_mb']:.1f} MB") + print(f"summary: files={summary['files']} symbols={summary['symbols']} imports={summary['imports']}") + + +def main() -> int: + args = parse_args() + report = make_report(Path(args.repo).expanduser().resolve(), raw_rust_walk=args.raw_rust_walk) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print_human(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/graph_sitter/codebase/codebase_context.py b/src/graph_sitter/codebase/codebase_context.py index a52e7ac94..19c728ade 100644 --- a/src/graph_sitter/codebase/codebase_context.py +++ b/src/graph_sitter/codebase/codebase_context.py @@ -21,7 +21,7 @@ from graph_sitter.codebase.validation import get_edges, post_reset_validation from graph_sitter.compiled.sort import sort_editables from graph_sitter.compiled.utils import uncache_all -from graph_sitter.configs.models.codebase import CodebaseConfig, PinkMode +from graph_sitter.configs.models.codebase import CodebaseConfig, GraphBackend, PinkMode, RustFallbackMode from graph_sitter.configs.models.secrets import SecretsConfig from graph_sitter.core.autocommit import AutoCommit, commiter from graph_sitter.core.directory import Directory @@ -43,6 +43,7 @@ from graph_sitter.codebase.io.io import IO from graph_sitter.codebase.node_classes.node_classes import NodeClasses from graph_sitter.codebase.progress.progress import Progress + from graph_sitter.codebase.rust_backend import RustIndexBackend from graph_sitter.core.dataclasses.usage import Usage from graph_sitter.core.expressions import Expression from graph_sitter.core.external_module import ExternalModule @@ -125,6 +126,8 @@ class CodebaseContext: filepath_idx: dict[str, NodeId] _ext_module_idx: dict[str, NodeId] flags: Flags + rust_index: RustIndexBackend | None + rust_backend_error: str | None session_options: SessionOptions = SessionOptions() projects: list[ProjectConfig] unapplied_diffs: list[DiffLite] @@ -186,6 +189,8 @@ def __init__( self.dependency_manager = get_dependency_manager(context.programming_language, self) self.language_engine = get_language_engine(context.programming_language, self) self.programming_language = context.programming_language + self.rust_index = None + self.rust_backend_error = None # Raise warning if language is not supported if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER: @@ -199,6 +204,8 @@ def __init__( msg = "allow_external must be set to True when py_resolve_syspath is enabled" raise ValueError(msg) + self._build_rust_index_if_configured() + # Build the graph if not self.config.exp_lazy_graph and self.config.use_pink != PinkMode.ALL_FILES: self.build_graph(context.repo_operator) @@ -215,6 +222,49 @@ def __init__( def __repr__(self): return self.__class__.__name__ + def _build_rust_index_if_configured(self) -> None: + if self.config.graph_backend == GraphBackend.PYTHON: + return + + reason = self._rust_index_unsupported_reason() + if reason is not None: + self._handle_rust_backend_unavailable(reason) + return + + from graph_sitter.codebase.rust_backend import RustBackendUnavailableError, RustIndexBackend, RustIndexBuildError + + try: + file_paths = self._rust_index_file_paths() + self.rust_index = RustIndexBackend.build(self.repo_path, file_paths=file_paths) + except (RustBackendUnavailableError, RustIndexBuildError) as error: + self._handle_rust_backend_unavailable(str(error)) + + def _rust_index_file_paths(self) -> list[str]: + if self.config.disable_file_parse: + return [] + repo_operator = self.projects[0].repo_operator + return [ + str(filepath) + for filepath, _ in repo_operator.iter_files( + subdirs=self.projects[0].subdirectories, + extensions=self.extensions, + ignore_list=GLOBAL_FILE_IGNORE_LIST, + ) + ] + + def _rust_index_unsupported_reason(self) -> str | None: + if self.programming_language is not ProgrammingLanguage.PYTHON: + return "Rust compact index currently supports Python codebases only" + if self.config.use_pink == PinkMode.ALL_FILES: + return "Rust compact index cannot be combined with PinkMode.ALL_FILES" + return None + + def _handle_rust_backend_unavailable(self, reason: str) -> None: + self.rust_backend_error = reason + if self.config.graph_backend == GraphBackend.RUST and self.config.rust_fallback == RustFallbackMode.ERROR: + raise RuntimeError(reason) + logger.warning("Rust graph backend unavailable; using Python graph backend. Reason: %s", reason) + @cached_property def _graph(self) -> PyDiGraph[Importable, Edge]: if not self.__graph_ready: diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py new file mode 100644 index 000000000..1ddd1446b --- /dev/null +++ b/src/graph_sitter/codebase/rust_backend.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from dataclasses import dataclass +from importlib import import_module +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Sequence + + +class RustBackendUnavailableError(RuntimeError): + """Raised when the optional Rust backend extension cannot be loaded.""" + + +class RustIndexBuildError(RuntimeError): + """Raised when the Rust backend extension loads but cannot index the repo.""" + + +@dataclass(frozen=True) +class RustIndexSummary: + files: int + symbols: int + classes: int + functions: int + imports: int + bytes: int + lines: int + files_with_errors: int + + @classmethod + def from_object(cls, summary: Any) -> RustIndexSummary: + if hasattr(summary, "as_dict"): + data = dict(summary.as_dict()) + elif isinstance(summary, dict): + data = summary + else: + data = {field: getattr(summary, field) for field in cls.__dataclass_fields__} + return cls(**{field: int(data[field]) for field in cls.__dataclass_fields__}) + + +@dataclass +class RustIndexBackend: + repo_path: Path + extension: Any + index: Any + summary: RustIndexSummary + + @classmethod + def build(cls, repo_path: str | Path, file_paths: Sequence[str] | None = None) -> RustIndexBackend: + path = Path(repo_path).resolve() + try: + extension = import_module("graph_sitter_py") + except ImportError as error: + message = "Rust graph backend extension `graph_sitter_py` is not installed" + raise RustBackendUnavailableError(message) from error + + try: + if file_paths is None: + index = extension.index_python_path(str(path)) + else: + index = extension.index_python_paths(str(path), list(file_paths)) + summary = RustIndexSummary.from_object(index.summary()) + except Exception as error: + message = f"Rust graph backend failed to index {path}" + raise RustIndexBuildError(message) from error + + return cls(repo_path=path, extension=extension, index=index, summary=summary) + + @property + def engine_version(self) -> str: + return str(self.extension.engine_version()) + + def to_json(self) -> str: + return str(self.index.to_json()) diff --git a/src/graph_sitter/configs/models/codebase.py b/src/graph_sitter/configs/models/codebase.py index 84c109739..f3a383740 100644 --- a/src/graph_sitter/configs/models/codebase.py +++ b/src/graph_sitter/configs/models/codebase.py @@ -1,4 +1,4 @@ -from enum import IntEnum, auto +from enum import IntEnum, StrEnum, auto from pydantic import Field @@ -14,6 +14,17 @@ class PinkMode(IntEnum): NON_SOURCE_FILES = auto() +class GraphBackend(StrEnum): + PYTHON = "python" + RUST = "rust" + AUTO = "auto" + + +class RustFallbackMode(StrEnum): + PYTHON = "python" + ERROR = "error" + + class CodebaseConfig(BaseConfig): def __init__(self, prefix: str = "CODEBASE", *args, **kwargs) -> None: super().__init__(prefix=prefix, *args, **kwargs) @@ -39,6 +50,8 @@ def __init__(self, prefix: str = "CODEBASE", *args, **kwargs) -> None: unpacking_assignment_partial_removal: bool = True conditional_type_resolution: bool = False use_pink: PinkMode = PinkMode.OFF + graph_backend: GraphBackend = GraphBackend.PYTHON + rust_fallback: RustFallbackMode = RustFallbackMode.PYTHON DefaultCodebaseConfig = CodebaseConfig() diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index cf3280379..949f37e1d 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -266,6 +266,13 @@ def language(self) -> ProgrammingLanguage: """The programming language of the repository.""" return self.ctx.programming_language + @property + @noapidoc + def rust_index_summary(self): + if self.ctx.rust_index is None: + return None + return self.ctx.rust_index.summary + #################################################################################################################### # NODES #################################################################################################################### diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py new file mode 100644 index 000000000..bc73317da --- /dev/null +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -0,0 +1,102 @@ +import sys +from types import ModuleType + +import pytest + +from graph_sitter.codebase.factory.get_session import get_codebase_session +from graph_sitter.configs.models.codebase import CodebaseConfig, GraphBackend, RustFallbackMode + + +class FakeSummary: + def as_dict(self): + return { + "files": 1, + "symbols": 2, + "classes": 1, + "functions": 1, + "imports": 1, + "bytes": 64, + "lines": 8, + "files_with_errors": 0, + } + + +class FakeIndex: + def summary(self): + return FakeSummary() + + def to_json(self): + return '{"files":[],"symbols":[],"imports":[]}' + + +def install_fake_rust_extension(monkeypatch: pytest.MonkeyPatch) -> tuple[list[str], list[list[str]]]: + indexed_paths: list[str] = [] + selected_paths: list[list[str]] = [] + module = ModuleType("graph_sitter_py") + module.engine_version = lambda: "test-rust-engine" + + def index_python_path(path: str): + indexed_paths.append(path) + return FakeIndex() + + def index_python_paths(path: str, file_paths: list[str]): + indexed_paths.append(path) + selected_paths.append(file_paths) + return FakeIndex() + + module.index_python_path = index_python_path + module.index_python_paths = index_python_paths + monkeypatch.setitem(sys.modules, "graph_sitter_py", module) + return indexed_paths, selected_paths + + +def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): + indexed_paths, selected_paths = install_fake_rust_extension(monkeypatch) + config = CodebaseConfig(graph_backend=GraphBackend.RUST) + + with get_codebase_session( + tmpdir=tmp_path, + files={"pkg/service.py": "import os\n\nclass Service:\n pass\n\ndef helper():\n return os.getcwd()\n"}, + config=config, + verify_output=False, + ) as codebase: + assert codebase.ctx.rust_index is not None + assert codebase.ctx.rust_index.engine_version == "test-rust-engine" + assert codebase.ctx.rust_index.summary.files == 1 + assert codebase.ctx.rust_index.summary.classes == 1 + assert codebase.ctx.rust_index.summary.functions == 1 + assert codebase.ctx.rust_index.summary.imports == 1 + assert codebase.rust_index_summary == codebase.ctx.rust_index.summary + assert indexed_paths == [str(tmp_path.resolve())] + assert selected_paths == [["pkg/service.py"]] + + +def test_missing_rust_extension_falls_back_to_python_graph(monkeypatch, tmp_path): + monkeypatch.setitem(sys.modules, "graph_sitter_py", None) + config = CodebaseConfig(graph_backend=GraphBackend.RUST, rust_fallback=RustFallbackMode.PYTHON) + + with get_codebase_session( + tmpdir=tmp_path, + files={"app.py": "def run():\n return 1\n"}, + config=config, + verify_output=False, + ) as codebase: + assert codebase.ctx.rust_index is None + assert codebase.rust_index_summary is None + assert "graph_sitter_py" in codebase.ctx.rust_backend_error + assert len(codebase.files) == 1 + assert len(codebase.functions) == 1 + + +def test_missing_rust_extension_can_fail_strictly(monkeypatch, tmp_path): + monkeypatch.setitem(sys.modules, "graph_sitter_py", None) + config = CodebaseConfig(graph_backend=GraphBackend.RUST, rust_fallback=RustFallbackMode.ERROR) + + with pytest.raises(RuntimeError, match="graph_sitter_py"): + with get_codebase_session( + tmpdir=tmp_path, + files={"app.py": "def run():\n return 1\n"}, + config=config, + verify_output=False, + ): + pass From a9ccbf76948896f12fdc5353106c7060af251b44 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:45:15 -0700 Subject: [PATCH 009/228] Add Rust compact Python import resolution --- .../examples/index_python.rs | 3 +- crates/graph-sitter-engine/src/lib.rs | 232 ++++++++++++++++++ crates/graph-sitter-py/src/lib.rs | 39 ++- rust-rewrite/benchmarks.md | 10 +- rust-rewrite/engine-skeleton.md | 2 +- rust-rewrite/python-compat.md | 3 +- rust-rewrite/resolution-algorithms.md | 7 + rust-rewrite/strategy.md | 4 +- .../tools/compare_rust_python_index.py | 3 +- rust-rewrite/tools/measure_rust_facade.py | 8 +- src/graph_sitter/codebase/rust_backend.py | 1 + tests/unit/sdk/codebase/test_rust_backend.py | 2 + 12 files changed, 300 insertions(+), 14 deletions(-) diff --git a/crates/graph-sitter-engine/examples/index_python.rs b/crates/graph-sitter-engine/examples/index_python.rs index 20fb79ef5..8055967f7 100644 --- a/crates/graph-sitter-engine/examples/index_python.rs +++ b/crates/graph-sitter-engine/examples/index_python.rs @@ -29,12 +29,13 @@ fn main() -> Result<(), Box> { println!("repo: {repo_path}"); println!("wall: {:.6}s", elapsed.as_secs_f64()); println!( - "index: files={} symbols={} classes={} functions={} imports={} bytes={} lines={} files_with_errors={}", + "index: files={} symbols={} classes={} functions={} imports={} import_resolutions={} bytes={} lines={} files_with_errors={}", summary.files, summary.symbols, summary.classes, summary.functions, summary.imports, + summary.import_resolutions, summary.bytes, summary.lines, summary.files_with_errors diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index eea40cf50..ce3deede3 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1,6 +1,7 @@ #![forbid(unsafe_code)] use serde::Serialize; +use std::collections::HashMap; use std::fmt; use std::fs; use std::io; @@ -133,6 +134,7 @@ pub struct PythonIndex { pub files: Vec, pub symbols: Vec, pub imports: Vec, + pub import_resolutions: Vec, } impl PythonIndex { @@ -151,6 +153,7 @@ impl PythonIndex { .filter(|symbol| symbol.kind == SymbolKind::Function) .count(), imports: self.imports.len(), + import_resolutions: self.import_resolutions.len(), bytes: self.files.iter().map(|file| file.byte_len).sum(), lines: self.files.iter().map(|file| file.line_count).sum(), files_with_errors: self.files.iter().filter(|file| file.has_error).count(), @@ -165,6 +168,7 @@ pub struct IndexSummary { pub classes: usize, pub functions: usize, pub imports: usize, + pub import_resolutions: usize, pub bytes: usize, pub lines: usize, pub files_with_errors: usize, @@ -174,6 +178,7 @@ pub struct IndexSummary { pub struct FileRecord { pub id: u32, pub path: String, + pub module_name: Option, pub byte_len: usize, pub line_count: usize, pub has_error: bool, @@ -216,6 +221,15 @@ pub struct ImportRecord { pub range: SourceRange, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct ImportResolutionRecord { + pub id: u32, + pub import_id: u32, + pub source_file_id: u32, + pub target_file_id: u32, + pub target_symbol_id: Option, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] pub struct SourceRange { pub start_byte: usize, @@ -290,6 +304,7 @@ impl PythonIndexer { files: Vec::new(), symbols: Vec::new(), imports: Vec::new(), + import_resolutions: Vec::new(), }; paths.sort(); @@ -312,6 +327,7 @@ impl PythonIndexer { index.files.push(FileRecord { id: file_id, + module_name: python_module_name(&relative_path), path: relative_path, byte_len: content.len(), line_count: line_count(&content), @@ -321,6 +337,7 @@ impl PythonIndexer { extract_python_file(file_id, &content, &tree, &mut index); } + resolve_python_imports(&mut index); Ok(index) } } @@ -516,6 +533,174 @@ fn line_count(source: &str) -> usize { } } +fn resolve_python_imports(index: &mut PythonIndex) { + let module_to_file: HashMap<&str, u32> = index + .files + .iter() + .filter_map(|file| file.module_name.as_deref().map(|module| (module, file.id))) + .collect(); + let symbol_to_id: HashMap<(u32, &str), u32> = index + .symbols + .iter() + .map(|symbol| ((symbol.file_id, symbol.name.as_str()), symbol.id)) + .collect(); + + let mut resolutions = Vec::new(); + for import in &index.imports { + let Some(source_file) = index.files.get(import.file_id as usize) else { + continue; + }; + let resolution = match import.kind { + ImportKind::Import => { + resolve_plain_import(import, &module_to_file).map(|target_file_id| { + ImportResolutionRecord { + id: resolutions.len() as u32, + import_id: import.id, + source_file_id: import.file_id, + target_file_id, + target_symbol_id: None, + } + }) + } + ImportKind::FromImport | ImportKind::FutureImport => resolve_from_import( + import, + source_file, + &module_to_file, + &symbol_to_id, + resolutions.len() as u32, + ), + }; + if let Some(resolution) = resolution { + resolutions.push(resolution); + } + } + index.import_resolutions = resolutions; +} + +fn resolve_plain_import(import: &ImportRecord, module_to_file: &HashMap<&str, u32>) -> Option { + let name = import.name.as_deref()?; + module_to_file.get(name).copied() +} + +fn resolve_from_import( + import: &ImportRecord, + source_file: &FileRecord, + module_to_file: &HashMap<&str, u32>, + symbol_to_id: &HashMap<(u32, &str), u32>, + resolution_id: u32, +) -> Option { + let module = import.module.as_deref()?; + let resolved_module = resolve_module_name(source_file, module)?; + let import_name = import.name.as_deref(); + + if let Some(target_file_id) = module_to_file.get(resolved_module.as_str()).copied() { + let target_symbol_id = + import_name.and_then(|name| symbol_to_id.get(&(target_file_id, name)).copied()); + if target_symbol_id.is_some() || import_name == Some("*") { + return Some(ImportResolutionRecord { + id: resolution_id, + import_id: import.id, + source_file_id: import.file_id, + target_file_id, + target_symbol_id, + }); + } + } + + let import_name = import_name?; + let child_module = join_module(&resolved_module, import_name); + if let Some(target_file_id) = module_to_file.get(child_module.as_str()).copied() { + return Some(ImportResolutionRecord { + id: resolution_id, + import_id: import.id, + source_file_id: import.file_id, + target_file_id, + target_symbol_id: None, + }); + } + + module_to_file + .get(resolved_module.as_str()) + .copied() + .map(|target_file_id| ImportResolutionRecord { + id: resolution_id, + import_id: import.id, + source_file_id: import.file_id, + target_file_id, + target_symbol_id: None, + }) +} + +fn resolve_module_name(source_file: &FileRecord, raw_module: &str) -> Option { + if !raw_module.starts_with('.') { + return Some(raw_module.to_owned()); + } + + let dot_count = raw_module + .as_bytes() + .iter() + .take_while(|byte| **byte == b'.') + .count(); + let suffix = &raw_module[dot_count..]; + let mut package_parts = source_package_name(source_file) + .map(|package| { + package + .split('.') + .filter(|part| !part.is_empty()) + .map(str::to_owned) + .collect::>() + }) + .unwrap_or_default(); + let ascend = dot_count.saturating_sub(1); + if ascend > package_parts.len() { + return None; + } + let keep = package_parts.len() - ascend; + package_parts.truncate(keep); + if !suffix.is_empty() { + package_parts.extend( + suffix + .split('.') + .filter(|part| !part.is_empty()) + .map(str::to_owned), + ); + } + Some(package_parts.join(".")) +} + +fn source_package_name(file: &FileRecord) -> Option<&str> { + let module = file.module_name.as_deref()?; + if file.path.ends_with("/__init__.py") || file.path == "__init__.py" { + Some(module) + } else { + module.rsplit_once('.').map(|(package, _)| package) + } +} + +fn join_module(parent: &str, child: &str) -> String { + if parent.is_empty() { + child.to_owned() + } else { + format!("{parent}.{child}") + } +} + +fn python_module_name(path: &str) -> Option { + let without_suffix = path.strip_suffix(".py")?; + let module = without_suffix + .strip_suffix("/__init__") + .unwrap_or(without_suffix) + .split('/') + .filter(|part| !part.is_empty()) + .collect::>() + .join("."); + if module.is_empty() { + None + } else { + Some(module) + } +} + #[cfg(test)] mod tests { use super::*; @@ -547,6 +732,7 @@ mod tests { assert_eq!(index.summary().classes, 1); assert_eq!(index.summary().functions, 1); assert_eq!(index.summary().imports, 4); + assert_eq!(index.summary().import_resolutions, 0); assert_eq!(index.symbols[0].name, "Service"); assert_eq!(index.symbols[1].name, "helper"); assert!(index @@ -575,6 +761,52 @@ mod tests { assert_eq!(index.symbols[0].name, "Included"); } + #[test] + fn resolves_internal_python_imports_to_files_and_symbols() { + let repo = temp_repo_path("resolve-python-imports"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from __future__ import annotations\nfrom .base import Base\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + assert_eq!(index.summary().files, 3); + assert_eq!(index.summary().classes, 2); + assert_eq!(index.summary().imports, 5); + assert_eq!(index.summary().import_resolutions, 3); + + let base_file_id = index + .files + .iter() + .find(|file| file.path == "pkg/base.py") + .unwrap() + .id; + let base_symbol_id = index + .symbols + .iter() + .find(|symbol| symbol.file_id == base_file_id && symbol.name == "Base") + .unwrap() + .id; + assert!(index.import_resolutions.iter().any(|resolution| { + resolution.target_file_id == base_file_id + && resolution.target_symbol_id == Some(base_symbol_id) + })); + assert_eq!( + index + .import_resolutions + .iter() + .filter(|resolution| resolution.target_file_id == base_file_id) + .count(), + 3 + ); + } + fn temp_repo_path(prefix: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 899d64cd6..ef34c50fe 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -69,6 +69,8 @@ mod bindings { #[pyo3(get)] imports: usize, #[pyo3(get)] + import_resolutions: usize, + #[pyo3(get)] bytes: usize, #[pyo3(get)] lines: usize, @@ -84,6 +86,7 @@ mod bindings { classes: summary.classes, functions: summary.functions, imports: summary.imports, + import_resolutions: summary.import_resolutions, bytes: summary.bytes, lines: summary.lines, files_with_errors: summary.files_with_errors, @@ -100,6 +103,7 @@ mod bindings { ("classes", self.classes), ("functions", self.functions), ("imports", self.imports), + ("import_resolutions", self.import_resolutions), ("bytes", self.bytes), ("lines", self.lines), ("files_with_errors", self.files_with_errors), @@ -108,12 +112,13 @@ mod bindings { fn __repr__(&self) -> String { format!( - "IndexSummary(files={}, symbols={}, classes={}, functions={}, imports={}, bytes={}, lines={}, files_with_errors={})", + "IndexSummary(files={}, symbols={}, classes={}, functions={}, imports={}, import_resolutions={}, bytes={}, lines={}, files_with_errors={})", self.files, self.symbols, self.classes, self.functions, self.imports, + self.import_resolutions, self.bytes, self.lines, self.files_with_errors @@ -159,11 +164,16 @@ mod bindings { self.inner.imports.len() } + #[getter] + fn import_resolution_count(&self) -> usize { + self.inner.import_resolutions.len() + } + fn __repr__(&self) -> String { let summary = self.inner.summary(); format!( - "PythonIndex(files={}, symbols={}, imports={})", - summary.files, summary.symbols, summary.imports + "PythonIndex(files={}, symbols={}, imports={}, import_resolutions={})", + summary.files, summary.symbols, summary.imports, summary.import_resolutions ) } } @@ -333,6 +343,29 @@ mod bindings { assert!(!index.to_json().unwrap().contains("\"Skipped\"")); } + #[test] + fn py_engine_exposes_import_resolution_count() { + let repo = temp_repo_path("py-binding-import-resolution"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from .base import Base\n\nclass Service(Base):\n pass\n", + ) + .unwrap(); + + let index = PyEngine::new() + .index_python_path(repo.to_str().unwrap()) + .unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let summary = index.summary(); + assert_eq!(summary.import_resolutions, 1); + assert_eq!(index.import_resolution_count(), 1); + assert!(index.to_json().unwrap().contains("import_resolutions")); + } + fn temp_repo_path(prefix: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index b0d25d3a9..13238547e 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -149,15 +149,15 @@ These measurements use the new Python shell integration path: Python discovers f Commands were run on this branch on 2026-06-18 after adding selected-file PyO3 indexing. -| Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Wall ratio | RSS ratio | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.961s | 532.3 MB | 0.632s | 114.3 MB | 1129 | 1129 | 4.685x | 4.657x | +| Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Rust import resolutions | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.938s | 533.3 MB | 0.687s | 116.5 MB | 1129 | 1129 | 432 | 4.277x | 4.579x | -This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index wall time and process max RSS substantially for the implemented compact-index slice. +This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index/import-resolution wall time and process max RSS substantially for the implemented compact graph slice. Important caveats: -- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions, and imports. +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions, imports, and internal import-resolution records for indexed Python modules. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md index eeef84317..09eca3669 100644 --- a/rust-rewrite/engine-skeleton.md +++ b/rust-rewrite/engine-skeleton.md @@ -27,7 +27,7 @@ RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup" \ cargo build --release -p graph-sitter-py --features extension-module ``` -The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. +The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. The compact index now also includes internal Python `import_resolutions` records for the first import graph slice. ## Integration Choice diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 0a9e92847..74c7702e7 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -118,8 +118,9 @@ Current implemented bridge status: - `crates/graph-sitter-py` builds a PyO3 module named `graph_sitter_py` behind the `extension-module` feature. - `Engine.index_python_path(repo_path)` and module-level `index_python_path(repo_path)` return a compact `PythonIndex` for Python files. - `Engine.index_python_paths(repo_path, file_paths)` and module-level `index_python_paths(repo_path, file_paths)` index an explicit Python file list. The Python shell integration uses this path so Rust sees the same `RepoOperator.iter_files(...)` selection as the current Python backend. -- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, byte, line, and error counts. +- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, import-resolution, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. +- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `Codebase.rust_index_summary` exposes the attached compact summary for shell smoke checks. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. diff --git a/rust-rewrite/resolution-algorithms.md b/rust-rewrite/resolution-algorithms.md index 34874f463..e4829c873 100644 --- a/rust-rewrite/resolution-algorithms.md +++ b/rust-rewrite/resolution-algorithms.md @@ -55,6 +55,13 @@ Graph edges: ## Import Resolution Algorithms +Current Rust implementation status: + +- The compact Rust Python index now builds `ImportResolutionRecord` rows for internal imports whose targets are in the selected Python file set. +- Covered forms: `import pkg.mod`, `from pkg.mod import Symbol`, `from .mod import Symbol`, and `from . import mod` when the target file or top-level symbol exists in the compact index. +- External imports intentionally remain unresolved rather than materializing external module records. +- Full parity remains open for configured `import_resolution_paths`, `py_resolve_syspath`, wildcard import expansion, package `valid_import_names`, fallback `src`/`test` roots, and every TypeScript import/export rule. + ### Shared Import Flow `Import.add_symbol_resolution_edge` calls the language-specific `resolve_import`: diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 441643f21..15d7467fd 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -170,10 +170,11 @@ Recommended task format: - [x] Inventory current resolver/dependency algorithms and Rust relation-table plan. owner: Gauss. Result: documented in `rust-rewrite/resolution-algorithms.md`. - [ ] Port Python import resolution rules. +- [x] Implement compact Python import-to-file and import-to-symbol resolution for indexed internal modules. owner: codex. Result: Rust now emits `ImportResolutionRecord` rows for direct, absolute `from`, and relative `from` imports when targets are inside the selected file set. - [ ] Port TypeScript relative import resolution rules. - [ ] Port TypeScript config/path alias handling. - [ ] Represent external modules compactly. -- [ ] Implement import-to-file and import-to-symbol edges. +- [ ] Implement full import-to-file and import-to-symbol edges for all Python and TypeScript rules. - [ ] Implement export-to-symbol/import/file edges. - [ ] Implement lexical scope tables for name resolution. - [ ] Implement symbol usage extraction by identifier ranges. @@ -239,3 +240,4 @@ Recommended task format: - [x] 2026-06-18: Implemented first Rust Python compact-index slice and benchmark comparison; initial measurements show 9x-22x wall-time improvement and 70x-104x RSS improvement on this repo for the implemented slice. owner: codex. - [x] 2026-06-18: Exposed the compact Python index through the PyO3 module and verified a Python import smoke against this repo. owner: codex. Notes: extension returned 1127 files, 3117 symbols, and 6414 imports for the current checkout. - [x] 2026-06-18: Added Python-shell Rust index integration behind `CodebaseConfig(graph_backend=...)`, selected-file PyO3 indexing from `RepoOperator`, and a facade benchmark. owner: codex. Notes: selected-file facade matched Python's 1129-file discovery and ran 4.7x faster with 4.7x lower process max RSS than Python parse/object materialization on this checkout. +- [x] 2026-06-18: Added compact Rust Python import resolution records. owner: codex. Notes: the Python-facing Rust facade now emits 432 internal import-resolution records on this checkout and remains 4.3x faster with 4.6x lower process max RSS than Python parse/object materialization. diff --git a/rust-rewrite/tools/compare_rust_python_index.py b/rust-rewrite/tools/compare_rust_python_index.py index 501f0b641..8eafac430 100644 --- a/rust-rewrite/tools/compare_rust_python_index.py +++ b/rust-rewrite/tools/compare_rust_python_index.py @@ -187,7 +187,8 @@ def print_human(report: dict[str, Any]) -> None: f"wall={comparison['rust_index_wall_seconds']:.3f}s " f"process_wall={comparison['rust_process_wall_seconds']:.3f}s " f"rss_peak={comparison['rust_sampled_rss_peak_mb']:.1f} MB " - f"files={rust_summary['files']} symbols={rust_summary['symbols']} imports={rust_summary['imports']}" + f"files={rust_summary['files']} symbols={rust_summary['symbols']} " + f"imports={rust_summary['imports']} import_resolutions={rust_summary['import_resolutions']}" ) print( "ratios: " diff --git a/rust-rewrite/tools/measure_rust_facade.py b/rust-rewrite/tools/measure_rust_facade.py index e467198d7..665d74d79 100644 --- a/rust-rewrite/tools/measure_rust_facade.py +++ b/rust-rewrite/tools/measure_rust_facade.py @@ -92,7 +92,13 @@ def print_human(report: dict) -> None: print(f"raw rust walk: {report['metadata']['raw_rust_walk']}") print(f"selected files: {report['metadata']['selected_file_count']}") print(f"rust facade: wall={totals['wall_seconds']:.3f}s max_rss={totals['max_rss_mb']:.1f} MB") - print(f"summary: files={summary['files']} symbols={summary['symbols']} imports={summary['imports']}") + print( + "summary: " + f"files={summary['files']} " + f"symbols={summary['symbols']} " + f"imports={summary['imports']} " + f"import_resolutions={summary['import_resolutions']}" + ) def main() -> int: diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 1ddd1446b..2af065a9c 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -24,6 +24,7 @@ class RustIndexSummary: classes: int functions: int imports: int + import_resolutions: int bytes: int lines: int files_with_errors: int diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index bc73317da..d1d9db47e 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -15,6 +15,7 @@ def as_dict(self): "classes": 1, "functions": 1, "imports": 1, + "import_resolutions": 1, "bytes": 64, "lines": 8, "files_with_errors": 0, @@ -66,6 +67,7 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.summary.classes == 1 assert codebase.ctx.rust_index.summary.functions == 1 assert codebase.ctx.rust_index.summary.imports == 1 + assert codebase.ctx.rust_index.summary.import_resolutions == 1 assert codebase.rust_index_summary == codebase.ctx.rust_index.summary assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] From 5162919d756a78c497c1bbf78d972b14bdaabb30 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:49:17 -0700 Subject: [PATCH 010/228] Expose Rust compact graph records --- crates/graph-sitter-engine/src/lib.rs | 85 ++++++++++++ crates/graph-sitter-py/src/lib.rs | 27 ++++ rust-rewrite/benchmarks.md | 2 +- rust-rewrite/engine-skeleton.md | 2 +- rust-rewrite/python-compat.md | 2 + rust-rewrite/strategy.md | 4 +- src/graph_sitter/codebase/rust_backend.py | 130 +++++++++++++++++++ tests/unit/sdk/codebase/test_rust_backend.py | 112 ++++++++++++++++ 8 files changed, 361 insertions(+), 3 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index ce3deede3..e8a381b61 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -807,6 +807,91 @@ mod tests { ); } + #[test] + fn compact_python_graph_snapshot_is_stable() { + let repo = temp_repo_path("compact-python-graph-snapshot"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from .base import Base\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let files = index + .files + .iter() + .map(|file| { + serde_json::json!({ + "id": file.id, + "path": file.path, + "module_name": file.module_name, + }) + }) + .collect::>(); + let symbols = index + .symbols + .iter() + .map(|symbol| { + serde_json::json!({ + "id": symbol.id, + "file_id": symbol.file_id, + "name": symbol.name, + "kind": symbol.kind, + }) + }) + .collect::>(); + let imports = index + .imports + .iter() + .map(|import| { + serde_json::json!({ + "id": import.id, + "file_id": import.file_id, + "kind": import.kind, + "module": import.module, + "name": import.name, + "alias": import.alias, + }) + }) + .collect::>(); + + assert_eq!( + serde_json::json!({ + "files": files, + "symbols": symbols, + "imports": imports, + "import_resolutions": index.import_resolutions, + }), + serde_json::json!({ + "files": [ + {"id": 0, "path": "pkg/__init__.py", "module_name": "pkg"}, + {"id": 1, "path": "pkg/base.py", "module_name": "pkg.base"}, + {"id": 2, "path": "pkg/service.py", "module_name": "pkg.service"} + ], + "symbols": [ + {"id": 0, "file_id": 1, "name": "Base", "kind": "class"}, + {"id": 1, "file_id": 2, "name": "Service", "kind": "class"} + ], + "imports": [ + {"id": 0, "file_id": 2, "kind": "from_import", "module": ".base", "name": "Base", "alias": null}, + {"id": 1, "file_id": 2, "kind": "from_import", "module": ".", "name": "base", "alias": null}, + {"id": 2, "file_id": 2, "kind": "import", "module": null, "name": "pkg.base", "alias": null}, + {"id": 3, "file_id": 2, "kind": "import", "module": null, "name": "os", "alias": null} + ], + "import_resolutions": [ + {"id": 0, "import_id": 0, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": 0}, + {"id": 1, "import_id": 1, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null}, + {"id": 2, "import_id": 2, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null} + ] + }) + ); + } + fn temp_repo_path(prefix: &str) -> PathBuf { let nanos = SystemTime::now() .duration_since(UNIX_EPOCH) diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index ef34c50fe..9d662a802 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -149,6 +149,26 @@ mod bindings { .map_err(|error| PyRuntimeError::new_err(error.to_string())) } + fn files_json(&self) -> PyResult { + serde_json::to_string(&self.inner.files) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + + fn symbols_json(&self) -> PyResult { + serde_json::to_string(&self.inner.symbols) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + + fn imports_json(&self) -> PyResult { + serde_json::to_string(&self.inner.imports) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + + fn import_resolutions_json(&self) -> PyResult { + serde_json::to_string(&self.inner.import_resolutions) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + #[getter] fn file_count(&self) -> usize { self.inner.files.len() @@ -363,6 +383,13 @@ mod bindings { let summary = index.summary(); assert_eq!(summary.import_resolutions, 1); assert_eq!(index.import_resolution_count(), 1); + assert!(index.files_json().unwrap().contains("\"pkg/base.py\"")); + assert!(index.symbols_json().unwrap().contains("\"Base\"")); + assert!(index.imports_json().unwrap().contains("\".base\"")); + assert!(index + .import_resolutions_json() + .unwrap() + .contains("target_symbol_id")); assert!(index.to_json().unwrap().contains("import_resolutions")); } diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 13238547e..410a5e327 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -151,7 +151,7 @@ Commands were run on this branch on 2026-06-18 after adding selected-file PyO3 i | Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Rust import resolutions | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.938s | 533.3 MB | 0.687s | 116.5 MB | 1129 | 1129 | 432 | 4.277x | 4.579x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.802s | 533.5 MB | 0.424s | 114.3 MB | 1129 | 1129 | 432 | 6.614x | 4.668x | This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index/import-resolution wall time and process max RSS substantially for the implemented compact graph slice. diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md index 09eca3669..18dfe744b 100644 --- a/rust-rewrite/engine-skeleton.md +++ b/rust-rewrite/engine-skeleton.md @@ -27,7 +27,7 @@ RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup" \ cargo build --release -p graph-sitter-py --features extension-module ``` -The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. The compact index now also includes internal Python `import_resolutions` records for the first import graph slice. +The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. The compact index now also includes internal Python `import_resolutions` records for the first import graph slice, plus record-family JSON methods for files, symbols, imports, and import resolutions. ## Integration Choice diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 74c7702e7..765c27b5f 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -120,6 +120,8 @@ Current implemented bridge status: - `Engine.index_python_paths(repo_path, file_paths)` and module-level `index_python_paths(repo_path, file_paths)` index an explicit Python file list. The Python shell integration uses this path so Rust sees the same `RepoOperator.iter_files(...)` selection as the current Python backend. - `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, import-resolution, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. +- `PythonIndex.files_json()`, `symbols_json()`, `imports_json()`, and `import_resolutions_json()` expose each record family without forcing callers to deserialize the full index payload. +- `RustIndexBackend.files`, `.symbols`, `.imports`, and `.import_resolutions` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `Codebase.rust_index_summary` exposes the attached compact summary for shell smoke checks. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 15d7467fd..13502d58f 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -162,8 +162,9 @@ Recommended task format: - [ ] Extract imports and exports for TypeScript. - [ ] Build path and string interners. - [x] Expose compact Python index summary and JSON through PyO3. owner: codex. Result: added `PythonIndex`, `IndexSummary`, `Engine.index_python_path`, and module-level `index_python_path`. +- [x] Expose compact Python file, symbol, import, and import-resolution records through PyO3/Python facade. owner: codex. Result: added record-family JSON methods and typed Python dataclass accessors on `RustIndexBackend`. - [ ] Expose `files`, `symbols`, `classes`, `functions`, `imports`, and `exports` ID queries through PyO3. -- [ ] Add golden snapshots for compact IR on small Python fixtures. +- [x] Add golden snapshots for compact IR on small Python fixtures. owner: codex. Result: added deterministic compact graph snapshot covering files, symbols, imports, and import resolutions. - [ ] Add golden snapshots for compact IR on small TypeScript fixtures. ## Phase 3: Resolution And Dependency Graph @@ -241,3 +242,4 @@ Recommended task format: - [x] 2026-06-18: Exposed the compact Python index through the PyO3 module and verified a Python import smoke against this repo. owner: codex. Notes: extension returned 1127 files, 3117 symbols, and 6414 imports for the current checkout. - [x] 2026-06-18: Added Python-shell Rust index integration behind `CodebaseConfig(graph_backend=...)`, selected-file PyO3 indexing from `RepoOperator`, and a facade benchmark. owner: codex. Notes: selected-file facade matched Python's 1129-file discovery and ran 4.7x faster with 4.7x lower process max RSS than Python parse/object materialization on this checkout. - [x] 2026-06-18: Added compact Rust Python import resolution records. owner: codex. Notes: the Python-facing Rust facade now emits 432 internal import-resolution records on this checkout and remains 4.3x faster with 4.6x lower process max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added typed Python facade accessors and a deterministic compact graph snapshot for record-level parity testing. owner: codex. Notes: this prepares the large-repo golden import/reference graph workflow. diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 2af065a9c..d549e2b55 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json from dataclasses import dataclass from importlib import import_module from pathlib import Path @@ -17,6 +18,20 @@ class RustIndexBuildError(RuntimeError): """Raised when the Rust backend extension loads but cannot index the repo.""" +@dataclass(frozen=True) +class RustSourceRange: + start_byte: int + end_byte: int + start_row: int + start_column: int + end_row: int + end_column: int + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustSourceRange: + return cls(**{field: int(data[field]) for field in cls.__dataclass_fields__}) + + @dataclass(frozen=True) class RustIndexSummary: files: int @@ -40,12 +55,103 @@ def from_object(cls, summary: Any) -> RustIndexSummary: return cls(**{field: int(data[field]) for field in cls.__dataclass_fields__}) +@dataclass(frozen=True) +class RustFileRecord: + id: int + path: str + module_name: str | None + byte_len: int + line_count: int + has_error: bool + root_range: RustSourceRange + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustFileRecord: + return cls( + id=int(data["id"]), + path=str(data["path"]), + module_name=data["module_name"], + byte_len=int(data["byte_len"]), + line_count=int(data["line_count"]), + has_error=bool(data["has_error"]), + root_range=RustSourceRange.from_dict(data["root_range"]), + ) + + +@dataclass(frozen=True) +class RustSymbolRecord: + id: int + file_id: int + name: str + kind: str + range: RustSourceRange + name_range: RustSourceRange + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustSymbolRecord: + return cls( + id=int(data["id"]), + file_id=int(data["file_id"]), + name=str(data["name"]), + kind=str(data["kind"]), + range=RustSourceRange.from_dict(data["range"]), + name_range=RustSourceRange.from_dict(data["name_range"]), + ) + + +@dataclass(frozen=True) +class RustImportRecord: + id: int + file_id: int + kind: str + module: str | None + name: str | None + alias: str | None + range: RustSourceRange + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustImportRecord: + return cls( + id=int(data["id"]), + file_id=int(data["file_id"]), + kind=str(data["kind"]), + module=data["module"], + name=data["name"], + alias=data["alias"], + range=RustSourceRange.from_dict(data["range"]), + ) + + +@dataclass(frozen=True) +class RustImportResolutionRecord: + id: int + import_id: int + source_file_id: int + target_file_id: int + target_symbol_id: int | None + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustImportResolutionRecord: + target_symbol_id = data["target_symbol_id"] + return cls( + id=int(data["id"]), + import_id=int(data["import_id"]), + source_file_id=int(data["source_file_id"]), + target_file_id=int(data["target_file_id"]), + target_symbol_id=None if target_symbol_id is None else int(target_symbol_id), + ) + + @dataclass class RustIndexBackend: repo_path: Path extension: Any index: Any summary: RustIndexSummary + _files: list[RustFileRecord] | None = None + _symbols: list[RustSymbolRecord] | None = None + _imports: list[RustImportRecord] | None = None + _import_resolutions: list[RustImportResolutionRecord] | None = None @classmethod def build(cls, repo_path: str | Path, file_paths: Sequence[str] | None = None) -> RustIndexBackend: @@ -72,5 +178,29 @@ def build(cls, repo_path: str | Path, file_paths: Sequence[str] | None = None) - def engine_version(self) -> str: return str(self.extension.engine_version()) + @property + def files(self) -> list[RustFileRecord]: + if self._files is None: + self._files = [RustFileRecord.from_dict(record) for record in json.loads(self.index.files_json())] + return self._files + + @property + def symbols(self) -> list[RustSymbolRecord]: + if self._symbols is None: + self._symbols = [RustSymbolRecord.from_dict(record) for record in json.loads(self.index.symbols_json())] + return self._symbols + + @property + def imports(self) -> list[RustImportRecord]: + if self._imports is None: + self._imports = [RustImportRecord.from_dict(record) for record in json.loads(self.index.imports_json())] + return self._imports + + @property + def import_resolutions(self) -> list[RustImportResolutionRecord]: + if self._import_resolutions is None: + self._import_resolutions = [RustImportResolutionRecord.from_dict(record) for record in json.loads(self.index.import_resolutions_json())] + return self._import_resolutions + def to_json(self) -> str: return str(self.index.to_json()) diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index d1d9db47e..4971e2775 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -1,3 +1,4 @@ +import json import sys from types import ModuleType @@ -29,6 +30,113 @@ def summary(self): def to_json(self): return '{"files":[],"symbols":[],"imports":[]}' + def files_json(self): + return json.dumps( + [ + { + "id": 0, + "path": "pkg/service.py", + "module_name": "pkg.service", + "byte_len": 64, + "line_count": 8, + "has_error": False, + "root_range": { + "start_byte": 0, + "end_byte": 64, + "start_row": 0, + "start_column": 0, + "end_row": 8, + "end_column": 0, + }, + } + ] + ) + + def symbols_json(self): + return json.dumps( + [ + { + "id": 0, + "file_id": 0, + "name": "Service", + "kind": "class", + "range": { + "start_byte": 11, + "end_byte": 31, + "start_row": 2, + "start_column": 0, + "end_row": 3, + "end_column": 8, + }, + "name_range": { + "start_byte": 17, + "end_byte": 24, + "start_row": 2, + "start_column": 6, + "end_row": 2, + "end_column": 13, + }, + }, + { + "id": 1, + "file_id": 0, + "name": "helper", + "kind": "function", + "range": { + "start_byte": 33, + "end_byte": 64, + "start_row": 5, + "start_column": 0, + "end_row": 8, + "end_column": 0, + }, + "name_range": { + "start_byte": 37, + "end_byte": 43, + "start_row": 5, + "start_column": 4, + "end_row": 5, + "end_column": 10, + }, + }, + ] + ) + + def imports_json(self): + return json.dumps( + [ + { + "id": 0, + "file_id": 0, + "kind": "import", + "module": None, + "name": "os", + "alias": None, + "range": { + "start_byte": 0, + "end_byte": 9, + "start_row": 0, + "start_column": 0, + "end_row": 0, + "end_column": 9, + }, + } + ] + ) + + def import_resolutions_json(self): + return json.dumps( + [ + { + "id": 0, + "import_id": 0, + "source_file_id": 0, + "target_file_id": 0, + "target_symbol_id": 0, + } + ] + ) + def install_fake_rust_extension(monkeypatch: pytest.MonkeyPatch) -> tuple[list[str], list[list[str]]]: indexed_paths: list[str] = [] @@ -68,6 +176,10 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.summary.functions == 1 assert codebase.ctx.rust_index.summary.imports == 1 assert codebase.ctx.rust_index.summary.import_resolutions == 1 + assert codebase.ctx.rust_index.files[0].path == "pkg/service.py" + assert codebase.ctx.rust_index.symbols[0].name == "Service" + assert codebase.ctx.rust_index.imports[0].name == "os" + assert codebase.ctx.rust_index.import_resolutions[0].target_symbol_id == 0 assert codebase.rust_index_summary == codebase.ctx.rust_index.summary assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] From 7b3283e8910e648ff2067187730d9d27cb10c38f Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:53:37 -0700 Subject: [PATCH 011/228] Extract Python globals in Rust compact index --- .../examples/index_python.rs | 3 +- crates/graph-sitter-engine/src/lib.rs | 104 +++++++++++++++--- crates/graph-sitter-py/src/lib.rs | 21 +++- rust-rewrite/benchmarks.md | 8 +- rust-rewrite/engine-skeleton.md | 2 +- rust-rewrite/parser-index.md | 5 + rust-rewrite/python-compat.md | 4 +- rust-rewrite/strategy.md | 3 +- .../tools/compare_rust_python_index.py | 1 + rust-rewrite/tools/measure_rust_facade.py | 1 + src/graph_sitter/codebase/rust_backend.py | 1 + tests/unit/sdk/codebase/test_rust_backend.py | 2 + 12 files changed, 126 insertions(+), 29 deletions(-) diff --git a/crates/graph-sitter-engine/examples/index_python.rs b/crates/graph-sitter-engine/examples/index_python.rs index 8055967f7..67704ebb2 100644 --- a/crates/graph-sitter-engine/examples/index_python.rs +++ b/crates/graph-sitter-engine/examples/index_python.rs @@ -29,11 +29,12 @@ fn main() -> Result<(), Box> { println!("repo: {repo_path}"); println!("wall: {:.6}s", elapsed.as_secs_f64()); println!( - "index: files={} symbols={} classes={} functions={} imports={} import_resolutions={} bytes={} lines={} files_with_errors={}", + "index: files={} symbols={} classes={} functions={} global_variables={} imports={} import_resolutions={} bytes={} lines={} files_with_errors={}", summary.files, summary.symbols, summary.classes, summary.functions, + summary.global_variables, summary.imports, summary.import_resolutions, summary.bytes, diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index e8a381b61..0ddd5d360 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -152,6 +152,11 @@ impl PythonIndex { .iter() .filter(|symbol| symbol.kind == SymbolKind::Function) .count(), + global_variables: self + .symbols + .iter() + .filter(|symbol| symbol.kind == SymbolKind::GlobalVariable) + .count(), imports: self.imports.len(), import_resolutions: self.import_resolutions.len(), bytes: self.files.iter().map(|file| file.byte_len).sum(), @@ -167,6 +172,7 @@ pub struct IndexSummary { pub symbols: usize, pub classes: usize, pub functions: usize, + pub global_variables: usize, pub imports: usize, pub import_resolutions: usize, pub bytes: usize, @@ -190,6 +196,7 @@ pub struct FileRecord { pub enum SymbolKind { Class, Function, + GlobalVariable, } #[derive(Debug, Clone, PartialEq, Eq, Serialize)] @@ -407,6 +414,16 @@ fn extract_top_level_node(file_id: u32, source: &str, node: Node<'_>, index: &mu "import_from_statement" | "future_import_statement" => { push_from_import_statement(file_id, source, node, index) } + "assignment" | "annotated_assignment" => { + push_global_assignment(file_id, source, node, index) + } + "expression_statement" => { + if let Some(assignment) = + first_child_of_kind(node, &["assignment", "annotated_assignment"]) + { + push_global_assignment(file_id, source, assignment, index); + } + } _ => {} } } @@ -445,6 +462,40 @@ fn push_symbol_with_range( }); } +fn push_global_assignment(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { + let Some(left) = node.child_by_field_name("left") else { + return; + }; + let mut targets = Vec::new(); + collect_assignment_targets(left, &mut targets); + for target in targets { + let Ok(name) = target.utf8_text(source.as_bytes()) else { + continue; + }; + index.symbols.push(SymbolRecord { + id: index.symbols.len() as u32, + file_id, + name: name.to_owned(), + kind: SymbolKind::GlobalVariable, + range: node.range().into(), + name_range: target.range().into(), + }); + } +} + +fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + match node.kind() { + "identifier" => out.push(node), + "pattern_list" | "tuple_pattern" | "list_pattern" => { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_assignment_targets(child, out); + } + } + _ => {} + } +} + fn push_import_statement(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { let text = node_text(source, node); let imports = text @@ -731,6 +782,7 @@ mod tests { assert_eq!(index.summary().files, 1); assert_eq!(index.summary().classes, 1); assert_eq!(index.summary().functions, 1); + assert_eq!(index.summary().global_variables, 0); assert_eq!(index.summary().imports, 4); assert_eq!(index.summary().import_resolutions, 0); assert_eq!(index.symbols[0].name, "Service"); @@ -766,10 +818,14 @@ mod tests { let repo = temp_repo_path("resolve-python-imports"); fs::create_dir_all(repo.join("pkg")).unwrap(); fs::write(repo.join("pkg/__init__.py"), "").unwrap(); - fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "CONSTANT = 'base'\nclass Base:\n pass\n", + ) + .unwrap(); fs::write( repo.join("pkg/service.py"), - "from __future__ import annotations\nfrom .base import Base\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", + "from __future__ import annotations\nfrom .base import Base, CONSTANT\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", ) .unwrap(); @@ -778,8 +834,9 @@ mod tests { assert_eq!(index.summary().files, 3); assert_eq!(index.summary().classes, 2); - assert_eq!(index.summary().imports, 5); - assert_eq!(index.summary().import_resolutions, 3); + assert_eq!(index.summary().global_variables, 1); + assert_eq!(index.summary().imports, 6); + assert_eq!(index.summary().import_resolutions, 4); let base_file_id = index .files @@ -793,17 +850,27 @@ mod tests { .find(|symbol| symbol.file_id == base_file_id && symbol.name == "Base") .unwrap() .id; + let constant_symbol_id = index + .symbols + .iter() + .find(|symbol| symbol.file_id == base_file_id && symbol.name == "CONSTANT") + .unwrap() + .id; assert!(index.import_resolutions.iter().any(|resolution| { resolution.target_file_id == base_file_id && resolution.target_symbol_id == Some(base_symbol_id) })); + assert!(index.import_resolutions.iter().any(|resolution| { + resolution.target_file_id == base_file_id + && resolution.target_symbol_id == Some(constant_symbol_id) + })); assert_eq!( index .import_resolutions .iter() .filter(|resolution| resolution.target_file_id == base_file_id) .count(), - 3 + 4 ); } @@ -812,10 +879,14 @@ mod tests { let repo = temp_repo_path("compact-python-graph-snapshot"); fs::create_dir_all(repo.join("pkg")).unwrap(); fs::write(repo.join("pkg/__init__.py"), "").unwrap(); - fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "CONSTANT = 'base'\nclass Base:\n pass\n", + ) + .unwrap(); fs::write( repo.join("pkg/service.py"), - "from .base import Base\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", + "from .base import Base, CONSTANT\nfrom . import base\nimport pkg.base\nimport os\n\nclass Service(Base):\n pass\n", ) .unwrap(); @@ -874,19 +945,22 @@ mod tests { {"id": 2, "path": "pkg/service.py", "module_name": "pkg.service"} ], "symbols": [ - {"id": 0, "file_id": 1, "name": "Base", "kind": "class"}, - {"id": 1, "file_id": 2, "name": "Service", "kind": "class"} + {"id": 0, "file_id": 1, "name": "CONSTANT", "kind": "global_variable"}, + {"id": 1, "file_id": 1, "name": "Base", "kind": "class"}, + {"id": 2, "file_id": 2, "name": "Service", "kind": "class"} ], "imports": [ {"id": 0, "file_id": 2, "kind": "from_import", "module": ".base", "name": "Base", "alias": null}, - {"id": 1, "file_id": 2, "kind": "from_import", "module": ".", "name": "base", "alias": null}, - {"id": 2, "file_id": 2, "kind": "import", "module": null, "name": "pkg.base", "alias": null}, - {"id": 3, "file_id": 2, "kind": "import", "module": null, "name": "os", "alias": null} + {"id": 1, "file_id": 2, "kind": "from_import", "module": ".base", "name": "CONSTANT", "alias": null}, + {"id": 2, "file_id": 2, "kind": "from_import", "module": ".", "name": "base", "alias": null}, + {"id": 3, "file_id": 2, "kind": "import", "module": null, "name": "pkg.base", "alias": null}, + {"id": 4, "file_id": 2, "kind": "import", "module": null, "name": "os", "alias": null} ], "import_resolutions": [ - {"id": 0, "import_id": 0, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": 0}, - {"id": 1, "import_id": 1, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null}, - {"id": 2, "import_id": 2, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null} + {"id": 0, "import_id": 0, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": 1}, + {"id": 1, "import_id": 1, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": 0}, + {"id": 2, "import_id": 2, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null}, + {"id": 3, "import_id": 3, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null} ] }) ); diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 9d662a802..ac77a91ed 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -67,6 +67,8 @@ mod bindings { #[pyo3(get)] functions: usize, #[pyo3(get)] + global_variables: usize, + #[pyo3(get)] imports: usize, #[pyo3(get)] import_resolutions: usize, @@ -85,6 +87,7 @@ mod bindings { symbols: summary.symbols, classes: summary.classes, functions: summary.functions, + global_variables: summary.global_variables, imports: summary.imports, import_resolutions: summary.import_resolutions, bytes: summary.bytes, @@ -102,6 +105,7 @@ mod bindings { ("symbols", self.symbols), ("classes", self.classes), ("functions", self.functions), + ("global_variables", self.global_variables), ("imports", self.imports), ("import_resolutions", self.import_resolutions), ("bytes", self.bytes), @@ -112,11 +116,12 @@ mod bindings { fn __repr__(&self) -> String { format!( - "IndexSummary(files={}, symbols={}, classes={}, functions={}, imports={}, import_resolutions={}, bytes={}, lines={}, files_with_errors={})", + "IndexSummary(files={}, symbols={}, classes={}, functions={}, global_variables={}, imports={}, import_resolutions={}, bytes={}, lines={}, files_with_errors={})", self.files, self.symbols, self.classes, self.functions, + self.global_variables, self.imports, self.import_resolutions, self.bytes, @@ -368,10 +373,14 @@ mod bindings { let repo = temp_repo_path("py-binding-import-resolution"); fs::create_dir_all(repo.join("pkg")).unwrap(); fs::write(repo.join("pkg/__init__.py"), "").unwrap(); - fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "CONSTANT = 'base'\nclass Base:\n pass\n", + ) + .unwrap(); fs::write( repo.join("pkg/service.py"), - "from .base import Base\n\nclass Service(Base):\n pass\n", + "from .base import Base, CONSTANT\n\nclass Service(Base):\n pass\n", ) .unwrap(); @@ -381,9 +390,11 @@ mod bindings { fs::remove_dir_all(&repo).unwrap(); let summary = index.summary(); - assert_eq!(summary.import_resolutions, 1); - assert_eq!(index.import_resolution_count(), 1); + assert_eq!(summary.global_variables, 1); + assert_eq!(summary.import_resolutions, 2); + assert_eq!(index.import_resolution_count(), 2); assert!(index.files_json().unwrap().contains("\"pkg/base.py\"")); + assert!(index.symbols_json().unwrap().contains("\"CONSTANT\"")); assert!(index.symbols_json().unwrap().contains("\"Base\"")); assert!(index.imports_json().unwrap().contains("\".base\"")); assert!(index diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 410a5e327..2807d5ae4 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -149,15 +149,15 @@ These measurements use the new Python shell integration path: Python discovers f Commands were run on this branch on 2026-06-18 after adding selected-file PyO3 indexing. -| Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Rust import resolutions | Wall ratio | RSS ratio | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.802s | 533.5 MB | 0.424s | 114.3 MB | 1129 | 1129 | 432 | 6.614x | 4.668x | +| Input | Python mode | Python wall | Python max RSS | Rust facade wall | Rust facade max RSS | Python files | Rust selected files | Rust globals | Rust import resolutions | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.987s | 535.0 MB | 0.692s | 115.3 MB | 1129 | 1129 | 799 | 432 | 4.317x | 4.638x | This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index/import-resolution wall time and process max RSS substantially for the implemented compact graph slice. Important caveats: -- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions, imports, and internal import-resolution records for indexed Python modules. +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, and internal import-resolution records for indexed Python modules. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/engine-skeleton.md b/rust-rewrite/engine-skeleton.md index 18dfe744b..b83c41ba6 100644 --- a/rust-rewrite/engine-skeleton.md +++ b/rust-rewrite/engine-skeleton.md @@ -27,7 +27,7 @@ RUSTFLAGS="-C link-arg=-undefined -C link-arg=dynamic_lookup" \ cargo build --release -p graph-sitter-py --features extension-module ``` -The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. The compact index now also includes internal Python `import_resolutions` records for the first import graph slice, plus record-family JSON methods for files, symbols, imports, and import resolutions. +The current module exports `Engine`, `EngineInfo`, `PythonIndex`, `IndexSummary`, `engine_version`, `debug_info`, `index_python_path`, and `index_python_paths`. A successful smoke import on this repo returned 1127 files, 3117 symbols, and 6414 imports for the compact Python index at that commit. The Python shell integration now uses `index_python_paths` so Rust indexes the exact file list returned by `RepoOperator.iter_files(...)`. The compact index now includes top-level Python classes, functions, simple globals, internal Python `import_resolutions` records for the first import graph slice, plus record-family JSON methods for files, symbols, imports, and import resolutions. ## Integration Choice diff --git a/rust-rewrite/parser-index.md b/rust-rewrite/parser-index.md index 15a5eaa7d..b1cfe1fd2 100644 --- a/rust-rewrite/parser-index.md +++ b/rust-rewrite/parser-index.md @@ -95,6 +95,11 @@ First-slice scopes are lookup boundaries and ownership containers, not full lexi ## Python Extraction Rules +Current implemented Rust status: + +- Files, top-level classes/functions, top-level simple global assignments, Python imports, and compact internal import-resolution records are implemented for Python. +- Global extraction currently covers simple identifier targets in top-level `assignment` and `annotated_assignment` nodes, including identifiers nested in tuple/list/pattern lists. Attribute and subscript assignment targets remain intentionally skipped. + ### Files - Parse only `.py`. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 765c27b5f..d81a226fd 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -118,11 +118,11 @@ Current implemented bridge status: - `crates/graph-sitter-py` builds a PyO3 module named `graph_sitter_py` behind the `extension-module` feature. - `Engine.index_python_path(repo_path)` and module-level `index_python_path(repo_path)` return a compact `PythonIndex` for Python files. - `Engine.index_python_paths(repo_path, file_paths)` and module-level `index_python_paths(repo_path, file_paths)` index an explicit Python file list. The Python shell integration uses this path so Rust sees the same `RepoOperator.iter_files(...)` selection as the current Python backend. -- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, import, import-resolution, byte, line, and error counts. +- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, global-variable, import, import-resolution, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. - `PythonIndex.files_json()`, `symbols_json()`, `imports_json()`, and `import_resolutions_json()` expose each record family without forcing callers to deserialize the full index payload. - `RustIndexBackend.files`, `.symbols`, `.imports`, and `.import_resolutions` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. -- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. +- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `Codebase.rust_index_summary` exposes the attached compact summary for shell smoke checks. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 13502d58f..cfd75d305 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -156,7 +156,7 @@ Recommended task format: - [ ] Extract file records with path, language, content hash, and root ranges. - [x] Extract file records with path, byte length, line count, error status, and root ranges for Python. owner: codex. - [x] Extract top-level Python classes and functions. owner: codex. Result: compact `SymbolRecord` extraction for class/function definitions and decorated definitions. -- [ ] Extract top-level Python globals. +- [x] Extract top-level Python globals. owner: codex. Result: added compact global-variable symbol records for simple top-level assignments and annotated assignments. - [ ] Extract top-level TypeScript classes, functions, interfaces, type aliases, enums, and globals. - [x] Extract imports for Python. owner: codex. Result: compact `ImportRecord` extraction for `import`, `from`, and future imports. - [ ] Extract imports and exports for TypeScript. @@ -243,3 +243,4 @@ Recommended task format: - [x] 2026-06-18: Added Python-shell Rust index integration behind `CodebaseConfig(graph_backend=...)`, selected-file PyO3 indexing from `RepoOperator`, and a facade benchmark. owner: codex. Notes: selected-file facade matched Python's 1129-file discovery and ran 4.7x faster with 4.7x lower process max RSS than Python parse/object materialization on this checkout. - [x] 2026-06-18: Added compact Rust Python import resolution records. owner: codex. Notes: the Python-facing Rust facade now emits 432 internal import-resolution records on this checkout and remains 4.3x faster with 4.6x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added typed Python facade accessors and a deterministic compact graph snapshot for record-level parity testing. owner: codex. Notes: this prepares the large-repo golden import/reference graph workflow. +- [x] 2026-06-18: Added compact Rust extraction for top-level Python globals and symbol-target import resolution for imported globals. owner: codex. diff --git a/rust-rewrite/tools/compare_rust_python_index.py b/rust-rewrite/tools/compare_rust_python_index.py index 8eafac430..f2970ee85 100644 --- a/rust-rewrite/tools/compare_rust_python_index.py +++ b/rust-rewrite/tools/compare_rust_python_index.py @@ -188,6 +188,7 @@ def print_human(report: dict[str, Any]) -> None: f"process_wall={comparison['rust_process_wall_seconds']:.3f}s " f"rss_peak={comparison['rust_sampled_rss_peak_mb']:.1f} MB " f"files={rust_summary['files']} symbols={rust_summary['symbols']} " + f"global_variables={rust_summary['global_variables']} " f"imports={rust_summary['imports']} import_resolutions={rust_summary['import_resolutions']}" ) print( diff --git a/rust-rewrite/tools/measure_rust_facade.py b/rust-rewrite/tools/measure_rust_facade.py index 665d74d79..6fce70f23 100644 --- a/rust-rewrite/tools/measure_rust_facade.py +++ b/rust-rewrite/tools/measure_rust_facade.py @@ -96,6 +96,7 @@ def print_human(report: dict) -> None: "summary: " f"files={summary['files']} " f"symbols={summary['symbols']} " + f"global_variables={summary['global_variables']} " f"imports={summary['imports']} " f"import_resolutions={summary['import_resolutions']}" ) diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index d549e2b55..8d52cbd0b 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -38,6 +38,7 @@ class RustIndexSummary: symbols: int classes: int functions: int + global_variables: int imports: int import_resolutions: int bytes: int diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index 4971e2775..c573ed7ae 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -15,6 +15,7 @@ def as_dict(self): "symbols": 2, "classes": 1, "functions": 1, + "global_variables": 0, "imports": 1, "import_resolutions": 1, "bytes": 64, @@ -174,6 +175,7 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.summary.files == 1 assert codebase.ctx.rust_index.summary.classes == 1 assert codebase.ctx.rust_index.summary.functions == 1 + assert codebase.ctx.rust_index.summary.global_variables == 0 assert codebase.ctx.rust_index.summary.imports == 1 assert codebase.ctx.rust_index.summary.import_resolutions == 1 assert codebase.ctx.rust_index.files[0].path == "pkg/service.py" From 87bf3c5502207c1b120d4ffbfff5b73bcb120f8f Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 13:58:50 -0700 Subject: [PATCH 012/228] Skip Python graph in Rust compact mode --- rust-rewrite/benchmarks.md | 15 +++ rust-rewrite/python-compat.md | 3 +- rust-rewrite/strategy.md | 9 +- .../tools/measure_codebase_rust_backend.py | 125 ++++++++++++++++++ src/graph_sitter/codebase/codebase_context.py | 9 +- src/graph_sitter/core/codebase.py | 41 ++++++ tests/unit/sdk/codebase/test_rust_backend.py | 10 ++ 7 files changed, 207 insertions(+), 5 deletions(-) create mode 100644 rust-rewrite/tools/measure_codebase_rust_backend.py diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 2807d5ae4..a5a7cc2d1 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -98,6 +98,13 @@ PYTHONPATH=/path/to/dir/containing/graph_sitter_py_extension \ Use `--raw-rust-walk` to measure Rust's standalone recursive walk instead of Python-selected file discovery. +`rust-rewrite/tools/measure_codebase_rust_backend.py` measures actual `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. It verifies that the lazy Python graph is blocked and reports compact Rust record counts: + +```bash +PYTHONPATH=/path/to/dir/containing/graph_sitter_py_extension \ + uv run python rust-rewrite/tools/measure_codebase_rust_backend.py . --json +``` + ## Metrics The JSON report includes: @@ -155,6 +162,14 @@ Commands were run on this branch on 2026-06-18 after adding selected-file PyO3 i This shell-facing number is intentionally more conservative than the standalone Rust process benchmark because it includes Python startup, imports, and repo file discovery. The important result is that the selected-file integration preserves Python file-discovery parity for the current repo while still cutting parse/index/import-resolution wall time and process max RSS substantially for the implemented compact graph slice. +## Rust `Codebase` Construction Evidence + +These measurements use real `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. In this mode, once the compact Rust index builds successfully, `CodebaseContext` does not build the eager Python graph. Existing Python graph APIs are intentionally blocked for now rather than silently materializing the memory-heavy graph. + +| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust globals | Rust import resolutions | Python graph blocked | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.777s | 535.6 MB | 0.696s | 115.8 MB | 1130 | 1130 | 801 | 432 | yes | 3.992x | 4.627x | + Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, and internal import-resolution records for indexed Python modules. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index d81a226fd..ee7319340 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -124,7 +124,8 @@ Current implemented bridge status: - `RustIndexBackend.files`, `.symbols`, `.imports`, and `.import_resolutions` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. -- `Codebase.rust_index_summary` exposes the attached compact summary for shell smoke checks. +- `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Existing Python graph APIs are blocked in that mode until lazy compatibility handles exist. +- `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, and `.rust_import_resolutions` expose the attached compact records for shell smoke checks and golden tests. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index cfd75d305..10865a57f 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -127,8 +127,8 @@ Recommended task format: - [x] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [x] Define P0 compatibility surface for the first Rust backend slice. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [ ] Define large-repo success targets for memory and time. -- [ ] Select pinned large Python repo commits for golden parity and latency benchmarks. Notes: Airflow is a good first candidate. -- [ ] Build golden reference/import/dependency graph snapshots for the pinned large Python repo commits. +- [ ] Select pinned large Python repo commits for golden parity and latency benchmarks. Notes: Airflow is a good first candidate; record the exact upstream URL, commit SHA, Python version, and checkout/bootstrap command. +- [ ] Build golden reference/import/dependency graph snapshots for the pinned large Python repo commits. Notes: fixtures should assert file/module records, import graph edges, symbol reference graph edges, dependency graph edges, and deterministic sort order. - [x] Draft compact Rust data model with module boundaries and Python integration points. owner: Pasteur. Result: documented in `rust-rewrite/data-model.md`. - [ ] Draft full Rust engine RFC with module boundaries and Python integration points. - [ ] Decide build tooling: `maturin`, setuptools-rust, or hatch custom hook. @@ -139,11 +139,13 @@ Recommended task format: - [x] Add PyO3 module import smoke test. owner: codex. Result: built the extension module and imported it from Python, then indexed this repo through `index_python_path`. - [x] Add `graph_backend` config flag with default `python`. owner: codex. Result: added `GraphBackend` and `RustFallbackMode` to `CodebaseConfig`. - [x] Add compact Rust index facade that can be constructed from `CodebaseContext`. owner: codex. Result: `ctx.rust_index` builds through the optional PyO3 extension when `graph_backend` is `rust` or `auto`. +- [x] Skip eager Python graph construction in opt-in Rust compact mode. owner: codex. Result: `CodebaseConfig(graph_backend="rust")` leaves the Python graph unbuilt when the Rust compact index succeeds. - [ ] Add full Rust engine facade object that can back existing `CodebaseContext` graph query APIs. - [x] Add a minimal debug API returning engine version and enabled features. owner: Beauvoir. Result: added Rust `Engine::debug_info` and feature-gated PyO3 bindings. - [ ] Add CI job that builds the Rust extension on supported Python versions. - [x] Add benchmark command comparing Python backend with Rust compact indexer. owner: codex. Result: added `rust-rewrite/tools/compare_rust_python_index.py`. - [x] Add benchmark command for the Python-facing Rust facade. owner: codex. Result: added `rust-rewrite/tools/measure_rust_facade.py`. +- [x] Add benchmark command for real `Codebase` construction with the Rust compact backend. owner: codex. Result: added `rust-rewrite/tools/measure_codebase_rust_backend.py`. - [ ] Add benchmark command that can select full `Codebase` `--backend python|rust` once Rust backend is wired into Python. ## Phase 2: Parser And Compact Index Vertical Slice @@ -216,7 +218,7 @@ Recommended task format: - [ ] Run full unit suite with Python backend. - [ ] Run full unit suite with Rust backend where supported. - [ ] Add large-repo memory regression benchmark to CI or nightly. -- [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. +- [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. Notes: run against the exact checked-out commit and emit backend, wall time, max RSS, file count, node/edge counts, and mismatch summaries. - [ ] Add feature flag documentation. - [ ] Add migration notes for unsupported APIs. - [ ] Decide default backend criteria. @@ -244,3 +246,4 @@ Recommended task format: - [x] 2026-06-18: Added compact Rust Python import resolution records. owner: codex. Notes: the Python-facing Rust facade now emits 432 internal import-resolution records on this checkout and remains 4.3x faster with 4.6x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added typed Python facade accessors and a deterministic compact graph snapshot for record-level parity testing. owner: codex. Notes: this prepares the large-repo golden import/reference graph workflow. - [x] 2026-06-18: Added compact Rust extraction for top-level Python globals and symbol-target import resolution for imported globals. owner: codex. +- [x] 2026-06-18: Made opt-in `CodebaseConfig(graph_backend="rust")` skip eager Python graph construction and expose compact `rust_*` record properties on `Codebase`. owner: codex. Notes: current checkout constructs 4.0x faster with 4.6x lower process max RSS than Python parse/object materialization while blocking lazy Python graph materialization. diff --git a/rust-rewrite/tools/measure_codebase_rust_backend.py b/rust-rewrite/tools/measure_codebase_rust_backend.py new file mode 100644 index 000000000..2c522c033 --- /dev/null +++ b/rust-rewrite/tools/measure_codebase_rust_backend.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import platform +import resource +import sys +import time +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from graph_sitter.configs.models.codebase import CodebaseConfig, GraphBackend, RustFallbackMode # noqa: E402 +from graph_sitter.core.codebase import Codebase # noqa: E402 + + +def bytes_to_mb(value: float) -> float: + return value / (1024 * 1024) + + +def max_rss_bytes() -> int: + rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform == "darwin": + return int(rss) + return int(rss * 1024) + + +def make_report(repo: Path) -> dict: + config = CodebaseConfig(graph_backend=GraphBackend.RUST, rust_fallback=RustFallbackMode.ERROR) + start = time.perf_counter() + codebase = Codebase(str(repo), language="python", config=config) + wall = time.perf_counter() - start + python_graph_blocked = False + try: + len(codebase.files) + except RuntimeError: + python_graph_blocked = True + + summary = codebase.rust_index_summary + return { + "metadata": { + "repo_path": str(repo), + "python": sys.version, + "platform": platform.platform(), + "python_graph_blocked": python_graph_blocked, + }, + "totals": { + "wall_seconds": round(wall, 6), + "max_rss_mb": round(bytes_to_mb(max_rss_bytes()), 3), + }, + "summary": { + "files": summary.files, + "symbols": summary.symbols, + "classes": summary.classes, + "functions": summary.functions, + "global_variables": summary.global_variables, + "imports": summary.imports, + "import_resolutions": summary.import_resolutions, + "bytes": summary.bytes, + "lines": summary.lines, + "files_with_errors": summary.files_with_errors, + }, + "records": { + "rust_files": len(codebase.rust_files), + "rust_symbols": len(codebase.rust_symbols), + "rust_classes": len(codebase.rust_classes), + "rust_functions": len(codebase.rust_functions), + "rust_global_vars": len(codebase.rust_global_vars), + "rust_imports": len(codebase.rust_imports), + "rust_import_resolutions": len(codebase.rust_import_resolutions), + }, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Measure Codebase construction with the opt-in compact Rust backend.") + parser.add_argument("repo", nargs="?", default=".", help="Path to the Python repository to index.") + parser.add_argument("--output", type=Path, help="Optional path to write JSON report.") + parser.add_argument("--json", action="store_true", help="Print JSON report instead of a human summary.") + return parser.parse_args() + + +def print_human(report: dict) -> None: + totals = report["totals"] + summary = report["summary"] + records = report["records"] + print(f"repo: {report['metadata']['repo_path']}") + print(f"rust Codebase: wall={totals['wall_seconds']:.3f}s max_rss={totals['max_rss_mb']:.1f} MB") + print(f"python graph blocked: {report['metadata']['python_graph_blocked']}") + print( + "summary: " + f"files={summary['files']} " + f"symbols={summary['symbols']} " + f"global_variables={summary['global_variables']} " + f"imports={summary['imports']} " + f"import_resolutions={summary['import_resolutions']}" + ) + print( + "records: " + f"files={records['rust_files']} " + f"symbols={records['rust_symbols']} " + f"imports={records['rust_imports']} " + f"import_resolutions={records['rust_import_resolutions']}" + ) + + +def main() -> int: + args = parse_args() + report = make_report(Path(args.repo).expanduser().resolve()) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print_human(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/graph_sitter/codebase/codebase_context.py b/src/graph_sitter/codebase/codebase_context.py index 19c728ade..c9636c0a4 100644 --- a/src/graph_sitter/codebase/codebase_context.py +++ b/src/graph_sitter/codebase/codebase_context.py @@ -207,7 +207,7 @@ def __init__( self._build_rust_index_if_configured() # Build the graph - if not self.config.exp_lazy_graph and self.config.use_pink != PinkMode.ALL_FILES: + if not self.rust_compact_mode and not self.config.exp_lazy_graph and self.config.use_pink != PinkMode.ALL_FILES: self.build_graph(context.repo_operator) try: self.synced_commit = context.repo_operator.head_commit @@ -222,6 +222,10 @@ def __init__( def __repr__(self): return self.__class__.__name__ + @property + def rust_compact_mode(self) -> bool: + return self.config.graph_backend == GraphBackend.RUST and self.rust_index is not None + def _build_rust_index_if_configured(self) -> None: if self.config.graph_backend == GraphBackend.PYTHON: return @@ -268,6 +272,9 @@ def _handle_rust_backend_unavailable(self, reason: str) -> None: @cached_property def _graph(self) -> PyDiGraph[Importable, Edge]: if not self.__graph_ready: + if self.rust_compact_mode: + msg = "Python graph is not built when CodebaseConfig(graph_backend='rust') uses the compact Rust backend; use rust_* record APIs or select graph_backend='python'" + raise RuntimeError(msg) logger.info("Lazily Computing Graph") self.build_graph(self.projects[0].repo_operator) return self.__graph diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index 949f37e1d..d9d7ab610 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -273,6 +273,47 @@ def rust_index_summary(self): return None return self.ctx.rust_index.summary + @property + @noapidoc + def rust_files(self): + return self._require_rust_index().files + + @property + @noapidoc + def rust_symbols(self): + return self._require_rust_index().symbols + + @property + @noapidoc + def rust_classes(self): + return [symbol for symbol in self.rust_symbols if symbol.kind == "class"] + + @property + @noapidoc + def rust_functions(self): + return [symbol for symbol in self.rust_symbols if symbol.kind == "function"] + + @property + @noapidoc + def rust_global_vars(self): + return [symbol for symbol in self.rust_symbols if symbol.kind == "global_variable"] + + @property + @noapidoc + def rust_imports(self): + return self._require_rust_index().imports + + @property + @noapidoc + def rust_import_resolutions(self): + return self._require_rust_index().import_resolutions + + def _require_rust_index(self): + if self.ctx.rust_index is None: + msg = "Rust compact index is unavailable; construct Codebase with CodebaseConfig(graph_backend='rust')" + raise RuntimeError(msg) + return self.ctx.rust_index + #################################################################################################################### # NODES #################################################################################################################### diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index c573ed7ae..d6cc6a30e 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -168,8 +168,10 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): tmpdir=tmp_path, files={"pkg/service.py": "import os\n\nclass Service:\n pass\n\ndef helper():\n return os.getcwd()\n"}, config=config, + verify_input=False, verify_output=False, ) as codebase: + assert codebase.ctx.rust_compact_mode is True assert codebase.ctx.rust_index is not None assert codebase.ctx.rust_index.engine_version == "test-rust-engine" assert codebase.ctx.rust_index.summary.files == 1 @@ -183,8 +185,15 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.imports[0].name == "os" assert codebase.ctx.rust_index.import_resolutions[0].target_symbol_id == 0 assert codebase.rust_index_summary == codebase.ctx.rust_index.summary + assert codebase.rust_files[0].path == "pkg/service.py" + assert codebase.rust_classes[0].name == "Service" + assert codebase.rust_functions[0].name == "helper" + assert codebase.rust_imports[0].name == "os" + assert codebase.rust_import_resolutions[0].target_symbol_id == 0 assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] + with pytest.raises(RuntimeError, match="Python graph is not built"): + len(codebase.files) def test_missing_rust_extension_falls_back_to_python_graph(monkeypatch, tmp_path): @@ -197,6 +206,7 @@ def test_missing_rust_extension_falls_back_to_python_graph(monkeypatch, tmp_path config=config, verify_output=False, ) as codebase: + assert codebase.ctx.rust_compact_mode is False assert codebase.ctx.rust_index is None assert codebase.rust_index_summary is None assert "graph_sitter_py" in codebase.ctx.rust_backend_error From 748532f06d3581b9797add8b31e15b4c9505bbbb Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:07:21 -0700 Subject: [PATCH 013/228] Add Rust compact read handles --- rust-rewrite/benchmarks.md | 8 +- rust-rewrite/python-compat.md | 6 +- rust-rewrite/strategy.md | 10 +- .../tools/measure_codebase_rust_backend.py | 17 +- src/graph_sitter/codebase/rust_backend.py | 416 ++++++++++++++++++ src/graph_sitter/core/codebase.py | 29 ++ tests/unit/sdk/codebase/test_rust_backend.py | 20 +- 7 files changed, 494 insertions(+), 12 deletions(-) diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index a5a7cc2d1..73a852c37 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -164,11 +164,11 @@ This shell-facing number is intentionally more conservative than the standalone ## Rust `Codebase` Construction Evidence -These measurements use real `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. In this mode, once the compact Rust index builds successfully, `CodebaseContext` does not build the eager Python graph. Existing Python graph APIs are intentionally blocked for now rather than silently materializing the memory-heavy graph. +These measurements use real `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. In this mode, once the compact Rust index builds successfully, `CodebaseContext` does not build the eager Python graph. The Rust path now exercises public `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, and `imports` compatibility handles while `CodebaseContext.nodes` remains blocked so the old graph cannot be materialized accidentally. -| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust globals | Rust import resolutions | Python graph blocked | Wall ratio | RSS ratio | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.777s | 535.6 MB | 0.696s | 115.8 MB | 1130 | 1130 | 801 | 432 | yes | 3.992x | 4.627x | +| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Python graph blocked | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.728s | 537.5 MB | 0.510s | 116.0 MB | 1130 | 1130 | 3954 | 6460 | 432 | yes | 5.348x | 4.636x | Important caveats: diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index ee7319340..8e290798a 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -124,9 +124,11 @@ Current implemented bridge status: - `RustIndexBackend.files`, `.symbols`, `.imports`, and `.import_resolutions` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. -- `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Existing Python graph APIs are blocked in that mode until lazy compatibility handles exist. +- `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. - `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, and `.rust_import_resolutions` expose the attached compact records for shell smoke checks and golden tests. -- This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet return stable Python compatibility handles. +- `Codebase.files`, `.symbols`, `.classes`, `.functions`, `.global_vars`, `.imports`, `get_file(...)`, `get_symbol(...)`, `get_class(...)`, and `get_function(...)` now return lightweight compact handles in strict Rust mode for Python codebases. +- Compact file handles expose basic identity/content plus file-local `symbols`, `classes`, `functions`, `global_vars`, and `imports`. Compact symbol and import handles expose basic identity/source and implemented import-resolution targets. Edit-heavy and dependency/reference graph methods are still unsupported until the full lazy engine facade exists. +- This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet provide full P0 `SourceFile`, `Symbol`, or `Import` parity. Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 10865a57f..6132b6b13 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -189,16 +189,17 @@ Recommended task format: ## Phase 4: Lazy Python Compatibility Layer - [x] Plan Python/PyO3 compatibility layer and lazy handle migration. owner: Wegener. Result: documented in `rust-rewrite/python-compat.md`. -- [ ] Define Python handle base class that stores engine reference and stable ID. +- [x] Add temporary Python compact handle base for Rust record-backed read APIs. owner: codex. Result: added `RustCompactHandle` with stable compact node IDs for files, symbols, and imports. - [ ] Implement Rust-backed file handles for P0 `SourceFile` APIs. - [ ] Implement Rust-backed symbol handles for P0 `Symbol`, `Class`, and `Function` APIs. - [ ] Implement Rust-backed import handles for P0 `Import` APIs. - [ ] Implement Rust-backed export handles for P0 TypeScript `Export` APIs. -- [ ] Make `Codebase.files` return lazy handles under Rust backend. -- [ ] Make `Codebase.symbols`, `classes`, `functions`, `imports`, and `exports` return lazy handles under Rust backend. +- [x] Make `Codebase.files` return compact read handles under the Python Rust backend. owner: codex. +- [x] Make `Codebase.symbols`, `classes`, `functions`, `global_vars`, and `imports` return compact read handles under the Python Rust backend. owner: codex. +- [ ] Make TypeScript `Codebase.exports`, `interfaces`, and `types` return lazy handles under Rust backend. - [ ] Preserve existing sorting behavior for public query results. - [ ] Add fallback path to Python backend for unsupported methods. -- [ ] Add tests that verify no full Python object graph is materialized for simple list queries. +- [x] Add tests that verify no full Python object graph is materialized for simple list queries. owner: codex. ## Phase 5: Incremental Sync And Edits @@ -247,3 +248,4 @@ Recommended task format: - [x] 2026-06-18: Added typed Python facade accessors and a deterministic compact graph snapshot for record-level parity testing. owner: codex. Notes: this prepares the large-repo golden import/reference graph workflow. - [x] 2026-06-18: Added compact Rust extraction for top-level Python globals and symbol-target import resolution for imported globals. owner: codex. - [x] 2026-06-18: Made opt-in `CodebaseConfig(graph_backend="rust")` skip eager Python graph construction and expose compact `rust_*` record properties on `Codebase`. owner: codex. Notes: current checkout constructs 4.0x faster with 4.6x lower process max RSS than Python parse/object materialization while blocking lazy Python graph materialization. +- [x] 2026-06-18: Added lightweight Rust compact handles for Python `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, `imports`, and basic `get_*` queries. owner: codex. Notes: current checkout constructs and exercises public read handles 5.3x faster with 4.6x lower process max RSS than Python parse/object materialization while keeping `CodebaseContext.nodes` blocked. diff --git a/rust-rewrite/tools/measure_codebase_rust_backend.py b/rust-rewrite/tools/measure_codebase_rust_backend.py index 2c522c033..c1d5a992a 100644 --- a/rust-rewrite/tools/measure_codebase_rust_backend.py +++ b/rust-rewrite/tools/measure_codebase_rust_backend.py @@ -36,7 +36,7 @@ def make_report(repo: Path) -> dict: wall = time.perf_counter() - start python_graph_blocked = False try: - len(codebase.files) + len(codebase.ctx.nodes) except RuntimeError: python_graph_blocked = True @@ -73,6 +73,14 @@ def make_report(repo: Path) -> dict: "rust_imports": len(codebase.rust_imports), "rust_import_resolutions": len(codebase.rust_import_resolutions), }, + "compat_handles": { + "files": len(codebase.files), + "symbols": len(codebase.symbols), + "classes": len(codebase.classes), + "functions": len(codebase.functions), + "global_vars": len(codebase.global_vars), + "imports": len(codebase.imports), + }, } @@ -88,6 +96,7 @@ def print_human(report: dict) -> None: totals = report["totals"] summary = report["summary"] records = report["records"] + compat_handles = report["compat_handles"] print(f"repo: {report['metadata']['repo_path']}") print(f"rust Codebase: wall={totals['wall_seconds']:.3f}s max_rss={totals['max_rss_mb']:.1f} MB") print(f"python graph blocked: {report['metadata']['python_graph_blocked']}") @@ -106,6 +115,12 @@ def print_human(report: dict) -> None: f"imports={records['rust_imports']} " f"import_resolutions={records['rust_import_resolutions']}" ) + print( + "compat handles: " + f"files={compat_handles['files']} " + f"symbols={compat_handles['symbols']} " + f"imports={compat_handles['imports']}" + ) def main() -> int: diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 8d52cbd0b..4ae2fca21 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -9,6 +9,11 @@ if TYPE_CHECKING: from collections.abc import Sequence + from graph_sitter.codebase.codebase_context import CodebaseContext + +from graph_sitter._proxy import proxy_property +from graph_sitter.enums import ImportType, NodeType, SymbolType + class RustBackendUnavailableError(RuntimeError): """Raised when the optional Rust backend extension cannot be loaded.""" @@ -153,6 +158,14 @@ class RustIndexBackend: _symbols: list[RustSymbolRecord] | None = None _imports: list[RustImportRecord] | None = None _import_resolutions: list[RustImportResolutionRecord] | None = None + _file_handles: list[RustCompactFile] | None = None + _symbol_handles: list[RustCompactSymbol] | None = None + _import_handles: list[RustCompactImport] | None = None + _file_handles_by_id: dict[int, RustCompactFile] | None = None + _symbol_handles_by_id: dict[int, RustCompactSymbol] | None = None + _symbols_by_file_id: dict[int, list[RustCompactSymbol]] | None = None + _imports_by_file_id: dict[int, list[RustCompactImport]] | None = None + _import_resolutions_by_import_id: dict[int, RustImportResolutionRecord] | None = None @classmethod def build(cls, repo_path: str | Path, file_paths: Sequence[str] | None = None) -> RustIndexBackend: @@ -203,5 +216,408 @@ def import_resolutions(self) -> list[RustImportResolutionRecord]: self._import_resolutions = [RustImportResolutionRecord.from_dict(record) for record in json.loads(self.index.import_resolutions_json())] return self._import_resolutions + @property + def file_handles(self) -> list[RustCompactFile]: + if self._file_handles is None: + self._file_handles = [RustCompactFile(self, record) for record in self.files] + return self._file_handles + + @property + def symbol_handles(self) -> list[RustCompactSymbol]: + if self._symbol_handles is None: + self._symbol_handles = [RustCompactSymbol(self, record) for record in self.symbols] + return self._symbol_handles + + @property + def import_handles(self) -> list[RustCompactImport]: + if self._import_handles is None: + self._import_handles = [RustCompactImport(self, record) for record in self.imports] + return self._import_handles + + def get_file_handle(self, filepath: str, *, ignore_case: bool = False) -> RustCompactFile | None: + path = Path(filepath.replace("\\", "/")) + if path.is_absolute(): + try: + path = path.resolve().relative_to(self.repo_path) + except ValueError: + return None + normalized = path.as_posix() + if normalized.startswith("./"): + normalized = normalized[2:] + if ignore_case: + normalized = normalized.lower() + return next((file for file in self.file_handles if file.filepath.lower() == normalized), None) + return next((file for file in self.file_handles if file.filepath == normalized), None) + + def symbols_for_file(self, file_id: int) -> list[RustCompactSymbol]: + if self._symbols_by_file_id is None: + symbols_by_file_id: dict[int, list[RustCompactSymbol]] = {} + for symbol in self.symbol_handles: + symbols_by_file_id.setdefault(symbol.record.file_id, []).append(symbol) + self._symbols_by_file_id = symbols_by_file_id + return self._symbols_by_file_id.get(file_id, []) + + def imports_for_file(self, file_id: int) -> list[RustCompactImport]: + if self._imports_by_file_id is None: + imports_by_file_id: dict[int, list[RustCompactImport]] = {} + for import_handle in self.import_handles: + imports_by_file_id.setdefault(import_handle.record.file_id, []).append(import_handle) + self._imports_by_file_id = imports_by_file_id + return self._imports_by_file_id.get(file_id, []) + + def file_handle_by_id(self, file_id: int) -> RustCompactFile | None: + if self._file_handles_by_id is None: + self._file_handles_by_id = {file.record.id: file for file in self.file_handles} + return self._file_handles_by_id.get(file_id) + + def symbol_handle_by_id(self, symbol_id: int) -> RustCompactSymbol | None: + if self._symbol_handles_by_id is None: + self._symbol_handles_by_id = {symbol.record.id: symbol for symbol in self.symbol_handles} + return self._symbol_handles_by_id.get(symbol_id) + + def import_resolution_for_import(self, import_id: int) -> RustImportResolutionRecord | None: + if self._import_resolutions_by_import_id is None: + self._import_resolutions_by_import_id = {resolution.import_id: resolution for resolution in self.import_resolutions} + return self._import_resolutions_by_import_id.get(import_id) + def to_json(self) -> str: return str(self.index.to_json()) + + +@dataclass(frozen=True) +class _RustCompatTreeNode: + range: RustSourceRange + kind_id: int + + @property + def start_byte(self) -> int: + return self.range.start_byte + + @property + def end_byte(self) -> int: + return self.range.end_byte + + @property + def start_point(self) -> tuple[int, int]: + return (self.range.start_row, self.range.start_column) + + @property + def end_point(self) -> tuple[int, int]: + return (self.range.end_row, self.range.end_column) + + +@dataclass(frozen=True) +class RustCompactName: + source: str + + def __str__(self) -> str: + return self.source + + +class RustCompactHandle: + node_type: NodeType + + def __init__(self, backend: RustIndexBackend, node_id: int, source_range: RustSourceRange) -> None: + self.backend = backend + self.ctx: CodebaseContext | None = None + self.node_id = node_id + self.ts_node = _RustCompatTreeNode(source_range, int(self.node_type)) + + def __hash__(self) -> int: + return hash((self.node_type, self.node_id)) + + @property + def start_byte(self) -> int: + return self.ts_node.start_byte + + @property + def end_byte(self) -> int: + return self.ts_node.end_byte + + @property + def start_point(self) -> tuple[int, int]: + return self.ts_node.start_point + + @property + def end_point(self) -> tuple[int, int]: + return self.ts_node.end_point + + @property + def range(self) -> RustSourceRange: + return self.ts_node.range + + def _unsupported(self, method: str) -> RuntimeError: + return RuntimeError(f"{method} is not supported by the compact Rust handle yet") + + +class RustCompactFile(RustCompactHandle): + node_type = NodeType.FILE + + def __init__(self, backend: RustIndexBackend, record: RustFileRecord) -> None: + self.record = record + super().__init__(backend, record.id, record.root_range) + self.file_path = record.path + self.filepath = record.path + self.path = backend.repo_path / record.path + self.name = self.path.stem + self._binary = False + + def __repr__(self) -> str: + return f"RustCompactFile(filepath={self.filepath!r})" + + @property + def file(self) -> RustCompactFile: + return self + + @property + def module_name(self) -> str | None: + return self.record.module_name + + @property + def extension(self) -> str: + return self.path.suffix + + @property + def is_binary(self) -> bool: + return self._binary + + @property + def content_bytes(self) -> bytes: + return self.path.read_bytes() + + @property + def content(self) -> str: + return self.content_bytes.decode("utf-8") + + @property + def source(self) -> str: + return self.content + + @property + def imports(self) -> list[RustCompactImport]: + return self.backend.imports_for_file(self.record.id) + + @proxy_property + def symbols(self, nested: bool = False) -> list[RustCompactSymbol]: + if nested: + return self.backend.symbols_for_file(self.record.id) + return [symbol for symbol in self.backend.symbols_for_file(self.record.id) if symbol.is_top_level] + + @property + def global_vars(self) -> list[RustCompactSymbol]: + return [symbol for symbol in self.symbols if symbol.symbol_type == SymbolType.GlobalVar] + + @property + def classes(self) -> list[RustCompactSymbol]: + return [symbol for symbol in self.symbols if symbol.symbol_type == SymbolType.Class] + + @property + def functions(self) -> list[RustCompactSymbol]: + return [symbol for symbol in self.symbols if symbol.symbol_type == SymbolType.Function] + + def get_symbol(self, name: str) -> RustCompactSymbol | None: + return next((symbol for symbol in self.symbols if symbol.name == name), None) + + def get_global_var(self, name: str) -> RustCompactSymbol | None: + return next((symbol for symbol in self.global_vars if symbol.name == name), None) + + def get_class(self, name: str) -> RustCompactSymbol | None: + return next((symbol for symbol in self.classes if symbol.name == name), None) + + def get_function(self, name: str) -> RustCompactSymbol | None: + return next((symbol for symbol in self.functions if symbol.name == name), None) + + def has_import(self, symbol_alias: str) -> bool: + return any(import_handle.name == symbol_alias for import_handle in self.imports) + + def get_import(self, symbol_alias: str) -> RustCompactImport | None: + return next((import_handle for import_handle in self.imports if import_handle.name == symbol_alias), None) + + +class RustCompactSymbol(RustCompactHandle): + node_type = NodeType.SYMBOL + + def __init__(self, backend: RustIndexBackend, record: RustSymbolRecord) -> None: + self.record = record + super().__init__(backend, record.id, record.range) + self.name = record.name + self._name_node = RustCompactName(record.name) + self.is_top_level = True + + def __repr__(self) -> str: + return f"RustCompactSymbol(name={self.name!r}, filepath={self.filepath!r})" + + @property + def symbol_type(self) -> SymbolType: + return { + "class": SymbolType.Class, + "function": SymbolType.Function, + "global_variable": SymbolType.GlobalVar, + }[self.record.kind] + + @property + def file(self) -> RustCompactFile: + file = self.backend.file_handle_by_id(self.record.file_id) + if file is None: + msg = f"Rust compact symbol {self.record.id} references missing file {self.record.file_id}" + raise RuntimeError(msg) + return file + + @property + def filepath(self) -> str: + return self.file.filepath + + @property + def full_name(self) -> str: + return self.name + + @property + def source(self) -> str: + return self.file.content_bytes[self.start_byte : self.end_byte].decode("utf-8") + + @property + def extended_source(self) -> str: + return self.source + + @property + def extended_nodes(self) -> list[RustCompactSymbol]: + return [self] + + @property + def dependencies(self) -> list[object]: + return [] + + @property + def usages(self) -> list[object]: + return [] + + @property + def symbol_usages(self) -> list[object]: + return [] + + @property + def descendant_symbols(self) -> list[RustCompactSymbol]: + return [] + + @property + def function_calls(self) -> list[object]: + return [] + + @property + def is_exported(self) -> bool: + return False + + def get_name(self) -> str: + return self.name + + +class RustCompactImport(RustCompactHandle): + node_type = NodeType.IMPORT + + def __init__(self, backend: RustIndexBackend, record: RustImportRecord) -> None: + self.record = record + super().__init__(backend, record.id, record.range) + self.module = RustCompactName(self._module_source()) if self._module_source() is not None else None + self.symbol_name = RustCompactName(record.name) if record.name is not None else None + alias = record.alias or record.name + self.alias = RustCompactName(alias) if alias is not None else None + self.import_type = self._import_type() + self.import_statement = self + + def __repr__(self) -> str: + return f"RustCompactImport(source={self.source!r}, filepath={self.filepath!r})" + + @property + def file(self) -> RustCompactFile: + file = self.backend.file_handle_by_id(self.record.file_id) + if file is None: + msg = f"Rust compact import {self.record.id} references missing file {self.record.file_id}" + raise RuntimeError(msg) + return file + + @property + def filepath(self) -> str: + return self.file.filepath + + @property + def source(self) -> str: + return self.file.content_bytes[self.start_byte : self.end_byte].decode("utf-8") + + @property + def name(self) -> str | None: + return self.alias.source if self.alias is not None else None + + @property + def import_specifier(self) -> str | None: + if self.symbol_name is not None: + return self.symbol_name.source + return self.module.source if self.module is not None else None + + @property + def from_file(self) -> RustCompactFile | None: + resolution = self.backend.import_resolution_for_import(self.record.id) + if resolution is None: + return None + return self.backend.file_handle_by_id(resolution.target_file_id) + + @property + def to_file(self) -> RustCompactFile: + return self.file + + @property + def imported_symbol(self) -> RustCompactSymbol | RustCompactFile | None: + resolution = self.backend.import_resolution_for_import(self.record.id) + if resolution is None: + return None + if resolution.target_symbol_id is not None: + return self.backend.symbol_handle_by_id(resolution.target_symbol_id) + return self.backend.file_handle_by_id(resolution.target_file_id) + + @property + def resolved_symbol(self) -> RustCompactSymbol | RustCompactFile | None: + return self.imported_symbol + + @property + def imported_exports(self) -> list[RustCompactSymbol | RustCompactFile]: + imported = self.imported_symbol + return [] if imported is None else [imported] + + @property + def namespace(self) -> str | None: + if not self.is_module_import(): + return None + return self.name + + @property + def is_dynamic(self) -> bool: + return False + + @property + def is_reexport(self) -> bool: + return False + + def is_aliased_import(self) -> bool: + return self.record.alias is not None and self.record.alias != self.record.name + + def is_module_import(self) -> bool: + return self.import_type in {ImportType.MODULE, ImportType.WILDCARD} + + def is_symbol_import(self) -> bool: + return not self.is_module_import() + + def is_wildcard_import(self) -> bool: + return self.import_type == ImportType.WILDCARD + + def _module_source(self) -> str | None: + if self.record.kind == "import": + return self.record.name + return self.record.module + + def _import_type(self) -> ImportType: + if self.record.name == "*": + return ImportType.WILDCARD + if self.record.kind == "import": + return ImportType.MODULE + if self.record.kind in {"from_import", "future_import"}: + return ImportType.NAMED_EXPORT + return ImportType.UNKNOWN diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index d9d7ab610..a3a1208b0 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -314,12 +314,25 @@ def _require_rust_index(self): raise RuntimeError(msg) return self.ctx.rust_index + def _rust_compact_files(self, extensions: list[str] | Literal["*"] | None = None): + if isinstance(extensions, str) and extensions != "*": + msg = "extensions must be a list of extensions or '*'" + raise ValueError(msg) + files = self._require_rust_index().file_handles + if isinstance(extensions, list): + allowed = set(extensions) + files = [file for file in files if file.extension in allowed] + return sorted(files, key=lambda file: file.name) + #################################################################################################################### # NODES #################################################################################################################### @noapidoc def _symbols(self, symbol_type: SymbolType | None = None) -> list[TSymbol | TClass | TFunction | TGlobalVar]: + if self.ctx.rust_compact_mode: + symbols = self._require_rust_index().symbol_handles + return [x for x in symbols if symbol_type is None or x.symbol_type == symbol_type] matches: list[Symbol] = self.ctx.get_nodes(NodeType.SYMBOL) return [x for x in matches if x.is_top_level and (symbol_type is None or x.symbol_type == symbol_type)] @@ -349,6 +362,8 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T Returns: list[TSourceFile]: A sorted list of source files in the codebase. """ + if self.ctx.rust_compact_mode: + return self._rust_compact_files(extensions) if self.ctx.config.use_pink == PinkMode.ALL_FILES: return self._pink_codebase.files if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0: @@ -409,6 +424,8 @@ def imports(self) -> list[TImport]: list[TImport]: A list of Import nodes representing all imports in the codebase. TImport can be PyImport for Python codebases or TSImport for TypeScript codebases. """ + if self.ctx.rust_compact_mode: + return self._require_rust_index().import_handles return self.ctx.get_nodes(NodeType.IMPORT) @property @@ -498,6 +515,8 @@ def interfaces(self) -> list[TInterface]: Returns: list[TInterface]: A list of Interface objects defined in the codebase's source files. """ + if self.ctx.rust_compact_mode: + return [] return self._symbols(symbol_type=SymbolType.Interface) @property @@ -510,6 +529,8 @@ def types(self) -> list[TTypeAlias]: Returns: list[TTypeAlias]: A list of all type aliases defined in the codebase. """ + if self.ctx.rust_compact_mode: + return [] return self._symbols(symbol_type=SymbolType.Type) #################################################################################################################### @@ -615,6 +636,14 @@ def get_file(self, filepath: str, *, optional: bool = False, ignore_case: bool = if self.ctx.config.use_pink == PinkMode.ALL_FILES: absolute_path = self.ctx.to_absolute(filepath) return self._pink_codebase.get_file(absolute_path) + if self.ctx.rust_compact_mode: + file = self._require_rust_index().get_file_handle(filepath, ignore_case=ignore_case) + if file is not None: + return file + if not optional: + msg = f"File {filepath} not found in Rust compact index. Use optional=True to return None instead." + raise ValueError(msg) + return None # Try to get the file from the graph first file = self.ctx.get_file(filepath, ignore_case=ignore_case) if file is not None: diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index d6cc6a30e..2425a2111 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -192,8 +192,26 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.rust_import_resolutions[0].target_symbol_id == 0 assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] + + assert len(codebase.files) == 1 + assert codebase.files[0].filepath == "pkg/service.py" + assert codebase.files[0].content.startswith("import os") + assert codebase.get_file("pkg/service.py") == codebase.files[0] + assert codebase.has_file("PKG/SERVICE.PY", ignore_case=True) + assert [symbol.name for symbol in codebase.symbols] == ["Service", "helper"] + assert [symbol.name for symbol in codebase.classes] == ["Service"] + assert [symbol.name for symbol in codebase.functions] == ["helper"] + assert codebase.get_symbol("Service").source.startswith("class Service") + assert codebase.get_class("Service").file == codebase.files[0] + assert codebase.get_function("helper").filepath == "pkg/service.py" + assert codebase.files[0].classes[0].name == "Service" + assert codebase.files[0].functions[0].name == "helper" + assert codebase.imports[0].source == "import os" + assert codebase.imports[0].is_module_import() + assert codebase.imports[0].from_file == codebase.files[0] + assert codebase.imports[0].imported_symbol == codebase.classes[0] with pytest.raises(RuntimeError, match="Python graph is not built"): - len(codebase.files) + len(codebase.ctx.nodes) def test_missing_rust_extension_falls_back_to_python_graph(monkeypatch, tmp_path): From 10d5b58eeb1f2760154027480b420433d7eb050b Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:13:16 -0700 Subject: [PATCH 014/228] Add Rust compact Python reference records --- .../examples/index_python.rs | 3 +- crates/graph-sitter-engine/src/lib.rs | 256 +++++++++++++++++- crates/graph-sitter-py/src/lib.rs | 29 +- rust-rewrite/benchmarks.md | 8 +- rust-rewrite/python-compat.md | 6 +- rust-rewrite/strategy.md | 4 +- .../tools/compare_rust_python_index.py | 3 +- .../tools/measure_codebase_rust_backend.py | 8 +- rust-rewrite/tools/measure_rust_facade.py | 3 +- src/graph_sitter/codebase/rust_backend.py | 33 +++ src/graph_sitter/core/codebase.py | 5 + tests/unit/sdk/codebase/test_rust_backend.py | 26 ++ 12 files changed, 356 insertions(+), 28 deletions(-) diff --git a/crates/graph-sitter-engine/examples/index_python.rs b/crates/graph-sitter-engine/examples/index_python.rs index 67704ebb2..159e12d44 100644 --- a/crates/graph-sitter-engine/examples/index_python.rs +++ b/crates/graph-sitter-engine/examples/index_python.rs @@ -29,7 +29,7 @@ fn main() -> Result<(), Box> { println!("repo: {repo_path}"); println!("wall: {:.6}s", elapsed.as_secs_f64()); println!( - "index: files={} symbols={} classes={} functions={} global_variables={} imports={} import_resolutions={} bytes={} lines={} files_with_errors={}", + "index: files={} symbols={} classes={} functions={} global_variables={} imports={} import_resolutions={} references={} bytes={} lines={} files_with_errors={}", summary.files, summary.symbols, summary.classes, @@ -37,6 +37,7 @@ fn main() -> Result<(), Box> { summary.global_variables, summary.imports, summary.import_resolutions, + summary.references, summary.bytes, summary.lines, summary.files_with_errors diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 0ddd5d360..b5d3221a1 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -135,6 +135,7 @@ pub struct PythonIndex { pub symbols: Vec, pub imports: Vec, pub import_resolutions: Vec, + pub references: Vec, } impl PythonIndex { @@ -159,6 +160,7 @@ impl PythonIndex { .count(), imports: self.imports.len(), import_resolutions: self.import_resolutions.len(), + references: self.references.len(), bytes: self.files.iter().map(|file| file.byte_len).sum(), lines: self.files.iter().map(|file| file.line_count).sum(), files_with_errors: self.files.iter().filter(|file| file.has_error).count(), @@ -175,6 +177,7 @@ pub struct IndexSummary { pub global_variables: usize, pub imports: usize, pub import_resolutions: usize, + pub references: usize, pub bytes: usize, pub lines: usize, pub files_with_errors: usize, @@ -237,6 +240,17 @@ pub struct ImportResolutionRecord { pub target_symbol_id: Option, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct ReferenceRecord { + pub id: u32, + pub source_file_id: u32, + pub source_symbol_id: Option, + pub target_symbol_id: u32, + pub import_id: Option, + pub name: String, + pub range: SourceRange, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] pub struct SourceRange { pub start_byte: usize, @@ -264,6 +278,14 @@ struct PythonIndexer { parser: Parser, } +#[derive(Debug, Clone, PartialEq, Eq)] +struct ReferenceCandidate { + source_file_id: u32, + source_symbol_id: Option, + name: String, + range: SourceRange, +} + impl PythonIndexer { fn new() -> Result { let mut parser = Parser::new(); @@ -312,7 +334,9 @@ impl PythonIndexer { symbols: Vec::new(), imports: Vec::new(), import_resolutions: Vec::new(), + references: Vec::new(), }; + let mut reference_candidates = Vec::new(); paths.sort(); for path in paths { @@ -341,10 +365,17 @@ impl PythonIndexer { has_error: root.has_error(), root_range: root.range().into(), }); - extract_python_file(file_id, &content, &tree, &mut index); + extract_python_file( + file_id, + &content, + &tree, + &mut index, + &mut reference_candidates, + ); } resolve_python_imports(&mut index); + resolve_python_references(&mut index, reference_candidates); Ok(index) } } @@ -386,18 +417,50 @@ fn should_skip_dir(path: &Path) -> bool { ) } -fn extract_python_file(file_id: u32, source: &str, tree: &Tree, index: &mut PythonIndex) { +fn extract_python_file( + file_id: u32, + source: &str, + tree: &Tree, + index: &mut PythonIndex, + reference_candidates: &mut Vec, +) { let root = tree.root_node(); let mut cursor = root.walk(); for child in root.named_children(&mut cursor) { - extract_top_level_node(file_id, source, child, index); + extract_top_level_node(file_id, source, child, index, reference_candidates); } } -fn extract_top_level_node(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { +fn extract_top_level_node( + file_id: u32, + source: &str, + node: Node<'_>, + index: &mut PythonIndex, + reference_candidates: &mut Vec, +) { match node.kind() { - "class_definition" => push_symbol(file_id, source, node, SymbolKind::Class, index), - "function_definition" => push_symbol(file_id, source, node, SymbolKind::Function, index), + "class_definition" => { + push_symbol(file_id, source, node, SymbolKind::Class, index).inspect(|symbol_id| { + collect_symbol_reference_candidates( + file_id, + *symbol_id, + source, + node, + reference_candidates, + ); + }); + } + "function_definition" => { + push_symbol(file_id, source, node, SymbolKind::Function, index).inspect(|symbol_id| { + collect_symbol_reference_candidates( + file_id, + *symbol_id, + source, + node, + reference_candidates, + ); + }); + } "decorated_definition" => { if let Some(definition) = first_child_of_kind(node, &["class_definition", "function_definition"]) @@ -407,7 +470,16 @@ fn extract_top_level_node(file_id: u32, source: &str, node: Node<'_>, index: &mu } else { SymbolKind::Function }; - push_symbol_with_range(file_id, source, definition, node.range(), kind, index); + push_symbol_with_range(file_id, source, definition, node.range(), kind, index) + .inspect(|symbol_id| { + collect_symbol_reference_candidates( + file_id, + *symbol_id, + source, + node, + reference_candidates, + ); + }); } } "import_statement" => push_import_statement(file_id, source, node, index), @@ -434,8 +506,8 @@ fn push_symbol( node: Node<'_>, kind: SymbolKind, index: &mut PythonIndex, -) { - push_symbol_with_range(file_id, source, node, node.range(), kind, index); +) -> Option { + push_symbol_with_range(file_id, source, node, node.range(), kind, index) } fn push_symbol_with_range( @@ -445,21 +517,23 @@ fn push_symbol_with_range( declaration_range: Range, kind: SymbolKind, index: &mut PythonIndex, -) { +) -> Option { let Some(name_node) = node.child_by_field_name("name") else { - return; + return None; }; let Ok(name) = name_node.utf8_text(source.as_bytes()) else { - return; + return None; }; + let symbol_id = index.symbols.len() as u32; index.symbols.push(SymbolRecord { - id: index.symbols.len() as u32, + id: symbol_id, file_id, name: name.to_owned(), kind, range: declaration_range.into(), name_range: name_node.range().into(), }); + Some(symbol_id) } fn push_global_assignment(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { @@ -496,6 +570,64 @@ fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec, + out: &mut Vec, +) { + let excluded_name_range = symbol_node + .child_by_field_name("name") + .map(|name_node| name_node.range()); + collect_identifier_candidates( + file_id, + Some(source_symbol_id), + source, + symbol_node, + excluded_name_range, + out, + ); +} + +fn collect_identifier_candidates( + file_id: u32, + source_symbol_id: Option, + source: &str, + node: Node<'_>, + excluded_range: Option, + out: &mut Vec, +) { + if node.kind() == "identifier" && !range_matches(node.range(), excluded_range) { + if let Ok(name) = node.utf8_text(source.as_bytes()) { + out.push(ReferenceCandidate { + source_file_id: file_id, + source_symbol_id, + name: name.to_owned(), + range: node.range().into(), + }); + } + } + + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_identifier_candidates( + file_id, + source_symbol_id, + source, + child, + excluded_range, + out, + ); + } +} + +fn range_matches(range: Range, other: Option) -> bool { + other + .map(|other| range.start_byte == other.start_byte && range.end_byte == other.end_byte) + .unwrap_or(false) +} + fn push_import_statement(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { let text = node_text(source, node); let imports = text @@ -628,6 +760,78 @@ fn resolve_python_imports(index: &mut PythonIndex) { index.import_resolutions = resolutions; } +fn resolve_python_references(index: &mut PythonIndex, candidates: Vec) { + let symbol_to_id: HashMap<(u32, &str), u32> = index + .symbols + .iter() + .map(|symbol| ((symbol.file_id, symbol.name.as_str()), symbol.id)) + .collect(); + let resolution_by_import_id: HashMap = index + .import_resolutions + .iter() + .map(|resolution| (resolution.import_id, resolution)) + .collect(); + let mut imported_symbol_by_binding: HashMap<(u32, String), (u32, u32)> = HashMap::new(); + + for import in &index.imports { + let Some(resolution) = resolution_by_import_id.get(&import.id) else { + continue; + }; + let Some(target_symbol_id) = resolution.target_symbol_id else { + continue; + }; + let Some(binding) = import_binding_name(import) else { + continue; + }; + imported_symbol_by_binding.insert((import.file_id, binding), (target_symbol_id, import.id)); + } + + let mut references = Vec::new(); + for candidate in candidates { + let imported_target = imported_symbol_by_binding + .get(&(candidate.source_file_id, candidate.name.clone())) + .copied(); + let same_file_target = symbol_to_id + .get(&(candidate.source_file_id, candidate.name.as_str())) + .copied() + .map(|symbol_id| (symbol_id, None)); + let Some((target_symbol_id, import_id)) = imported_target + .map(|(symbol_id, import_id)| (symbol_id, Some(import_id))) + .or(same_file_target) + else { + continue; + }; + if candidate.source_symbol_id == Some(target_symbol_id) { + continue; + } + + references.push(ReferenceRecord { + id: references.len() as u32, + source_file_id: candidate.source_file_id, + source_symbol_id: candidate.source_symbol_id, + target_symbol_id, + import_id, + name: candidate.name, + range: candidate.range, + }); + } + index.references = references; +} + +fn import_binding_name(import: &ImportRecord) -> Option { + if let Some(alias) = import.alias.as_deref() { + return Some(alias.to_owned()); + } + match import.kind { + ImportKind::Import => import + .name + .as_deref() + .and_then(|name| name.split('.').next()) + .map(str::to_owned), + ImportKind::FromImport | ImportKind::FutureImport => import.name.clone(), + } +} + fn resolve_plain_import(import: &ImportRecord, module_to_file: &HashMap<&str, u32>) -> Option { let name = import.name.as_deref()?; module_to_file.get(name).copied() @@ -785,6 +989,7 @@ mod tests { assert_eq!(index.summary().global_variables, 0); assert_eq!(index.summary().imports, 4); assert_eq!(index.summary().import_resolutions, 0); + assert_eq!(index.summary().references, 0); assert_eq!(index.symbols[0].name, "Service"); assert_eq!(index.symbols[1].name, "helper"); assert!(index @@ -837,6 +1042,7 @@ mod tests { assert_eq!(index.summary().global_variables, 1); assert_eq!(index.summary().imports, 6); assert_eq!(index.summary().import_resolutions, 4); + assert_eq!(index.summary().references, 1); let base_file_id = index .files @@ -872,6 +1078,12 @@ mod tests { .count(), 4 ); + assert!(index.references.iter().any(|reference| { + reference.name == "Base" + && reference.source_symbol_id.is_some() + && reference.target_symbol_id == base_symbol_id + && reference.import_id.is_some() + })); } #[test] @@ -930,6 +1142,20 @@ mod tests { }) }) .collect::>(); + let references = index + .references + .iter() + .map(|reference| { + serde_json::json!({ + "id": reference.id, + "source_file_id": reference.source_file_id, + "source_symbol_id": reference.source_symbol_id, + "target_symbol_id": reference.target_symbol_id, + "import_id": reference.import_id, + "name": reference.name, + }) + }) + .collect::>(); assert_eq!( serde_json::json!({ @@ -937,6 +1163,7 @@ mod tests { "symbols": symbols, "imports": imports, "import_resolutions": index.import_resolutions, + "references": references, }), serde_json::json!({ "files": [ @@ -961,6 +1188,9 @@ mod tests { {"id": 1, "import_id": 1, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": 0}, {"id": 2, "import_id": 2, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null}, {"id": 3, "import_id": 3, "source_file_id": 2, "target_file_id": 1, "target_symbol_id": null} + ], + "references": [ + {"id": 0, "source_file_id": 2, "source_symbol_id": 2, "target_symbol_id": 1, "import_id": 0, "name": "Base"} ] }) ); diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index ac77a91ed..88cffaeae 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -73,6 +73,8 @@ mod bindings { #[pyo3(get)] import_resolutions: usize, #[pyo3(get)] + references: usize, + #[pyo3(get)] bytes: usize, #[pyo3(get)] lines: usize, @@ -90,6 +92,7 @@ mod bindings { global_variables: summary.global_variables, imports: summary.imports, import_resolutions: summary.import_resolutions, + references: summary.references, bytes: summary.bytes, lines: summary.lines, files_with_errors: summary.files_with_errors, @@ -108,6 +111,7 @@ mod bindings { ("global_variables", self.global_variables), ("imports", self.imports), ("import_resolutions", self.import_resolutions), + ("references", self.references), ("bytes", self.bytes), ("lines", self.lines), ("files_with_errors", self.files_with_errors), @@ -116,7 +120,7 @@ mod bindings { fn __repr__(&self) -> String { format!( - "IndexSummary(files={}, symbols={}, classes={}, functions={}, global_variables={}, imports={}, import_resolutions={}, bytes={}, lines={}, files_with_errors={})", + "IndexSummary(files={}, symbols={}, classes={}, functions={}, global_variables={}, imports={}, import_resolutions={}, references={}, bytes={}, lines={}, files_with_errors={})", self.files, self.symbols, self.classes, @@ -124,6 +128,7 @@ mod bindings { self.global_variables, self.imports, self.import_resolutions, + self.references, self.bytes, self.lines, self.files_with_errors @@ -174,6 +179,11 @@ mod bindings { .map_err(|error| PyRuntimeError::new_err(error.to_string())) } + fn references_json(&self) -> PyResult { + serde_json::to_string(&self.inner.references) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + #[getter] fn file_count(&self) -> usize { self.inner.files.len() @@ -194,11 +204,20 @@ mod bindings { self.inner.import_resolutions.len() } + #[getter] + fn reference_count(&self) -> usize { + self.inner.references.len() + } + fn __repr__(&self) -> String { let summary = self.inner.summary(); format!( - "PythonIndex(files={}, symbols={}, imports={}, import_resolutions={})", - summary.files, summary.symbols, summary.imports, summary.import_resolutions + "PythonIndex(files={}, symbols={}, imports={}, import_resolutions={}, references={})", + summary.files, + summary.symbols, + summary.imports, + summary.import_resolutions, + summary.references ) } } @@ -392,7 +411,9 @@ mod bindings { let summary = index.summary(); assert_eq!(summary.global_variables, 1); assert_eq!(summary.import_resolutions, 2); + assert_eq!(summary.references, 1); assert_eq!(index.import_resolution_count(), 2); + assert_eq!(index.reference_count(), 1); assert!(index.files_json().unwrap().contains("\"pkg/base.py\"")); assert!(index.symbols_json().unwrap().contains("\"CONSTANT\"")); assert!(index.symbols_json().unwrap().contains("\"Base\"")); @@ -401,7 +422,9 @@ mod bindings { .import_resolutions_json() .unwrap() .contains("target_symbol_id")); + assert!(index.references_json().unwrap().contains("\"Base\"")); assert!(index.to_json().unwrap().contains("import_resolutions")); + assert!(index.to_json().unwrap().contains("references")); } fn temp_repo_path(prefix: &str) -> PathBuf { diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 73a852c37..055f3c4c3 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -166,13 +166,13 @@ This shell-facing number is intentionally more conservative than the standalone These measurements use real `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. In this mode, once the compact Rust index builds successfully, `CodebaseContext` does not build the eager Python graph. The Rust path now exercises public `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, and `imports` compatibility handles while `CodebaseContext.nodes` remains blocked so the old graph cannot be materialized accidentally. -| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Python graph blocked | Wall ratio | RSS ratio | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.728s | 537.5 MB | 0.510s | 116.0 MB | 1130 | 1130 | 3954 | 6460 | 432 | yes | 5.348x | 4.636x | +| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Python graph blocked | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.825s | 537.2 MB | 0.568s | 131.9 MB | 1130 | 1130 | 3955 | 6460 | 432 | 3666 | yes | 4.976x | 4.073x | Important caveats: -- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, and internal import-resolution records for indexed Python modules. +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, internal import-resolution records, and first-slice top-level Python symbol reference records for indexed Python modules. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 8e290798a..90d878433 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -121,11 +121,13 @@ Current implemented bridge status: - `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, global-variable, import, import-resolution, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. - `PythonIndex.files_json()`, `symbols_json()`, `imports_json()`, and `import_resolutions_json()` expose each record family without forcing callers to deserialize the full index payload. -- `RustIndexBackend.files`, `.symbols`, `.imports`, and `.import_resolutions` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. +- `PythonIndex.references_json()` exposes compact symbol reference records. +- `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, and `.references` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside top-level Python classes/functions. Full lexical scoping, nested references, attributes, and module references remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. -- `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, and `.rust_import_resolutions` expose the attached compact records for shell smoke checks and golden tests. +- `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, `.rust_import_resolutions`, and `.rust_references` expose the attached compact records for shell smoke checks and golden tests. - `Codebase.files`, `.symbols`, `.classes`, `.functions`, `.global_vars`, `.imports`, `get_file(...)`, `get_symbol(...)`, `get_class(...)`, and `get_function(...)` now return lightweight compact handles in strict Rust mode for Python codebases. - Compact file handles expose basic identity/content plus file-local `symbols`, `classes`, `functions`, `global_vars`, and `imports`. Compact symbol and import handles expose basic identity/source and implemented import-resolution targets. Edit-heavy and dependency/reference graph methods are still unsupported until the full lazy engine facade exists. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet provide full P0 `SourceFile`, `Symbol`, or `Import` parity. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 6132b6b13..1ef1fa06a 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -180,7 +180,8 @@ Recommended task format: - [ ] Implement full import-to-file and import-to-symbol edges for all Python and TypeScript rules. - [ ] Implement export-to-symbol/import/file edges. - [ ] Implement lexical scope tables for name resolution. -- [ ] Implement symbol usage extraction by identifier ranges. +- [x] Implement first compact Python symbol reference extraction by identifier ranges. owner: codex. Result: records same-file and imported top-level symbol references inside top-level Python classes/functions. +- [ ] Expand symbol usage extraction to nested scopes, attributes, module references, and full lexical shadowing behavior. - [ ] Implement dependency edge construction from usage records. - [ ] Implement superclass/interface dependency edges. - [ ] Add graph debug dump for nodes, edges, and usage metadata. @@ -249,3 +250,4 @@ Recommended task format: - [x] 2026-06-18: Added compact Rust extraction for top-level Python globals and symbol-target import resolution for imported globals. owner: codex. - [x] 2026-06-18: Made opt-in `CodebaseConfig(graph_backend="rust")` skip eager Python graph construction and expose compact `rust_*` record properties on `Codebase`. owner: codex. Notes: current checkout constructs 4.0x faster with 4.6x lower process max RSS than Python parse/object materialization while blocking lazy Python graph materialization. - [x] 2026-06-18: Added lightweight Rust compact handles for Python `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, `imports`, and basic `get_*` queries. owner: codex. Notes: current checkout constructs and exercises public read handles 5.3x faster with 4.6x lower process max RSS than Python parse/object materialization while keeping `CodebaseContext.nodes` blocked. +- [x] 2026-06-18: Added compact Python `ReferenceRecord` extraction for same-file and imported top-level symbol references inside top-level classes/functions. owner: codex. Notes: current checkout emits 3,666 compact references and remains 5.0x faster with 4.1x lower process max RSS than Python parse/object materialization. diff --git a/rust-rewrite/tools/compare_rust_python_index.py b/rust-rewrite/tools/compare_rust_python_index.py index f2970ee85..a5e17d7fd 100644 --- a/rust-rewrite/tools/compare_rust_python_index.py +++ b/rust-rewrite/tools/compare_rust_python_index.py @@ -189,7 +189,8 @@ def print_human(report: dict[str, Any]) -> None: f"rss_peak={comparison['rust_sampled_rss_peak_mb']:.1f} MB " f"files={rust_summary['files']} symbols={rust_summary['symbols']} " f"global_variables={rust_summary['global_variables']} " - f"imports={rust_summary['imports']} import_resolutions={rust_summary['import_resolutions']}" + f"imports={rust_summary['imports']} import_resolutions={rust_summary['import_resolutions']} " + f"references={rust_summary['references']}" ) print( "ratios: " diff --git a/rust-rewrite/tools/measure_codebase_rust_backend.py b/rust-rewrite/tools/measure_codebase_rust_backend.py index c1d5a992a..e699ba188 100644 --- a/rust-rewrite/tools/measure_codebase_rust_backend.py +++ b/rust-rewrite/tools/measure_codebase_rust_backend.py @@ -60,6 +60,7 @@ def make_report(repo: Path) -> dict: "global_variables": summary.global_variables, "imports": summary.imports, "import_resolutions": summary.import_resolutions, + "references": summary.references, "bytes": summary.bytes, "lines": summary.lines, "files_with_errors": summary.files_with_errors, @@ -72,6 +73,7 @@ def make_report(repo: Path) -> dict: "rust_global_vars": len(codebase.rust_global_vars), "rust_imports": len(codebase.rust_imports), "rust_import_resolutions": len(codebase.rust_import_resolutions), + "rust_references": len(codebase.rust_references), }, "compat_handles": { "files": len(codebase.files), @@ -106,14 +108,16 @@ def print_human(report: dict) -> None: f"symbols={summary['symbols']} " f"global_variables={summary['global_variables']} " f"imports={summary['imports']} " - f"import_resolutions={summary['import_resolutions']}" + f"import_resolutions={summary['import_resolutions']} " + f"references={summary['references']}" ) print( "records: " f"files={records['rust_files']} " f"symbols={records['rust_symbols']} " f"imports={records['rust_imports']} " - f"import_resolutions={records['rust_import_resolutions']}" + f"import_resolutions={records['rust_import_resolutions']} " + f"references={records['rust_references']}" ) print( "compat handles: " diff --git a/rust-rewrite/tools/measure_rust_facade.py b/rust-rewrite/tools/measure_rust_facade.py index 6fce70f23..1d0d92a6f 100644 --- a/rust-rewrite/tools/measure_rust_facade.py +++ b/rust-rewrite/tools/measure_rust_facade.py @@ -98,7 +98,8 @@ def print_human(report: dict) -> None: f"symbols={summary['symbols']} " f"global_variables={summary['global_variables']} " f"imports={summary['imports']} " - f"import_resolutions={summary['import_resolutions']}" + f"import_resolutions={summary['import_resolutions']} " + f"references={summary['references']}" ) diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 4ae2fca21..0f7105e29 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -46,6 +46,7 @@ class RustIndexSummary: global_variables: int imports: int import_resolutions: int + references: int bytes: int lines: int files_with_errors: int @@ -148,6 +149,31 @@ def from_dict(cls, data: dict[str, Any]) -> RustImportResolutionRecord: ) +@dataclass(frozen=True) +class RustReferenceRecord: + id: int + source_file_id: int + source_symbol_id: int | None + target_symbol_id: int + import_id: int | None + name: str + range: RustSourceRange + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustReferenceRecord: + source_symbol_id = data["source_symbol_id"] + import_id = data["import_id"] + return cls( + id=int(data["id"]), + source_file_id=int(data["source_file_id"]), + source_symbol_id=None if source_symbol_id is None else int(source_symbol_id), + target_symbol_id=int(data["target_symbol_id"]), + import_id=None if import_id is None else int(import_id), + name=str(data["name"]), + range=RustSourceRange.from_dict(data["range"]), + ) + + @dataclass class RustIndexBackend: repo_path: Path @@ -158,6 +184,7 @@ class RustIndexBackend: _symbols: list[RustSymbolRecord] | None = None _imports: list[RustImportRecord] | None = None _import_resolutions: list[RustImportResolutionRecord] | None = None + _references: list[RustReferenceRecord] | None = None _file_handles: list[RustCompactFile] | None = None _symbol_handles: list[RustCompactSymbol] | None = None _import_handles: list[RustCompactImport] | None = None @@ -216,6 +243,12 @@ def import_resolutions(self) -> list[RustImportResolutionRecord]: self._import_resolutions = [RustImportResolutionRecord.from_dict(record) for record in json.loads(self.index.import_resolutions_json())] return self._import_resolutions + @property + def references(self) -> list[RustReferenceRecord]: + if self._references is None: + self._references = [RustReferenceRecord.from_dict(record) for record in json.loads(self.index.references_json())] + return self._references + @property def file_handles(self) -> list[RustCompactFile]: if self._file_handles is None: diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index a3a1208b0..89cc63876 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -308,6 +308,11 @@ def rust_imports(self): def rust_import_resolutions(self): return self._require_rust_index().import_resolutions + @property + @noapidoc + def rust_references(self): + return self._require_rust_index().references + def _require_rust_index(self): if self.ctx.rust_index is None: msg = "Rust compact index is unavailable; construct Codebase with CodebaseConfig(graph_backend='rust')" diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index 2425a2111..c75659c2b 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -18,6 +18,7 @@ def as_dict(self): "global_variables": 0, "imports": 1, "import_resolutions": 1, + "references": 1, "bytes": 64, "lines": 8, "files_with_errors": 0, @@ -138,6 +139,28 @@ def import_resolutions_json(self): ] ) + def references_json(self): + return json.dumps( + [ + { + "id": 0, + "source_file_id": 0, + "source_symbol_id": 1, + "target_symbol_id": 0, + "import_id": 0, + "name": "Service", + "range": { + "start_byte": 51, + "end_byte": 58, + "start_row": 6, + "start_column": 11, + "end_row": 6, + "end_column": 18, + }, + } + ] + ) + def install_fake_rust_extension(monkeypatch: pytest.MonkeyPatch) -> tuple[list[str], list[list[str]]]: indexed_paths: list[str] = [] @@ -180,16 +203,19 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.summary.global_variables == 0 assert codebase.ctx.rust_index.summary.imports == 1 assert codebase.ctx.rust_index.summary.import_resolutions == 1 + assert codebase.ctx.rust_index.summary.references == 1 assert codebase.ctx.rust_index.files[0].path == "pkg/service.py" assert codebase.ctx.rust_index.symbols[0].name == "Service" assert codebase.ctx.rust_index.imports[0].name == "os" assert codebase.ctx.rust_index.import_resolutions[0].target_symbol_id == 0 + assert codebase.ctx.rust_index.references[0].target_symbol_id == 0 assert codebase.rust_index_summary == codebase.ctx.rust_index.summary assert codebase.rust_files[0].path == "pkg/service.py" assert codebase.rust_classes[0].name == "Service" assert codebase.rust_functions[0].name == "helper" assert codebase.rust_imports[0].name == "os" assert codebase.rust_import_resolutions[0].target_symbol_id == 0 + assert codebase.rust_references[0].name == "Service" assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] From 533f33a1587357685d5cc253249ffa7e8af49ad6 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:18:57 -0700 Subject: [PATCH 015/228] Add Rust compact Python dependency records --- .../examples/index_python.rs | 3 +- crates/graph-sitter-engine/src/lib.rs | 87 ++++++++++++++++++- crates/graph-sitter-py/src/lib.rs | 29 ++++++- rust-rewrite/benchmarks.md | 8 +- rust-rewrite/python-compat.md | 8 +- rust-rewrite/strategy.md | 4 +- .../tools/compare_rust_python_index.py | 2 +- .../tools/measure_codebase_rust_backend.py | 8 +- rust-rewrite/tools/measure_rust_facade.py | 3 +- src/graph_sitter/codebase/rust_backend.py | 31 +++++++ src/graph_sitter/core/codebase.py | 5 ++ tests/unit/sdk/codebase/test_rust_backend.py | 19 ++++ 12 files changed, 190 insertions(+), 17 deletions(-) diff --git a/crates/graph-sitter-engine/examples/index_python.rs b/crates/graph-sitter-engine/examples/index_python.rs index 159e12d44..b9f422cde 100644 --- a/crates/graph-sitter-engine/examples/index_python.rs +++ b/crates/graph-sitter-engine/examples/index_python.rs @@ -29,7 +29,7 @@ fn main() -> Result<(), Box> { println!("repo: {repo_path}"); println!("wall: {:.6}s", elapsed.as_secs_f64()); println!( - "index: files={} symbols={} classes={} functions={} global_variables={} imports={} import_resolutions={} references={} bytes={} lines={} files_with_errors={}", + "index: files={} symbols={} classes={} functions={} global_variables={} imports={} import_resolutions={} references={} dependencies={} bytes={} lines={} files_with_errors={}", summary.files, summary.symbols, summary.classes, @@ -38,6 +38,7 @@ fn main() -> Result<(), Box> { summary.imports, summary.import_resolutions, summary.references, + summary.dependencies, summary.bytes, summary.lines, summary.files_with_errors diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index b5d3221a1..55e5e5fa2 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1,7 +1,7 @@ #![forbid(unsafe_code)] use serde::Serialize; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::fmt; use std::fs; use std::io; @@ -136,6 +136,7 @@ pub struct PythonIndex { pub imports: Vec, pub import_resolutions: Vec, pub references: Vec, + pub dependencies: Vec, } impl PythonIndex { @@ -161,6 +162,7 @@ impl PythonIndex { imports: self.imports.len(), import_resolutions: self.import_resolutions.len(), references: self.references.len(), + dependencies: self.dependencies.len(), bytes: self.files.iter().map(|file| file.byte_len).sum(), lines: self.files.iter().map(|file| file.line_count).sum(), files_with_errors: self.files.iter().filter(|file| file.has_error).count(), @@ -178,6 +180,7 @@ pub struct IndexSummary { pub imports: usize, pub import_resolutions: usize, pub references: usize, + pub dependencies: usize, pub bytes: usize, pub lines: usize, pub files_with_errors: usize, @@ -251,6 +254,17 @@ pub struct ReferenceRecord { pub range: SourceRange, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct DependencyRecord { + pub id: u32, + pub source_symbol_id: u32, + pub target_symbol_id: u32, + pub source_file_id: u32, + pub target_file_id: u32, + pub reference_ids: Vec, + pub reference_count: usize, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] pub struct SourceRange { pub start_byte: usize, @@ -335,6 +349,7 @@ impl PythonIndexer { imports: Vec::new(), import_resolutions: Vec::new(), references: Vec::new(), + dependencies: Vec::new(), }; let mut reference_candidates = Vec::new(); paths.sort(); @@ -376,6 +391,7 @@ impl PythonIndexer { resolve_python_imports(&mut index); resolve_python_references(&mut index, reference_candidates); + build_python_dependencies(&mut index); Ok(index) } } @@ -818,6 +834,49 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec = index + .symbols + .iter() + .map(|symbol| (symbol.id, symbol.file_id)) + .collect(); + let mut dependency_reference_ids: BTreeMap<(u32, u32), Vec> = BTreeMap::new(); + + for reference in &index.references { + let Some(source_symbol_id) = reference.source_symbol_id else { + continue; + }; + dependency_reference_ids + .entry((source_symbol_id, reference.target_symbol_id)) + .or_default() + .push(reference.id); + } + + let dependencies = dependency_reference_ids + .into_iter() + .filter_map(|((source_symbol_id, target_symbol_id), reference_ids)| { + let source_file_id = symbol_file_ids.get(&source_symbol_id).copied()?; + let target_file_id = symbol_file_ids.get(&target_symbol_id).copied()?; + Some(DependencyRecord { + id: 0, + source_symbol_id, + target_symbol_id, + source_file_id, + target_file_id, + reference_count: reference_ids.len(), + reference_ids, + }) + }) + .enumerate() + .map(|(id, mut dependency)| { + dependency.id = id as u32; + dependency + }) + .collect(); + + index.dependencies = dependencies; +} + fn import_binding_name(import: &ImportRecord) -> Option { if let Some(alias) = import.alias.as_deref() { return Some(alias.to_owned()); @@ -990,6 +1049,7 @@ mod tests { assert_eq!(index.summary().imports, 4); assert_eq!(index.summary().import_resolutions, 0); assert_eq!(index.summary().references, 0); + assert_eq!(index.summary().dependencies, 0); assert_eq!(index.symbols[0].name, "Service"); assert_eq!(index.symbols[1].name, "helper"); assert!(index @@ -1043,6 +1103,7 @@ mod tests { assert_eq!(index.summary().imports, 6); assert_eq!(index.summary().import_resolutions, 4); assert_eq!(index.summary().references, 1); + assert_eq!(index.summary().dependencies, 1); let base_file_id = index .files @@ -1084,6 +1145,11 @@ mod tests { && reference.target_symbol_id == base_symbol_id && reference.import_id.is_some() })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.target_symbol_id == base_symbol_id + && dependency.reference_count == 1 + && dependency.reference_ids == vec![0] + })); } #[test] @@ -1156,6 +1222,21 @@ mod tests { }) }) .collect::>(); + let dependencies = index + .dependencies + .iter() + .map(|dependency| { + serde_json::json!({ + "id": dependency.id, + "source_symbol_id": dependency.source_symbol_id, + "target_symbol_id": dependency.target_symbol_id, + "source_file_id": dependency.source_file_id, + "target_file_id": dependency.target_file_id, + "reference_ids": dependency.reference_ids, + "reference_count": dependency.reference_count, + }) + }) + .collect::>(); assert_eq!( serde_json::json!({ @@ -1164,6 +1245,7 @@ mod tests { "imports": imports, "import_resolutions": index.import_resolutions, "references": references, + "dependencies": dependencies, }), serde_json::json!({ "files": [ @@ -1191,6 +1273,9 @@ mod tests { ], "references": [ {"id": 0, "source_file_id": 2, "source_symbol_id": 2, "target_symbol_id": 1, "import_id": 0, "name": "Base"} + ], + "dependencies": [ + {"id": 0, "source_symbol_id": 2, "target_symbol_id": 1, "source_file_id": 2, "target_file_id": 1, "reference_ids": [0], "reference_count": 1} ] }) ); diff --git a/crates/graph-sitter-py/src/lib.rs b/crates/graph-sitter-py/src/lib.rs index 88cffaeae..721848f22 100644 --- a/crates/graph-sitter-py/src/lib.rs +++ b/crates/graph-sitter-py/src/lib.rs @@ -75,6 +75,8 @@ mod bindings { #[pyo3(get)] references: usize, #[pyo3(get)] + dependencies: usize, + #[pyo3(get)] bytes: usize, #[pyo3(get)] lines: usize, @@ -93,6 +95,7 @@ mod bindings { imports: summary.imports, import_resolutions: summary.import_resolutions, references: summary.references, + dependencies: summary.dependencies, bytes: summary.bytes, lines: summary.lines, files_with_errors: summary.files_with_errors, @@ -112,6 +115,7 @@ mod bindings { ("imports", self.imports), ("import_resolutions", self.import_resolutions), ("references", self.references), + ("dependencies", self.dependencies), ("bytes", self.bytes), ("lines", self.lines), ("files_with_errors", self.files_with_errors), @@ -120,7 +124,7 @@ mod bindings { fn __repr__(&self) -> String { format!( - "IndexSummary(files={}, symbols={}, classes={}, functions={}, global_variables={}, imports={}, import_resolutions={}, references={}, bytes={}, lines={}, files_with_errors={})", + "IndexSummary(files={}, symbols={}, classes={}, functions={}, global_variables={}, imports={}, import_resolutions={}, references={}, dependencies={}, bytes={}, lines={}, files_with_errors={})", self.files, self.symbols, self.classes, @@ -129,6 +133,7 @@ mod bindings { self.imports, self.import_resolutions, self.references, + self.dependencies, self.bytes, self.lines, self.files_with_errors @@ -184,6 +189,11 @@ mod bindings { .map_err(|error| PyRuntimeError::new_err(error.to_string())) } + fn dependencies_json(&self) -> PyResult { + serde_json::to_string(&self.inner.dependencies) + .map_err(|error| PyRuntimeError::new_err(error.to_string())) + } + #[getter] fn file_count(&self) -> usize { self.inner.files.len() @@ -209,15 +219,21 @@ mod bindings { self.inner.references.len() } + #[getter] + fn dependency_count(&self) -> usize { + self.inner.dependencies.len() + } + fn __repr__(&self) -> String { let summary = self.inner.summary(); format!( - "PythonIndex(files={}, symbols={}, imports={}, import_resolutions={}, references={})", + "PythonIndex(files={}, symbols={}, imports={}, import_resolutions={}, references={}, dependencies={})", summary.files, summary.symbols, summary.imports, summary.import_resolutions, - summary.references + summary.references, + summary.dependencies ) } } @@ -412,8 +428,10 @@ mod bindings { assert_eq!(summary.global_variables, 1); assert_eq!(summary.import_resolutions, 2); assert_eq!(summary.references, 1); + assert_eq!(summary.dependencies, 1); assert_eq!(index.import_resolution_count(), 2); assert_eq!(index.reference_count(), 1); + assert_eq!(index.dependency_count(), 1); assert!(index.files_json().unwrap().contains("\"pkg/base.py\"")); assert!(index.symbols_json().unwrap().contains("\"CONSTANT\"")); assert!(index.symbols_json().unwrap().contains("\"Base\"")); @@ -423,8 +441,13 @@ mod bindings { .unwrap() .contains("target_symbol_id")); assert!(index.references_json().unwrap().contains("\"Base\"")); + assert!(index + .dependencies_json() + .unwrap() + .contains("reference_count")); assert!(index.to_json().unwrap().contains("import_resolutions")); assert!(index.to_json().unwrap().contains("references")); + assert!(index.to_json().unwrap().contains("dependencies")); } fn temp_repo_path(prefix: &str) -> PathBuf { diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 055f3c4c3..1d746c336 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -166,13 +166,13 @@ This shell-facing number is intentionally more conservative than the standalone These measurements use real `Codebase(...)` construction with `CodebaseConfig(graph_backend="rust", rust_fallback="error")`. In this mode, once the compact Rust index builds successfully, `CodebaseContext` does not build the eager Python graph. The Rust path now exercises public `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, and `imports` compatibility handles while `CodebaseContext.nodes` remains blocked so the old graph cannot be materialized accidentally. -| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Python graph blocked | Wall ratio | RSS ratio | -| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.825s | 537.2 MB | 0.568s | 131.9 MB | 1130 | 1130 | 3955 | 6460 | 432 | 3666 | yes | 4.976x | 4.073x | +| Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | +| `graph-sitter` repo checkout | `--disable-graph` | 2.818s | 539.5 MB | 0.617s | 132.1 MB | 1130 | 1130 | 3956 | 6460 | 432 | 3669 | 2020 | yes | 4.568x | 4.085x | Important caveats: -- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, internal import-resolution records, and first-slice top-level Python symbol reference records for indexed Python modules. +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, internal import-resolution records, first-slice top-level Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 90d878433..ebcaa2e17 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -118,16 +118,18 @@ Current implemented bridge status: - `crates/graph-sitter-py` builds a PyO3 module named `graph_sitter_py` behind the `extension-module` feature. - `Engine.index_python_path(repo_path)` and module-level `index_python_path(repo_path)` return a compact `PythonIndex` for Python files. - `Engine.index_python_paths(repo_path, file_paths)` and module-level `index_python_paths(repo_path, file_paths)` index an explicit Python file list. The Python shell integration uses this path so Rust sees the same `RepoOperator.iter_files(...)` selection as the current Python backend. -- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, global-variable, import, import-resolution, byte, line, and error counts. +- `PythonIndex.summary()` returns `IndexSummary` with file, symbol, class, function, global-variable, import, import-resolution, reference, dependency, byte, line, and error counts. - `PythonIndex.to_json()` serializes the compact Rust records for debug and benchmark use. - `PythonIndex.files_json()`, `symbols_json()`, `imports_json()`, and `import_resolutions_json()` expose each record family without forcing callers to deserialize the full index payload. - `PythonIndex.references_json()` exposes compact symbol reference records. -- `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, and `.references` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. +- `PythonIndex.dependencies_json()` exposes compact dependency edge records. +- `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. - Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside top-level Python classes/functions. Full lexical scoping, nested references, attributes, and module references remain future work. +- Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. -- `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, `.rust_import_resolutions`, and `.rust_references` expose the attached compact records for shell smoke checks and golden tests. +- `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, `.rust_import_resolutions`, `.rust_references`, and `.rust_dependencies` expose the attached compact records for shell smoke checks and golden tests. - `Codebase.files`, `.symbols`, `.classes`, `.functions`, `.global_vars`, `.imports`, `get_file(...)`, `get_symbol(...)`, `get_class(...)`, and `get_function(...)` now return lightweight compact handles in strict Rust mode for Python codebases. - Compact file handles expose basic identity/content plus file-local `symbols`, `classes`, `functions`, `global_vars`, and `imports`. Compact symbol and import handles expose basic identity/source and implemented import-resolution targets. Edit-heavy and dependency/reference graph methods are still unsupported until the full lazy engine facade exists. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet provide full P0 `SourceFile`, `Symbol`, or `Import` parity. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 1ef1fa06a..40156763d 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -182,7 +182,8 @@ Recommended task format: - [ ] Implement lexical scope tables for name resolution. - [x] Implement first compact Python symbol reference extraction by identifier ranges. owner: codex. Result: records same-file and imported top-level symbol references inside top-level Python classes/functions. - [ ] Expand symbol usage extraction to nested scopes, attributes, module references, and full lexical shadowing behavior. -- [ ] Implement dependency edge construction from usage records. +- [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. +- [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. - [ ] Add graph debug dump for nodes, edges, and usage metadata. - [ ] Add parity tests comparing Python backend and Rust backend graph edges on fixtures. @@ -251,3 +252,4 @@ Recommended task format: - [x] 2026-06-18: Made opt-in `CodebaseConfig(graph_backend="rust")` skip eager Python graph construction and expose compact `rust_*` record properties on `Codebase`. owner: codex. Notes: current checkout constructs 4.0x faster with 4.6x lower process max RSS than Python parse/object materialization while blocking lazy Python graph materialization. - [x] 2026-06-18: Added lightweight Rust compact handles for Python `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, `imports`, and basic `get_*` queries. owner: codex. Notes: current checkout constructs and exercises public read handles 5.3x faster with 4.6x lower process max RSS than Python parse/object materialization while keeping `CodebaseContext.nodes` blocked. - [x] 2026-06-18: Added compact Python `ReferenceRecord` extraction for same-file and imported top-level symbol references inside top-level classes/functions. owner: codex. Notes: current checkout emits 3,666 compact references and remains 5.0x faster with 4.1x lower process max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added compact Python `DependencyRecord` construction from references. owner: codex. Notes: current checkout emits 2,020 de-duplicated dependency edges and remains 4.6x faster with 4.1x lower process max RSS than Python parse/object materialization. diff --git a/rust-rewrite/tools/compare_rust_python_index.py b/rust-rewrite/tools/compare_rust_python_index.py index a5e17d7fd..ac6384a95 100644 --- a/rust-rewrite/tools/compare_rust_python_index.py +++ b/rust-rewrite/tools/compare_rust_python_index.py @@ -190,7 +190,7 @@ def print_human(report: dict[str, Any]) -> None: f"files={rust_summary['files']} symbols={rust_summary['symbols']} " f"global_variables={rust_summary['global_variables']} " f"imports={rust_summary['imports']} import_resolutions={rust_summary['import_resolutions']} " - f"references={rust_summary['references']}" + f"references={rust_summary['references']} dependencies={rust_summary['dependencies']}" ) print( "ratios: " diff --git a/rust-rewrite/tools/measure_codebase_rust_backend.py b/rust-rewrite/tools/measure_codebase_rust_backend.py index e699ba188..70193cd5c 100644 --- a/rust-rewrite/tools/measure_codebase_rust_backend.py +++ b/rust-rewrite/tools/measure_codebase_rust_backend.py @@ -61,6 +61,7 @@ def make_report(repo: Path) -> dict: "imports": summary.imports, "import_resolutions": summary.import_resolutions, "references": summary.references, + "dependencies": summary.dependencies, "bytes": summary.bytes, "lines": summary.lines, "files_with_errors": summary.files_with_errors, @@ -74,6 +75,7 @@ def make_report(repo: Path) -> dict: "rust_imports": len(codebase.rust_imports), "rust_import_resolutions": len(codebase.rust_import_resolutions), "rust_references": len(codebase.rust_references), + "rust_dependencies": len(codebase.rust_dependencies), }, "compat_handles": { "files": len(codebase.files), @@ -109,7 +111,8 @@ def print_human(report: dict) -> None: f"global_variables={summary['global_variables']} " f"imports={summary['imports']} " f"import_resolutions={summary['import_resolutions']} " - f"references={summary['references']}" + f"references={summary['references']} " + f"dependencies={summary['dependencies']}" ) print( "records: " @@ -117,7 +120,8 @@ def print_human(report: dict) -> None: f"symbols={records['rust_symbols']} " f"imports={records['rust_imports']} " f"import_resolutions={records['rust_import_resolutions']} " - f"references={records['rust_references']}" + f"references={records['rust_references']} " + f"dependencies={records['rust_dependencies']}" ) print( "compat handles: " diff --git a/rust-rewrite/tools/measure_rust_facade.py b/rust-rewrite/tools/measure_rust_facade.py index 1d0d92a6f..eb29e16a9 100644 --- a/rust-rewrite/tools/measure_rust_facade.py +++ b/rust-rewrite/tools/measure_rust_facade.py @@ -99,7 +99,8 @@ def print_human(report: dict) -> None: f"global_variables={summary['global_variables']} " f"imports={summary['imports']} " f"import_resolutions={summary['import_resolutions']} " - f"references={summary['references']}" + f"references={summary['references']} " + f"dependencies={summary['dependencies']}" ) diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 0f7105e29..2c8d60aa5 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -47,6 +47,7 @@ class RustIndexSummary: imports: int import_resolutions: int references: int + dependencies: int bytes: int lines: int files_with_errors: int @@ -174,6 +175,29 @@ def from_dict(cls, data: dict[str, Any]) -> RustReferenceRecord: ) +@dataclass(frozen=True) +class RustDependencyRecord: + id: int + source_symbol_id: int + target_symbol_id: int + source_file_id: int + target_file_id: int + reference_ids: list[int] + reference_count: int + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> RustDependencyRecord: + return cls( + id=int(data["id"]), + source_symbol_id=int(data["source_symbol_id"]), + target_symbol_id=int(data["target_symbol_id"]), + source_file_id=int(data["source_file_id"]), + target_file_id=int(data["target_file_id"]), + reference_ids=[int(reference_id) for reference_id in data["reference_ids"]], + reference_count=int(data["reference_count"]), + ) + + @dataclass class RustIndexBackend: repo_path: Path @@ -185,6 +209,7 @@ class RustIndexBackend: _imports: list[RustImportRecord] | None = None _import_resolutions: list[RustImportResolutionRecord] | None = None _references: list[RustReferenceRecord] | None = None + _dependencies: list[RustDependencyRecord] | None = None _file_handles: list[RustCompactFile] | None = None _symbol_handles: list[RustCompactSymbol] | None = None _import_handles: list[RustCompactImport] | None = None @@ -249,6 +274,12 @@ def references(self) -> list[RustReferenceRecord]: self._references = [RustReferenceRecord.from_dict(record) for record in json.loads(self.index.references_json())] return self._references + @property + def dependencies(self) -> list[RustDependencyRecord]: + if self._dependencies is None: + self._dependencies = [RustDependencyRecord.from_dict(record) for record in json.loads(self.index.dependencies_json())] + return self._dependencies + @property def file_handles(self) -> list[RustCompactFile]: if self._file_handles is None: diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index 89cc63876..d4c1c4d53 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -313,6 +313,11 @@ def rust_import_resolutions(self): def rust_references(self): return self._require_rust_index().references + @property + @noapidoc + def rust_dependencies(self): + return self._require_rust_index().dependencies + def _require_rust_index(self): if self.ctx.rust_index is None: msg = "Rust compact index is unavailable; construct Codebase with CodebaseConfig(graph_backend='rust')" diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index c75659c2b..7bf3ab600 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -19,6 +19,7 @@ def as_dict(self): "imports": 1, "import_resolutions": 1, "references": 1, + "dependencies": 1, "bytes": 64, "lines": 8, "files_with_errors": 0, @@ -161,6 +162,21 @@ def references_json(self): ] ) + def dependencies_json(self): + return json.dumps( + [ + { + "id": 0, + "source_symbol_id": 1, + "target_symbol_id": 0, + "source_file_id": 0, + "target_file_id": 0, + "reference_ids": [0], + "reference_count": 1, + } + ] + ) + def install_fake_rust_extension(monkeypatch: pytest.MonkeyPatch) -> tuple[list[str], list[list[str]]]: indexed_paths: list[str] = [] @@ -204,11 +220,13 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.ctx.rust_index.summary.imports == 1 assert codebase.ctx.rust_index.summary.import_resolutions == 1 assert codebase.ctx.rust_index.summary.references == 1 + assert codebase.ctx.rust_index.summary.dependencies == 1 assert codebase.ctx.rust_index.files[0].path == "pkg/service.py" assert codebase.ctx.rust_index.symbols[0].name == "Service" assert codebase.ctx.rust_index.imports[0].name == "os" assert codebase.ctx.rust_index.import_resolutions[0].target_symbol_id == 0 assert codebase.ctx.rust_index.references[0].target_symbol_id == 0 + assert codebase.ctx.rust_index.dependencies[0].target_symbol_id == 0 assert codebase.rust_index_summary == codebase.ctx.rust_index.summary assert codebase.rust_files[0].path == "pkg/service.py" assert codebase.rust_classes[0].name == "Service" @@ -216,6 +234,7 @@ def test_codebase_context_builds_opt_in_rust_index(monkeypatch, tmp_path): assert codebase.rust_imports[0].name == "os" assert codebase.rust_import_resolutions[0].target_symbol_id == 0 assert codebase.rust_references[0].name == "Service" + assert codebase.rust_dependencies[0].reference_ids == [0] assert indexed_paths == [str(tmp_path.resolve())] assert selected_paths == [["pkg/service.py"]] From 4443e6847ee4ef407afb79e207647e42f9e641f4 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:23:32 -0700 Subject: [PATCH 016/228] Add pinned large repo benchmark runner --- rust-rewrite/benchmarks.md | 13 +- rust-rewrite/strategy.md | 11 +- .../tools/benchmark_pinned_python_repo.py | 268 ++++++++++++++++++ 3 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 rust-rewrite/tools/benchmark_pinned_python_repo.py diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 1d746c336..4c7fa8784 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -105,6 +105,14 @@ PYTHONPATH=/path/to/dir/containing/graph_sitter_py_extension \ uv run python rust-rewrite/tools/measure_codebase_rust_backend.py . --json ``` +`rust-rewrite/tools/benchmark_pinned_python_repo.py` prepares a pinned external Python repository, builds the PyO3 extension, runs the Python parse/object-materialization harness, runs the Rust compact `Codebase` harness, and fails if the configured wall/RSS ratio gates are not met. The default pinned repo is Apache Airflow `2.10.5`, resolved to commit `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`: + +```bash +uv run python rust-rewrite/tools/benchmark_pinned_python_repo.py \ + --output /tmp/graph-sitter-airflow-2.10.5-benchmark.json \ + --json +``` + ## Metrics The JSON report includes: @@ -169,6 +177,7 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | | `graph-sitter` repo checkout | `--disable-graph` | 2.818s | 539.5 MB | 0.617s | 132.1 MB | 1130 | 1130 | 3956 | 6460 | 432 | 3669 | 2020 | yes | 4.568x | 4.085x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.863s | 3471.4 MB | 3.194s | 351.3 MB | 4789 | 4789 | 23663 | 40580 | 19011 | 95292 | 35489 | yes | 6.218x | 9.882x | Important caveats: @@ -176,11 +185,11 @@ Important caveats: - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. -- The generated fixture and this repo are useful proof points, but the huge-repo target still needs canonical pinned baselines. +- The generated fixture, this repo, and the pinned Airflow baseline are useful proof points, but full parity snapshots and additional canonical repos are still open. ## Open Questions -- Which exact small, medium, and huge repositories should become canonical Phase 0 baselines? +- Which additional small, medium, and huge repositories should become canonical Phase 0 baselines? - Should TypeScript baselines run with dependency manager and language engine flags off, on, or both? - Do we need allocator-level attribution with `memray`, `tracemalloc`, or `py-spy` in addition to RSS sampling? - What commit, dependency lockfile, and Python minor version should define the official baseline? diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 40156763d..563c36811 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -120,14 +120,17 @@ Recommended task format: - [x] Add memory benchmark harness for current Python backend. owner: Poincare. Result: added `rust-rewrite/tools/measure_python_backend.py`. - [x] Measure initial cold parse RSS and wall time for generated fixture and this repo. owner: codex. Result: recorded in `rust-rewrite/benchmarks.md`. -- [ ] Measure cold parse RSS and wall time for canonical small, medium, and huge repos. +- [x] Add pinned Python repository benchmark harness. owner: codex. Result: added `rust-rewrite/tools/benchmark_pinned_python_repo.py` to clone/fetch a pinned repo, build the PyO3 extension, run Python and Rust `Codebase` measurements, and enforce wall/RSS/file-count gates. +- [x] Measure first canonical huge Python repo cold parse/Rust compact backend baseline. owner: codex. Result: Apache Airflow `2.10.5` at `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf` records 4,789 Python files, 6.218x wall improvement, and 9.882x max-RSS improvement for the current compact Rust slice. +- [ ] Measure cold parse RSS and wall time for additional canonical small, medium, and huge repos. - [ ] Measure graph node/edge counts, Python object counts, and per-phase allocation peaks. - [x] Document the exact current build phases with timings: file enumeration, parse, directory tree, config parse, import resolution, export resolution, dependency recompute. owner: Poincare. Result: added phase map in `rust-rewrite/benchmarks.md`; representative repo timings remain open. - [x] Inventory all public `Codebase` properties and methods. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [x] Inventory all public `SourceFile`, `Symbol`, `Import`, `Export`, and `Directory` APIs used by tests/docs. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [x] Define P0 compatibility surface for the first Rust backend slice. owner: Dewey. Result: documented in `rust-rewrite/api-inventory.md`. - [ ] Define large-repo success targets for memory and time. -- [ ] Select pinned large Python repo commits for golden parity and latency benchmarks. Notes: Airflow is a good first candidate; record the exact upstream URL, commit SHA, Python version, and checkout/bootstrap command. +- [x] Select first pinned large Python repo commit for golden parity and latency benchmarks. owner: codex. Result: Apache Airflow `2.10.5`, upstream `https://github.com/apache/airflow.git`, ref `refs/tags/2.10.5`, commit `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`, measured with Python 3.13.11 on macOS. +- [ ] Select additional pinned large Python repo commits for golden parity and latency benchmarks. - [ ] Build golden reference/import/dependency graph snapshots for the pinned large Python repo commits. Notes: fixtures should assert file/module records, import graph edges, symbol reference graph edges, dependency graph edges, and deterministic sort order. - [x] Draft compact Rust data model with module boundaries and Python integration points. owner: Pasteur. Result: documented in `rust-rewrite/data-model.md`. - [ ] Draft full Rust engine RFC with module boundaries and Python integration points. @@ -146,7 +149,7 @@ Recommended task format: - [x] Add benchmark command comparing Python backend with Rust compact indexer. owner: codex. Result: added `rust-rewrite/tools/compare_rust_python_index.py`. - [x] Add benchmark command for the Python-facing Rust facade. owner: codex. Result: added `rust-rewrite/tools/measure_rust_facade.py`. - [x] Add benchmark command for real `Codebase` construction with the Rust compact backend. owner: codex. Result: added `rust-rewrite/tools/measure_codebase_rust_backend.py`. -- [ ] Add benchmark command that can select full `Codebase` `--backend python|rust` once Rust backend is wired into Python. +- [x] Add benchmark command that can select full `Codebase` `--backend python|rust` once Rust backend is wired into Python. owner: codex. Result: `benchmark_pinned_python_repo.py` runs Python and Rust `Codebase` measurements in child processes for pinned external repos. ## Phase 2: Parser And Compact Index Vertical Slice @@ -221,6 +224,7 @@ Recommended task format: - [ ] Run full unit suite with Python backend. - [ ] Run full unit suite with Rust backend where supported. - [ ] Add large-repo memory regression benchmark to CI or nightly. +- [x] Add pinned large-repo latency/RSS benchmark harness. owner: codex. Result: Airflow `2.10.5` benchmark command emits backend, wall time, max RSS, file count, node/edge counts, compact Rust record counts, mismatch summaries, and pass/fail gates. - [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. Notes: run against the exact checked-out commit and emit backend, wall time, max RSS, file count, node/edge counts, and mismatch summaries. - [ ] Add feature flag documentation. - [ ] Add migration notes for unsupported APIs. @@ -253,3 +257,4 @@ Recommended task format: - [x] 2026-06-18: Added lightweight Rust compact handles for Python `Codebase.files`, `symbols`, `classes`, `functions`, `global_vars`, `imports`, and basic `get_*` queries. owner: codex. Notes: current checkout constructs and exercises public read handles 5.3x faster with 4.6x lower process max RSS than Python parse/object materialization while keeping `CodebaseContext.nodes` blocked. - [x] 2026-06-18: Added compact Python `ReferenceRecord` extraction for same-file and imported top-level symbol references inside top-level classes/functions. owner: codex. Notes: current checkout emits 3,666 compact references and remains 5.0x faster with 4.1x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added compact Python `DependencyRecord` construction from references. owner: codex. Notes: current checkout emits 2,020 de-duplicated dependency edges and remains 4.6x faster with 4.1x lower process max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added first pinned large-repo benchmark runner and Airflow baseline. owner: codex. Notes: Apache Airflow `2.10.5` at `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf` matched 4,789 Python files and measured 6.218x faster wall time with 9.882x lower max RSS for the current compact Rust `Codebase` slice. diff --git a/rust-rewrite/tools/benchmark_pinned_python_repo.py b/rust-rewrite/tools/benchmark_pinned_python_repo.py new file mode 100644 index 000000000..2ff61bb61 --- /dev/null +++ b/rust-rewrite/tools/benchmark_pinned_python_repo.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import platform +import shutil +import subprocess +import sys +import sysconfig +from pathlib import Path +from typing import Any + +TOOLS_DIR = Path(__file__).resolve().parent +REPO_ROOT = TOOLS_DIR.parents[1] + +DEFAULT_REPO_NAME = "apache-airflow-2.10.5" +DEFAULT_REPO_URL = "https://github.com/apache/airflow.git" +DEFAULT_REF = "refs/tags/2.10.5" +DEFAULT_EXPECTED_COMMIT = "b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf" +DEFAULT_CACHE_DIR = Path("/tmp/graph-sitter-pinned-repos") +DEFAULT_EXTENSION_DIR = Path("/tmp/graph_sitter_py_pinned_benchmark") + + +def run(command: list[str], *, cwd: Path, env: dict[str, str] | None = None, timeout: int | None = None) -> subprocess.CompletedProcess[str]: + return subprocess.run(command, cwd=cwd, env=env, timeout=timeout, check=True, capture_output=True, text=True) + + +def parse_json_output(output: str) -> dict[str, Any]: + start = output.find("{") + end = output.rfind("}") + if start == -1 or end == -1 or end < start: + msg = f"command did not emit JSON output:\n{output}" + raise ValueError(msg) + return json.loads(output[start : end + 1]) + + +def git(repo: Path, *args: str, timeout: int | None = None) -> str: + result = run(["git", *args], cwd=repo, timeout=timeout) + return result.stdout.strip() + + +def prepare_pinned_repo(args: argparse.Namespace) -> tuple[Path, str]: + checkout = args.cache_dir / args.name + if args.reset_checkout and checkout.exists(): + shutil.rmtree(checkout) + checkout.parent.mkdir(parents=True, exist_ok=True) + + if not (checkout / ".git").exists(): + checkout.mkdir(parents=True, exist_ok=True) + git(checkout, "init", timeout=args.timeout) + git(checkout, "remote", "add", "origin", args.repo_url, timeout=args.timeout) + else: + existing_url = git(checkout, "remote", "get-url", "origin", timeout=args.timeout) + if existing_url != args.repo_url: + git(checkout, "remote", "set-url", "origin", args.repo_url, timeout=args.timeout) + + if not args.skip_fetch: + git(checkout, "fetch", "--depth=1", "origin", args.ref, timeout=args.timeout) + git(checkout, "checkout", "--detach", "FETCH_HEAD", timeout=args.timeout) + actual_commit = git(checkout, "rev-parse", "HEAD", timeout=args.timeout) + if args.expected_commit and actual_commit != args.expected_commit: + msg = f"expected {args.expected_commit} for {args.ref}, got {actual_commit}" + raise RuntimeError(msg) + return checkout, actual_commit + + +def build_rust_extension(extension_dir: Path, *, timeout: int | None) -> Path: + env = os.environ.copy() + env["PYO3_PYTHON"] = sys.executable + if sys.platform == "darwin": + dynamic_lookup_flags = "-C link-arg=-undefined -C link-arg=dynamic_lookup" + env["RUSTFLAGS"] = f"{env.get('RUSTFLAGS', '')} {dynamic_lookup_flags}".strip() + + subprocess.run( + ["cargo", "build", "--release", "-p", "graph-sitter-py", "--features", "extension-module"], + cwd=REPO_ROOT, + env=env, + timeout=timeout, + check=True, + ) + + if sys.platform == "darwin": + source = REPO_ROOT / "target/release/libgraph_sitter_py.dylib" + elif os.name == "nt": + source = REPO_ROOT / "target/release/graph_sitter_py.dll" + else: + source = REPO_ROOT / "target/release/libgraph_sitter_py.so" + if not source.exists(): + msg = f"built extension artifact not found: {source}" + raise FileNotFoundError(msg) + + extension_dir.mkdir(parents=True, exist_ok=True) + target = extension_dir / f"graph_sitter_py{sysconfig.get_config_var('EXT_SUFFIX')}" + shutil.copy2(source, target) + return target + + +def run_python_backend(repo: Path, args: argparse.Namespace) -> dict[str, Any]: + command = [ + sys.executable, + str(TOOLS_DIR / "measure_python_backend.py"), + str(repo), + "--language", + "python", + "--skip-object-counts", + "--sample-interval", + str(args.sample_interval), + "--json", + ] + if args.python_disable_graph: + command.append("--disable-graph") + result = run(command, cwd=REPO_ROOT, timeout=args.timeout) + return parse_json_output(result.stdout) + + +def run_rust_codebase(repo: Path, args: argparse.Namespace) -> dict[str, Any]: + env = os.environ.copy() + pythonpath = env.get("PYTHONPATH") + env["PYTHONPATH"] = str(args.extension_dir) if not pythonpath else f"{args.extension_dir}{os.pathsep}{pythonpath}" + command = [sys.executable, str(TOOLS_DIR / "measure_codebase_rust_backend.py"), str(repo), "--json"] + result = run(command, cwd=REPO_ROOT, env=env, timeout=args.timeout) + return parse_json_output(result.stdout) + + +def ratio(numerator: float, denominator: float) -> float | None: + if denominator <= 0: + return None + return round(numerator / denominator, 3) + + +def make_report(args: argparse.Namespace) -> dict[str, Any]: + repo, actual_commit = prepare_pinned_repo(args) + extension_path = None + if not args.skip_build_extension: + extension_path = build_rust_extension(args.extension_dir, timeout=args.timeout) + + python_report = run_python_backend(repo, args) + rust_report = run_rust_codebase(repo, args) + + python_totals = python_report["totals"] + python_graph = python_report["graph"] + rust_totals = rust_report["totals"] + rust_summary = rust_report["summary"] + wall_ratio = ratio(python_totals["wall_seconds"], rust_totals["wall_seconds"]) + rss_ratio = ratio(python_totals["max_rss_mb"], rust_totals["max_rss_mb"]) + + report = { + "metadata": { + "name": args.name, + "repo_url": args.repo_url, + "ref": args.ref, + "commit": actual_commit, + "checkout": str(repo), + "python": sys.version, + "platform": platform.platform(), + "python_disable_graph": args.python_disable_graph, + "sample_interval_seconds": args.sample_interval, + "extension_path": str(extension_path) if extension_path else None, + }, + "comparison": { + "python_to_rust_wall_ratio": wall_ratio, + "python_to_rust_rss_ratio": rss_ratio, + "python_wall_seconds": python_totals["wall_seconds"], + "rust_wall_seconds": rust_totals["wall_seconds"], + "python_max_rss_mb": python_totals["max_rss_mb"], + "rust_max_rss_mb": rust_totals["max_rss_mb"], + "python_source_files": python_graph["source_files"], + "rust_files": rust_summary["files"], + "file_count_match": python_graph["source_files"] == rust_summary["files"], + "rust_symbols": rust_summary["symbols"], + "rust_imports": rust_summary["imports"], + "rust_import_resolutions": rust_summary["import_resolutions"], + "rust_references": rust_summary["references"], + "rust_dependencies": rust_summary["dependencies"], + "python_nodes": python_graph["nodes"], + "python_edges": python_graph["edges"], + }, + "python_backend": python_report, + "rust_codebase": rust_report, + } + validate_report(report, args) + return report + + +def validate_report(report: dict[str, Any], args: argparse.Namespace) -> None: + comparison = report["comparison"] + failures = [] + wall_ratio = comparison["python_to_rust_wall_ratio"] + rss_ratio = comparison["python_to_rust_rss_ratio"] + if args.require_file_count_match and not comparison["file_count_match"]: + failures.append(f"file count mismatch: python={comparison['python_source_files']} rust={comparison['rust_files']}") + if wall_ratio is None or wall_ratio < args.min_wall_ratio: + failures.append(f"wall ratio {wall_ratio}x is below required {args.min_wall_ratio}x") + if rss_ratio is None or rss_ratio < args.min_rss_ratio: + failures.append(f"RSS ratio {rss_ratio}x is below required {args.min_rss_ratio}x") + if failures: + raise RuntimeError("; ".join(failures)) + + +def print_human(report: dict[str, Any]) -> None: + metadata = report["metadata"] + comparison = report["comparison"] + print(f"repo: {metadata['name']} {metadata['commit']}") + print(f"checkout: {metadata['checkout']}") + print(f"python disable_graph: {metadata['python_disable_graph']}") + print( + "python backend: " + f"wall={comparison['python_wall_seconds']:.3f}s " + f"max_rss={comparison['python_max_rss_mb']:.1f} MB " + f"files={comparison['python_source_files']} nodes={comparison['python_nodes']} edges={comparison['python_edges']}" + ) + print( + "rust Codebase: " + f"wall={comparison['rust_wall_seconds']:.3f}s " + f"max_rss={comparison['rust_max_rss_mb']:.1f} MB " + f"files={comparison['rust_files']} symbols={comparison['rust_symbols']} imports={comparison['rust_imports']} " + f"import_resolutions={comparison['rust_import_resolutions']} references={comparison['rust_references']} dependencies={comparison['rust_dependencies']}" + ) + print( + "ratios: " + f"wall={comparison['python_to_rust_wall_ratio']}x " + f"rss={comparison['python_to_rust_rss_ratio']}x " + f"file_count_match={comparison['file_count_match']}" + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark a pinned large Python repository against the compact Rust Codebase backend.") + parser.add_argument("--name", default=DEFAULT_REPO_NAME, help="Stable name for the pinned repository checkout.") + parser.add_argument("--repo-url", default=DEFAULT_REPO_URL, help="Git repository URL.") + parser.add_argument("--ref", default=DEFAULT_REF, help="Remote ref or commit to fetch.") + parser.add_argument("--expected-commit", default=DEFAULT_EXPECTED_COMMIT, help="Expected resolved commit SHA. Pass an empty string to disable.") + parser.add_argument("--cache-dir", type=Path, default=DEFAULT_CACHE_DIR, help="Directory for reusable pinned checkouts.") + parser.add_argument("--extension-dir", type=Path, default=DEFAULT_EXTENSION_DIR, help="Directory for the built PyO3 extension module.") + parser.add_argument("--reset-checkout", action="store_true", help="Delete and recreate the cached checkout before running.") + parser.add_argument("--skip-fetch", action="store_true", help="Do not fetch before checkout; useful for offline reruns with FETCH_HEAD present.") + parser.add_argument("--skip-build-extension", action="store_true", help="Reuse an existing graph_sitter_py extension in --extension-dir.") + parser.add_argument("--python-full-graph", action="store_false", dest="python_disable_graph", help="Measure the full Python graph instead of parse/object materialization only.") + parser.add_argument("--sample-interval", type=float, default=0.01, help="RSS sampling interval for the Python backend harness.") + parser.add_argument("--timeout", type=int, default=900, help="Timeout in seconds for clone/build/benchmark child commands.") + parser.add_argument("--min-wall-ratio", type=float, default=1.0, help="Fail unless Python wall time divided by Rust wall time is at least this value.") + parser.add_argument("--min-rss-ratio", type=float, default=1.0, help="Fail unless Python max RSS divided by Rust max RSS is at least this value.") + parser.add_argument("--allow-file-count-mismatch", action="store_false", dest="require_file_count_match", help="Do not fail if Python and Rust file counts differ.") + parser.add_argument("--output", type=Path, help="Optional path to write JSON report.") + parser.add_argument("--json", action="store_true", help="Print JSON report instead of a human summary.") + parser.set_defaults(python_disable_graph=True, require_file_count_match=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + if args.expected_commit == "": + args.expected_commit = None + report = make_report(args) + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(report, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.json: + print(json.dumps(report, indent=2, sort_keys=True)) + else: + print_human(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From a1e140f01f9268793ca6fbe1180f5c974b1f07c1 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:27:58 -0700 Subject: [PATCH 017/228] Add pinned Airflow compact graph snapshot --- rust-rewrite/benchmarks.md | 36 +- .../apache-airflow-2.10.5-rust-compact.json | 1208 +++++++++++++++++ rust-rewrite/strategy.md | 6 +- .../tools/snapshot_pinned_python_repo.py | 393 ++++++ .../test_pinned_airflow_snapshot.py | 25 + 5 files changed, 1666 insertions(+), 2 deletions(-) create mode 100644 rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json create mode 100644 rust-rewrite/tools/snapshot_pinned_python_repo.py create mode 100644 tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 4c7fa8784..277e5b328 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -113,6 +113,25 @@ uv run python rust-rewrite/tools/benchmark_pinned_python_repo.py \ --json ``` +`rust-rewrite/tools/snapshot_pinned_python_repo.py` verifies a deterministic compact Rust graph snapshot for the same pinned Airflow checkout. The committed golden stores counts, stable SHA-256 digests, and sorted sample rows for files, symbols, imports, import resolutions, references, and dependencies: + +```bash +uv run python rust-rewrite/tools/snapshot_pinned_python_repo.py +``` + +Refresh the committed snapshot after intentional compact-IR changes: + +```bash +uv run python rust-rewrite/tools/snapshot_pinned_python_repo.py --update +``` + +The same check is available as an opt-in pytest integration test: + +```bash +GRAPH_SITTER_RUN_PINNED_AIRFLOW_SNAPSHOT=1 \ + uv run pytest tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py -q +``` + ## Metrics The JSON report includes: @@ -179,13 +198,28 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | `graph-sitter` repo checkout | `--disable-graph` | 2.818s | 539.5 MB | 0.617s | 132.1 MB | 1130 | 1130 | 3956 | 6460 | 432 | 3669 | 2020 | yes | 4.568x | 4.085x | | Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.863s | 3471.4 MB | 3.194s | 351.3 MB | 4789 | 4789 | 23663 | 40580 | 19011 | 95292 | 35489 | yes | 6.218x | 9.882x | +## Pinned Compact Snapshot Evidence + +The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json`. It was generated from Apache Airflow `2.10.5` at commit `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`. + +| Graph family | Count | SHA-256 | +| --- | ---: | --- | +| Files | 4789 | `226e8cb32dc0a23ec956e97b036e7c505037df979cce7182514f39a43b07cb80` | +| Symbols | 23663 | `02fd17a7c0ba4f8fa0f29dcdfc642bffcc8116c20b86f5519f15fd0447d08781` | +| Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | +| Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | +| References | 95292 | `677270b43e9578c64f08d85f8635d5bf4bea027ad513649e8767d1147633af5c` | +| Dependencies | 35489 | `0e18b4147f49a3bc58ae8bab3972535b0df3cbede968c7e14324e2b23fb31f70` | + +The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. + Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, internal import-resolution records, first-slice top-level Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. -- The generated fixture, this repo, and the pinned Airflow baseline are useful proof points, but full parity snapshots and additional canonical repos are still open. +- The generated fixture, this repo, and the pinned Airflow baseline are useful proof points, but Python-vs-Rust semantic parity snapshots and additional canonical repos are still open. ## Open Questions diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json new file mode 100644 index 000000000..80ff0bbaa --- /dev/null +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -0,0 +1,1208 @@ +{ + "graphs": { + "dependencies": { + "count": 35489, + "samples": [ + { + "reference_count": 1, + "source_file": "airflow/__init__.py", + "source_symbol": "airflow/__init__.py:function:__getattr__@4048", + "target_file": "airflow/__init__.py", + "target_symbol": "airflow/__init__.py:global_variable:__lazy_imports@3362" + }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:configure_internal_api@2850", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:class:AirflowConfigParser@6263" + }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:configure_internal_api@2850", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_file": "airflow/__main__.py", + "target_symbol": "airflow/__main__.py:function:configure_internal_api@2850" + }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:function:write_webserver_configuration_if_needed@93544" + }, + { + "reference_count": 2, + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_file": "airflow/api/__init__.py", + "target_symbol": "airflow/api/__init__.py:global_variable:log@1018" + }, + { + "reference_count": 1, + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "reference_count": 1, + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowConfigException@1913" + }, + { + "reference_count": 1, + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "reference_count": 2, + "source_file": "airflow/api/auth/backend/default.py", + "source_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", + "target_file": "airflow/api/auth/backend/default.py", + "target_symbol": "airflow/api/auth/backend/default.py:global_variable:T@1078" + }, + { + "reference_count": 2, + "source_file": "airflow/api/auth/backend/deny_all.py", + "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", + "target_file": "airflow/api/auth/backend/deny_all.py", + "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" + }, + { + "reference_count": 6, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:class:_KerberosAuth@2887" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" + }, + { + "reference_count": 4, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/utils/net.py", + "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_forbidden@3931" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_unauthorized@3781" + } + ], + "sha256": "0e18b4147f49a3bc58ae8bab3972535b0df3cbede968c7e14324e2b23fb31f70" + }, + "files": { + "count": 4789, + "samples": [ + { + "byte_len": 5673, + "has_error": false, + "line_count": 139, + "module_name": "airflow", + "path": "airflow/__init__.py" + }, + { + "byte_len": 4273, + "has_error": false, + "line_count": 90, + "module_name": "airflow.__main__", + "path": "airflow/__main__.py" + }, + { + "byte_len": 0, + "has_error": false, + "line_count": 0, + "module_name": "airflow._vendor", + "path": "airflow/_vendor/__init__.py" + }, + { + "byte_len": 1678, + "has_error": false, + "line_count": 48, + "module_name": "airflow.api", + "path": "airflow/api/__init__.py" + }, + { + "byte_len": 787, + "has_error": false, + "line_count": 17, + "module_name": "airflow.api.auth", + "path": "airflow/api/auth/__init__.py" + }, + { + "byte_len": 787, + "has_error": false, + "line_count": 17, + "module_name": "airflow.api.auth.backend", + "path": "airflow/api/auth/backend/__init__.py" + }, + { + "byte_len": 1723, + "has_error": false, + "line_count": 52, + "module_name": "airflow.api.auth.backend.basic_auth", + "path": "airflow/api/auth/backend/basic_auth.py" + }, + { + "byte_len": 1343, + "has_error": false, + "line_count": 42, + "module_name": "airflow.api.auth.backend.default", + "path": "airflow/api/auth/backend/default.py" + }, + { + "byte_len": 1357, + "has_error": false, + "line_count": 44, + "module_name": "airflow.api.auth.backend.deny_all", + "path": "airflow/api/auth/backend/deny_all.py" + }, + { + "byte_len": 6542, + "has_error": false, + "line_count": 182, + "module_name": "airflow.api.auth.backend.kerberos_auth", + "path": "airflow/api/auth/backend/kerberos_auth.py" + }, + { + "byte_len": 1773, + "has_error": false, + "line_count": 55, + "module_name": "airflow.api.auth.backend.session", + "path": "airflow/api/auth/backend/session.py" + }, + { + "byte_len": 1730, + "has_error": false, + "line_count": 46, + "module_name": "airflow.api.client", + "path": "airflow/api/client/__init__.py" + }, + { + "byte_len": 2731, + "has_error": false, + "line_count": 94, + "module_name": "airflow.api.client.api_client", + "path": "airflow/api/client/api_client.py" + }, + { + "byte_len": 6616, + "has_error": false, + "line_count": 164, + "module_name": "airflow.api.client.json_client", + "path": "airflow/api/client/json_client.py" + }, + { + "byte_len": 3788, + "has_error": false, + "line_count": 93, + "module_name": "airflow.api.client.local_client", + "path": "airflow/api/client/local_client.py" + }, + { + "byte_len": 787, + "has_error": false, + "line_count": 17, + "module_name": "airflow.api.common", + "path": "airflow/api/common/__init__.py" + }, + { + "byte_len": 3268, + "has_error": false, + "line_count": 89, + "module_name": "airflow.api.common.airflow_health", + "path": "airflow/api/common/airflow_health.py" + }, + { + "byte_len": 4086, + "has_error": false, + "line_count": 108, + "module_name": "airflow.api.common.delete_dag", + "path": "airflow/api/common/delete_dag.py" + }, + { + "byte_len": 2138, + "has_error": false, + "line_count": 56, + "module_name": "airflow.api.common.experimental", + "path": "airflow/api/common/experimental/__init__.py" + }, + { + "byte_len": 1135, + "has_error": false, + "line_count": 30, + "module_name": "airflow.api.common.experimental.delete_dag", + "path": "airflow/api/common/experimental/delete_dag.py" + } + ], + "sha256": "226e8cb32dc0a23ec956e97b036e7c505037df979cce7182514f39a43b07cb80" + }, + "import_resolutions": { + "count": 19011, + "samples": [ + { + "import": "airflow/__init__.py:from_import:airflow:configuration:@2460", + "source_file": "airflow/__init__.py", + "target_file": "airflow/configuration.py", + "target_symbol": null + }, + { + "import": "airflow/__init__.py:from_import:airflow:settings:@2460", + "source_file": "airflow/__init__.py", + "target_file": "airflow/settings.py", + "target_symbol": null + }, + { + "import": "airflow/__main__.py:from_import:airflow.cli:cli_parser:@1630", + "source_file": "airflow/__main__.py", + "target_file": "airflow/cli/cli_parser.py", + "target_symbol": null + }, + { + "import": "airflow/__main__.py:from_import:airflow.configuration:AirflowConfigParser:@1665", + "source_file": "airflow/__main__.py", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:class:AirflowConfigParser@6263" + }, + { + "import": "airflow/__main__.py:from_import:airflow.configuration:write_webserver_configuration_if_needed:@1665", + "source_file": "airflow/__main__.py", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:function:write_webserver_configuration_if_needed@93544" + }, + { + "import": "airflow/__main__.py:from_import:airflow.exceptions:AirflowException:@1760", + "source_file": "airflow/__main__.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "import": "airflow/__main__.py:from_import:airflow:configuration:@1596", + "source_file": "airflow/__main__.py", + "target_file": "airflow/configuration.py", + "target_symbol": null + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.configuration:conf:@906", + "source_file": "airflow/api/__init__.py", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.exceptions:AirflowConfigException:@945", + "source_file": "airflow/api/__init__.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowConfigException@1913" + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.exceptions:AirflowException:@945", + "source_file": "airflow/api/__init__.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "import": "airflow/api/auth/backend/basic_auth.py:from_import:airflow.exceptions:RemovedInAirflow3Warning:@1101", + "source_file": "airflow/api/auth/backend/basic_auth.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:RemovedInAirflow3Warning@16092" + }, + { + "import": "airflow/api/auth/backend/basic_auth.py:import::airflow.providers.fab.auth_manager.api.auth.backend.basic_auth:fab_basic_auth@1013", + "source_file": "airflow/api/auth/backend/basic_auth.py", + "target_file": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py", + "target_symbol": null + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.configuration:conf:@2525", + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.exceptions:RemovedInAirflow3Warning:@838", + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:RemovedInAirflow3Warning@16092" + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.utils.airflow_flask_app:get_airflow_app:@894", + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_file": "airflow/utils/airflow_flask_app.py", + "target_symbol": "airflow/utils/airflow_flask_app.py:function:get_airflow_app@1176" + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.utils.net:getfqdn:@2564", + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_file": "airflow/utils/net.py", + "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" + }, + { + "import": "airflow/api/auth/backend/session.py:from_import:airflow.exceptions:RemovedInAirflow3Warning:@981", + "source_file": "airflow/api/auth/backend/session.py", + "target_file": "airflow/exceptions.py", + "target_symbol": "airflow/exceptions.py:class:RemovedInAirflow3Warning@16092" + }, + { + "import": "airflow/api/auth/backend/session.py:from_import:airflow.www.extensions.init_auth_manager:get_auth_manager:@1037", + "source_file": "airflow/api/auth/backend/session.py", + "target_file": "airflow/www/extensions/init_auth_manager.py", + "target_symbol": "airflow/www/extensions/init_auth_manager.py:function:get_auth_manager@2003" + }, + { + "import": "airflow/api/client/__init__.py:from_import:airflow.configuration:conf:@977", + "source_file": "airflow/api/client/__init__.py", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": "airflow/api/client/__init__.py:from_import:airflow:api:@953", + "source_file": "airflow/api/client/__init__.py", + "target_file": "airflow/api/__init__.py", + "target_symbol": null + } + ], + "sha256": "84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9" + }, + "imports": { + "count": 40580, + "samples": [ + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:future_import:__future__:annotations:@787", + "kind": "future_import", + "module": "__future__", + "name": "annotations", + "range": [ + 787, + 821 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:import::os:@847", + "kind": "import", + "module": null, + "name": "os", + "range": [ + 847, + 856 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:import::sys:@857", + "kind": "import", + "module": null, + "name": "sys", + "range": [ + 857, + 867 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:import::warnings:@868", + "kind": "import", + "module": null, + "name": "warnings", + "range": [ + 868, + 883 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:from_import:typing:TYPE_CHECKING:@884", + "kind": "from_import", + "module": "typing", + "name": "TYPE_CHECKING", + "range": [ + 884, + 916 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:from_import:airflow:configuration:@2460", + "kind": "from_import", + "module": "airflow", + "name": "configuration", + "range": [ + 2460, + 2503 + ] + }, + { + "alias": null, + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:from_import:airflow:settings:@2460", + "kind": "from_import", + "module": "airflow", + "name": "settings", + "range": [ + 2460, + 2503 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:future_import:__future__:annotations:@864", + "kind": "future_import", + "module": "__future__", + "name": "annotations", + "range": [ + 864, + 898 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:import::os:@900", + "kind": "import", + "module": null, + "name": "os", + "range": [ + 900, + 909 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:argparse:Namespace:@910", + "kind": "from_import", + "module": "argparse", + "name": "Namespace", + "range": [ + 910, + 940 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:import::argcomplete:@942", + "kind": "import", + "module": null, + "name": "argcomplete", + "range": [ + 942, + 960 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:airflow:configuration:@1596", + "kind": "from_import", + "module": "airflow", + "name": "configuration", + "range": [ + 1596, + 1629 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:airflow.cli:cli_parser:@1630", + "kind": "from_import", + "module": "airflow.cli", + "name": "cli_parser", + "range": [ + 1630, + 1664 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:airflow.configuration:AirflowConfigParser:@1665", + "kind": "from_import", + "module": "airflow.configuration", + "name": "AirflowConfigParser", + "range": [ + 1665, + 1759 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:airflow.configuration:write_webserver_configuration_if_needed:@1665", + "kind": "from_import", + "module": "airflow.configuration", + "name": "write_webserver_configuration_if_needed", + "range": [ + 1665, + 1759 + ] + }, + { + "alias": null, + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:from_import:airflow.exceptions:AirflowException:@1760", + "kind": "from_import", + "module": "airflow.exceptions", + "name": "AirflowException", + "range": [ + 1760, + 1807 + ] + }, + { + "alias": null, + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:future_import:__future__:annotations:@818", + "kind": "future_import", + "module": "__future__", + "name": "annotations", + "range": [ + 818, + 852 + ] + }, + { + "alias": null, + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:import::logging:@854", + "kind": "import", + "module": null, + "name": "logging", + "range": [ + 854, + 868 + ] + }, + { + "alias": null, + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:from_import:importlib:import_module:@869", + "kind": "from_import", + "module": "importlib", + "name": "import_module", + "range": [ + 869, + 904 + ] + }, + { + "alias": null, + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:from_import:airflow.configuration:conf:@906", + "kind": "from_import", + "module": "airflow.configuration", + "name": "conf", + "range": [ + 906, + 944 + ] + } + ], + "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" + }, + "references": { + "count": 95292, + "samples": [ + { + "import": null, + "name": "__lazy_imports", + "range": [ + 4169, + 4183 + ], + "source_file": "airflow/__init__.py", + "source_symbol": "airflow/__init__.py:function:__getattr__@4048", + "target_symbol": "airflow/__init__.py:global_variable:__lazy_imports@3362" + }, + { + "import": "airflow/__main__.py:from_import:airflow.configuration:write_webserver_configuration_if_needed:@1665", + "name": "write_webserver_configuration_if_needed", + "range": [ + 2738, + 2777 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_symbol": "airflow/configuration.py:function:write_webserver_configuration_if_needed@93544" + }, + { + "import": null, + "name": "configure_internal_api", + "range": [ + 2788, + 2810 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_symbol": "airflow/__main__.py:function:configure_internal_api@2850" + }, + { + "import": "airflow/__main__.py:from_import:airflow.configuration:AirflowConfigParser:@1665", + "name": "AirflowConfigParser", + "range": [ + 2896, + 2915 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:configure_internal_api@2850", + "target_symbol": "airflow/configuration.py:class:AirflowConfigParser@6263" + }, + { + "import": "airflow/__main__.py:from_import:airflow.exceptions:AirflowException:@1760", + "name": "AirflowException", + "range": [ + 3863, + 3879 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:configure_internal_api@2850", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.configuration:conf:@906", + "name": "conf", + "range": [ + 1199, + 1203 + ], + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.exceptions:AirflowConfigException:@945", + "name": "AirflowConfigException", + "range": [ + 1243, + 1265 + ], + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_symbol": "airflow/exceptions.py:class:AirflowConfigException@1913" + }, + { + "import": null, + "name": "log", + "range": [ + 1419, + 1422 + ], + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_symbol": "airflow/api/__init__.py:global_variable:log@1018" + }, + { + "import": null, + "name": "log", + "range": [ + 1541, + 1544 + ], + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_symbol": "airflow/api/__init__.py:global_variable:log@1018" + }, + { + "import": "airflow/api/__init__.py:from_import:airflow.exceptions:AirflowException:@945", + "name": "AirflowException", + "range": [ + 1636, + 1652 + ], + "source_file": "airflow/api/__init__.py", + "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", + "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" + }, + { + "import": null, + "name": "T", + "range": [ + 1151, + 1152 + ], + "source_file": "airflow/api/auth/backend/default.py", + "source_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", + "target_symbol": "airflow/api/auth/backend/default.py:global_variable:T@1078" + }, + { + "import": null, + "name": "T", + "range": [ + 1329, + 1330 + ], + "source_file": "airflow/api/auth/backend/default.py", + "source_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", + "target_symbol": "airflow/api/auth/backend/default.py:global_variable:T@1078" + }, + { + "import": null, + "name": "T", + "range": [ + 1164, + 1165 + ], + "source_file": "airflow/api/auth/backend/deny_all.py", + "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", + "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" + }, + { + "import": null, + "name": "T", + "range": [ + 1343, + 1344 + ], + "source_file": "airflow/api/auth/backend/deny_all.py", + "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", + "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.utils.net:getfqdn:@2564", + "name": "getfqdn", + "range": [ + 3232, + 3239 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" + }, + { + "import": null, + "name": "log", + "range": [ + 3246, + 3249 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + }, + { + "import": null, + "name": "_KERBEROS_SERVICE", + "range": [ + 3320, + 3337 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" + }, + { + "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.configuration:conf:@2525", + "name": "conf", + "range": [ + 3454, + 3458 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": null, + "name": "log", + "range": [ + 3503, + 3506 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + }, + { + "import": null, + "name": "log", + "range": [ + 3674, + 3677 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + } + ], + "sha256": "677270b43e9578c64f08d85f8635d5bf4bea027ad513649e8767d1147633af5c" + }, + "symbols": { + "count": 23663, + "samples": [ + { + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:function:__getattr__@4048", + "kind": "function", + "name": "__getattr__", + "name_range": [ + 4048, + 4059 + ], + "range": [ + 4044, + 5292 + ] + }, + { + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:global_variable:__all__@2505", + "kind": "global_variable", + "name": "__all__", + "name_range": [ + 2505, + 2512 + ], + "range": [ + 2505, + 2578 + ] + }, + { + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:global_variable:__lazy_imports@3362", + "kind": "global_variable", + "name": "__lazy_imports", + "name_range": [ + 3362, + 3376 + ], + "range": [ + 3362, + 3698 + ] + }, + { + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:global_variable:__path__@2728", + "kind": "global_variable", + "name": "__path__", + "name_range": [ + 2728, + 2736 + ], + "range": [ + 2728, + 2792 + ] + }, + { + "file": "airflow/__init__.py", + "key": "airflow/__init__.py:global_variable:__version__@823", + "kind": "global_variable", + "name": "__version__", + "name_range": [ + 823, + 834 + ], + "range": [ + 823, + 845 + ] + }, + { + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:function:configure_internal_api@2850", + "kind": "function", + "name": "configure_internal_api", + "name_range": [ + 2850, + 2872 + ], + "range": [ + 2846, + 4232 + ] + }, + { + "file": "airflow/__main__.py", + "key": "airflow/__main__.py:function:main@1814", + "kind": "function", + "name": "main", + "name_range": [ + 1814, + 1818 + ], + "range": [ + 1810, + 2843 + ] + }, + { + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:function:load_auth@1058", + "kind": "function", + "name": "load_auth", + "name_range": [ + 1058, + 1067 + ], + "range": [ + 1054, + 1677 + ] + }, + { + "file": "airflow/api/__init__.py", + "key": "airflow/api/__init__.py:global_variable:log@1018", + "kind": "global_variable", + "name": "log", + "name_range": [ + 1018, + 1021 + ], + "range": [ + 1018, + 1051 + ] + }, + { + "file": "airflow/api/auth/backend/basic_auth.py", + "key": "airflow/api/auth/backend/basic_auth.py:function:auth_current_user@1530", + "kind": "function", + "name": "auth_current_user", + "name_range": [ + 1530, + 1547 + ], + "range": [ + 1526, + 1611 + ] + }, + { + "file": "airflow/api/auth/backend/basic_auth.py", + "key": "airflow/api/auth/backend/basic_auth.py:function:init_app@1480", + "kind": "function", + "name": "init_app", + "name_range": [ + 1480, + 1488 + ], + "range": [ + 1476, + 1523 + ] + }, + { + "file": "airflow/api/auth/backend/basic_auth.py", + "key": "airflow/api/auth/backend/basic_auth.py:function:requires_authentication@1618", + "kind": "function", + "name": "requires_authentication", + "name_range": [ + 1618, + 1641 + ], + "range": [ + 1614, + 1722 + ] + }, + { + "file": "airflow/api/auth/backend/basic_auth.py", + "key": "airflow/api/auth/backend/basic_auth.py:global_variable:CLIENT_AUTH@1240", + "kind": "global_variable", + "name": "CLIENT_AUTH", + "name_range": [ + 1240, + 1251 + ], + "range": [ + 1240, + 1288 + ] + }, + { + "file": "airflow/api/auth/backend/default.py", + "key": "airflow/api/auth/backend/default.py:function:init_app@1018", + "kind": "function", + "name": "init_app", + "name_range": [ + 1018, + 1026 + ], + "range": [ + 1014, + 1075 + ] + }, + { + "file": "airflow/api/auth/backend/default.py", + "key": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", + "kind": "function", + "name": "requires_authentication", + "name_range": [ + 1117, + 1140 + ], + "range": [ + 1113, + 1342 + ] + }, + { + "file": "airflow/api/auth/backend/default.py", + "key": "airflow/api/auth/backend/default.py:global_variable:CLIENT_AUTH@963", + "kind": "global_variable", + "name": "CLIENT_AUTH", + "name_range": [ + 963, + 974 + ], + "range": [ + 963, + 1011 + ] + }, + { + "file": "airflow/api/auth/backend/default.py", + "key": "airflow/api/auth/backend/default.py:global_variable:T@1078", + "kind": "global_variable", + "name": "T", + "name_range": [ + 1078, + 1079 + ], + "range": [ + 1078, + 1110 + ] + }, + { + "file": "airflow/api/auth/backend/deny_all.py", + "key": "airflow/api/auth/backend/deny_all.py:function:init_app@1039", + "kind": "function", + "name": "init_app", + "name_range": [ + 1039, + 1047 + ], + "range": [ + 1035, + 1088 + ] + }, + { + "file": "airflow/api/auth/backend/deny_all.py", + "key": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", + "kind": "function", + "name": "requires_authentication", + "name_range": [ + 1130, + 1153 + ], + "range": [ + 1126, + 1356 + ] + }, + { + "file": "airflow/api/auth/backend/deny_all.py", + "key": "airflow/api/auth/backend/deny_all.py:global_variable:CLIENT_AUTH@984", + "kind": "global_variable", + "name": "CLIENT_AUTH", + "name_range": [ + 984, + 995 + ], + "range": [ + 984, + 1032 + ] + } + ], + "sha256": "02fd17a7c0ba4f8fa0f29dcdfc642bffcc8116c20b86f5519f15fd0447d08781" + } + }, + "integrity": { + "bad_dependency_reference_counts": 0, + "bad_dependency_reference_targets": 0, + "missing_dependency_links": 0, + "missing_import_resolution_links": 0, + "missing_reference_links": 0 + }, + "metadata": { + "commit": "b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf", + "name": "apache-airflow-2.10.5", + "ref": "refs/tags/2.10.5", + "repo_url": "https://github.com/apache/airflow.git" + }, + "schema_version": 1, + "summary": { + "bytes": 36617627, + "classes": 5379, + "dependencies": 35489, + "files": 4789, + "files_with_errors": 0, + "functions": 6145, + "global_variables": 12139, + "import_resolutions": 19011, + "imports": 40580, + "lines": 924514, + "references": 95292, + "symbols": 23663 + } +} diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 563c36811..23a129bdb 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -131,7 +131,8 @@ Recommended task format: - [ ] Define large-repo success targets for memory and time. - [x] Select first pinned large Python repo commit for golden parity and latency benchmarks. owner: codex. Result: Apache Airflow `2.10.5`, upstream `https://github.com/apache/airflow.git`, ref `refs/tags/2.10.5`, commit `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`, measured with Python 3.13.11 on macOS. - [ ] Select additional pinned large Python repo commits for golden parity and latency benchmarks. -- [ ] Build golden reference/import/dependency graph snapshots for the pinned large Python repo commits. Notes: fixtures should assert file/module records, import graph edges, symbol reference graph edges, dependency graph edges, and deterministic sort order. +- [x] Build first compact Rust golden graph snapshot for the pinned large Python repo commit. owner: codex. Result: committed `rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json` with stable files, symbols, imports, import-resolution, reference, and dependency counts/hashes/samples plus integrity checks. +- [ ] Compare golden reference/import/dependency graph snapshots against the Python backend semantics for the pinned large Python repo commits. Notes: fixtures should assert file/module records, import graph edges, symbol reference graph edges, dependency graph edges, and deterministic sort order. - [x] Draft compact Rust data model with module boundaries and Python integration points. owner: Pasteur. Result: documented in `rust-rewrite/data-model.md`. - [ ] Draft full Rust engine RFC with module boundaries and Python integration points. - [ ] Decide build tooling: `maturin`, setuptools-rust, or hatch custom hook. @@ -189,6 +190,7 @@ Recommended task format: - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. - [ ] Add graph debug dump for nodes, edges, and usage metadata. +- [x] Add compact Rust graph debug snapshot for pinned Airflow. owner: codex. Result: `snapshot_pinned_python_repo.py` normalizes compact records by stable paths/symbol keys and emits deterministic counts, hashes, and sample rows for large-repo review. - [ ] Add parity tests comparing Python backend and Rust backend graph edges on fixtures. ## Phase 4: Lazy Python Compatibility Layer @@ -225,6 +227,7 @@ Recommended task format: - [ ] Run full unit suite with Rust backend where supported. - [ ] Add large-repo memory regression benchmark to CI or nightly. - [x] Add pinned large-repo latency/RSS benchmark harness. owner: codex. Result: Airflow `2.10.5` benchmark command emits backend, wall time, max RSS, file count, node/edge counts, compact Rust record counts, mismatch summaries, and pass/fail gates. +- [x] Add opt-in pinned large-repo compact snapshot test. owner: codex. Result: `tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py` runs the committed Airflow compact golden check when `GRAPH_SITTER_RUN_PINNED_AIRFLOW_SNAPSHOT=1`. - [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. Notes: run against the exact checked-out commit and emit backend, wall time, max RSS, file count, node/edge counts, and mismatch summaries. - [ ] Add feature flag documentation. - [ ] Add migration notes for unsupported APIs. @@ -258,3 +261,4 @@ Recommended task format: - [x] 2026-06-18: Added compact Python `ReferenceRecord` extraction for same-file and imported top-level symbol references inside top-level classes/functions. owner: codex. Notes: current checkout emits 3,666 compact references and remains 5.0x faster with 4.1x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added compact Python `DependencyRecord` construction from references. owner: codex. Notes: current checkout emits 2,020 de-duplicated dependency edges and remains 4.6x faster with 4.1x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added first pinned large-repo benchmark runner and Airflow baseline. owner: codex. Notes: Apache Airflow `2.10.5` at `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf` matched 4,789 Python files and measured 6.218x faster wall time with 9.882x lower max RSS for the current compact Rust `Codebase` slice. +- [x] 2026-06-18: Added first pinned Airflow compact graph golden. owner: codex. Notes: committed stable hashes/samples for 4,789 files, 23,663 symbols, 40,580 imports, 19,011 import resolutions, 95,292 references, and 35,489 dependencies; the opt-in pytest wrapper can verify it against the pinned checkout. diff --git a/rust-rewrite/tools/snapshot_pinned_python_repo.py b/rust-rewrite/tools/snapshot_pinned_python_repo.py new file mode 100644 index 000000000..658139452 --- /dev/null +++ b/rust-rewrite/tools/snapshot_pinned_python_repo.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import hashlib +import json +import resource +import sys +import time +from pathlib import Path +from typing import Any + +TOOLS_DIR = Path(__file__).resolve().parent +REPO_ROOT = TOOLS_DIR.parents[1] +SRC_ROOT = REPO_ROOT / "src" +if str(TOOLS_DIR) not in sys.path: + sys.path.insert(0, str(TOOLS_DIR)) +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from benchmark_pinned_python_repo import ( # noqa: E402 + DEFAULT_CACHE_DIR, + DEFAULT_EXPECTED_COMMIT, + DEFAULT_EXTENSION_DIR, + DEFAULT_REF, + DEFAULT_REPO_NAME, + DEFAULT_REPO_URL, + build_rust_extension, + prepare_pinned_repo, +) + +DEFAULT_EXPECTED_SNAPSHOT = REPO_ROOT / "rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json" +SNAPSHOT_SCHEMA_VERSION = 1 + + +def bytes_to_mb(value: float) -> float: + return value / (1024 * 1024) + + +def max_rss_bytes() -> int: + rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform == "darwin": + return int(rss) + return int(rss * 1024) + + +def stable_json(data: Any) -> str: + return json.dumps(data, sort_keys=True, separators=(",", ":")) + + +def row_digest(rows: list[dict[str, Any]]) -> str: + digest = hashlib.sha256() + for row in rows: + digest.update(stable_json(row).encode("utf-8")) + digest.update(b"\n") + return digest.hexdigest() + + +def compact_record_set(rows: list[dict[str, Any]], *, sample_size: int) -> dict[str, Any]: + return { + "count": len(rows), + "sha256": row_digest(rows), + "samples": rows[:sample_size], + } + + +def symbol_key(symbol: Any, file_by_id: dict[int, Any]) -> str: + file = file_by_id[symbol.file_id] + return f"{file.path}:{symbol.kind}:{symbol.name}@{symbol.name_range.start_byte}" + + +def import_key(import_record: Any, file_by_id: dict[int, Any]) -> str: + file = file_by_id[import_record.file_id] + module = import_record.module if import_record.module is not None else "" + name = import_record.name if import_record.name is not None else "" + alias = import_record.alias if import_record.alias is not None else "" + return f"{file.path}:{import_record.kind}:{module}:{name}:{alias}@{import_record.range.start_byte}" + + +def make_file_rows(codebase: Any) -> list[dict[str, Any]]: + rows = [ + { + "path": file.path, + "module_name": file.module_name, + "byte_len": file.byte_len, + "line_count": file.line_count, + "has_error": file.has_error, + } + for file in codebase.rust_files + ] + return sorted(rows, key=lambda row: row["path"]) + + +def make_symbol_rows(codebase: Any, file_by_id: dict[int, Any]) -> list[dict[str, Any]]: + rows = [ + { + "key": symbol_key(symbol, file_by_id), + "file": file_by_id[symbol.file_id].path, + "kind": symbol.kind, + "name": symbol.name, + "range": [symbol.range.start_byte, symbol.range.end_byte], + "name_range": [symbol.name_range.start_byte, symbol.name_range.end_byte], + } + for symbol in codebase.rust_symbols + ] + return sorted(rows, key=lambda row: (row["file"], row["kind"], row["name"], row["name_range"])) + + +def make_import_rows(codebase: Any, file_by_id: dict[int, Any]) -> list[dict[str, Any]]: + rows = [ + { + "key": import_key(import_record, file_by_id), + "file": file_by_id[import_record.file_id].path, + "kind": import_record.kind, + "module": import_record.module, + "name": import_record.name, + "alias": import_record.alias, + "range": [import_record.range.start_byte, import_record.range.end_byte], + } + for import_record in codebase.rust_imports + ] + return sorted(rows, key=lambda row: (row["file"], row["range"], row["kind"], row["module"] or "", row["name"] or "", row["alias"] or "")) + + +def make_import_resolution_rows( + codebase: Any, + file_by_id: dict[int, Any], + symbol_by_id: dict[int, Any], + import_by_id: dict[int, Any], +) -> list[dict[str, Any]]: + rows = [] + for resolution in codebase.rust_import_resolutions: + target_symbol = None if resolution.target_symbol_id is None else symbol_by_id[resolution.target_symbol_id] + rows.append( + { + "import": import_key(import_by_id[resolution.import_id], file_by_id), + "source_file": file_by_id[resolution.source_file_id].path, + "target_file": file_by_id[resolution.target_file_id].path, + "target_symbol": None if target_symbol is None else symbol_key(target_symbol, file_by_id), + } + ) + return sorted(rows, key=lambda row: (row["source_file"], row["import"], row["target_file"], row["target_symbol"] or "")) + + +def make_reference_rows( + codebase: Any, + file_by_id: dict[int, Any], + symbol_by_id: dict[int, Any], + import_by_id: dict[int, Any], +) -> list[dict[str, Any]]: + rows = [] + for reference in codebase.rust_references: + source_symbol = None if reference.source_symbol_id is None else symbol_by_id[reference.source_symbol_id] + rows.append( + { + "source_file": file_by_id[reference.source_file_id].path, + "source_symbol": None if source_symbol is None else symbol_key(source_symbol, file_by_id), + "target_symbol": symbol_key(symbol_by_id[reference.target_symbol_id], file_by_id), + "import": None if reference.import_id is None else import_key(import_by_id[reference.import_id], file_by_id), + "name": reference.name, + "range": [reference.range.start_byte, reference.range.end_byte], + } + ) + return sorted(rows, key=lambda row: (row["source_file"], row["range"], row["source_symbol"] or "", row["target_symbol"], row["name"])) + + +def make_dependency_rows( + codebase: Any, + file_by_id: dict[int, Any], + symbol_by_id: dict[int, Any], +) -> list[dict[str, Any]]: + rows = [ + { + "source_file": file_by_id[dependency.source_file_id].path, + "source_symbol": symbol_key(symbol_by_id[dependency.source_symbol_id], file_by_id), + "target_file": file_by_id[dependency.target_file_id].path, + "target_symbol": symbol_key(symbol_by_id[dependency.target_symbol_id], file_by_id), + "reference_count": dependency.reference_count, + } + for dependency in codebase.rust_dependencies + ] + return sorted(rows, key=lambda row: (row["source_symbol"], row["target_symbol"], row["reference_count"])) + + +def validate_integrity(codebase: Any) -> dict[str, int]: + file_ids = {file.id for file in codebase.rust_files} + symbol_ids = {symbol.id for symbol in codebase.rust_symbols} + import_ids = {import_record.id for import_record in codebase.rust_imports} + reference_by_id = {reference.id: reference for reference in codebase.rust_references} + + missing_import_resolution_links = 0 + for resolution in codebase.rust_import_resolutions: + missing_import_resolution_links += int(resolution.import_id not in import_ids) + missing_import_resolution_links += int(resolution.source_file_id not in file_ids) + missing_import_resolution_links += int(resolution.target_file_id not in file_ids) + if resolution.target_symbol_id is not None: + missing_import_resolution_links += int(resolution.target_symbol_id not in symbol_ids) + + missing_reference_links = 0 + for reference in codebase.rust_references: + missing_reference_links += int(reference.source_file_id not in file_ids) + missing_reference_links += int(reference.target_symbol_id not in symbol_ids) + if reference.source_symbol_id is not None: + missing_reference_links += int(reference.source_symbol_id not in symbol_ids) + if reference.import_id is not None: + missing_reference_links += int(reference.import_id not in import_ids) + + missing_dependency_links = 0 + bad_dependency_reference_counts = 0 + bad_dependency_reference_targets = 0 + for dependency in codebase.rust_dependencies: + missing_dependency_links += int(dependency.source_file_id not in file_ids) + missing_dependency_links += int(dependency.target_file_id not in file_ids) + missing_dependency_links += int(dependency.source_symbol_id not in symbol_ids) + missing_dependency_links += int(dependency.target_symbol_id not in symbol_ids) + if dependency.reference_count != len(dependency.reference_ids): + bad_dependency_reference_counts += 1 + for reference_id in dependency.reference_ids: + reference = reference_by_id.get(reference_id) + if reference is None: + missing_dependency_links += 1 + continue + if reference.source_symbol_id != dependency.source_symbol_id or reference.target_symbol_id != dependency.target_symbol_id: + bad_dependency_reference_targets += 1 + + return { + "missing_import_resolution_links": missing_import_resolution_links, + "missing_reference_links": missing_reference_links, + "missing_dependency_links": missing_dependency_links, + "bad_dependency_reference_counts": bad_dependency_reference_counts, + "bad_dependency_reference_targets": bad_dependency_reference_targets, + } + + +def assert_integrity(integrity: dict[str, int]) -> None: + failures = [f"{name}={value}" for name, value in integrity.items() if value != 0] + if failures: + msg = "compact graph integrity check failed: " + ", ".join(failures) + raise RuntimeError(msg) + + +def make_snapshot(args: argparse.Namespace) -> tuple[dict[str, Any], dict[str, Any]]: + repo, actual_commit = prepare_pinned_repo(args) + extension_path = None + if not args.skip_build_extension: + extension_path = build_rust_extension(args.extension_dir, timeout=args.timeout) + if str(args.extension_dir) not in sys.path: + sys.path.insert(0, str(args.extension_dir)) + + from graph_sitter.configs.models.codebase import CodebaseConfig, GraphBackend, RustFallbackMode + from graph_sitter.core.codebase import Codebase + + start = time.perf_counter() + config = CodebaseConfig(graph_backend=GraphBackend.RUST, rust_fallback=RustFallbackMode.ERROR) + codebase = Codebase(str(repo), language="python", config=config) + wall = time.perf_counter() - start + + file_by_id = {file.id: file for file in codebase.rust_files} + symbol_by_id = {symbol.id: symbol for symbol in codebase.rust_symbols} + import_by_id = {import_record.id: import_record for import_record in codebase.rust_imports} + + file_rows = make_file_rows(codebase) + symbol_rows = make_symbol_rows(codebase, file_by_id) + import_rows = make_import_rows(codebase, file_by_id) + import_resolution_rows = make_import_resolution_rows(codebase, file_by_id, symbol_by_id, import_by_id) + reference_rows = make_reference_rows(codebase, file_by_id, symbol_by_id, import_by_id) + dependency_rows = make_dependency_rows(codebase, file_by_id, symbol_by_id) + integrity = validate_integrity(codebase) + assert_integrity(integrity) + + summary = codebase.rust_index_summary + snapshot = { + "schema_version": SNAPSHOT_SCHEMA_VERSION, + "metadata": { + "name": args.name, + "repo_url": args.repo_url, + "ref": args.ref, + "commit": actual_commit, + }, + "summary": { + "files": summary.files, + "symbols": summary.symbols, + "classes": summary.classes, + "functions": summary.functions, + "global_variables": summary.global_variables, + "imports": summary.imports, + "import_resolutions": summary.import_resolutions, + "references": summary.references, + "dependencies": summary.dependencies, + "bytes": summary.bytes, + "lines": summary.lines, + "files_with_errors": summary.files_with_errors, + }, + "graphs": { + "files": compact_record_set(file_rows, sample_size=args.sample_size), + "symbols": compact_record_set(symbol_rows, sample_size=args.sample_size), + "imports": compact_record_set(import_rows, sample_size=args.sample_size), + "import_resolutions": compact_record_set(import_resolution_rows, sample_size=args.sample_size), + "references": compact_record_set(reference_rows, sample_size=args.sample_size), + "dependencies": compact_record_set(dependency_rows, sample_size=args.sample_size), + }, + "integrity": integrity, + } + observation = { + "checkout": str(repo), + "extension_path": str(extension_path) if extension_path else None, + "wall_seconds": round(wall, 6), + "max_rss_mb": round(bytes_to_mb(max_rss_bytes()), 3), + } + return snapshot, observation + + +def compare_snapshot(actual: dict[str, Any], expected_path: Path) -> None: + expected = json.loads(expected_path.read_text(encoding="utf-8")) + if actual == expected: + return + + mismatches = [] + for key in ("metadata", "summary", "integrity"): + if actual.get(key) != expected.get(key): + mismatches.append(key) + for graph_name, graph in actual.get("graphs", {}).items(): + expected_graph = expected.get("graphs", {}).get(graph_name) + if graph != expected_graph: + mismatches.append(f"graphs.{graph_name}") + msg = f"snapshot mismatch against {expected_path}: {', '.join(mismatches)}" + raise AssertionError(msg) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Create or verify a deterministic compact Rust graph snapshot for a pinned Python repository.") + parser.add_argument("--name", default=DEFAULT_REPO_NAME, help="Stable name for the pinned repository checkout.") + parser.add_argument("--repo-url", default=DEFAULT_REPO_URL, help="Git repository URL.") + parser.add_argument("--ref", default=DEFAULT_REF, help="Remote ref or commit to fetch.") + parser.add_argument("--expected-commit", default=DEFAULT_EXPECTED_COMMIT, help="Expected resolved commit SHA. Pass an empty string to disable.") + parser.add_argument("--cache-dir", type=Path, default=DEFAULT_CACHE_DIR, help="Directory for reusable pinned checkouts.") + parser.add_argument("--extension-dir", type=Path, default=DEFAULT_EXTENSION_DIR, help="Directory for the built PyO3 extension module.") + parser.add_argument("--expected", type=Path, default=DEFAULT_EXPECTED_SNAPSHOT, help="Expected compact snapshot JSON path.") + parser.add_argument("--output", type=Path, help="Optional path to write the observed snapshot JSON.") + parser.add_argument("--update", action="store_true", help="Write the observed snapshot to --expected instead of comparing.") + parser.add_argument("--reset-checkout", action="store_true", help="Delete and recreate the cached checkout before running.") + parser.add_argument("--skip-fetch", action="store_true", help="Do not fetch before checkout; useful for offline reruns with FETCH_HEAD present.") + parser.add_argument("--skip-build-extension", action="store_true", help="Reuse an existing graph_sitter_py extension in --extension-dir.") + parser.add_argument("--sample-size", type=int, default=20, help="Number of sorted sample rows stored for each graph family.") + parser.add_argument("--timeout", type=int, default=900, help="Timeout in seconds for clone/build child commands.") + parser.add_argument("--json", action="store_true", help="Print the snapshot JSON instead of a human summary.") + return parser.parse_args() + + +def print_human(snapshot: dict[str, Any], observation: dict[str, Any], expected: Path) -> None: + summary = snapshot["summary"] + print(f"repo: {snapshot['metadata']['name']} {snapshot['metadata']['commit']}") + print(f"expected: {expected}") + print(f"checkout: {observation['checkout']}") + print(f"rust snapshot: wall={observation['wall_seconds']:.3f}s max_rss={observation['max_rss_mb']:.1f} MB") + print( + "summary: " + f"files={summary['files']} symbols={summary['symbols']} imports={summary['imports']} " + f"import_resolutions={summary['import_resolutions']} references={summary['references']} dependencies={summary['dependencies']}" + ) + print( + "hashes: " + f"files={snapshot['graphs']['files']['sha256']} " + f"imports={snapshot['graphs']['imports']['sha256']} " + f"references={snapshot['graphs']['references']['sha256']} " + f"dependencies={snapshot['graphs']['dependencies']['sha256']}" + ) + + +def main() -> int: + args = parse_args() + if args.expected_commit == "": + args.expected_commit = None + snapshot, observation = make_snapshot(args) + + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(json.dumps(snapshot, indent=2, sort_keys=True) + "\n", encoding="utf-8") + if args.update: + args.expected.parent.mkdir(parents=True, exist_ok=True) + args.expected.write_text(json.dumps(snapshot, indent=2, sort_keys=True) + "\n", encoding="utf-8") + else: + compare_snapshot(snapshot, args.expected) + + if args.json: + print(json.dumps({"observation": observation, "snapshot": snapshot}, indent=2, sort_keys=True)) + else: + print_human(snapshot, observation, args.expected) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py b/tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py new file mode 100644 index 000000000..aaa161e84 --- /dev/null +++ b/tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +import os +import shlex +import subprocess +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[3] + + +@pytest.mark.skipif( + os.environ.get("GRAPH_SITTER_RUN_PINNED_AIRFLOW_SNAPSHOT") != "1", + reason="set GRAPH_SITTER_RUN_PINNED_AIRFLOW_SNAPSHOT=1 to run the pinned Airflow Rust compact snapshot check", +) +def test_pinned_airflow_rust_compact_snapshot() -> None: + extra_args = shlex.split(os.environ.get("GRAPH_SITTER_PINNED_AIRFLOW_SNAPSHOT_ARGS", "")) + command = [ + sys.executable, + str(REPO_ROOT / "rust-rewrite/tools/snapshot_pinned_python_repo.py"), + *extra_args, + ] + subprocess.run(command, cwd=REPO_ROOT, check=True) From 5f7ca1af114e87683701855b73c88a117e99fd0a Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:34:27 -0700 Subject: [PATCH 018/228] Attribute Rust references to nested Python symbols --- crates/graph-sitter-engine/src/lib.rs | 336 ++++++++++++++---- rust-rewrite/benchmarks.md | 13 +- .../apache-airflow-2.10.5-rust-compact.json | 218 +++++++----- rust-rewrite/python-compat.md | 4 +- rust-rewrite/strategy.md | 4 +- .../tools/snapshot_pinned_python_repo.py | 3 + src/graph_sitter/codebase/rust_backend.py | 6 +- src/graph_sitter/core/codebase.py | 2 +- tests/unit/sdk/codebase/test_rust_backend.py | 4 + 9 files changed, 414 insertions(+), 176 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 55e5e5fa2..bf5ecb171 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -209,6 +209,8 @@ pub enum SymbolKind { pub struct SymbolRecord { pub id: u32, pub file_id: u32, + pub parent_symbol_id: Option, + pub is_top_level: bool, pub name: String, pub kind: SymbolKind, pub range: SourceRange, @@ -441,10 +443,25 @@ fn extract_python_file( reference_candidates: &mut Vec, ) { let root = tree.root_node(); + let mut excluded_name_ranges = Vec::new(); let mut cursor = root.walk(); for child in root.named_children(&mut cursor) { - extract_top_level_node(file_id, source, child, index, reference_candidates); + extract_top_level_node(file_id, source, child, index, &mut excluded_name_ranges); } + let symbol_ranges = index + .symbols + .iter() + .filter(|symbol| symbol.file_id == file_id) + .map(|symbol| (symbol.id, symbol.range)) + .collect::>(); + collect_identifier_candidates( + file_id, + source, + root, + &symbol_ranges, + &excluded_name_ranges, + reference_candidates, + ); } fn extract_top_level_node( @@ -452,30 +469,32 @@ fn extract_top_level_node( source: &str, node: Node<'_>, index: &mut PythonIndex, - reference_candidates: &mut Vec, + excluded_name_ranges: &mut Vec, ) { match node.kind() { "class_definition" => { - push_symbol(file_id, source, node, SymbolKind::Class, index).inspect(|symbol_id| { - collect_symbol_reference_candidates( - file_id, - *symbol_id, - source, - node, - reference_candidates, - ); - }); + extract_symbol_tree( + file_id, + source, + node, + node.range(), + SymbolKind::Class, + None, + index, + excluded_name_ranges, + ); } "function_definition" => { - push_symbol(file_id, source, node, SymbolKind::Function, index).inspect(|symbol_id| { - collect_symbol_reference_candidates( - file_id, - *symbol_id, - source, - node, - reference_candidates, - ); - }); + extract_symbol_tree( + file_id, + source, + node, + node.range(), + SymbolKind::Function, + None, + index, + excluded_name_ranges, + ); } "decorated_definition" => { if let Some(definition) = @@ -486,16 +505,16 @@ fn extract_top_level_node( } else { SymbolKind::Function }; - push_symbol_with_range(file_id, source, definition, node.range(), kind, index) - .inspect(|symbol_id| { - collect_symbol_reference_candidates( - file_id, - *symbol_id, - source, - node, - reference_candidates, - ); - }); + extract_symbol_tree( + file_id, + source, + definition, + node.range(), + kind, + None, + index, + excluded_name_ranges, + ); } } "import_statement" => push_import_statement(file_id, source, node, index), @@ -503,27 +522,118 @@ fn extract_top_level_node( push_from_import_statement(file_id, source, node, index) } "assignment" | "annotated_assignment" => { - push_global_assignment(file_id, source, node, index) + push_global_assignment(file_id, source, node, index, excluded_name_ranges) } "expression_statement" => { if let Some(assignment) = first_child_of_kind(node, &["assignment", "annotated_assignment"]) { - push_global_assignment(file_id, source, assignment, index); + push_global_assignment(file_id, source, assignment, index, excluded_name_ranges); } } _ => {} } } -fn push_symbol( +fn extract_symbol_tree( file_id: u32, source: &str, - node: Node<'_>, + definition: Node<'_>, + declaration_range: Range, kind: SymbolKind, + parent_symbol_id: Option, index: &mut PythonIndex, + excluded_name_ranges: &mut Vec, ) -> Option { - push_symbol_with_range(file_id, source, node, node.range(), kind, index) + let symbol_id = push_symbol_with_range( + file_id, + source, + definition, + declaration_range, + kind, + parent_symbol_id, + index, + )?; + if let Some(name_node) = definition.child_by_field_name("name") { + excluded_name_ranges.push(name_node.range().into()); + } + extract_nested_symbols( + file_id, + source, + definition, + Some(symbol_id), + index, + excluded_name_ranges, + ); + Some(symbol_id) +} + +fn extract_nested_symbols( + file_id: u32, + source: &str, + node: Node<'_>, + parent_symbol_id: Option, + index: &mut PythonIndex, + excluded_name_ranges: &mut Vec, +) { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + match child.kind() { + "class_definition" => { + extract_symbol_tree( + file_id, + source, + child, + child.range(), + SymbolKind::Class, + parent_symbol_id, + index, + excluded_name_ranges, + ); + } + "function_definition" => { + extract_symbol_tree( + file_id, + source, + child, + child.range(), + SymbolKind::Function, + parent_symbol_id, + index, + excluded_name_ranges, + ); + } + "decorated_definition" => { + if let Some(definition) = + first_child_of_kind(child, &["class_definition", "function_definition"]) + { + let kind = if definition.kind() == "class_definition" { + SymbolKind::Class + } else { + SymbolKind::Function + }; + extract_symbol_tree( + file_id, + source, + definition, + child.range(), + kind, + parent_symbol_id, + index, + excluded_name_ranges, + ); + } + } + _ => extract_nested_symbols( + file_id, + source, + child, + parent_symbol_id, + index, + excluded_name_ranges, + ), + } + } } fn push_symbol_with_range( @@ -532,6 +642,7 @@ fn push_symbol_with_range( node: Node<'_>, declaration_range: Range, kind: SymbolKind, + parent_symbol_id: Option, index: &mut PythonIndex, ) -> Option { let Some(name_node) = node.child_by_field_name("name") else { @@ -544,6 +655,8 @@ fn push_symbol_with_range( index.symbols.push(SymbolRecord { id: symbol_id, file_id, + parent_symbol_id, + is_top_level: parent_symbol_id.is_none(), name: name.to_owned(), kind, range: declaration_range.into(), @@ -552,7 +665,13 @@ fn push_symbol_with_range( Some(symbol_id) } -fn push_global_assignment(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { +fn push_global_assignment( + file_id: u32, + source: &str, + node: Node<'_>, + index: &mut PythonIndex, + excluded_name_ranges: &mut Vec, +) { let Some(left) = node.child_by_field_name("left") else { return; }; @@ -565,11 +684,14 @@ fn push_global_assignment(file_id: u32, source: &str, node: Node<'_>, index: &mu index.symbols.push(SymbolRecord { id: index.symbols.len() as u32, file_id, + parent_symbol_id: None, + is_top_level: true, name: name.to_owned(), kind: SymbolKind::GlobalVariable, range: node.range().into(), name_range: target.range().into(), }); + excluded_name_ranges.push(target.range().into()); } } @@ -586,62 +708,58 @@ fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec, - out: &mut Vec, -) { - let excluded_name_range = symbol_node - .child_by_field_name("name") - .map(|name_node| name_node.range()); - collect_identifier_candidates( - file_id, - Some(source_symbol_id), - source, - symbol_node, - excluded_name_range, - out, - ); -} - fn collect_identifier_candidates( file_id: u32, - source_symbol_id: Option, source: &str, node: Node<'_>, - excluded_range: Option, + symbol_ranges: &[(u32, SourceRange)], + excluded_ranges: &[SourceRange], out: &mut Vec, ) { - if node.kind() == "identifier" && !range_matches(node.range(), excluded_range) { + if matches!( + node.kind(), + "import_statement" | "import_from_statement" | "future_import_statement" + ) { + return; + } + + let range = node.range().into(); + if node.kind() == "identifier" && !range_matches_any(range, excluded_ranges) { if let Ok(name) = node.utf8_text(source.as_bytes()) { out.push(ReferenceCandidate { source_file_id: file_id, - source_symbol_id, + source_symbol_id: innermost_symbol_for_range(symbol_ranges, range), name: name.to_owned(), - range: node.range().into(), + range, }); } } let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { - collect_identifier_candidates( - file_id, - source_symbol_id, - source, - child, - excluded_range, - out, - ); + collect_identifier_candidates(file_id, source, child, symbol_ranges, excluded_ranges, out); } } -fn range_matches(range: Range, other: Option) -> bool { - other - .map(|other| range.start_byte == other.start_byte && range.end_byte == other.end_byte) - .unwrap_or(false) +fn innermost_symbol_for_range( + symbol_ranges: &[(u32, SourceRange)], + range: SourceRange, +) -> Option { + symbol_ranges + .iter() + .filter(|(_, symbol_range)| contains_range(*symbol_range, range)) + .min_by_key(|(_, symbol_range)| symbol_range.end_byte - symbol_range.start_byte) + .map(|(symbol_id, _)| *symbol_id) +} + +fn contains_range(container: SourceRange, range: SourceRange) -> bool { + container.start_byte <= range.start_byte && range.end_byte <= container.end_byte +} + +fn range_matches_any(range: SourceRange, others: &[SourceRange]) -> bool { + others + .iter() + .any(|other| range.start_byte == other.start_byte && range.end_byte == other.end_byte) } fn push_import_statement(file_id: u32, source: &str, node: Node<'_>, index: &mut PythonIndex) { @@ -741,6 +859,7 @@ fn resolve_python_imports(index: &mut PythonIndex) { let symbol_to_id: HashMap<(u32, &str), u32> = index .symbols .iter() + .filter(|symbol| symbol.is_top_level) .map(|symbol| ((symbol.file_id, symbol.name.as_str()), symbol.id)) .collect(); @@ -780,6 +899,7 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec = index .symbols .iter() + .filter(|symbol| symbol.is_top_level) .map(|symbol| ((symbol.file_id, symbol.name.as_str()), symbol.id)) .collect(); let resolution_by_import_id: HashMap = index @@ -1051,7 +1171,11 @@ mod tests { assert_eq!(index.summary().references, 0); assert_eq!(index.summary().dependencies, 0); assert_eq!(index.symbols[0].name, "Service"); + assert_eq!(index.symbols[0].parent_symbol_id, None); + assert!(index.symbols[0].is_top_level); assert_eq!(index.symbols[1].name, "helper"); + assert_eq!(index.symbols[1].parent_symbol_id, None); + assert!(index.symbols[1].is_top_level); assert!(index .imports .iter() @@ -1152,6 +1276,64 @@ mod tests { })); } + #[test] + fn attributes_references_to_innermost_python_symbol() { + let repo = temp_repo_path("nested-python-reference-sources"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "class Base:\n pass\n\ndef helper():\n return Base\n", + ) + .unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from .base import Base, helper\n\nclass Service(Base):\n def run(self):\n return helper()\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let service = index + .symbols + .iter() + .find(|symbol| symbol.name == "Service") + .unwrap(); + let run = index + .symbols + .iter() + .find(|symbol| symbol.name == "run") + .unwrap(); + let helper = index + .symbols + .iter() + .find(|symbol| symbol.name == "helper") + .unwrap(); + let base = index + .symbols + .iter() + .find(|symbol| symbol.name == "Base") + .unwrap(); + + assert!(service.is_top_level); + assert!(!run.is_top_level); + assert_eq!(run.parent_symbol_id, Some(service.id)); + assert!(index.references.iter().any(|reference| { + reference.name == "Base" + && reference.source_symbol_id == Some(service.id) + && reference.target_symbol_id == base.id + })); + assert!(index.references.iter().any(|reference| { + reference.name == "helper" + && reference.source_symbol_id == Some(run.id) + && reference.target_symbol_id == helper.id + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == run.id && dependency.target_symbol_id == helper.id + })); + } + #[test] fn compact_python_graph_snapshot_is_stable() { let repo = temp_repo_path("compact-python-graph-snapshot"); @@ -1189,6 +1371,8 @@ mod tests { serde_json::json!({ "id": symbol.id, "file_id": symbol.file_id, + "parent_symbol_id": symbol.parent_symbol_id, + "is_top_level": symbol.is_top_level, "name": symbol.name, "kind": symbol.kind, }) @@ -1254,9 +1438,9 @@ mod tests { {"id": 2, "path": "pkg/service.py", "module_name": "pkg.service"} ], "symbols": [ - {"id": 0, "file_id": 1, "name": "CONSTANT", "kind": "global_variable"}, - {"id": 1, "file_id": 1, "name": "Base", "kind": "class"}, - {"id": 2, "file_id": 2, "name": "Service", "kind": "class"} + {"id": 0, "file_id": 1, "parent_symbol_id": null, "is_top_level": true, "name": "CONSTANT", "kind": "global_variable"}, + {"id": 1, "file_id": 1, "parent_symbol_id": null, "is_top_level": true, "name": "Base", "kind": "class"}, + {"id": 2, "file_id": 2, "parent_symbol_id": null, "is_top_level": true, "name": "Service", "kind": "class"} ], "imports": [ {"id": 0, "file_id": 2, "kind": "from_import", "module": ".base", "name": "Base", "alias": null}, diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 277e5b328..ed6943fa8 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.818s | 539.5 MB | 0.617s | 132.1 MB | 1130 | 1130 | 3956 | 6460 | 432 | 3669 | 2020 | yes | 4.568x | 4.085x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.863s | 3471.4 MB | 3.194s | 351.3 MB | 4789 | 4789 | 23663 | 40580 | 19011 | 95292 | 35489 | yes | 6.218x | 9.882x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.768s | 543.5 MB | 0.666s | 133.1 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4586 | 3093 | yes | 4.154x | 4.083x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.441s | 3470.4 MB | 3.473s | 368.5 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 112238 | 71348 | yes | 5.309x | 9.418x | ## Pinned Compact Snapshot Evidence @@ -205,17 +205,18 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Graph family | Count | SHA-256 | | --- | ---: | --- | | Files | 4789 | `226e8cb32dc0a23ec956e97b036e7c505037df979cce7182514f39a43b07cb80` | -| Symbols | 23663 | `02fd17a7c0ba4f8fa0f29dcdfc642bffcc8116c20b86f5519f15fd0447d08781` | +| Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 95292 | `677270b43e9578c64f08d85f8635d5bf4bea027ad513649e8767d1147633af5c` | -| Dependencies | 35489 | `0e18b4147f49a3bc58ae8bab3972535b0df3cbede968c7e14324e2b23fb31f70` | +| References | 112238 | `cd0796bb493a329acab25f666d956539e714aceb39c70e1d9c0376f950f6ed98` | +| Dependencies | 71348 | `c04d010154c2b501ba4cd497d4dcaf296155962e3518c3cfba8b22e929f508e1` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. Important caveats: -- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, imports, internal import-resolution records, first-slice top-level Python symbol reference records, and de-duplicated dependency records for indexed Python modules. +- The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. +- Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 80ff0bbaa..7f1e15732 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 35489, + "count": 71348, "samples": [ { "reference_count": 1, @@ -97,54 +97,54 @@ { "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:decorated@5454", "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_forbidden@3931" }, { - "reference_count": 4, + "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:decorated@5454", "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" }, { "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/configuration.py", - "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:decorated@5454", + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_unauthorized@3781" }, { "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/utils/net.py", - "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" + "target_file": "airflow/api/auth/backend/kerberos_auth.py", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" }, { - "reference_count": 1, + "reference_count": 4, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_forbidden@3931" + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" }, { "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", - "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" }, { "reference_count": 1, "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:requires_authentication@4936", - "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_unauthorized@3781" + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", + "target_file": "airflow/utils/net.py", + "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "0e18b4147f49a3bc58ae8bab3972535b0df3cbede968c7e14324e2b23fb31f70" + "sha256": "c04d010154c2b501ba4cd497d4dcaf296155962e3518c3cfba8b22e929f508e1" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 95292, + "count": 112238, "samples": [ { "import": null, @@ -722,6 +722,17 @@ "source_symbol": "airflow/__main__.py:function:configure_internal_api@2850", "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" }, + { + "import": null, + "name": "main", + "range": [ + 4266, + 4270 + ], + "source_file": "airflow/__main__.py", + "source_symbol": null, + "target_symbol": "airflow/__main__.py:function:main@1814" + }, { "import": "airflow/api/__init__.py:from_import:airflow.configuration:conf:@906", "name": "conf", @@ -777,6 +788,17 @@ "source_symbol": "airflow/api/__init__.py:function:load_auth@1058", "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" }, + { + "import": "airflow/api/auth/backend/basic_auth.py:from_import:airflow.exceptions:RemovedInAirflow3Warning:@1101", + "name": "RemovedInAirflow3Warning", + "range": [ + 1428, + 1452 + ], + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": null, + "target_symbol": "airflow/exceptions.py:class:RemovedInAirflow3Warning@16092" + }, { "import": null, "name": "T", @@ -821,6 +843,17 @@ "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" }, + { + "import": null, + "name": "KerberosService", + "range": [ + 3060, + 3075 + ], + "source_file": "airflow/api/auth/backend/kerberos_auth.py", + "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040", + "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:class:KerberosService@2728" + }, { "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.utils.net:getfqdn:@2564", "name": "getfqdn", @@ -853,48 +886,16 @@ "source_file": "airflow/api/auth/backend/kerberos_auth.py", "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" - }, - { - "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.configuration:conf:@2525", - "name": "conf", - "range": [ - 3454, - 3458 - ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/configuration.py:global_variable:conf@102463" - }, - { - "import": null, - "name": "log", - "range": [ - 3503, - 3506 - ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" - }, - { - "import": null, - "name": "log", - "range": [ - 3674, - 3677 - ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" } ], - "sha256": "677270b43e9578c64f08d85f8635d5bf4bea027ad513649e8767d1147633af5c" + "sha256": "cd0796bb493a329acab25f666d956539e714aceb39c70e1d9c0376f950f6ed98" }, "symbols": { - "count": 23663, + "count": 52339, "samples": [ { "file": "airflow/__init__.py", + "is_top_level": true, "key": "airflow/__init__.py:function:__getattr__@4048", "kind": "function", "name": "__getattr__", @@ -902,6 +903,7 @@ 4048, 4059 ], + "parent_symbol": null, "range": [ 4044, 5292 @@ -909,6 +911,7 @@ }, { "file": "airflow/__init__.py", + "is_top_level": true, "key": "airflow/__init__.py:global_variable:__all__@2505", "kind": "global_variable", "name": "__all__", @@ -916,6 +919,7 @@ 2505, 2512 ], + "parent_symbol": null, "range": [ 2505, 2578 @@ -923,6 +927,7 @@ }, { "file": "airflow/__init__.py", + "is_top_level": true, "key": "airflow/__init__.py:global_variable:__lazy_imports@3362", "kind": "global_variable", "name": "__lazy_imports", @@ -930,6 +935,7 @@ 3362, 3376 ], + "parent_symbol": null, "range": [ 3362, 3698 @@ -937,6 +943,7 @@ }, { "file": "airflow/__init__.py", + "is_top_level": true, "key": "airflow/__init__.py:global_variable:__path__@2728", "kind": "global_variable", "name": "__path__", @@ -944,6 +951,7 @@ 2728, 2736 ], + "parent_symbol": null, "range": [ 2728, 2792 @@ -951,6 +959,7 @@ }, { "file": "airflow/__init__.py", + "is_top_level": true, "key": "airflow/__init__.py:global_variable:__version__@823", "kind": "global_variable", "name": "__version__", @@ -958,6 +967,7 @@ 823, 834 ], + "parent_symbol": null, "range": [ 823, 845 @@ -965,6 +975,7 @@ }, { "file": "airflow/__main__.py", + "is_top_level": true, "key": "airflow/__main__.py:function:configure_internal_api@2850", "kind": "function", "name": "configure_internal_api", @@ -972,6 +983,7 @@ 2850, 2872 ], + "parent_symbol": null, "range": [ 2846, 4232 @@ -979,6 +991,7 @@ }, { "file": "airflow/__main__.py", + "is_top_level": true, "key": "airflow/__main__.py:function:main@1814", "kind": "function", "name": "main", @@ -986,6 +999,7 @@ 1814, 1818 ], + "parent_symbol": null, "range": [ 1810, 2843 @@ -993,6 +1007,7 @@ }, { "file": "airflow/api/__init__.py", + "is_top_level": true, "key": "airflow/api/__init__.py:function:load_auth@1058", "kind": "function", "name": "load_auth", @@ -1000,6 +1015,7 @@ 1058, 1067 ], + "parent_symbol": null, "range": [ 1054, 1677 @@ -1007,6 +1023,7 @@ }, { "file": "airflow/api/__init__.py", + "is_top_level": true, "key": "airflow/api/__init__.py:global_variable:log@1018", "kind": "global_variable", "name": "log", @@ -1014,6 +1031,7 @@ 1018, 1021 ], + "parent_symbol": null, "range": [ 1018, 1051 @@ -1021,6 +1039,7 @@ }, { "file": "airflow/api/auth/backend/basic_auth.py", + "is_top_level": true, "key": "airflow/api/auth/backend/basic_auth.py:function:auth_current_user@1530", "kind": "function", "name": "auth_current_user", @@ -1028,6 +1047,7 @@ 1530, 1547 ], + "parent_symbol": null, "range": [ 1526, 1611 @@ -1035,6 +1055,7 @@ }, { "file": "airflow/api/auth/backend/basic_auth.py", + "is_top_level": true, "key": "airflow/api/auth/backend/basic_auth.py:function:init_app@1480", "kind": "function", "name": "init_app", @@ -1042,6 +1063,7 @@ 1480, 1488 ], + "parent_symbol": null, "range": [ 1476, 1523 @@ -1049,6 +1071,7 @@ }, { "file": "airflow/api/auth/backend/basic_auth.py", + "is_top_level": true, "key": "airflow/api/auth/backend/basic_auth.py:function:requires_authentication@1618", "kind": "function", "name": "requires_authentication", @@ -1056,6 +1079,7 @@ 1618, 1641 ], + "parent_symbol": null, "range": [ 1614, 1722 @@ -1063,6 +1087,7 @@ }, { "file": "airflow/api/auth/backend/basic_auth.py", + "is_top_level": true, "key": "airflow/api/auth/backend/basic_auth.py:global_variable:CLIENT_AUTH@1240", "kind": "global_variable", "name": "CLIENT_AUTH", @@ -1070,6 +1095,7 @@ 1240, 1251 ], + "parent_symbol": null, "range": [ 1240, 1288 @@ -1077,6 +1103,23 @@ }, { "file": "airflow/api/auth/backend/default.py", + "is_top_level": false, + "key": "airflow/api/auth/backend/default.py:function:decorated@1243", + "kind": "function", + "name": "decorated", + "name_range": [ + 1243, + 1252 + ], + "parent_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", + "range": [ + 1218, + 1311 + ] + }, + { + "file": "airflow/api/auth/backend/default.py", + "is_top_level": true, "key": "airflow/api/auth/backend/default.py:function:init_app@1018", "kind": "function", "name": "init_app", @@ -1084,6 +1127,7 @@ 1018, 1026 ], + "parent_symbol": null, "range": [ 1014, 1075 @@ -1091,6 +1135,7 @@ }, { "file": "airflow/api/auth/backend/default.py", + "is_top_level": true, "key": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", "kind": "function", "name": "requires_authentication", @@ -1098,6 +1143,7 @@ 1117, 1140 ], + "parent_symbol": null, "range": [ 1113, 1342 @@ -1105,6 +1151,7 @@ }, { "file": "airflow/api/auth/backend/default.py", + "is_top_level": true, "key": "airflow/api/auth/backend/default.py:global_variable:CLIENT_AUTH@963", "kind": "global_variable", "name": "CLIENT_AUTH", @@ -1112,6 +1159,7 @@ 963, 974 ], + "parent_symbol": null, "range": [ 963, 1011 @@ -1119,6 +1167,7 @@ }, { "file": "airflow/api/auth/backend/default.py", + "is_top_level": true, "key": "airflow/api/auth/backend/default.py:global_variable:T@1078", "kind": "global_variable", "name": "T", @@ -1126,6 +1175,7 @@ 1078, 1079 ], + "parent_symbol": null, "range": [ 1078, 1110 @@ -1133,48 +1183,38 @@ }, { "file": "airflow/api/auth/backend/deny_all.py", - "key": "airflow/api/auth/backend/deny_all.py:function:init_app@1039", + "is_top_level": false, + "key": "airflow/api/auth/backend/deny_all.py:function:decorated@1256", "kind": "function", - "name": "init_app", + "name": "decorated", "name_range": [ - 1039, - 1047 + 1256, + 1265 ], + "parent_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", "range": [ - 1035, - 1088 + 1231, + 1325 ] }, { "file": "airflow/api/auth/backend/deny_all.py", - "key": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", + "is_top_level": true, + "key": "airflow/api/auth/backend/deny_all.py:function:init_app@1039", "kind": "function", - "name": "requires_authentication", - "name_range": [ - 1130, - 1153 - ], - "range": [ - 1126, - 1356 - ] - }, - { - "file": "airflow/api/auth/backend/deny_all.py", - "key": "airflow/api/auth/backend/deny_all.py:global_variable:CLIENT_AUTH@984", - "kind": "global_variable", - "name": "CLIENT_AUTH", + "name": "init_app", "name_range": [ - 984, - 995 + 1039, + 1047 ], + "parent_symbol": null, "range": [ - 984, - 1032 + 1035, + 1088 ] } ], - "sha256": "02fd17a7c0ba4f8fa0f29dcdfc642bffcc8116c20b86f5519f15fd0447d08781" + "sha256": "d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699" } }, "integrity": { @@ -1193,16 +1233,16 @@ "schema_version": 1, "summary": { "bytes": 36617627, - "classes": 5379, - "dependencies": 35489, + "classes": 5665, + "dependencies": 71348, "files": 4789, "files_with_errors": 0, - "functions": 6145, + "functions": 34535, "global_variables": 12139, "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 95292, - "symbols": 23663 + "references": 112238, + "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index ebcaa2e17..e8c5ee6a4 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,13 +125,13 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside top-level Python classes/functions. Full lexical scoping, nested references, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. - `Codebase.rust_index_summary`, `.rust_files`, `.rust_symbols`, `.rust_classes`, `.rust_functions`, `.rust_global_vars`, `.rust_imports`, `.rust_import_resolutions`, `.rust_references`, and `.rust_dependencies` expose the attached compact records for shell smoke checks and golden tests. - `Codebase.files`, `.symbols`, `.classes`, `.functions`, `.global_vars`, `.imports`, `get_file(...)`, `get_symbol(...)`, `get_class(...)`, and `get_function(...)` now return lightweight compact handles in strict Rust mode for Python codebases. -- Compact file handles expose basic identity/content plus file-local `symbols`, `classes`, `functions`, `global_vars`, and `imports`. Compact symbol and import handles expose basic identity/source and implemented import-resolution targets. Edit-heavy and dependency/reference graph methods are still unsupported until the full lazy engine facade exists. +- Compact file handles expose basic identity/content plus file-local top-level `symbols`, `classes`, `functions`, `global_vars`, and `imports`; `file.symbols(nested=True)` exposes nested compact records. Compact symbol and import handles expose basic identity/source and implemented import-resolution targets. Edit-heavy and dependency/reference graph methods are still unsupported until the full lazy engine facade exists. - This surface is a bridge for the compact-index vertical slice. It is not yet the final lazy `CodebaseContext` backend facade and it does not yet provide full P0 `SourceFile`, `Symbol`, or `Import` parity. Rust can keep typed IDs internally. Python needs a compatibility `node_id: int`, so `RustGraphBackend` should maintain a per-context mapping between Python node IDs and typed Rust refs: diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 23a129bdb..cc59c8798 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -185,7 +185,8 @@ Recommended task format: - [ ] Implement export-to-symbol/import/file edges. - [ ] Implement lexical scope tables for name resolution. - [x] Implement first compact Python symbol reference extraction by identifier ranges. owner: codex. Result: records same-file and imported top-level symbol references inside top-level Python classes/functions. -- [ ] Expand symbol usage extraction to nested scopes, attributes, module references, and full lexical shadowing behavior. +- [x] Attribute compact Python references to nested class/function source symbols. owner: codex. Result: nested Python functions and methods are indexed as non-top-level compact symbols, while public `Codebase.functions` remains top-level-only. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, and local assignment/parameter scope exclusion. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -262,3 +263,4 @@ Recommended task format: - [x] 2026-06-18: Added compact Python `DependencyRecord` construction from references. owner: codex. Notes: current checkout emits 2,020 de-duplicated dependency edges and remains 4.6x faster with 4.1x lower process max RSS than Python parse/object materialization. - [x] 2026-06-18: Added first pinned large-repo benchmark runner and Airflow baseline. owner: codex. Notes: Apache Airflow `2.10.5` at `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf` matched 4,789 Python files and measured 6.218x faster wall time with 9.882x lower max RSS for the current compact Rust `Codebase` slice. - [x] 2026-06-18: Added first pinned Airflow compact graph golden. owner: codex. Notes: committed stable hashes/samples for 4,789 files, 23,663 symbols, 40,580 imports, 19,011 import resolutions, 95,292 references, and 35,489 dependencies; the opt-in pytest wrapper can verify it against the pinned checkout. +- [x] 2026-06-18: Added nested Python function/method compact symbols and innermost reference source attribution. owner: codex. Notes: Airflow compact coverage now emits 52,339 symbols, 112,238 references, and 71,348 dependencies while staying 5.309x faster with 9.418x lower max RSS than Python parse/object materialization. diff --git a/rust-rewrite/tools/snapshot_pinned_python_repo.py b/rust-rewrite/tools/snapshot_pinned_python_repo.py index 658139452..e802da65d 100644 --- a/rust-rewrite/tools/snapshot_pinned_python_repo.py +++ b/rust-rewrite/tools/snapshot_pinned_python_repo.py @@ -92,9 +92,12 @@ def make_file_rows(codebase: Any) -> list[dict[str, Any]]: def make_symbol_rows(codebase: Any, file_by_id: dict[int, Any]) -> list[dict[str, Any]]: + symbol_by_id = {symbol.id: symbol for symbol in codebase.rust_symbols} rows = [ { "key": symbol_key(symbol, file_by_id), + "parent_symbol": None if symbol.parent_symbol_id is None else symbol_key(symbol_by_id[symbol.parent_symbol_id], file_by_id), + "is_top_level": symbol.is_top_level, "file": file_by_id[symbol.file_id].path, "kind": symbol.kind, "name": symbol.name, diff --git a/src/graph_sitter/codebase/rust_backend.py b/src/graph_sitter/codebase/rust_backend.py index 2c8d60aa5..195885086 100644 --- a/src/graph_sitter/codebase/rust_backend.py +++ b/src/graph_sitter/codebase/rust_backend.py @@ -90,6 +90,8 @@ def from_dict(cls, data: dict[str, Any]) -> RustFileRecord: class RustSymbolRecord: id: int file_id: int + parent_symbol_id: int | None + is_top_level: bool name: str kind: str range: RustSourceRange @@ -100,6 +102,8 @@ def from_dict(cls, data: dict[str, Any]) -> RustSymbolRecord: return cls( id=int(data["id"]), file_id=int(data["file_id"]), + parent_symbol_id=None if data.get("parent_symbol_id") is None else int(data["parent_symbol_id"]), + is_top_level=bool(data.get("is_top_level", True)), name=str(data["name"]), kind=str(data["kind"]), range=RustSourceRange.from_dict(data["range"]), @@ -506,7 +510,7 @@ def __init__(self, backend: RustIndexBackend, record: RustSymbolRecord) -> None: super().__init__(backend, record.id, record.range) self.name = record.name self._name_node = RustCompactName(record.name) - self.is_top_level = True + self.is_top_level = record.is_top_level def __repr__(self) -> str: return f"RustCompactSymbol(name={self.name!r}, filepath={self.filepath!r})" diff --git a/src/graph_sitter/core/codebase.py b/src/graph_sitter/core/codebase.py index d4c1c4d53..28c9cc83e 100644 --- a/src/graph_sitter/core/codebase.py +++ b/src/graph_sitter/core/codebase.py @@ -342,7 +342,7 @@ def _rust_compact_files(self, extensions: list[str] | Literal["*"] | None = None def _symbols(self, symbol_type: SymbolType | None = None) -> list[TSymbol | TClass | TFunction | TGlobalVar]: if self.ctx.rust_compact_mode: symbols = self._require_rust_index().symbol_handles - return [x for x in symbols if symbol_type is None or x.symbol_type == symbol_type] + return [x for x in symbols if x.is_top_level and (symbol_type is None or x.symbol_type == symbol_type)] matches: list[Symbol] = self.ctx.get_nodes(NodeType.SYMBOL) return [x for x in matches if x.is_top_level and (symbol_type is None or x.symbol_type == symbol_type)] diff --git a/tests/unit/sdk/codebase/test_rust_backend.py b/tests/unit/sdk/codebase/test_rust_backend.py index 7bf3ab600..b5ba92dba 100644 --- a/tests/unit/sdk/codebase/test_rust_backend.py +++ b/tests/unit/sdk/codebase/test_rust_backend.py @@ -61,6 +61,8 @@ def symbols_json(self): { "id": 0, "file_id": 0, + "parent_symbol_id": None, + "is_top_level": True, "name": "Service", "kind": "class", "range": { @@ -83,6 +85,8 @@ def symbols_json(self): { "id": 1, "file_id": 0, + "parent_symbol_id": None, + "is_top_level": True, "name": "helper", "kind": "function", "range": { From a190e79f550b25b7d697b59393389e050a558b39 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:39:09 -0700 Subject: [PATCH 019/228] Skip Rust references shadowed by Python locals --- crates/graph-sitter-engine/src/lib.rs | 198 +++++++++++++++++- rust-rewrite/benchmarks.md | 9 +- .../apache-airflow-2.10.5-rust-compact.json | 12 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 5 files changed, 210 insertions(+), 15 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index bf5ecb171..7ac2fe1e8 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1,7 +1,7 @@ #![forbid(unsafe_code)] use serde::Serialize; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt; use std::fs; use std::io; @@ -454,11 +454,14 @@ fn extract_python_file( .filter(|symbol| symbol.file_id == file_id) .map(|symbol| (symbol.id, symbol.range)) .collect::>(); + let local_bindings_by_symbol_id = + collect_local_bindings(file_id, source, root, index, &symbol_ranges); collect_identifier_candidates( file_id, source, root, &symbol_ranges, + &local_bindings_by_symbol_id, &excluded_name_ranges, reference_candidates, ); @@ -708,11 +711,118 @@ fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec, + index: &PythonIndex, + symbol_ranges: &[(u32, SourceRange)], +) -> HashMap> { + let mut bindings: HashMap> = HashMap::new(); + + for symbol in index + .symbols + .iter() + .filter(|symbol| symbol.file_id == file_id) + { + if let Some(parent_symbol_id) = symbol.parent_symbol_id { + bindings + .entry(parent_symbol_id) + .or_default() + .insert(symbol.name.clone()); + } + } + + collect_local_bindings_from_node(source, root, symbol_ranges, &mut bindings); + bindings +} + +fn collect_local_bindings_from_node( + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + bindings: &mut HashMap>, +) { + match node.kind() { + "parameters" => { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, node.range().into()) + { + let mut targets = Vec::new(); + collect_parameter_targets(node, &mut targets); + push_local_binding_names(source, source_symbol_id, targets, bindings); + } + return; + } + "assignment" | "annotated_assignment" | "augmented_assignment" => { + if let Some(left) = node.child_by_field_name("left") { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, left.range().into()) + { + let mut targets = Vec::new(); + collect_assignment_targets(left, &mut targets); + push_local_binding_names(source, source_symbol_id, targets, bindings); + } + } + return; + } + _ => {} + } + + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_local_bindings_from_node(source, child, symbol_ranges, bindings); + } +} + +fn collect_parameter_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + match node.kind() { + "identifier" => out.push(node), + "typed_parameter" | "default_parameter" | "typed_default_parameter" => { + if let Some(name) = node.child_by_field_name("name") { + collect_parameter_targets(name, out); + } else if let Some(first_child) = first_named_child(node) { + collect_parameter_targets(first_child, out); + } + } + "list_splat_pattern" | "dictionary_splat_pattern" => { + if let Some(first_child) = first_named_child(node) { + collect_parameter_targets(first_child, out); + } + } + "parameters" => { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_parameter_targets(child, out); + } + } + _ => {} + } +} + +fn push_local_binding_names( + source: &str, + source_symbol_id: u32, + targets: Vec>, + bindings: &mut HashMap>, +) { + for target in targets { + let Ok(name) = target.utf8_text(source.as_bytes()) else { + continue; + }; + bindings + .entry(source_symbol_id) + .or_default() + .insert(name.to_owned()); + } +} + fn collect_identifier_candidates( file_id: u32, source: &str, node: Node<'_>, symbol_ranges: &[(u32, SourceRange)], + local_bindings_by_symbol_id: &HashMap>, excluded_ranges: &[SourceRange], out: &mut Vec, ) { @@ -726,9 +836,13 @@ fn collect_identifier_candidates( let range = node.range().into(); if node.kind() == "identifier" && !range_matches_any(range, excluded_ranges) { if let Ok(name) = node.utf8_text(source.as_bytes()) { + let source_symbol_id = innermost_symbol_for_range(symbol_ranges, range); + if is_shadowed_local_binding(source_symbol_id, name, local_bindings_by_symbol_id) { + return; + } out.push(ReferenceCandidate { source_file_id: file_id, - source_symbol_id: innermost_symbol_for_range(symbol_ranges, range), + source_symbol_id, name: name.to_owned(), range, }); @@ -737,10 +851,28 @@ fn collect_identifier_candidates( let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { - collect_identifier_candidates(file_id, source, child, symbol_ranges, excluded_ranges, out); + collect_identifier_candidates( + file_id, + source, + child, + symbol_ranges, + local_bindings_by_symbol_id, + excluded_ranges, + out, + ); } } +fn is_shadowed_local_binding( + source_symbol_id: Option, + name: &str, + local_bindings_by_symbol_id: &HashMap>, +) -> bool { + source_symbol_id + .and_then(|symbol_id| local_bindings_by_symbol_id.get(&symbol_id)) + .is_some_and(|bindings| bindings.contains(name)) +} + fn innermost_symbol_for_range( symbol_ranges: &[(u32, SourceRange)], range: SourceRange, @@ -825,6 +957,12 @@ fn first_child_of_kind<'tree>(node: Node<'tree>, kinds: &[&str]) -> Option) -> Option> { + let mut cursor = node.walk(); + let child = node.named_children(&mut cursor).next(); + child +} + fn split_alias(import: &str) -> (&str, Option<&str>) { if let Some((name, alias)) = import.split_once(" as ") { (name.trim(), Some(alias.trim())) @@ -1334,6 +1472,60 @@ mod tests { })); } + #[test] + fn skips_references_shadowed_by_python_parameters_and_locals() { + let repo = temp_repo_path("python-shadowed-reference-sources"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "class Base:\n pass\n\ndef helper():\n return Base\n", + ) + .unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from .base import Base, helper\n\n\ +def shadowed(Base):\n helper = Base\n return helper, Base\n\n\ +def caller():\n return helper()\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "shadowed") + .unwrap(); + let caller = index + .symbols + .iter() + .find(|symbol| symbol.name == "caller") + .unwrap(); + let helper = index + .symbols + .iter() + .find(|symbol| symbol.name == "helper") + .unwrap(); + let base = index + .symbols + .iter() + .find(|symbol| symbol.name == "Base") + .unwrap(); + + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id) + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(caller.id) + && reference.name == "helper" + && reference.target_symbol_id == helper.id + })); + } + #[test] fn compact_python_graph_snapshot_is_stable() { let repo = temp_repo_path("compact-python-graph-snapshot"); diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index ed6943fa8..44230c875 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.768s | 543.5 MB | 0.666s | 133.1 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4586 | 3093 | yes | 4.154x | 4.083x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.441s | 3470.4 MB | 3.473s | 368.5 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 112238 | 71348 | yes | 5.309x | 9.418x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.824s | 542.8 MB | 0.700s | 126.6 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4113 | 2953 | yes | 4.033x | 4.288x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 20.092s | 3469.7 MB | 3.981s | 260.6 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105739 | 68927 | yes | 5.048x | 13.315x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 112238 | `cd0796bb493a329acab25f666d956539e714aceb39c70e1d9c0376f950f6ed98` | -| Dependencies | 71348 | `c04d010154c2b501ba4cd497d4dcaf296155962e3518c3cfba8b22e929f508e1` | +| References | 105739 | `9329e3e2e1c03a13f9c0872b1b799c734457eec20c825168bcf8fc473578b72d` | +| Dependencies | 68927 | `76cf4451297dab436618c3ec4db94b9a30d2d645debd1d329637b33fce02120e` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -217,6 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. +- Function parameters, local assignment targets, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 7f1e15732..00e157e48 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 71348, + "count": 68927, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "c04d010154c2b501ba4cd497d4dcaf296155962e3518c3cfba8b22e929f508e1" + "sha256": "76cf4451297dab436618c3ec4db94b9a30d2d645debd1d329637b33fce02120e" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 112238, + "count": 105739, "samples": [ { "import": null, @@ -888,7 +888,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" } ], - "sha256": "cd0796bb493a329acab25f666d956539e714aceb39c70e1d9c0376f950f6ed98" + "sha256": "9329e3e2e1c03a13f9c0872b1b799c734457eec20c825168bcf8fc473578b72d" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 71348, + "dependencies": 68927, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 112238, + "references": 105739, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index e8c5ee6a4..0996d0bd8 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index cc59c8798..74ca7600b 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -186,7 +186,8 @@ Recommended task format: - [ ] Implement lexical scope tables for name resolution. - [x] Implement first compact Python symbol reference extraction by identifier ranges. owner: codex. Result: records same-file and imported top-level symbol references inside top-level Python classes/functions. - [x] Attribute compact Python references to nested class/function source symbols. owner: codex. Result: nested Python functions and methods are indexed as non-top-level compact symbols, while public `Codebase.functions` remains top-level-only. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, and local assignment/parameter scope exclusion. +- [x] Exclude compact Python references shadowed by parameters, local assignments, and nested definitions. owner: codex. Result: avoids resolving local bindings to imported/top-level symbols and reduced Airflow compact references from 112,238 to 105,739. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, local imports, loop/with/except bindings, and `global`/`nonlocal`. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -264,3 +265,4 @@ Recommended task format: - [x] 2026-06-18: Added first pinned large-repo benchmark runner and Airflow baseline. owner: codex. Notes: Apache Airflow `2.10.5` at `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf` matched 4,789 Python files and measured 6.218x faster wall time with 9.882x lower max RSS for the current compact Rust `Codebase` slice. - [x] 2026-06-18: Added first pinned Airflow compact graph golden. owner: codex. Notes: committed stable hashes/samples for 4,789 files, 23,663 symbols, 40,580 imports, 19,011 import resolutions, 95,292 references, and 35,489 dependencies; the opt-in pytest wrapper can verify it against the pinned checkout. - [x] 2026-06-18: Added nested Python function/method compact symbols and innermost reference source attribution. owner: codex. Notes: Airflow compact coverage now emits 52,339 symbols, 112,238 references, and 71,348 dependencies while staying 5.309x faster with 9.418x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added first local-binding shadow filter for compact Python references. owner: codex. Notes: parameters, local assignments, and nested definitions no longer resolve to imported/top-level symbols; Airflow compact graph now emits 105,739 references and 68,927 dependencies while staying 5.048x faster with 13.315x lower max RSS than Python parse/object materialization. From c076fe4058b15458a71c0677d52f33dd0e9be626 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:44:41 -0700 Subject: [PATCH 020/228] Skip Rust references shadowed by local imports --- crates/graph-sitter-engine/src/lib.rs | 75 +++++++++++++++++++ rust-rewrite/benchmarks.md | 10 +-- .../apache-airflow-2.10.5-rust-compact.json | 12 +-- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 5 files changed, 90 insertions(+), 13 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 7ac2fe1e8..c88e91b08 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -766,6 +766,19 @@ fn collect_local_bindings_from_node( } return; } + "import_statement" | "import_from_statement" | "future_import_statement" => { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, node.range().into()) + { + for binding in local_import_binding_names(source, node) { + bindings + .entry(source_symbol_id) + .or_default() + .insert(binding); + } + } + return; + } _ => {} } @@ -817,6 +830,50 @@ fn push_local_binding_names( } } +fn local_import_binding_names(source: &str, node: Node<'_>) -> Vec { + match node.kind() { + "import_statement" => plain_import_binding_names(node_text(source, node)), + "import_from_statement" | "future_import_statement" => { + from_import_binding_names(node_text(source, node)) + } + _ => Vec::new(), + } +} + +fn plain_import_binding_names(text: &str) -> Vec { + text.trim() + .trim_start_matches("import") + .split(',') + .map(str::trim) + .filter(|part| !part.is_empty()) + .filter_map(|part| { + let (name, alias) = split_alias(part); + alias + .map(str::to_owned) + .or_else(|| name.split('.').next().map(str::to_owned)) + }) + .collect() +} + +fn from_import_binding_names(text: &str) -> Vec { + let stripped = text.trim(); + let Some(after_from) = stripped.strip_prefix("from ") else { + return Vec::new(); + }; + let Some((_, names)) = after_from.split_once(" import ") else { + return Vec::new(); + }; + names + .split(',') + .map(str::trim) + .filter(|part| !part.is_empty() && *part != "*") + .map(|part| { + let (name, alias) = split_alias(part); + alias.unwrap_or(name).to_owned() + }) + .collect() +} + fn collect_identifier_candidates( file_id: u32, source: &str, @@ -1485,7 +1542,9 @@ mod tests { fs::write( repo.join("pkg/service.py"), "from .base import Base, helper\n\n\ +other = object()\n\n\ def shadowed(Base):\n helper = Base\n return helper, Base\n\n\ +def import_shadowed():\n import other.module\n import other.module as helper\n from other import Base\n return helper, Base, other\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1503,11 +1562,21 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "caller") .unwrap(); + let import_shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "import_shadowed") + .unwrap(); let helper = index .symbols .iter() .find(|symbol| symbol.name == "helper") .unwrap(); + let other = index + .symbols + .iter() + .find(|symbol| symbol.name == "other") + .unwrap(); let base = index .symbols .iter() @@ -1519,6 +1588,12 @@ def caller():\n return helper()\n", && (reference.target_symbol_id == base.id || reference.target_symbol_id == helper.id) })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(import_shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id + || reference.target_symbol_id == other.id) + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 44230c875..16c57fd78 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.824s | 542.8 MB | 0.700s | 126.6 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4113 | 2953 | yes | 4.033x | 4.288x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 20.092s | 3469.7 MB | 3.981s | 260.6 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105739 | 68927 | yes | 5.048x | 13.315x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.978s | 543.6 MB | 0.713s | 126.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4113 | 2953 | yes | 4.177x | 4.301x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.534s | 3469.2 MB | 3.806s | 262.9 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105624 | 68869 | yes | 4.870x | 13.195x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 105739 | `9329e3e2e1c03a13f9c0872b1b799c734457eec20c825168bcf8fc473578b72d` | -| Dependencies | 68927 | `76cf4451297dab436618c3ec4db94b9a30d2d645debd1d329637b33fce02120e` | +| References | 105624 | `90264e3b168009f8285d85055f28b41b0442503b113b8cb64120ee6a5d3883d1` | +| Dependencies | 68869 | `2ec9bded47f9ea7add8b2084caa0eb7fe299998948e85afbb4102db1df5abddb` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, local assignment targets, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. +- Function parameters, local assignment targets, local imports, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 00e157e48..f5c234511 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 68927, + "count": 68869, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "76cf4451297dab436618c3ec4db94b9a30d2d645debd1d329637b33fce02120e" + "sha256": "2ec9bded47f9ea7add8b2084caa0eb7fe299998948e85afbb4102db1df5abddb" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 105739, + "count": 105624, "samples": [ { "import": null, @@ -888,7 +888,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" } ], - "sha256": "9329e3e2e1c03a13f9c0872b1b799c734457eec20c825168bcf8fc473578b72d" + "sha256": "90264e3b168009f8285d85055f28b41b0442503b113b8cb64120ee6a5d3883d1" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 68927, + "dependencies": 68869, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 105739, + "references": 105624, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 0996d0bd8..b95143b60 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 74ca7600b..fec15b875 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -187,7 +187,8 @@ Recommended task format: - [x] Implement first compact Python symbol reference extraction by identifier ranges. owner: codex. Result: records same-file and imported top-level symbol references inside top-level Python classes/functions. - [x] Attribute compact Python references to nested class/function source symbols. owner: codex. Result: nested Python functions and methods are indexed as non-top-level compact symbols, while public `Codebase.functions` remains top-level-only. - [x] Exclude compact Python references shadowed by parameters, local assignments, and nested definitions. owner: codex. Result: avoids resolving local bindings to imported/top-level symbols and reduced Airflow compact references from 112,238 to 105,739. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, local imports, loop/with/except bindings, and `global`/`nonlocal`. +- [x] Exclude compact Python references shadowed by local imports. owner: codex. Result: avoids resolving function-local `import ... as ...`, `import pkg.mod`, and `from ... import ...` bindings to imported/top-level symbols; reduced Airflow compact references from 105,739 to 105,624 and dependencies from 68,927 to 68,869. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, loop/with/except bindings, and `global`/`nonlocal`. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -266,3 +267,4 @@ Recommended task format: - [x] 2026-06-18: Added first pinned Airflow compact graph golden. owner: codex. Notes: committed stable hashes/samples for 4,789 files, 23,663 symbols, 40,580 imports, 19,011 import resolutions, 95,292 references, and 35,489 dependencies; the opt-in pytest wrapper can verify it against the pinned checkout. - [x] 2026-06-18: Added nested Python function/method compact symbols and innermost reference source attribution. owner: codex. Notes: Airflow compact coverage now emits 52,339 symbols, 112,238 references, and 71,348 dependencies while staying 5.309x faster with 9.418x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added first local-binding shadow filter for compact Python references. owner: codex. Notes: parameters, local assignments, and nested definitions no longer resolve to imported/top-level symbols; Airflow compact graph now emits 105,739 references and 68,927 dependencies while staying 5.048x faster with 13.315x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added local-import shadow filtering for compact Python references. owner: codex. Notes: function-local imports no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,624 references and 68,869 dependencies while staying 4.870x faster with 13.195x lower max RSS than Python parse/object materialization. From 42bba51bc2f5ea9864d136f58082de9917b74f16 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:50:05 -0700 Subject: [PATCH 021/228] Skip Rust references shadowed by control flow bindings --- crates/graph-sitter-engine/src/lib.rs | 93 +++++++++++++++++-- rust-rewrite/benchmarks.md | 10 +- .../apache-airflow-2.10.5-rust-compact.json | 12 +-- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 5 files changed, 100 insertions(+), 21 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index c88e91b08..645308047 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -701,7 +701,7 @@ fn push_global_assignment( fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { match node.kind() { "identifier" => out.push(node), - "pattern_list" | "tuple_pattern" | "list_pattern" => { + "as_pattern_target" | "pattern" | "pattern_list" | "tuple_pattern" | "list_pattern" => { let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { collect_assignment_targets(child, out); @@ -756,16 +756,28 @@ fn collect_local_bindings_from_node( } "assignment" | "annotated_assignment" | "augmented_assignment" => { if let Some(left) = node.child_by_field_name("left") { - if let Some(source_symbol_id) = - innermost_symbol_for_range(symbol_ranges, left.range().into()) - { - let mut targets = Vec::new(); - collect_assignment_targets(left, &mut targets); - push_local_binding_names(source, source_symbol_id, targets, bindings); - } + push_local_binding_targets(source, left, symbol_ranges, bindings); } return; } + "for_statement" => { + if let Some(left) = node.child_by_field_name("left") { + push_local_binding_targets(source, left, symbol_ranges, bindings); + } + } + "with_statement" => { + if let Some(with_clause) = first_child_of_kind(node, &["with_clause"]) { + push_as_pattern_binding_targets(source, with_clause, symbol_ranges, bindings); + } + } + "except_clause" => { + if let Some(alias) = node.child_by_field_name("alias") { + push_local_binding_targets(source, alias, symbol_ranges, bindings); + } + if let Some(value) = node.child_by_field_name("value") { + push_as_pattern_binding_targets(source, value, symbol_ranges, bindings); + } + } "import_statement" | "import_from_statement" | "future_import_statement" => { if let Some(source_symbol_id) = innermost_symbol_for_range(symbol_ranges, node.range().into()) @@ -788,6 +800,48 @@ fn collect_local_bindings_from_node( } } +fn push_local_binding_targets( + source: &str, + target_root: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + bindings: &mut HashMap>, +) { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, target_root.range().into()) + { + let mut targets = Vec::new(); + collect_assignment_targets(target_root, &mut targets); + push_local_binding_names(source, source_symbol_id, targets, bindings); + } +} + +fn push_as_pattern_binding_targets( + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + bindings: &mut HashMap>, +) { + let mut targets = Vec::new(); + collect_as_pattern_alias_targets(node, &mut targets); + for target in targets { + push_local_binding_targets(source, target, symbol_ranges, bindings); + } +} + +fn collect_as_pattern_alias_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + if node.kind() == "as_pattern" { + if let Some(alias) = node.child_by_field_name("alias") { + collect_assignment_targets(alias, out); + } + return; + } + + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_as_pattern_alias_targets(child, out); + } +} + fn collect_parameter_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { match node.kind() { "identifier" => out.push(node), @@ -1543,8 +1597,10 @@ mod tests { repo.join("pkg/service.py"), "from .base import Base, helper\n\n\ other = object()\n\n\ +Error = Exception\n\n\ def shadowed(Base):\n helper = Base\n return helper, Base\n\n\ def import_shadowed():\n import other.module\n import other.module as helper\n from other import Base\n return helper, Base, other\n\n\ +def control_flow_shadowed(items, manager):\n for Base, helper in items:\n pass\n with manager as other:\n pass\n try:\n pass\n except Error as helper:\n return Base, helper, other\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1567,6 +1623,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "import_shadowed") .unwrap(); + let control_flow_shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "control_flow_shadowed") + .unwrap(); let helper = index .symbols .iter() @@ -1582,6 +1643,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "Base") .unwrap(); + let error = index + .symbols + .iter() + .find(|symbol| symbol.name == "Error") + .unwrap(); assert!(!index.references.iter().any(|reference| { reference.source_symbol_id == Some(shadowed.id) @@ -1594,6 +1660,17 @@ def caller():\n return helper()\n", || reference.target_symbol_id == helper.id || reference.target_symbol_id == other.id) })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(control_flow_shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id + || reference.target_symbol_id == other.id) + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(control_flow_shadowed.id) + && reference.name == "Error" + && reference.target_symbol_id == error.id + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 16c57fd78..61f584ce8 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.978s | 543.6 MB | 0.713s | 126.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4113 | 2953 | yes | 4.177x | 4.301x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.534s | 3469.2 MB | 3.806s | 262.9 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105624 | 68869 | yes | 4.870x | 13.195x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.908s | 544.8 MB | 0.695s | 125.0 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4101 | 2950 | yes | 4.184x | 4.359x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.661s | 3470.1 MB | 3.758s | 260.3 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 5.232x | 13.332x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 105624 | `90264e3b168009f8285d85055f28b41b0442503b113b8cb64120ee6a5d3883d1` | -| Dependencies | 68869 | `2ec9bded47f9ea7add8b2084caa0eb7fe299998948e85afbb4102db1df5abddb` | +| References | 105467 | `31d46be3ba07c666ca0c3c03639f0ee75e426c758d9655294cf3d7b7e6b9fe38` | +| Dependencies | 68848 | `000c01b9fe4230de809516b2b5e7ea8089d89a09e810536896f02c9b2a67b94a` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, local assignment targets, local imports, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. +- Function parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index f5c234511..e9952919e 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 68869, + "count": 68848, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "2ec9bded47f9ea7add8b2084caa0eb7fe299998948e85afbb4102db1df5abddb" + "sha256": "000c01b9fe4230de809516b2b5e7ea8089d89a09e810536896f02c9b2a67b94a" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 105624, + "count": 105467, "samples": [ { "import": null, @@ -888,7 +888,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" } ], - "sha256": "90264e3b168009f8285d85055f28b41b0442503b113b8cb64120ee6a5d3883d1" + "sha256": "31d46be3ba07c666ca0c3c03639f0ee75e426c758d9655294cf3d7b7e6b9fe38" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 68869, + "dependencies": 68848, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 105624, + "references": 105467, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index b95143b60..0d4a0a73d 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index fec15b875..4d759edb6 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -188,7 +188,8 @@ Recommended task format: - [x] Attribute compact Python references to nested class/function source symbols. owner: codex. Result: nested Python functions and methods are indexed as non-top-level compact symbols, while public `Codebase.functions` remains top-level-only. - [x] Exclude compact Python references shadowed by parameters, local assignments, and nested definitions. owner: codex. Result: avoids resolving local bindings to imported/top-level symbols and reduced Airflow compact references from 112,238 to 105,739. - [x] Exclude compact Python references shadowed by local imports. owner: codex. Result: avoids resolving function-local `import ... as ...`, `import pkg.mod`, and `from ... import ...` bindings to imported/top-level symbols; reduced Airflow compact references from 105,739 to 105,624 and dependencies from 68,927 to 68,869. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, loop/with/except bindings, and `global`/`nonlocal`. +- [x] Exclude compact Python references shadowed by control-flow bindings. owner: codex. Result: avoids resolving `for` targets, `with ... as ...` targets, and `except ... as ...` targets to imported/top-level symbols; reduced Airflow compact references from 105,624 to 105,467 and dependencies from 68,869 to 68,848. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, pattern-match bindings, comprehensions, and `global`/`nonlocal`. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -268,3 +269,4 @@ Recommended task format: - [x] 2026-06-18: Added nested Python function/method compact symbols and innermost reference source attribution. owner: codex. Notes: Airflow compact coverage now emits 52,339 symbols, 112,238 references, and 71,348 dependencies while staying 5.309x faster with 9.418x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added first local-binding shadow filter for compact Python references. owner: codex. Notes: parameters, local assignments, and nested definitions no longer resolve to imported/top-level symbols; Airflow compact graph now emits 105,739 references and 68,927 dependencies while staying 5.048x faster with 13.315x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added local-import shadow filtering for compact Python references. owner: codex. Notes: function-local imports no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,624 references and 68,869 dependencies while staying 4.870x faster with 13.195x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added control-flow binding shadow filtering for compact Python references. owner: codex. Notes: `for`, `with ... as ...`, and `except ... as ...` targets no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,467 references and 68,848 dependencies while staying 5.232x faster with 13.332x lower max RSS than Python parse/object materialization. From 07db26c1d3c72d55b49b9913892bdd69efce392a Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 14:55:56 -0700 Subject: [PATCH 022/228] Skip Rust references shadowed by match and comprehension bindings --- crates/graph-sitter-engine/src/lib.rs | 129 +++++++++++++++++++++++++- rust-rewrite/benchmarks.md | 6 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 4 files changed, 134 insertions(+), 7 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 645308047..b0a4ecb6c 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -765,6 +765,11 @@ fn collect_local_bindings_from_node( push_local_binding_targets(source, left, symbol_ranges, bindings); } } + "for_in_clause" => { + if let Some(left) = node.child_by_field_name("left") { + push_local_binding_targets(source, left, symbol_ranges, bindings); + } + } "with_statement" => { if let Some(with_clause) = first_child_of_kind(node, &["with_clause"]) { push_as_pattern_binding_targets(source, with_clause, symbol_ranges, bindings); @@ -778,6 +783,9 @@ fn collect_local_bindings_from_node( push_as_pattern_binding_targets(source, value, symbol_ranges, bindings); } } + "case_clause" => { + push_match_pattern_binding_targets(source, node, symbol_ranges, bindings); + } "import_statement" | "import_from_statement" | "future_import_statement" => { if let Some(source_symbol_id) = innermost_symbol_for_range(symbol_ranges, node.range().into()) @@ -800,6 +808,89 @@ fn collect_local_bindings_from_node( } } +fn push_match_pattern_binding_targets( + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + bindings: &mut HashMap>, +) { + let mut targets = Vec::new(); + collect_case_clause_binding_targets(node, &mut targets); + for target in targets { + push_local_binding_targets(source, target, symbol_ranges, bindings); + } +} + +fn collect_case_clause_binding_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + let mut cursor = node.walk(); + for (index, child) in node.children(&mut cursor).enumerate() { + if !child.is_named() || node.field_name_for_child(index as u32).is_some() { + continue; + } + if child.kind() == "case_pattern" { + collect_match_pattern_targets(child, out); + } + } +} + +fn collect_match_pattern_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + match node.kind() { + "identifier" => out.push(node), + "dotted_name" => { + let mut cursor = node.walk(); + let identifiers: Vec<_> = node + .named_children(&mut cursor) + .filter(|child| child.kind() == "identifier") + .collect(); + if identifiers.len() == 1 { + out.push(identifiers[0]); + } + } + "dict_pattern" => { + let mut cursor = node.walk(); + for child in node.children_by_field_name("value", &mut cursor) { + collect_match_pattern_targets(child, out); + } + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + if child.kind() == "splat_pattern" { + collect_match_pattern_targets(child, out); + } + } + } + "class_pattern" => { + let mut seen_constructor = false; + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + if !seen_constructor && child.kind() == "dotted_name" { + seen_constructor = true; + continue; + } + collect_match_pattern_targets(child, out); + } + } + "keyword_pattern" => { + let mut skipped_keyword = false; + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + if !skipped_keyword && child.kind() == "identifier" { + skipped_keyword = true; + continue; + } + collect_match_pattern_targets(child, out); + } + } + "case_pattern" | "as_pattern" | "list_pattern" | "tuple_pattern" | "splat_pattern" + | "union_pattern" => { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_match_pattern_targets(child, out); + } + } + _ => {} + } +} + fn push_local_binding_targets( source: &str, target_root: Node<'_>, @@ -1590,17 +1681,19 @@ mod tests { fs::write(repo.join("pkg/__init__.py"), "").unwrap(); fs::write( repo.join("pkg/base.py"), - "class Base:\n pass\n\ndef helper():\n return Base\n", + "class Base:\n pass\n\nclass Point:\n pass\n\ndef helper():\n return Base\n", ) .unwrap(); fs::write( repo.join("pkg/service.py"), - "from .base import Base, helper\n\n\ + "from .base import Base, helper, Point\n\n\ other = object()\n\n\ Error = Exception\n\n\ def shadowed(Base):\n helper = Base\n return helper, Base\n\n\ def import_shadowed():\n import other.module\n import other.module as helper\n from other import Base\n return helper, Base, other\n\n\ def control_flow_shadowed(items, manager):\n for Base, helper in items:\n pass\n with manager as other:\n pass\n try:\n pass\n except Error as helper:\n return Base, helper, other\n\n\ +def comprehension_shadowed(items):\n return [Base + helper + other for Base, helper, other in items if Base]\n\n\ +def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=helper) as other if Base:\n return Base, helper, other\n case {\"base\": Base, \"helper\": helper, **other}:\n return Base, helper, other\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1628,6 +1721,16 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "control_flow_shadowed") .unwrap(); + let comprehension_shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "comprehension_shadowed") + .unwrap(); + let match_shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "match_shadowed") + .unwrap(); let helper = index .symbols .iter() @@ -1648,6 +1751,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "Error") .unwrap(); + let point = index + .symbols + .iter() + .find(|symbol| symbol.name == "Point") + .unwrap(); assert!(!index.references.iter().any(|reference| { reference.source_symbol_id == Some(shadowed.id) @@ -1671,6 +1779,23 @@ def caller():\n return helper()\n", && reference.name == "Error" && reference.target_symbol_id == error.id })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(comprehension_shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id + || reference.target_symbol_id == other.id) + })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(match_shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id + || reference.target_symbol_id == other.id) + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(match_shadowed.id) + && reference.name == "Point" + && reference.target_symbol_id == point.id + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 61f584ce8..78c3bca71 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.908s | 544.8 MB | 0.695s | 125.0 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4101 | 2950 | yes | 4.184x | 4.359x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.661s | 3470.1 MB | 3.758s | 260.3 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 5.232x | 13.332x | +| `graph-sitter` repo checkout | `--disable-graph` | 3.064s | 544.6 MB | 0.802s | 125.2 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4089 | 2949 | yes | 3.820x | 4.350x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.531s | 3468.9 MB | 3.830s | 259.0 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 5.100x | 13.395x | ## Pinned Compact Snapshot Evidence @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. +- Function parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 0d4a0a73d..418e68e6b 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 4d759edb6..dbf08ad43 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -189,7 +189,8 @@ Recommended task format: - [x] Exclude compact Python references shadowed by parameters, local assignments, and nested definitions. owner: codex. Result: avoids resolving local bindings to imported/top-level symbols and reduced Airflow compact references from 112,238 to 105,739. - [x] Exclude compact Python references shadowed by local imports. owner: codex. Result: avoids resolving function-local `import ... as ...`, `import pkg.mod`, and `from ... import ...` bindings to imported/top-level symbols; reduced Airflow compact references from 105,739 to 105,624 and dependencies from 68,927 to 68,869. - [x] Exclude compact Python references shadowed by control-flow bindings. owner: codex. Result: avoids resolving `for` targets, `with ... as ...` targets, and `except ... as ...` targets to imported/top-level symbols; reduced Airflow compact references from 105,624 to 105,467 and dependencies from 68,869 to 68,848. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, pattern-match bindings, comprehensions, and `global`/`nonlocal`. +- [x] Exclude compact Python references shadowed by comprehension targets and match-pattern captures. owner: codex. Result: avoids resolving comprehension loop targets and match capture patterns to imported/top-level symbols; reduced this checkout's compact references from 4,101 to 4,089 and dependencies from 2,950 to 2,949. The pinned Airflow `2.10.5` compact graph stayed at 105,467 references and 68,848 dependencies. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, lambda parameters, `global`/`nonlocal`, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -270,3 +271,4 @@ Recommended task format: - [x] 2026-06-18: Added first local-binding shadow filter for compact Python references. owner: codex. Notes: parameters, local assignments, and nested definitions no longer resolve to imported/top-level symbols; Airflow compact graph now emits 105,739 references and 68,927 dependencies while staying 5.048x faster with 13.315x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added local-import shadow filtering for compact Python references. owner: codex. Notes: function-local imports no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,624 references and 68,869 dependencies while staying 4.870x faster with 13.195x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added control-flow binding shadow filtering for compact Python references. owner: codex. Notes: `for`, `with ... as ...`, and `except ... as ...` targets no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,467 references and 68,848 dependencies while staying 5.232x faster with 13.332x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added comprehension and match-pattern capture shadow filtering for compact Python references. owner: codex. Notes: this checkout now emits 4,089 compact references and 2,949 dependencies; pinned Airflow remained graph-stable at 105,467 references and 68,848 dependencies while staying 5.100x faster with 13.395x lower max RSS than Python parse/object materialization. From ab737f4810990b485de3de2967767cc50bb2253d Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:01:20 -0700 Subject: [PATCH 023/228] Scope Rust reference shadowing for lambda parameters --- crates/graph-sitter-engine/src/lib.rs | 173 ++++++++++++++++++++++++-- rust-rewrite/benchmarks.md | 6 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 4 files changed, 171 insertions(+), 14 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index b0a4ecb6c..f95f454ed 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -302,6 +302,13 @@ struct ReferenceCandidate { range: SourceRange, } +#[derive(Debug, Clone, PartialEq, Eq)] +struct LocalBindingScope { + source_symbol_id: u32, + range: SourceRange, + names: HashSet, +} + impl PythonIndexer { fn new() -> Result { let mut parser = Parser::new(); @@ -454,7 +461,7 @@ fn extract_python_file( .filter(|symbol| symbol.file_id == file_id) .map(|symbol| (symbol.id, symbol.range)) .collect::>(); - let local_bindings_by_symbol_id = + let (local_bindings_by_symbol_id, local_binding_scopes) = collect_local_bindings(file_id, source, root, index, &symbol_ranges); collect_identifier_candidates( file_id, @@ -462,6 +469,7 @@ fn extract_python_file( root, &symbol_ranges, &local_bindings_by_symbol_id, + &local_binding_scopes, &excluded_name_ranges, reference_candidates, ); @@ -717,8 +725,9 @@ fn collect_local_bindings( root: Node<'_>, index: &PythonIndex, symbol_ranges: &[(u32, SourceRange)], -) -> HashMap> { +) -> (HashMap>, Vec) { let mut bindings: HashMap> = HashMap::new(); + let mut scoped_bindings: Vec = Vec::new(); for symbol in index .symbols @@ -733,8 +742,14 @@ fn collect_local_bindings( } } - collect_local_bindings_from_node(source, root, symbol_ranges, &mut bindings); - bindings + collect_local_bindings_from_node( + source, + root, + symbol_ranges, + &mut bindings, + &mut scoped_bindings, + ); + (bindings, scoped_bindings) } fn collect_local_bindings_from_node( @@ -742,6 +757,7 @@ fn collect_local_bindings_from_node( node: Node<'_>, symbol_ranges: &[(u32, SourceRange)], bindings: &mut HashMap>, + scoped_bindings: &mut Vec, ) { match node.kind() { "parameters" => { @@ -754,6 +770,9 @@ fn collect_local_bindings_from_node( } return; } + "lambda" => { + push_lambda_binding_scope(source, node, symbol_ranges, scoped_bindings); + } "assignment" | "annotated_assignment" | "augmented_assignment" => { if let Some(left) = node.child_by_field_name("left") { push_local_binding_targets(source, left, symbol_ranges, bindings); @@ -804,7 +823,41 @@ fn collect_local_bindings_from_node( let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { - collect_local_bindings_from_node(source, child, symbol_ranges, bindings); + collect_local_bindings_from_node(source, child, symbol_ranges, bindings, scoped_bindings); + } +} + +fn push_lambda_binding_scope( + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + scoped_bindings: &mut Vec, +) { + let Some(parameters) = node.child_by_field_name("parameters") else { + return; + }; + let Some(body) = node.child_by_field_name("body") else { + return; + }; + let Some(source_symbol_id) = innermost_symbol_for_range(symbol_ranges, body.range().into()) + else { + return; + }; + + let mut targets = Vec::new(); + collect_parameter_targets(parameters, &mut targets); + let mut names = HashSet::new(); + for target in targets { + if let Ok(name) = target.utf8_text(source.as_bytes()) { + names.insert(name.to_owned()); + } + } + if !names.is_empty() { + scoped_bindings.push(LocalBindingScope { + source_symbol_id, + range: body.range().into(), + names, + }); } } @@ -948,7 +1001,7 @@ fn collect_parameter_targets<'tree>(node: Node<'tree>, out: &mut Vec collect_parameter_targets(first_child, out); } } - "parameters" => { + "parameters" | "lambda_parameters" => { let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { collect_parameter_targets(child, out); @@ -1025,9 +1078,24 @@ fn collect_identifier_candidates( node: Node<'_>, symbol_ranges: &[(u32, SourceRange)], local_bindings_by_symbol_id: &HashMap>, + local_binding_scopes: &[LocalBindingScope], excluded_ranges: &[SourceRange], out: &mut Vec, ) { + if node.kind() == "lambda_parameters" { + collect_lambda_parameter_value_identifier_candidates( + file_id, + source, + node, + symbol_ranges, + local_bindings_by_symbol_id, + local_binding_scopes, + excluded_ranges, + out, + ); + return; + } + if matches!( node.kind(), "import_statement" | "import_from_statement" | "future_import_statement" @@ -1039,7 +1107,13 @@ fn collect_identifier_candidates( if node.kind() == "identifier" && !range_matches_any(range, excluded_ranges) { if let Ok(name) = node.utf8_text(source.as_bytes()) { let source_symbol_id = innermost_symbol_for_range(symbol_ranges, range); - if is_shadowed_local_binding(source_symbol_id, name, local_bindings_by_symbol_id) { + if is_shadowed_local_binding( + source_symbol_id, + name, + range, + local_bindings_by_symbol_id, + local_binding_scopes, + ) { return; } out.push(ReferenceCandidate { @@ -1059,20 +1133,78 @@ fn collect_identifier_candidates( child, symbol_ranges, local_bindings_by_symbol_id, + local_binding_scopes, excluded_ranges, out, ); } } +fn collect_lambda_parameter_value_identifier_candidates( + file_id: u32, + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + local_bindings_by_symbol_id: &HashMap>, + local_binding_scopes: &[LocalBindingScope], + excluded_ranges: &[SourceRange], + out: &mut Vec, +) { + match node.kind() { + "default_parameter" | "typed_default_parameter" => { + if let Some(value) = node.child_by_field_name("value") { + collect_identifier_candidates( + file_id, + source, + value, + symbol_ranges, + local_bindings_by_symbol_id, + local_binding_scopes, + excluded_ranges, + out, + ); + } + } + "lambda_parameters" => { + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_lambda_parameter_value_identifier_candidates( + file_id, + source, + child, + symbol_ranges, + local_bindings_by_symbol_id, + local_binding_scopes, + excluded_ranges, + out, + ); + } + } + _ => {} + } +} + fn is_shadowed_local_binding( source_symbol_id: Option, name: &str, + range: SourceRange, local_bindings_by_symbol_id: &HashMap>, + local_binding_scopes: &[LocalBindingScope], ) -> bool { - source_symbol_id - .and_then(|symbol_id| local_bindings_by_symbol_id.get(&symbol_id)) + let Some(source_symbol_id) = source_symbol_id else { + return false; + }; + if local_bindings_by_symbol_id + .get(&source_symbol_id) .is_some_and(|bindings| bindings.contains(name)) + { + return true; + } + local_binding_scopes.iter().any(|scope| { + scope.source_symbol_id == source_symbol_id + && contains_range(scope.range, range) + && scope.names.contains(name) + }) } fn innermost_symbol_for_range( @@ -1694,6 +1826,8 @@ def import_shadowed():\n import other.module\n import other.module as help def control_flow_shadowed(items, manager):\n for Base, helper in items:\n pass\n with manager as other:\n pass\n try:\n pass\n except Error as helper:\n return Base, helper, other\n\n\ def comprehension_shadowed(items):\n return [Base + helper + other for Base, helper, other in items if Base]\n\n\ def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=helper) as other if Base:\n return Base, helper, other\n case {\"base\": Base, \"helper\": helper, **other}:\n return Base, helper, other\n\n\ +def lambda_shadowed():\n return (lambda Base, helper, *other: (Base, helper, other))\n\n\ +def lambda_default_ref():\n return (lambda local=Base: local)\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1731,6 +1865,16 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "match_shadowed") .unwrap(); + let lambda_shadowed = index + .symbols + .iter() + .find(|symbol| symbol.name == "lambda_shadowed") + .unwrap(); + let lambda_default_ref = index + .symbols + .iter() + .find(|symbol| symbol.name == "lambda_default_ref") + .unwrap(); let helper = index .symbols .iter() @@ -1796,6 +1940,17 @@ def caller():\n return helper()\n", && reference.name == "Point" && reference.target_symbol_id == point.id })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(lambda_shadowed.id) + && (reference.target_symbol_id == base.id + || reference.target_symbol_id == helper.id + || reference.target_symbol_id == other.id) + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(lambda_default_ref.id) + && reference.name == "Base" + && reference.target_symbol_id == base.id + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 78c3bca71..e569b1508 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 3.064s | 544.6 MB | 0.802s | 125.2 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4089 | 2949 | yes | 3.820x | 4.350x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.531s | 3468.9 MB | 3.830s | 259.0 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 5.100x | 13.395x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.994s | 544.4 MB | 0.730s | 125.8 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4089 | 2949 | yes | 4.100x | 4.329x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 20.043s | 3469.3 MB | 4.024s | 257.8 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 4.981x | 13.456x | ## Pinned Compact Snapshot Evidence @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. +- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 418e68e6b..a940a2801 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index dbf08ad43..56b5045f5 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -190,7 +190,8 @@ Recommended task format: - [x] Exclude compact Python references shadowed by local imports. owner: codex. Result: avoids resolving function-local `import ... as ...`, `import pkg.mod`, and `from ... import ...` bindings to imported/top-level symbols; reduced Airflow compact references from 105,739 to 105,624 and dependencies from 68,927 to 68,869. - [x] Exclude compact Python references shadowed by control-flow bindings. owner: codex. Result: avoids resolving `for` targets, `with ... as ...` targets, and `except ... as ...` targets to imported/top-level symbols; reduced Airflow compact references from 105,624 to 105,467 and dependencies from 68,869 to 68,848. - [x] Exclude compact Python references shadowed by comprehension targets and match-pattern captures. owner: codex. Result: avoids resolving comprehension loop targets and match capture patterns to imported/top-level symbols; reduced this checkout's compact references from 4,101 to 4,089 and dependencies from 2,950 to 2,949. The pinned Airflow `2.10.5` compact graph stayed at 105,467 references and 68,848 dependencies. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, lambda parameters, `global`/`nonlocal`, and order-sensitive scopes. +- [x] Exclude compact Python references shadowed by lambda parameters. owner: codex. Result: adds range-scoped lambda-body bindings so lambda parameters shadow inside the lambda body without hiding legitimate default-value references such as `lambda local=Base: local`; this checkout and pinned Airflow stayed graph-stable at 4,089 and 105,467 references respectively. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, `global`/`nonlocal`, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -272,3 +273,4 @@ Recommended task format: - [x] 2026-06-18: Added local-import shadow filtering for compact Python references. owner: codex. Notes: function-local imports no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,624 references and 68,869 dependencies while staying 4.870x faster with 13.195x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added control-flow binding shadow filtering for compact Python references. owner: codex. Notes: `for`, `with ... as ...`, and `except ... as ...` targets no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,467 references and 68,848 dependencies while staying 5.232x faster with 13.332x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added comprehension and match-pattern capture shadow filtering for compact Python references. owner: codex. Notes: this checkout now emits 4,089 compact references and 2,949 dependencies; pinned Airflow remained graph-stable at 105,467 references and 68,848 dependencies while staying 5.100x faster with 13.395x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added range-scoped lambda-parameter shadow filtering for compact Python references. owner: codex. Notes: lambda parameters now shadow only inside lambda bodies while default-value references still resolve outward; pinned Airflow stayed graph-stable at 105,467 references and 68,848 dependencies while staying 4.981x faster with 13.456x lower max RSS than Python parse/object materialization. From 4d45c3408e9122f6736d3c4aa3dc79c64ad331fa Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:06:11 -0700 Subject: [PATCH 024/228] Honor Python global declarations in Rust references --- crates/graph-sitter-engine/src/lib.rs | 73 ++++++++++++++++++- rust-rewrite/benchmarks.md | 10 +-- .../apache-airflow-2.10.5-rust-compact.json | 12 +-- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 5 files changed, 86 insertions(+), 15 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index f95f454ed..7320cbec7 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -727,6 +727,7 @@ fn collect_local_bindings( symbol_ranges: &[(u32, SourceRange)], ) -> (HashMap>, Vec) { let mut bindings: HashMap> = HashMap::new(); + let mut global_declarations: HashMap> = HashMap::new(); let mut scoped_bindings: Vec = Vec::new(); for symbol in index @@ -747,8 +748,14 @@ fn collect_local_bindings( root, symbol_ranges, &mut bindings, + &mut global_declarations, &mut scoped_bindings, ); + for (symbol_id, names) in global_declarations { + if let Some(bindings) = bindings.get_mut(&symbol_id) { + bindings.retain(|name| !names.contains(name)); + } + } (bindings, scoped_bindings) } @@ -757,6 +764,7 @@ fn collect_local_bindings_from_node( node: Node<'_>, symbol_ranges: &[(u32, SourceRange)], bindings: &mut HashMap>, + global_declarations: &mut HashMap>, scoped_bindings: &mut Vec, ) { match node.kind() { @@ -773,6 +781,19 @@ fn collect_local_bindings_from_node( "lambda" => { push_lambda_binding_scope(source, node, symbol_ranges, scoped_bindings); } + "global_statement" => { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, node.range().into()) + { + for name in declaration_names(source, node) { + global_declarations + .entry(source_symbol_id) + .or_default() + .insert(name); + } + } + return; + } "assignment" | "annotated_assignment" | "augmented_assignment" => { if let Some(left) = node.child_by_field_name("left") { push_local_binding_targets(source, left, symbol_ranges, bindings); @@ -823,10 +844,31 @@ fn collect_local_bindings_from_node( let mut cursor = node.walk(); for child in node.named_children(&mut cursor) { - collect_local_bindings_from_node(source, child, symbol_ranges, bindings, scoped_bindings); + collect_local_bindings_from_node( + source, + child, + symbol_ranges, + bindings, + global_declarations, + scoped_bindings, + ); } } +fn declaration_names(source: &str, node: Node<'_>) -> Vec { + let mut names = Vec::new(); + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + if child.kind() != "identifier" { + continue; + } + if let Ok(name) = child.utf8_text(source.as_bytes()) { + names.push(name.to_owned()); + } + } + names +} + fn push_lambda_binding_scope( source: &str, node: Node<'_>, @@ -1098,7 +1140,11 @@ fn collect_identifier_candidates( if matches!( node.kind(), - "import_statement" | "import_from_statement" | "future_import_statement" + "import_statement" + | "import_from_statement" + | "future_import_statement" + | "global_statement" + | "nonlocal_statement" ) { return; } @@ -1828,6 +1874,7 @@ def comprehension_shadowed(items):\n return [Base + helper + other for Base, def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=helper) as other if Base:\n return Base, helper, other\n case {\"base\": Base, \"helper\": helper, **other}:\n return Base, helper, other\n\n\ def lambda_shadowed():\n return (lambda Base, helper, *other: (Base, helper, other))\n\n\ def lambda_default_ref():\n return (lambda local=Base: local)\n\n\ +def global_declared():\n global other\n other = Base\n return other\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1875,6 +1922,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "lambda_default_ref") .unwrap(); + let global_declared = index + .symbols + .iter() + .find(|symbol| symbol.name == "global_declared") + .unwrap(); let helper = index .symbols .iter() @@ -1951,6 +2003,23 @@ def caller():\n return helper()\n", && reference.name == "Base" && reference.target_symbol_id == base.id })); + assert_eq!( + index + .references + .iter() + .filter(|reference| { + reference.source_symbol_id == Some(global_declared.id) + && reference.name == "other" + && reference.target_symbol_id == other.id + }) + .count(), + 2 + ); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(global_declared.id) + && reference.name == "Base" + && reference.target_symbol_id == base.id + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index e569b1508..dc92c6dd9 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.994s | 544.4 MB | 0.730s | 125.8 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4089 | 2949 | yes | 4.100x | 4.329x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 20.043s | 3469.3 MB | 4.024s | 257.8 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105467 | 68848 | yes | 4.981x | 13.456x | +| `graph-sitter` repo checkout | `--disable-graph` | 3.047s | 543.9 MB | 0.698s | 125.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4122 | 2960 | yes | 4.364x | 4.337x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.718s | 3470.5 MB | 3.953s | 259.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105607 | 68917 | yes | 4.987x | 13.393x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 105467 | `31d46be3ba07c666ca0c3c03639f0ee75e426c758d9655294cf3d7b7e6b9fe38` | -| Dependencies | 68848 | `000c01b9fe4230de809516b2b5e7ea8089d89a09e810536896f02c9b2a67b94a` | +| References | 105607 | `1d4a195687476b4f6605966075f6ef844b3ec93b0fb40b65451202d4901d8469` | +| Dependencies | 68917 | `daf4311756da7daa360ffedc08641e9b4675ebff2ccda708153843f64f1fc183` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. +- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index e9952919e..cc78dc6bc 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 68848, + "count": 68917, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "000c01b9fe4230de809516b2b5e7ea8089d89a09e810536896f02c9b2a67b94a" + "sha256": "daf4311756da7daa360ffedc08641e9b4675ebff2ccda708153843f64f1fc183" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 105467, + "count": 105607, "samples": [ { "import": null, @@ -888,7 +888,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" } ], - "sha256": "31d46be3ba07c666ca0c3c03639f0ee75e426c758d9655294cf3d7b7e6b9fe38" + "sha256": "1d4a195687476b4f6605966075f6ef844b3ec93b0fb40b65451202d4901d8469" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 68848, + "dependencies": 68917, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 105467, + "references": 105607, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index a940a2801..caf57d8ce 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Full lexical scoping, attributes, and module references remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 56b5045f5..a56d59593 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -191,7 +191,8 @@ Recommended task format: - [x] Exclude compact Python references shadowed by control-flow bindings. owner: codex. Result: avoids resolving `for` targets, `with ... as ...` targets, and `except ... as ...` targets to imported/top-level symbols; reduced Airflow compact references from 105,624 to 105,467 and dependencies from 68,869 to 68,848. - [x] Exclude compact Python references shadowed by comprehension targets and match-pattern captures. owner: codex. Result: avoids resolving comprehension loop targets and match capture patterns to imported/top-level symbols; reduced this checkout's compact references from 4,101 to 4,089 and dependencies from 2,950 to 2,949. The pinned Airflow `2.10.5` compact graph stayed at 105,467 references and 68,848 dependencies. - [x] Exclude compact Python references shadowed by lambda parameters. owner: codex. Result: adds range-scoped lambda-body bindings so lambda parameters shadow inside the lambda body without hiding legitimate default-value references such as `lambda local=Base: local`; this checkout and pinned Airflow stayed graph-stable at 4,089 and 105,467 references respectively. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, `global`/`nonlocal`, and order-sensitive scopes. +- [x] Preserve compact Python references for `global` declarations. owner: codex. Result: `global` names are removed from the function-local shadow set, so module-level writes and uses remain visible; Airflow compact coverage now emits 105,607 references and 68,917 dependencies. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, `nonlocal`, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -274,3 +275,4 @@ Recommended task format: - [x] 2026-06-18: Added control-flow binding shadow filtering for compact Python references. owner: codex. Notes: `for`, `with ... as ...`, and `except ... as ...` targets no longer resolve later uses to imported/top-level symbols; Airflow compact graph now emits 105,467 references and 68,848 dependencies while staying 5.232x faster with 13.332x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added comprehension and match-pattern capture shadow filtering for compact Python references. owner: codex. Notes: this checkout now emits 4,089 compact references and 2,949 dependencies; pinned Airflow remained graph-stable at 105,467 references and 68,848 dependencies while staying 5.100x faster with 13.395x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added range-scoped lambda-parameter shadow filtering for compact Python references. owner: codex. Notes: lambda parameters now shadow only inside lambda bodies while default-value references still resolve outward; pinned Airflow stayed graph-stable at 105,467 references and 68,848 dependencies while staying 4.981x faster with 13.456x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added `global` declaration handling for compact Python references. owner: codex. Notes: `global` declarations no longer hide module-level symbols behind local assignment shadows; Airflow compact graph now emits 105,607 references and 68,917 dependencies while staying 4.987x faster with 13.393x lower max RSS than Python parse/object materialization. From 6a732c12724cd1a3b56d550368cea580c0fe32c7 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:11:01 -0700 Subject: [PATCH 025/228] Skip Python attribute fields in Rust references --- crates/graph-sitter-engine/src/lib.rs | 39 +++++++++++++++++++ rust-rewrite/benchmarks.md | 9 +++-- .../apache-airflow-2.10.5-rust-compact.json | 12 +++--- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 6 ++- 5 files changed, 55 insertions(+), 13 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 7320cbec7..1c9fc5374 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1138,6 +1138,22 @@ fn collect_identifier_candidates( return; } + if node.kind() == "attribute" { + if let Some(object) = node.child_by_field_name("object") { + collect_identifier_candidates( + file_id, + source, + object, + symbol_ranges, + local_bindings_by_symbol_id, + local_binding_scopes, + excluded_ranges, + out, + ); + } + return; + } + if matches!( node.kind(), "import_statement" @@ -1875,6 +1891,7 @@ def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=h def lambda_shadowed():\n return (lambda Base, helper, *other: (Base, helper, other))\n\n\ def lambda_default_ref():\n return (lambda local=Base: local)\n\n\ def global_declared():\n global other\n other = Base\n return other\n\n\ +def attribute_names_are_not_bare_references(obj):\n return obj.helper, other.helper, helper.attr\n\n\ def caller():\n return helper()\n", ) .unwrap(); @@ -1927,6 +1944,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "global_declared") .unwrap(); + let attribute_names = index + .symbols + .iter() + .find(|symbol| symbol.name == "attribute_names_are_not_bare_references") + .unwrap(); let helper = index .symbols .iter() @@ -2020,6 +2042,23 @@ def caller():\n return helper()\n", && reference.name == "Base" && reference.target_symbol_id == base.id })); + assert_eq!( + index + .references + .iter() + .filter(|reference| { + reference.source_symbol_id == Some(attribute_names.id) + && reference.name == "helper" + && reference.target_symbol_id == helper.id + }) + .count(), + 1 + ); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(attribute_names.id) + && reference.name == "other" + && reference.target_symbol_id == other.id + })); assert!(index.references.iter().any(|reference| { reference.source_symbol_id == Some(caller.id) && reference.name == "helper" diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index dc92c6dd9..6e73f8a13 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 3.047s | 543.9 MB | 0.698s | 125.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4122 | 2960 | yes | 4.364x | 4.337x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.718s | 3470.5 MB | 3.953s | 259.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 105607 | 68917 | yes | 4.987x | 13.393x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.915s | 544.0 MB | 0.708s | 120.9 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.120x | 4.498x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.408s | 3470.3 MB | 3.864s | 220.4 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 104622 | 68340 | yes | 5.023x | 15.744x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 105607 | `1d4a195687476b4f6605966075f6ef844b3ec93b0fb40b65451202d4901d8469` | -| Dependencies | 68917 | `daf4311756da7daa360ffedc08641e9b4675ebff2ccda708153843f64f1fc183` | +| References | 104622 | `3a6a5c38e0485307841d9ba55aecbc5578d341cea1a295e133fa2f34d93f8133` | +| Dependencies | 68340 | `69840447840f50c6513e72d85086271996df8ea7104f6e510079e5d105ae0c51` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -218,6 +218,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. +- Attribute field names are skipped as bare-name references until full attribute/module resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index cc78dc6bc..910376d33 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 68917, + "count": 68340, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "daf4311756da7daa360ffedc08641e9b4675ebff2ccda708153843f64f1fc183" + "sha256": "69840447840f50c6513e72d85086271996df8ea7104f6e510079e5d105ae0c51" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 105607, + "count": 104622, "samples": [ { "import": null, @@ -888,7 +888,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" } ], - "sha256": "1d4a195687476b4f6605966075f6ef844b3ec93b0fb40b65451202d4901d8469" + "sha256": "3a6a5c38e0485307841d9ba55aecbc5578d341cea1a295e133fa2f34d93f8133" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 68917, + "dependencies": 68340, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 105607, + "references": 104622, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index caf57d8ce..561e5335d 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Full lexical scoping, attributes, and module references remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/module resolution, and `nonlocal` remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index a56d59593..6d81f09e0 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -192,7 +192,8 @@ Recommended task format: - [x] Exclude compact Python references shadowed by comprehension targets and match-pattern captures. owner: codex. Result: avoids resolving comprehension loop targets and match capture patterns to imported/top-level symbols; reduced this checkout's compact references from 4,101 to 4,089 and dependencies from 2,950 to 2,949. The pinned Airflow `2.10.5` compact graph stayed at 105,467 references and 68,848 dependencies. - [x] Exclude compact Python references shadowed by lambda parameters. owner: codex. Result: adds range-scoped lambda-body bindings so lambda parameters shadow inside the lambda body without hiding legitimate default-value references such as `lambda local=Base: local`; this checkout and pinned Airflow stayed graph-stable at 4,089 and 105,467 references respectively. - [x] Preserve compact Python references for `global` declarations. owner: codex. Result: `global` names are removed from the function-local shadow set, so module-level writes and uses remain visible; Airflow compact coverage now emits 105,607 references and 68,917 dependencies. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, attributes, module references, `nonlocal`, and order-sensitive scopes. +- [x] Stop treating Python attribute field names as bare compact references. owner: codex. Result: scans the object side of attribute expressions but skips the field-name side; Airflow compact coverage now emits 104,622 references and 68,340 dependencies. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/module resolution, `nonlocal`, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -235,7 +236,7 @@ Recommended task format: - [ ] Add large-repo memory regression benchmark to CI or nightly. - [x] Add pinned large-repo latency/RSS benchmark harness. owner: codex. Result: Airflow `2.10.5` benchmark command emits backend, wall time, max RSS, file count, node/edge counts, compact Rust record counts, mismatch summaries, and pass/fail gates. - [x] Add opt-in pinned large-repo compact snapshot test. owner: codex. Result: `tests/integration/rust_rewrite/test_pinned_airflow_snapshot.py` runs the committed Airflow compact golden check when `GRAPH_SITTER_RUN_PINNED_AIRFLOW_SNAPSHOT=1`. -- [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. Notes: run against the exact checked-out commit and emit backend, wall time, max RSS, file count, node/edge counts, and mismatch summaries. +- [ ] Add pinned large-repo parity test for reference graph, import graph, dependency graph, and latency/RSS. Notes: start with Apache Airflow `2.10.5` at commit `b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`; assert reference graph, import graph, dependency graph, deterministic ordering, and benchmark wall/RSS against the exact checkout before adding more canonical repos. - [ ] Add feature flag documentation. - [ ] Add migration notes for unsupported APIs. - [ ] Decide default backend criteria. @@ -276,3 +277,4 @@ Recommended task format: - [x] 2026-06-18: Added comprehension and match-pattern capture shadow filtering for compact Python references. owner: codex. Notes: this checkout now emits 4,089 compact references and 2,949 dependencies; pinned Airflow remained graph-stable at 105,467 references and 68,848 dependencies while staying 5.100x faster with 13.395x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added range-scoped lambda-parameter shadow filtering for compact Python references. owner: codex. Notes: lambda parameters now shadow only inside lambda bodies while default-value references still resolve outward; pinned Airflow stayed graph-stable at 105,467 references and 68,848 dependencies while staying 4.981x faster with 13.456x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added `global` declaration handling for compact Python references. owner: codex. Notes: `global` declarations no longer hide module-level symbols behind local assignment shadows; Airflow compact graph now emits 105,607 references and 68,917 dependencies while staying 4.987x faster with 13.393x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Skipped Python attribute field names as bare compact references. owner: codex. Notes: object-side references still resolve, but `obj.helper` no longer creates a false standalone `helper` dependency; Airflow compact graph now emits 104,622 references and 68,340 dependencies while staying 5.023x faster with 15.744x lower max RSS than Python parse/object materialization. From 10169d04682c3eb58ad33305751c08bd6617a11a Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:15:44 -0700 Subject: [PATCH 026/228] Scope Python comprehension bindings in Rust references --- crates/graph-sitter-engine/src/lib.rs | 87 +++++++++++++++++++++++++-- rust-rewrite/benchmarks.md | 6 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 2 + 4 files changed, 88 insertions(+), 9 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 1c9fc5374..acd84bf0a 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -781,6 +781,12 @@ fn collect_local_bindings_from_node( "lambda" => { push_lambda_binding_scope(source, node, symbol_ranges, scoped_bindings); } + "list_comprehension" + | "set_comprehension" + | "dictionary_comprehension" + | "generator_expression" => { + push_comprehension_binding_scope(source, node, symbol_ranges, scoped_bindings); + } "global_statement" => { if let Some(source_symbol_id) = innermost_symbol_for_range(symbol_ranges, node.range().into()) @@ -798,6 +804,16 @@ fn collect_local_bindings_from_node( if let Some(left) = node.child_by_field_name("left") { push_local_binding_targets(source, left, symbol_ranges, bindings); } + if let Some(right) = node.child_by_field_name("right") { + collect_local_bindings_from_node( + source, + right, + symbol_ranges, + bindings, + global_declarations, + scoped_bindings, + ); + } return; } "for_statement" => { @@ -805,11 +821,6 @@ fn collect_local_bindings_from_node( push_local_binding_targets(source, left, symbol_ranges, bindings); } } - "for_in_clause" => { - if let Some(left) = node.child_by_field_name("left") { - push_local_binding_targets(source, left, symbol_ranges, bindings); - } - } "with_statement" => { if let Some(with_clause) = first_child_of_kind(node, &["with_clause"]) { push_as_pattern_binding_targets(source, with_clause, symbol_ranges, bindings); @@ -869,6 +880,48 @@ fn declaration_names(source: &str, node: Node<'_>) -> Vec { names } +fn push_comprehension_binding_scope( + source: &str, + node: Node<'_>, + symbol_ranges: &[(u32, SourceRange)], + scoped_bindings: &mut Vec, +) { + let Some(source_symbol_id) = innermost_symbol_for_range(symbol_ranges, node.range().into()) + else { + return; + }; + + let mut targets = Vec::new(); + collect_comprehension_targets(node, &mut targets); + let mut names = HashSet::new(); + for target in targets { + if let Ok(name) = target.utf8_text(source.as_bytes()) { + names.insert(name.to_owned()); + } + } + if !names.is_empty() { + scoped_bindings.push(LocalBindingScope { + source_symbol_id, + range: node.range().into(), + names, + }); + } +} + +fn collect_comprehension_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { + if node.kind() == "for_in_clause" { + if let Some(left) = node.child_by_field_name("left") { + collect_assignment_targets(left, out); + } + return; + } + + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + collect_comprehension_targets(child, out); + } +} + fn push_lambda_binding_scope( source: &str, node: Node<'_>, @@ -1887,6 +1940,7 @@ def shadowed(Base):\n helper = Base\n return helper, Base\n\n\ def import_shadowed():\n import other.module\n import other.module as helper\n from other import Base\n return helper, Base, other\n\n\ def control_flow_shadowed(items, manager):\n for Base, helper in items:\n pass\n with manager as other:\n pass\n try:\n pass\n except Error as helper:\n return Base, helper, other\n\n\ def comprehension_shadowed(items):\n return [Base + helper + other for Base, helper, other in items if Base]\n\n\ +def comprehension_scope_does_not_leak(items):\n values = [Base + helper for Base, helper in items]\n return Base, helper, other\n\n\ def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=helper) as other if Base:\n return Base, helper, other\n case {\"base\": Base, \"helper\": helper, **other}:\n return Base, helper, other\n\n\ def lambda_shadowed():\n return (lambda Base, helper, *other: (Base, helper, other))\n\n\ def lambda_default_ref():\n return (lambda local=Base: local)\n\n\ @@ -1924,6 +1978,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "comprehension_shadowed") .unwrap(); + let comprehension_scope_does_not_leak = index + .symbols + .iter() + .find(|symbol| symbol.name == "comprehension_scope_does_not_leak") + .unwrap(); let match_shadowed = index .symbols .iter() @@ -2003,6 +2062,24 @@ def caller():\n return helper()\n", || reference.target_symbol_id == helper.id || reference.target_symbol_id == other.id) })); + for (name, target_symbol_id) in [ + ("Base", base.id), + ("helper", helper.id), + ("other", other.id), + ] { + assert_eq!( + index + .references + .iter() + .filter(|reference| { + reference.source_symbol_id == Some(comprehension_scope_does_not_leak.id) + && reference.name == name + && reference.target_symbol_id == target_symbol_id + }) + .count(), + 1 + ); + } assert!(!index.references.iter().any(|reference| { reference.source_symbol_id == Some(match_shadowed.id) && (reference.target_symbol_id == base.id diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 6e73f8a13..6ac24dbd1 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.915s | 544.0 MB | 0.708s | 120.9 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.120x | 4.498x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.408s | 3470.3 MB | 3.864s | 220.4 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 104622 | 68340 | yes | 5.023x | 15.744x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.882s | 546.2 MB | 0.681s | 123.2 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.235x | 4.435x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.898s | 3468.8 MB | 4.061s | 220.3 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 104622 | 68340 | yes | 4.899x | 15.745x | ## Pinned Compact Snapshot Evidence @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. +- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. - Attribute field names are skipped as bare-name references until full attribute/module resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 561e5335d..646fafcd1 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/module resolution, and `nonlocal` remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/module resolution, and `nonlocal` remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 6d81f09e0..6cc137931 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -190,6 +190,7 @@ Recommended task format: - [x] Exclude compact Python references shadowed by local imports. owner: codex. Result: avoids resolving function-local `import ... as ...`, `import pkg.mod`, and `from ... import ...` bindings to imported/top-level symbols; reduced Airflow compact references from 105,739 to 105,624 and dependencies from 68,927 to 68,869. - [x] Exclude compact Python references shadowed by control-flow bindings. owner: codex. Result: avoids resolving `for` targets, `with ... as ...` targets, and `except ... as ...` targets to imported/top-level symbols; reduced Airflow compact references from 105,624 to 105,467 and dependencies from 68,869 to 68,848. - [x] Exclude compact Python references shadowed by comprehension targets and match-pattern captures. owner: codex. Result: avoids resolving comprehension loop targets and match capture patterns to imported/top-level symbols; reduced this checkout's compact references from 4,101 to 4,089 and dependencies from 2,950 to 2,949. The pinned Airflow `2.10.5` compact graph stayed at 105,467 references and 68,848 dependencies. +- [x] Scope compact Python comprehension target shadows to comprehension expressions. owner: codex. Result: comprehension loop targets no longer hide later references in the enclosing function; current checkout and pinned Airflow stayed graph-stable at 4,110 and 104,622 references respectively after the attribute-field skip baseline. - [x] Exclude compact Python references shadowed by lambda parameters. owner: codex. Result: adds range-scoped lambda-body bindings so lambda parameters shadow inside the lambda body without hiding legitimate default-value references such as `lambda local=Base: local`; this checkout and pinned Airflow stayed graph-stable at 4,089 and 105,467 references respectively. - [x] Preserve compact Python references for `global` declarations. owner: codex. Result: `global` names are removed from the function-local shadow set, so module-level writes and uses remain visible; Airflow compact coverage now emits 105,607 references and 68,917 dependencies. - [x] Stop treating Python attribute field names as bare compact references. owner: codex. Result: scans the object side of attribute expressions but skips the field-name side; Airflow compact coverage now emits 104,622 references and 68,340 dependencies. @@ -278,3 +279,4 @@ Recommended task format: - [x] 2026-06-18: Added range-scoped lambda-parameter shadow filtering for compact Python references. owner: codex. Notes: lambda parameters now shadow only inside lambda bodies while default-value references still resolve outward; pinned Airflow stayed graph-stable at 105,467 references and 68,848 dependencies while staying 4.981x faster with 13.456x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added `global` declaration handling for compact Python references. owner: codex. Notes: `global` declarations no longer hide module-level symbols behind local assignment shadows; Airflow compact graph now emits 105,607 references and 68,917 dependencies while staying 4.987x faster with 13.393x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Skipped Python attribute field names as bare compact references. owner: codex. Notes: object-side references still resolve, but `obj.helper` no longer creates a false standalone `helper` dependency; Airflow compact graph now emits 104,622 references and 68,340 dependencies while staying 5.023x faster with 15.744x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Scoped comprehension target shadowing to comprehension expressions. owner: codex. Notes: prevents `[Base for Base in items]` from hiding later `Base` references in the enclosing function; Airflow compact graph stayed stable at 104,622 references and 68,340 dependencies while staying 4.899x faster with 15.745x lower max RSS than Python parse/object materialization. From cab2c20230ca39dacd85aee106dc002e2ccd4128 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:20:49 -0700 Subject: [PATCH 027/228] Resolve Python module attribute references in Rust --- crates/graph-sitter-engine/src/lib.rs | 181 +++++++++++++- rust-rewrite/benchmarks.md | 10 +- .../apache-airflow-2.10.5-rust-compact.json | 234 +++++++++--------- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +- 5 files changed, 296 insertions(+), 135 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index acd84bf0a..3b62aa8d4 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -299,6 +299,7 @@ struct ReferenceCandidate { source_file_id: u32, source_symbol_id: Option, name: String, + qualifier: Option, range: SourceRange, } @@ -1192,6 +1193,35 @@ fn collect_identifier_candidates( } if node.kind() == "attribute" { + if let (Some(object), Some(attribute)) = ( + node.child_by_field_name("object"), + node.child_by_field_name("attribute"), + ) { + let range = attribute.range().into(); + if attribute.kind() == "identifier" && !range_matches_any(range, excluded_ranges) { + if let (Ok(qualifier), Ok(name)) = ( + object.utf8_text(source.as_bytes()), + attribute.utf8_text(source.as_bytes()), + ) { + let source_symbol_id = innermost_symbol_for_range(symbol_ranges, range); + if !qualified_reference_is_shadowed( + source_symbol_id, + qualifier, + object.range().into(), + local_bindings_by_symbol_id, + local_binding_scopes, + ) { + out.push(ReferenceCandidate { + source_file_id: file_id, + source_symbol_id, + name: name.to_owned(), + qualifier: Some(qualifier.to_owned()), + range, + }); + } + } + } + } if let Some(object) = node.child_by_field_name("object") { collect_identifier_candidates( file_id, @@ -1235,6 +1265,7 @@ fn collect_identifier_candidates( source_file_id: file_id, source_symbol_id, name: name.to_owned(), + qualifier: None, range, }); } @@ -1255,6 +1286,23 @@ fn collect_identifier_candidates( } } +fn qualified_reference_is_shadowed( + source_symbol_id: Option, + qualifier: &str, + range: SourceRange, + local_bindings_by_symbol_id: &HashMap>, + local_binding_scopes: &[LocalBindingScope], +) -> bool { + let binding = qualifier.split('.').next().unwrap_or(qualifier); + is_shadowed_local_binding( + source_symbol_id, + binding, + range, + local_bindings_by_symbol_id, + local_binding_scopes, + ) +} + fn collect_lambda_parameter_value_identifier_candidates( file_id: u32, source: &str, @@ -1495,11 +1543,20 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec = HashMap::new(); + let mut imported_module_by_qualifier: HashMap<(u32, String), (u32, u32)> = HashMap::new(); for import in &index.imports { let Some(resolution) = resolution_by_import_id.get(&import.id) else { continue; }; + if resolution.target_symbol_id.is_none() { + for qualifier in import_module_qualifiers(import) { + imported_module_by_qualifier.insert( + (import.file_id, qualifier), + (resolution.target_file_id, import.id), + ); + } + } let Some(target_symbol_id) = resolution.target_symbol_id else { continue; }; @@ -1511,17 +1568,28 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec Vec { + let mut qualifiers = Vec::new(); + if let Some(alias) = import.alias.as_deref() { + qualifiers.push(alias.to_owned()); + } + match import.kind { + ImportKind::Import => { + if let Some(name) = import.name.as_deref() { + qualifiers.push(name.to_owned()); + } + } + ImportKind::FromImport | ImportKind::FutureImport => { + if import.alias.is_none() { + if let Some(name) = import.name.as_deref() { + qualifiers.push(name.to_owned()); + } + } + } + } + qualifiers.sort(); + qualifiers.dedup(); + qualifiers +} + fn build_python_dependencies(index: &mut PythonIndex) { let symbol_file_ids: HashMap = index .symbols @@ -1921,6 +2013,73 @@ mod tests { })); } + #[test] + fn resolves_python_module_attribute_references() { + let repo = temp_repo_path("python-module-attribute-references"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "").unwrap(); + fs::write( + repo.join("pkg/base.py"), + "class Base:\n pass\n\ndef helper():\n return Base\n", + ) + .unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from . import base\nimport pkg.base as base_alias\nimport pkg.base\n\n\ +def caller():\n return base.helper(), base_alias.Base, pkg.base.helper()\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let caller = index + .symbols + .iter() + .find(|symbol| symbol.name == "caller") + .unwrap(); + let helper = index + .symbols + .iter() + .find(|symbol| symbol.name == "helper") + .unwrap(); + let base = index + .symbols + .iter() + .find(|symbol| symbol.name == "Base") + .unwrap(); + + assert_eq!( + index + .references + .iter() + .filter(|reference| { + reference.source_symbol_id == Some(caller.id) + && reference.name == "helper" + && reference.target_symbol_id == helper.id + && reference.import_id.is_some() + }) + .count(), + 2 + ); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(caller.id) + && reference.name == "Base" + && reference.target_symbol_id == base.id + && reference.import_id.is_some() + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == caller.id + && dependency.target_symbol_id == helper.id + && dependency.reference_count == 2 + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == caller.id + && dependency.target_symbol_id == base.id + && dependency.reference_count == 1 + })); + } + #[test] fn skips_references_shadowed_by_python_parameters_and_locals() { let repo = temp_repo_path("python-shadowed-reference-sources"); diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index 6ac24dbd1..d82a8b564 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.882s | 546.2 MB | 0.681s | 123.2 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.235x | 4.435x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.898s | 3468.8 MB | 4.061s | 220.3 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 104622 | 68340 | yes | 4.899x | 15.745x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.957s | 545.9 MB | 0.723s | 121.5 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.089x | 4.491x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.308s | 3470.9 MB | 4.038s | 259.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109282 | 71534 | yes | 4.781x | 13.394x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 104622 | `3a6a5c38e0485307841d9ba55aecbc5578d341cea1a295e133fa2f34d93f8133` | -| Dependencies | 68340 | `69840447840f50c6513e72d85086271996df8ea7104f6e510079e5d105ae0c51` | +| References | 109282 | `105a18ff136264aa95dc28220ff664fb8599cc3b54fc33bf4e80544332a24a9f` | +| Dependencies | 71534 | `98241ae1ab983f1d345ffd43ee23ac61d4b6bba40404da1f74f858c237e5961c` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -218,7 +218,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. -- Attribute field names are skipped as bare-name references until full attribute/module resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. +- Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 910376d33..90d0e11d9 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 68340, + "count": 71534, "samples": [ { "reference_count": 1, @@ -31,6 +31,13 @@ "target_file": "airflow/__main__.py", "target_symbol": "airflow/__main__.py:function:configure_internal_api@2850" }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_file": "airflow/cli/cli_parser.py", + "target_symbol": "airflow/cli/cli_parser.py:function:get_parser@4978" + }, { "reference_count": 1, "source_file": "airflow/__main__.py", @@ -38,6 +45,13 @@ "target_file": "airflow/configuration.py", "target_symbol": "airflow/configuration.py:function:write_webserver_configuration_if_needed@93544" }, + { + "reference_count": 1, + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_file": "airflow/configuration.py", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, { "reference_count": 2, "source_file": "airflow/api/__init__.py", @@ -66,6 +80,27 @@ "target_file": "airflow/exceptions.py", "target_symbol": "airflow/exceptions.py:class:AirflowException@1246" }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:auth_current_user@1530", + "target_file": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:auth_current_user@1480" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:init_app@1480", + "target_file": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:init_app@1416" + }, + { + "reference_count": 1, + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:requires_authentication@1618", + "target_file": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:requires_authentication@2134" + }, { "reference_count": 2, "source_file": "airflow/api/auth/backend/default.py", @@ -107,44 +142,9 @@ "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:decorated@5454", "target_file": "airflow/api/auth/backend/kerberos_auth.py", "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" - }, - { - "reference_count": 1, - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:decorated@5454", - "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_unauthorized@3781" - }, - { - "reference_count": 1, - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" - }, - { - "reference_count": 4, - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/api/auth/backend/kerberos_auth.py", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" - }, - { - "reference_count": 1, - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/configuration.py", - "target_symbol": "airflow/configuration.py:global_variable:conf@102463" - }, - { - "reference_count": 1, - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_file": "airflow/utils/net.py", - "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" } ], - "sha256": "69840447840f50c6513e72d85086271996df8ea7104f6e510079e5d105ae0c51" + "sha256": "98241ae1ab983f1d345ffd43ee23ac61d4b6bba40404da1f74f858c237e5961c" }, "files": { "count": 4789, @@ -665,8 +665,19 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 104622, + "count": 109282, "samples": [ + { + "import": "airflow/__init__.py:from_import:airflow:settings:@2460", + "name": "initialize", + "range": [ + 3255, + 3265 + ], + "source_file": "airflow/__init__.py", + "source_symbol": null, + "target_symbol": "airflow/settings.py:function:initialize@29763" + }, { "import": null, "name": "__lazy_imports", @@ -678,6 +689,50 @@ "source_symbol": "airflow/__init__.py:function:__getattr__@4048", "target_symbol": "airflow/__init__.py:global_variable:__lazy_imports@3362" }, + { + "import": "airflow/__init__.py:from_import:airflow:settings:@2460", + "name": "LAZY_LOAD_PROVIDERS", + "range": [ + 5311, + 5330 + ], + "source_file": "airflow/__init__.py", + "source_symbol": null, + "target_symbol": "airflow/settings.py:global_variable:LAZY_LOAD_PROVIDERS@32556" + }, + { + "import": "airflow/__init__.py:from_import:airflow:settings:@2460", + "name": "LAZY_LOAD_PLUGINS", + "range": [ + 5569, + 5586 + ], + "source_file": "airflow/__init__.py", + "source_symbol": null, + "target_symbol": "airflow/settings.py:global_variable:LAZY_LOAD_PLUGINS@32249" + }, + { + "import": "airflow/__main__.py:from_import:airflow:configuration:@1596", + "name": "conf", + "range": [ + 1847, + 1851 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_symbol": "airflow/configuration.py:global_variable:conf@102463" + }, + { + "import": "airflow/__main__.py:from_import:airflow.cli:cli_parser:@1630", + "name": "get_parser", + "range": [ + 2060, + 2070 + ], + "source_file": "airflow/__main__.py", + "source_symbol": "airflow/__main__.py:function:main@1814", + "target_symbol": "airflow/cli/cli_parser.py:function:get_parser@4978" + }, { "import": "airflow/__main__.py:from_import:airflow.configuration:write_webserver_configuration_if_needed:@1665", "name": "write_webserver_configuration_if_needed", @@ -800,95 +855,40 @@ "target_symbol": "airflow/exceptions.py:class:RemovedInAirflow3Warning@16092" }, { - "import": null, - "name": "T", - "range": [ - 1151, - 1152 - ], - "source_file": "airflow/api/auth/backend/default.py", - "source_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", - "target_symbol": "airflow/api/auth/backend/default.py:global_variable:T@1078" - }, - { - "import": null, - "name": "T", - "range": [ - 1329, - 1330 - ], - "source_file": "airflow/api/auth/backend/default.py", - "source_symbol": "airflow/api/auth/backend/default.py:function:requires_authentication@1117", - "target_symbol": "airflow/api/auth/backend/default.py:global_variable:T@1078" - }, - { - "import": null, - "name": "T", - "range": [ - 1164, - 1165 - ], - "source_file": "airflow/api/auth/backend/deny_all.py", - "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", - "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" - }, - { - "import": null, - "name": "T", - "range": [ - 1343, - 1344 - ], - "source_file": "airflow/api/auth/backend/deny_all.py", - "source_symbol": "airflow/api/auth/backend/deny_all.py:function:requires_authentication@1130", - "target_symbol": "airflow/api/auth/backend/deny_all.py:global_variable:T@1091" - }, - { - "import": null, - "name": "KerberosService", - "range": [ - 3060, - 3075 - ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:class:KerberosService@2728" - }, - { - "import": "airflow/api/auth/backend/kerberos_auth.py:from_import:airflow.utils.net:getfqdn:@2564", - "name": "getfqdn", + "import": "airflow/api/auth/backend/basic_auth.py:import::airflow.providers.fab.auth_manager.api.auth.backend.basic_auth:fab_basic_auth@1013", + "name": "init_app", "range": [ - 3232, - 3239 + 1512, + 1520 ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/utils/net.py:function:getfqdn@1031" + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:init_app@1480", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:init_app@1416" }, { - "import": null, - "name": "log", + "import": "airflow/api/auth/backend/basic_auth.py:import::airflow.providers.fab.auth_manager.api.auth.backend.basic_auth:fab_basic_auth@1013", + "name": "auth_current_user", "range": [ - 3246, - 3249 + 1592, + 1609 ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:log@2686" + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:auth_current_user@1530", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:auth_current_user@1480" }, { - "import": null, - "name": "_KERBEROS_SERVICE", + "import": "airflow/api/auth/backend/basic_auth.py:import::airflow.providers.fab.auth_manager.api.auth.backend.basic_auth:fab_basic_auth@1013", + "name": "requires_authentication", "range": [ - 3320, - 3337 + 1689, + 1712 ], - "source_file": "airflow/api/auth/backend/kerberos_auth.py", - "source_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:init_app@3084", - "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:global_variable:_KERBEROS_SERVICE@3040" + "source_file": "airflow/api/auth/backend/basic_auth.py", + "source_symbol": "airflow/api/auth/backend/basic_auth.py:function:requires_authentication@1618", + "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:requires_authentication@2134" } ], - "sha256": "3a6a5c38e0485307841d9ba55aecbc5578d341cea1a295e133fa2f34d93f8133" + "sha256": "105a18ff136264aa95dc28220ff664fb8599cc3b54fc33bf4e80544332a24a9f" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 68340, + "dependencies": 71534, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 104622, + "references": 109282, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 646fafcd1..0d312f8f1 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/module resolution, and `nonlocal` remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/type resolution, and `nonlocal` remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index 6cc137931..a8f8c51f8 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -194,7 +194,8 @@ Recommended task format: - [x] Exclude compact Python references shadowed by lambda parameters. owner: codex. Result: adds range-scoped lambda-body bindings so lambda parameters shadow inside the lambda body without hiding legitimate default-value references such as `lambda local=Base: local`; this checkout and pinned Airflow stayed graph-stable at 4,089 and 105,467 references respectively. - [x] Preserve compact Python references for `global` declarations. owner: codex. Result: `global` names are removed from the function-local shadow set, so module-level writes and uses remain visible; Airflow compact coverage now emits 105,607 references and 68,917 dependencies. - [x] Stop treating Python attribute field names as bare compact references. owner: codex. Result: scans the object side of attribute expressions but skips the field-name side; Airflow compact coverage now emits 104,622 references and 68,340 dependencies. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/module resolution, `nonlocal`, and order-sensitive scopes. +- [x] Resolve compact Python references through imported module attributes. owner: codex. Result: resolves `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` when the qualifier maps to an indexed internal Python module; Airflow compact coverage now emits 109,282 references and 71,534 dependencies. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, `nonlocal`, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -280,3 +281,4 @@ Recommended task format: - [x] 2026-06-18: Added `global` declaration handling for compact Python references. owner: codex. Notes: `global` declarations no longer hide module-level symbols behind local assignment shadows; Airflow compact graph now emits 105,607 references and 68,917 dependencies while staying 4.987x faster with 13.393x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Skipped Python attribute field names as bare compact references. owner: codex. Notes: object-side references still resolve, but `obj.helper` no longer creates a false standalone `helper` dependency; Airflow compact graph now emits 104,622 references and 68,340 dependencies while staying 5.023x faster with 15.744x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Scoped comprehension target shadowing to comprehension expressions. owner: codex. Notes: prevents `[Base for Base in items]` from hiding later `Base` references in the enclosing function; Airflow compact graph stayed stable at 104,622 references and 68,340 dependencies while staying 4.899x faster with 15.745x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added imported module member references to the compact Rust graph. owner: codex. Notes: `module.some_func`, `alias.SomeClass`, and exact `pkg.module.some_func` qualifiers now resolve through existing import-resolution rows; Airflow compact graph now emits 109,282 references and 71,534 dependencies while staying 4.781x faster with 13.394x lower max RSS than Python parse/object materialization. From 31b3ce0313b7ccba848adcaed7a6aa42498ddbf3 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:24:42 -0700 Subject: [PATCH 028/228] Honor Python nonlocal declarations in Rust references --- crates/graph-sitter-engine/src/lib.rs | 27 +++++++++++++++++++++++++++ rust-rewrite/benchmarks.md | 6 +++--- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 4 +++- 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 3b62aa8d4..9ccbb608d 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -801,6 +801,17 @@ fn collect_local_bindings_from_node( } return; } + "nonlocal_statement" => { + if let Some(source_symbol_id) = + innermost_symbol_for_range(symbol_ranges, node.range().into()) + { + bindings + .entry(source_symbol_id) + .or_default() + .extend(declaration_names(source, node)); + } + return; + } "assignment" | "annotated_assignment" | "augmented_assignment" => { if let Some(left) = node.child_by_field_name("left") { push_local_binding_targets(source, left, symbol_ranges, bindings); @@ -2103,6 +2114,7 @@ def comprehension_scope_does_not_leak(items):\n values = [Base + helper for B def match_shadowed(subject):\n match subject:\n case Point(x=Base, y=helper) as other if Base:\n return Base, helper, other\n case {\"base\": Base, \"helper\": helper, **other}:\n return Base, helper, other\n\n\ def lambda_shadowed():\n return (lambda Base, helper, *other: (Base, helper, other))\n\n\ def lambda_default_ref():\n return (lambda local=Base: local)\n\n\ +def nonlocal_declared():\n helper = Base\n def inner():\n nonlocal helper\n helper = Base\n return helper\n return inner\n\n\ def global_declared():\n global other\n other = Base\n return other\n\n\ def attribute_names_are_not_bare_references(obj):\n return obj.helper, other.helper, helper.attr\n\n\ def caller():\n return helper()\n", @@ -2157,6 +2169,11 @@ def caller():\n return helper()\n", .iter() .find(|symbol| symbol.name == "lambda_default_ref") .unwrap(); + let nonlocal_declared_inner = index + .symbols + .iter() + .find(|symbol| symbol.name == "inner") + .unwrap(); let global_declared = index .symbols .iter() @@ -2261,6 +2278,16 @@ def caller():\n return helper()\n", && reference.name == "Base" && reference.target_symbol_id == base.id })); + assert!(!index.references.iter().any(|reference| { + reference.source_symbol_id == Some(nonlocal_declared_inner.id) + && reference.name == "helper" + && reference.target_symbol_id == helper.id + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(nonlocal_declared_inner.id) + && reference.name == "Base" + && reference.target_symbol_id == base.id + })); assert_eq!( index .references diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index d82a8b564..c14e7ea10 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.957s | 545.9 MB | 0.723s | 121.5 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 4.089x | 4.491x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.308s | 3470.9 MB | 4.038s | 259.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109282 | 71534 | yes | 4.781x | 13.394x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.849s | 544.7 MB | 0.718s | 123.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.968x | 4.414x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.557s | 3470.3 MB | 3.979s | 262.0 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109282 | 71534 | yes | 4.663x | 13.244x | ## Pinned Compact Snapshot Evidence @@ -217,7 +217,7 @@ Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. -- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. +- Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. - Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 0d312f8f1..74b3bf467 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, and nested definitions shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping, full attribute/type resolution, and `nonlocal` remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index a8f8c51f8..dbf4c8d3e 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -195,7 +195,8 @@ Recommended task format: - [x] Preserve compact Python references for `global` declarations. owner: codex. Result: `global` names are removed from the function-local shadow set, so module-level writes and uses remain visible; Airflow compact coverage now emits 105,607 references and 68,917 dependencies. - [x] Stop treating Python attribute field names as bare compact references. owner: codex. Result: scans the object side of attribute expressions but skips the field-name side; Airflow compact coverage now emits 104,622 references and 68,340 dependencies. - [x] Resolve compact Python references through imported module attributes. owner: codex. Result: resolves `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` when the qualifier maps to an indexed internal Python module; Airflow compact coverage now emits 109,282 references and 71,534 dependencies. -- [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, `nonlocal`, and order-sensitive scopes. +- [x] Exclude compact Python references shadowed by `nonlocal` declarations. owner: codex. Result: prevents closure variables declared `nonlocal` from resolving to imported/top-level symbols in nested functions; this checkout and pinned Airflow stayed graph-stable at 4,110 and 109,282 references respectively. +- [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. - [ ] Implement superclass/interface dependency edges. @@ -282,3 +283,4 @@ Recommended task format: - [x] 2026-06-18: Skipped Python attribute field names as bare compact references. owner: codex. Notes: object-side references still resolve, but `obj.helper` no longer creates a false standalone `helper` dependency; Airflow compact graph now emits 104,622 references and 68,340 dependencies while staying 5.023x faster with 15.744x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Scoped comprehension target shadowing to comprehension expressions. owner: codex. Notes: prevents `[Base for Base in items]` from hiding later `Base` references in the enclosing function; Airflow compact graph stayed stable at 104,622 references and 68,340 dependencies while staying 4.899x faster with 15.745x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added imported module member references to the compact Rust graph. owner: codex. Notes: `module.some_func`, `alias.SomeClass`, and exact `pkg.module.some_func` qualifiers now resolve through existing import-resolution rows; Airflow compact graph now emits 109,282 references and 71,534 dependencies while staying 4.781x faster with 13.394x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added `nonlocal` declaration shadowing for compact Python references. owner: codex. Notes: `nonlocal helper` inside nested functions no longer creates a false imported/top-level `helper` reference; Airflow compact graph stayed stable at 109,282 references and 71,534 dependencies while staying 4.663x faster with 13.244x lower max RSS than Python parse/object materialization. From 6de5180f346723de7bc23118757a32321d116681 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:29:35 -0700 Subject: [PATCH 029/228] Resolve Python package reexports in Rust --- crates/graph-sitter-engine/src/lib.rs | 113 ++++++++++++++++++ rust-rewrite/benchmarks.md | 11 +- .../apache-airflow-2.10.5-rust-compact.json | 14 +-- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 2 + 5 files changed, 129 insertions(+), 13 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 9ccbb608d..53e5b5042 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1539,6 +1539,66 @@ fn resolve_python_imports(index: &mut PythonIndex) { } } index.import_resolutions = resolutions; + resolve_python_reexport_imports(index); +} + +fn resolve_python_reexport_imports(index: &mut PythonIndex) { + let import_by_id: HashMap = index + .imports + .iter() + .map(|import| (import.id, import)) + .collect(); + + for _ in 0..index.import_resolutions.len() { + let resolution_by_import_id: HashMap = index + .import_resolutions + .iter() + .map(|resolution| (resolution.import_id, resolution)) + .collect(); + let mut reexported_symbol_by_file_binding: HashMap<(u32, String), u32> = HashMap::new(); + + for import in &index.imports { + let Some(binding) = import_binding_name(import) else { + continue; + }; + let Some(resolution) = resolution_by_import_id.get(&import.id) else { + continue; + }; + let Some(target_symbol_id) = resolution.target_symbol_id else { + continue; + }; + reexported_symbol_by_file_binding.insert((import.file_id, binding), target_symbol_id); + } + + let mut changed = false; + for resolution in &mut index.import_resolutions { + if resolution.target_symbol_id.is_some() { + continue; + } + let Some(import) = import_by_id.get(&resolution.import_id) else { + continue; + }; + if import.kind != ImportKind::FromImport { + continue; + } + let Some(name) = import.name.as_deref() else { + continue; + }; + if name == "*" { + continue; + } + if let Some(target_symbol_id) = + reexported_symbol_by_file_binding.get(&(resolution.target_file_id, name.to_owned())) + { + resolution.target_symbol_id = Some(*target_symbol_id); + changed = true; + } + } + + if !changed { + break; + } + } } fn resolve_python_references(index: &mut PythonIndex, candidates: Vec) { @@ -1966,6 +2026,59 @@ mod tests { })); } + #[test] + fn resolves_python_package_reexports_to_symbols() { + let repo = temp_repo_path("resolve-python-reexports"); + fs::create_dir_all(repo.join("pkg")).unwrap(); + fs::write(repo.join("pkg/__init__.py"), "from .base import Base\n").unwrap(); + fs::write(repo.join("pkg/base.py"), "class Base:\n pass\n").unwrap(); + fs::write( + repo.join("pkg/service.py"), + "from pkg import Base\n\nclass Service(Base):\n pass\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let base_file_id = index + .files + .iter() + .find(|file| file.path == "pkg/base.py") + .unwrap() + .id; + let base_symbol_id = index + .symbols + .iter() + .find(|symbol| symbol.file_id == base_file_id && symbol.name == "Base") + .unwrap() + .id; + let service = index + .symbols + .iter() + .find(|symbol| symbol.name == "Service") + .unwrap(); + + assert_eq!( + index + .import_resolutions + .iter() + .filter(|resolution| resolution.target_symbol_id == Some(base_symbol_id)) + .count(), + 2 + ); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(service.id) + && reference.name == "Base" + && reference.target_symbol_id == base_symbol_id + && reference.import_id.is_some() + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == service.id + && dependency.target_symbol_id == base_symbol_id + })); + } + #[test] fn attributes_references_to_innermost_python_symbol() { let repo = temp_repo_path("nested-python-reference-sources"); diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index c14e7ea10..b502861b5 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.849s | 544.7 MB | 0.718s | 123.4 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.968x | 4.414x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.557s | 3470.3 MB | 3.979s | 262.0 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109282 | 71534 | yes | 4.663x | 13.244x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.732s | 543.6 MB | 0.701s | 123.5 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.897x | 4.400x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.456s | 3469.7 MB | 4.045s | 260.8 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109655 | 71788 | yes | 4.562x | 13.307x | ## Pinned Compact Snapshot Evidence @@ -207,15 +207,16 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Files | 4789 | `226e8cb32dc0a23ec956e97b036e7c505037df979cce7182514f39a43b07cb80` | | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | -| Import resolutions | 19011 | `84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9` | -| References | 109282 | `105a18ff136264aa95dc28220ff664fb8599cc3b54fc33bf4e80544332a24a9f` | -| Dependencies | 71534 | `98241ae1ab983f1d345ffd43ee23ac61d4b6bba40404da1f74f858c237e5961c` | +| Import resolutions | 19011 | `e5d6d4f515bce5c73bfbd3cc354e0941bad57fdba3965fca3fb148a3421cc5a9` | +| References | 109655 | `21bdfa05e523c642534e28020fbc87e69d7dbfb80dcd2c1d064295d5173910e0` | +| Dependencies | 71788 | `ef46cd4dfa1aaf4a5232bd2e574e5b442944387c9d0e7456f2577be857fb6ac7` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. +- Direct package re-exports are resolved for indexed internal modules when the package file has a matching imported binding. Wildcard re-export expansion and ambiguous external re-export chains remain future work. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. - Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 90d0e11d9..768d6c438 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 71534, + "count": 71788, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" } ], - "sha256": "98241ae1ab983f1d345ffd43ee23ac61d4b6bba40404da1f74f858c237e5961c" + "sha256": "ef46cd4dfa1aaf4a5232bd2e574e5b442944387c9d0e7456f2577be857fb6ac7" }, "files": { "count": 4789, @@ -416,7 +416,7 @@ "target_symbol": null } ], - "sha256": "84477dc0f9cd1caea726c1305b8c642ae2104769e8dbd1a9e97faa2f7726d8c9" + "sha256": "e5d6d4f515bce5c73bfbd3cc354e0941bad57fdba3965fca3fb148a3421cc5a9" }, "imports": { "count": 40580, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 109282, + "count": 109655, "samples": [ { "import": "airflow/__init__.py:from_import:airflow:settings:@2460", @@ -888,7 +888,7 @@ "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:requires_authentication@2134" } ], - "sha256": "105a18ff136264aa95dc28220ff664fb8599cc3b54fc33bf4e80544332a24a9f" + "sha256": "21bdfa05e523c642534e28020fbc87e69d7dbfb80dcd2c1d064295d5173910e0" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 71534, + "dependencies": 71788, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 109282, + "references": 109655, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 74b3bf467..e39fe56e3 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -124,7 +124,7 @@ Current implemented bridge status: - `PythonIndex.references_json()` exposes compact symbol reference records. - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. -- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, and simple top-level globals. +- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, simple top-level globals, and direct package re-exports such as `from pkg import Symbol` when `pkg/__init__.py` re-exports the symbol from an internal module. - Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index dbf4c8d3e..e960828a4 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -196,6 +196,7 @@ Recommended task format: - [x] Stop treating Python attribute field names as bare compact references. owner: codex. Result: scans the object side of attribute expressions but skips the field-name side; Airflow compact coverage now emits 104,622 references and 68,340 dependencies. - [x] Resolve compact Python references through imported module attributes. owner: codex. Result: resolves `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` when the qualifier maps to an indexed internal Python module; Airflow compact coverage now emits 109,282 references and 71,534 dependencies. - [x] Exclude compact Python references shadowed by `nonlocal` declarations. owner: codex. Result: prevents closure variables declared `nonlocal` from resolving to imported/top-level symbols in nested functions; this checkout and pinned Airflow stayed graph-stable at 4,110 and 109,282 references respectively. +- [x] Resolve direct Python package re-export imports. owner: codex. Result: `from pkg import Symbol` follows matching imported bindings in `pkg/__init__.py` to the original internal symbol; Airflow compact coverage now emits 109,655 references and 71,788 dependencies. - [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. @@ -284,3 +285,4 @@ Recommended task format: - [x] 2026-06-18: Scoped comprehension target shadowing to comprehension expressions. owner: codex. Notes: prevents `[Base for Base in items]` from hiding later `Base` references in the enclosing function; Airflow compact graph stayed stable at 104,622 references and 68,340 dependencies while staying 4.899x faster with 15.745x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added imported module member references to the compact Rust graph. owner: codex. Notes: `module.some_func`, `alias.SomeClass`, and exact `pkg.module.some_func` qualifiers now resolve through existing import-resolution rows; Airflow compact graph now emits 109,282 references and 71,534 dependencies while staying 4.781x faster with 13.394x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added `nonlocal` declaration shadowing for compact Python references. owner: codex. Notes: `nonlocal helper` inside nested functions no longer creates a false imported/top-level `helper` reference; Airflow compact graph stayed stable at 109,282 references and 71,534 dependencies while staying 4.663x faster with 13.244x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added direct Python package re-export import resolution. owner: codex. Notes: `from pkg import Symbol` now follows matching imported bindings in `pkg/__init__.py`; Airflow compact graph now emits 109,655 references and 71,788 dependencies while staying 4.562x faster with 13.307x lower max RSS than Python parse/object materialization. From d175b8362fa8c75195569205655d0d8cef368e07 Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:36:46 -0700 Subject: [PATCH 030/228] Resolve Python wildcard imports in Rust --- crates/graph-sitter-engine/src/lib.rs | 203 ++++++++++++++++-- rust-rewrite/benchmarks.md | 12 +- .../apache-airflow-2.10.5-rust-compact.json | 14 +- rust-rewrite/python-compat.md | 4 +- rust-rewrite/strategy.md | 2 + 5 files changed, 198 insertions(+), 37 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 53e5b5042..bb4f7bf9e 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -310,6 +310,8 @@ struct LocalBindingScope { names: HashSet, } +type ExportedSymbolsByFile = HashMap>; + impl PythonIndexer { fn new() -> Result { let mut parser = Parser::new(); @@ -1550,25 +1552,7 @@ fn resolve_python_reexport_imports(index: &mut PythonIndex) { .collect(); for _ in 0..index.import_resolutions.len() { - let resolution_by_import_id: HashMap = index - .import_resolutions - .iter() - .map(|resolution| (resolution.import_id, resolution)) - .collect(); - let mut reexported_symbol_by_file_binding: HashMap<(u32, String), u32> = HashMap::new(); - - for import in &index.imports { - let Some(binding) = import_binding_name(import) else { - continue; - }; - let Some(resolution) = resolution_by_import_id.get(&import.id) else { - continue; - }; - let Some(target_symbol_id) = resolution.target_symbol_id else { - continue; - }; - reexported_symbol_by_file_binding.insert((import.file_id, binding), target_symbol_id); - } + let exported_symbols_by_file = python_exported_symbols_by_file(index); let mut changed = false; for resolution in &mut index.import_resolutions { @@ -1587,8 +1571,9 @@ fn resolve_python_reexport_imports(index: &mut PythonIndex) { if name == "*" { continue; } - if let Some(target_symbol_id) = - reexported_symbol_by_file_binding.get(&(resolution.target_file_id, name.to_owned())) + if let Some(target_symbol_id) = exported_symbols_by_file + .get(&resolution.target_file_id) + .and_then(|exports| exports.get(name)) { resolution.target_symbol_id = Some(*target_symbol_id); changed = true; @@ -1601,6 +1586,62 @@ fn resolve_python_reexport_imports(index: &mut PythonIndex) { } } +fn python_exported_symbols_by_file(index: &PythonIndex) -> ExportedSymbolsByFile { + let resolution_by_import_id: HashMap = index + .import_resolutions + .iter() + .map(|resolution| (resolution.import_id, resolution)) + .collect(); + let mut exports: ExportedSymbolsByFile = HashMap::new(); + + for symbol in index.symbols.iter().filter(|symbol| symbol.is_top_level) { + exports + .entry(symbol.file_id) + .or_default() + .insert(symbol.name.clone(), symbol.id); + } + + for _ in 0..index.imports.len().max(1) { + let previous_exports = exports.clone(); + + for import in &index.imports { + if import.kind == ImportKind::FutureImport { + continue; + } + let Some(resolution) = resolution_by_import_id.get(&import.id) else { + continue; + }; + if is_wildcard_import(import) { + let Some(target_exports) = previous_exports.get(&resolution.target_file_id) else { + continue; + }; + let file_exports = exports.entry(import.file_id).or_default(); + for (name, target_symbol_id) in target_exports { + file_exports.insert(name.clone(), *target_symbol_id); + } + continue; + } + + let Some(binding) = import_binding_name(import) else { + continue; + }; + let Some(target_symbol_id) = resolution.target_symbol_id else { + continue; + }; + exports + .entry(import.file_id) + .or_default() + .insert(binding, target_symbol_id); + } + + if exports == previous_exports { + break; + } + } + + exports +} + fn resolve_python_references(index: &mut PythonIndex, candidates: Vec) { let symbol_to_id: HashMap<(u32, &str), u32> = index .symbols @@ -1613,6 +1654,7 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec = HashMap::new(); let mut imported_module_by_qualifier: HashMap<(u32, String), (u32, u32)> = HashMap::new(); @@ -1620,6 +1662,17 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec Option { .as_deref() .and_then(|name| name.split('.').next()) .map(str::to_owned), - ImportKind::FromImport | ImportKind::FutureImport => import.name.clone(), + ImportKind::FromImport | ImportKind::FutureImport => import + .name + .as_ref() + .filter(|name| name.as_str() != "*") + .cloned(), } } +fn is_wildcard_import(import: &ImportRecord) -> bool { + matches!( + import.kind, + ImportKind::FromImport | ImportKind::FutureImport + ) && import.name.as_deref() == Some("*") +} + fn resolve_plain_import(import: &ImportRecord, module_to_file: &HashMap<&str, u32>) -> Option { let name = import.name.as_deref()?; module_to_file.get(name).copied() @@ -2079,6 +2143,101 @@ mod tests { })); } + #[test] + fn resolves_python_wildcard_import_chains_to_symbols() { + let repo = temp_repo_path("resolve-python-wildcard-reexports"); + fs::create_dir_all(repo.join("pkg/inner")).unwrap(); + fs::write( + repo.join("pkg/base.py"), + "CONSTANT = 1\nclass Base:\n pass\n\ndef helper():\n return CONSTANT\n", + ) + .unwrap(); + fs::write( + repo.join("pkg/inner/__init__.py"), + "from ..base import *\nINNER = CONSTANT\n", + ) + .unwrap(); + fs::write(repo.join("pkg/__init__.py"), "from .inner import *\n").unwrap(); + fs::write(repo.join("facade.py"), "from pkg import *\n").unwrap(); + fs::write( + repo.join("service.py"), + "from pkg import Base\nfrom facade import *\n\nclass Service(Base):\n def run(self):\n return helper(), CONSTANT\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let base = index + .symbols + .iter() + .find(|symbol| symbol.name == "Base") + .unwrap(); + let constant = index + .symbols + .iter() + .find(|symbol| symbol.name == "CONSTANT") + .unwrap(); + let helper = index + .symbols + .iter() + .find(|symbol| symbol.name == "helper") + .unwrap(); + let inner = index + .symbols + .iter() + .find(|symbol| symbol.name == "INNER") + .unwrap(); + let service = index + .symbols + .iter() + .find(|symbol| symbol.name == "Service") + .unwrap(); + let run = index + .symbols + .iter() + .find(|symbol| symbol.name == "run") + .unwrap(); + + assert!(index + .import_resolutions + .iter() + .any(|resolution| { resolution.target_symbol_id == Some(base.id) })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(inner.id) + && reference.name == "CONSTANT" + && reference.target_symbol_id == constant.id + && reference.import_id.is_some() + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(service.id) + && reference.name == "Base" + && reference.target_symbol_id == base.id + && reference.import_id.is_some() + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(run.id) + && reference.name == "helper" + && reference.target_symbol_id == helper.id + && reference.import_id.is_some() + })); + assert!(index.references.iter().any(|reference| { + reference.source_symbol_id == Some(run.id) + && reference.name == "CONSTANT" + && reference.target_symbol_id == constant.id + && reference.import_id.is_some() + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == service.id && dependency.target_symbol_id == base.id + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == run.id && dependency.target_symbol_id == helper.id + })); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == run.id && dependency.target_symbol_id == constant.id + })); + } + #[test] fn attributes_references_to_innermost_python_symbol() { let repo = temp_repo_path("nested-python-reference-sources"); diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index b502861b5..e73803400 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.732s | 543.6 MB | 0.701s | 123.5 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.897x | 4.400x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.456s | 3469.7 MB | 4.045s | 260.8 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109655 | 71788 | yes | 4.562x | 13.307x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.865s | 543.9 MB | 0.749s | 123.7 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.824x | 4.397x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.414s | 3469.6 MB | 4.040s | 264.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109743 | 71863 | yes | 4.806x | 13.136x | ## Pinned Compact Snapshot Evidence @@ -207,16 +207,16 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Files | 4789 | `226e8cb32dc0a23ec956e97b036e7c505037df979cce7182514f39a43b07cb80` | | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | -| Import resolutions | 19011 | `e5d6d4f515bce5c73bfbd3cc354e0941bad57fdba3965fca3fb148a3421cc5a9` | -| References | 109655 | `21bdfa05e523c642534e28020fbc87e69d7dbfb80dcd2c1d064295d5173910e0` | -| Dependencies | 71788 | `ef46cd4dfa1aaf4a5232bd2e574e5b442944387c9d0e7456f2577be857fb6ac7` | +| Import resolutions | 19011 | `84df9ba7bf069278f61ac2a4891d8b4cb38b25f4f63ce20dd77eada1ba654278` | +| References | 109743 | `d369c16c4c153e5902f301a5ecf9721c914fee3b0a5bcaf1ac9f837cb14099cb` | +| Dependencies | 71863 | `18e315d2d122a9c7808ac4b7544afa7d25b2d97bc78c0f5124f6554039a4a5c9` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. Important caveats: - The Rust indexer currently extracts a compact subset: files, top-level Python classes/functions/globals, nested Python class/function records for source attribution, imports, internal import-resolution records, first-slice Python symbol reference records, and de-duplicated dependency records for indexed Python modules. -- Direct package re-exports are resolved for indexed internal modules when the package file has a matching imported binding. Wildcard re-export expansion and ambiguous external re-export chains remain future work. +- Direct package re-exports and wildcard import/re-export chains are resolved for indexed internal modules when the package file exposes a matching imported binding. `__all__`, order-sensitive wildcard binding semantics, and ambiguous external re-export chains remain future work. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. - Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 768d6c438..20c33ca7e 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 71788, + "count": 71863, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" } ], - "sha256": "ef46cd4dfa1aaf4a5232bd2e574e5b442944387c9d0e7456f2577be857fb6ac7" + "sha256": "18e315d2d122a9c7808ac4b7544afa7d25b2d97bc78c0f5124f6554039a4a5c9" }, "files": { "count": 4789, @@ -416,7 +416,7 @@ "target_symbol": null } ], - "sha256": "e5d6d4f515bce5c73bfbd3cc354e0941bad57fdba3965fca3fb148a3421cc5a9" + "sha256": "84df9ba7bf069278f61ac2a4891d8b4cb38b25f4f63ce20dd77eada1ba654278" }, "imports": { "count": 40580, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 109655, + "count": 109743, "samples": [ { "import": "airflow/__init__.py:from_import:airflow:settings:@2460", @@ -888,7 +888,7 @@ "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:requires_authentication@2134" } ], - "sha256": "21bdfa05e523c642534e28020fbc87e69d7dbfb80dcd2c1d064295d5173910e0" + "sha256": "d369c16c4c153e5902f301a5ecf9721c914fee3b0a5bcaf1ac9f837cb14099cb" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 71788, + "dependencies": 71863, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 109655, + "references": 109743, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index e39fe56e3..8bc17ab00 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -124,8 +124,8 @@ Current implemented bridge status: - `PythonIndex.references_json()` exposes compact symbol reference records. - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. -- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, simple top-level globals, and direct package re-exports such as `from pkg import Symbol` when `pkg/__init__.py` re-exports the symbol from an internal module. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. +- Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, simple top-level globals, direct package re-exports such as `from pkg import Symbol`, and named imports through wildcard-backed package files when the wildcard chain stays inside indexed internal modules. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Bare names imported through wildcard import chains resolve when the chain exposes an indexed internal symbol. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index e960828a4..cbe96e77c 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -197,6 +197,7 @@ Recommended task format: - [x] Resolve compact Python references through imported module attributes. owner: codex. Result: resolves `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` when the qualifier maps to an indexed internal Python module; Airflow compact coverage now emits 109,282 references and 71,534 dependencies. - [x] Exclude compact Python references shadowed by `nonlocal` declarations. owner: codex. Result: prevents closure variables declared `nonlocal` from resolving to imported/top-level symbols in nested functions; this checkout and pinned Airflow stayed graph-stable at 4,110 and 109,282 references respectively. - [x] Resolve direct Python package re-export imports. owner: codex. Result: `from pkg import Symbol` follows matching imported bindings in `pkg/__init__.py` to the original internal symbol; Airflow compact coverage now emits 109,655 references and 71,788 dependencies. +- [x] Resolve Python wildcard import and re-export chains. owner: codex. Result: compact exported-name tables now propagate `from module import *` across indexed internal modules and feed named imports, references, and dependency edges; Airflow compact coverage now emits 109,743 references and 71,863 dependencies. - [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. @@ -286,3 +287,4 @@ Recommended task format: - [x] 2026-06-18: Added imported module member references to the compact Rust graph. owner: codex. Notes: `module.some_func`, `alias.SomeClass`, and exact `pkg.module.some_func` qualifiers now resolve through existing import-resolution rows; Airflow compact graph now emits 109,282 references and 71,534 dependencies while staying 4.781x faster with 13.394x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added `nonlocal` declaration shadowing for compact Python references. owner: codex. Notes: `nonlocal helper` inside nested functions no longer creates a false imported/top-level `helper` reference; Airflow compact graph stayed stable at 109,282 references and 71,534 dependencies while staying 4.663x faster with 13.244x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added direct Python package re-export import resolution. owner: codex. Notes: `from pkg import Symbol` now follows matching imported bindings in `pkg/__init__.py`; Airflow compact graph now emits 109,655 references and 71,788 dependencies while staying 4.562x faster with 13.307x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added wildcard import/re-export chain resolution to the compact Rust graph. owner: codex. Notes: fixed-point exported-name tables now propagate `from module import *` across indexed internal modules; Airflow compact graph now emits 109,743 references and 71,863 dependencies while staying 4.806x faster with 13.136x lower max RSS than Python parse/object materialization. From 1a48aaac91d70c9f94b77dcf3270bbe4fd45487a Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:41:52 -0700 Subject: [PATCH 031/228] Resolve nested Python module attributes in Rust --- crates/graph-sitter-engine/src/lib.rs | 165 +++++++++++++++++- rust-rewrite/benchmarks.md | 10 +- .../apache-airflow-2.10.5-rust-compact.json | 12 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 2 + 5 files changed, 175 insertions(+), 16 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index bb4f7bf9e..0d1b056c7 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1643,6 +1643,12 @@ fn python_exported_symbols_by_file(index: &PythonIndex) -> ExportedSymbolsByFile } fn resolve_python_references(index: &mut PythonIndex, candidates: Vec) { + let module_to_file: HashMap<&str, u32> = index + .files + .iter() + .filter_map(|file| file.module_name.as_deref().map(|module| (module, file.id))) + .collect(); + let internal_module_prefixes = internal_python_module_prefixes(&index.files); let symbol_to_id: HashMap<(u32, &str), u32> = index .symbols .iter() @@ -1657,13 +1663,23 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec = HashMap::new(); let mut imported_module_by_qualifier: HashMap<(u32, String), (u32, u32)> = HashMap::new(); + let mut imported_module_prefix_by_binding: HashMap<(u32, String), (String, u32)> = + HashMap::new(); for import in &index.imports { - let Some(resolution) = resolution_by_import_id.get(&import.id) else { - continue; - }; + if let Some(source_file) = index.files.get(import.file_id as usize) { + for (binding, module_prefix) in + import_module_prefix_bindings(import, source_file, &internal_module_prefixes) + { + imported_module_prefix_by_binding + .insert((import.file_id, binding), (module_prefix, import.id)); + } + } + let resolution = resolution_by_import_id.get(&import.id); if is_wildcard_import(import) { - if let Some(target_exports) = exported_symbols_by_file.get(&resolution.target_file_id) { + if let Some(target_exports) = resolution + .and_then(|resolution| exported_symbols_by_file.get(&resolution.target_file_id)) + { for (binding, target_symbol_id) in target_exports { imported_symbol_by_binding.insert( (import.file_id, binding.clone()), @@ -1673,6 +1689,9 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec HashSet { + let mut prefixes = HashSet::new(); + for module in files.iter().filter_map(|file| file.module_name.as_deref()) { + let parts = module.split('.').collect::>(); + for i in 1..=parts.len() { + prefixes.insert(parts[..i].join(".")); + } + } + prefixes +} + +fn import_module_prefix_bindings( + import: &ImportRecord, + source_file: &FileRecord, + internal_module_prefixes: &HashSet, +) -> Vec<(String, String)> { + if is_wildcard_import(import) || import.kind == ImportKind::FutureImport { + return Vec::new(); + } + + let mut bindings = Vec::new(); + match import.kind { + ImportKind::Import => { + let Some(name) = import.name.as_deref() else { + return bindings; + }; + if let Some(alias) = import.alias.as_deref() { + if internal_module_prefixes.contains(name) { + bindings.push((alias.to_owned(), name.to_owned())); + } + } else if let Some(root) = name.split('.').next() { + if internal_module_prefixes.contains(root) { + bindings.push((root.to_owned(), root.to_owned())); + } + } + } + ImportKind::FromImport => { + let Some(module) = import + .module + .as_deref() + .and_then(|module| resolve_module_name(source_file, module)) + else { + return bindings; + }; + let Some(name) = import.name.as_deref() else { + return bindings; + }; + let binding = import.alias.as_deref().unwrap_or(name); + let module_prefix = join_module(&module, name); + if internal_module_prefixes.contains(&module_prefix) { + bindings.push((binding.to_owned(), module_prefix)); + } + } + ImportKind::FutureImport => {} + } + bindings +} + +fn resolve_imported_module_attribute( + source_file_id: u32, + qualifier: &str, + name: &str, + imported_module_prefix_by_binding: &HashMap<(u32, String), (String, u32)>, + module_to_file: &HashMap<&str, u32>, + symbol_to_id: &HashMap<(u32, &str), u32>, +) -> Option<(u32, Option)> { + let (binding, suffix) = qualifier + .split_once('.') + .map_or((qualifier, None), |(binding, suffix)| { + (binding, Some(suffix)) + }); + let (module_prefix, import_id) = + imported_module_prefix_by_binding.get(&(source_file_id, binding.to_owned()))?; + let target_module = suffix.map_or_else( + || module_prefix.clone(), + |suffix| join_module(module_prefix, suffix), + ); + let target_file_id = module_to_file.get(target_module.as_str()).copied()?; + let target_symbol_id = symbol_to_id.get(&(target_file_id, name)).copied()?; + Some((target_symbol_id, Some(*import_id))) +} + fn import_module_qualifiers(import: &ImportRecord) -> Vec { let mut qualifiers = Vec::new(); if let Some(alias) = import.alias.as_deref() { @@ -2363,6 +2474,52 @@ def caller():\n return base.helper(), base_alias.Base, pkg.base.helper()\n", })); } + #[test] + fn resolves_python_nested_module_attribute_references() { + let repo = temp_repo_path("python-nested-module-attribute-references"); + fs::create_dir_all(repo.join("a/b")).unwrap(); + fs::write(repo.join("a/b/c.py"), "def d():\n pass\n").unwrap(); + fs::write( + repo.join("consumer.py"), + "from a import b\nimport a.b\nimport a.b.c as c_alias\n\n\ +def caller():\n return b.c.d(), a.b.c.d(), c_alias.d()\n", + ) + .unwrap(); + + let index = index_python_path(&repo).unwrap(); + fs::remove_dir_all(&repo).unwrap(); + + let caller = index + .symbols + .iter() + .find(|symbol| symbol.name == "caller") + .unwrap(); + let d = index + .symbols + .iter() + .find(|symbol| symbol.name == "d") + .unwrap(); + + assert_eq!( + index + .references + .iter() + .filter(|reference| { + reference.source_symbol_id == Some(caller.id) + && reference.name == "d" + && reference.target_symbol_id == d.id + && reference.import_id.is_some() + }) + .count(), + 3 + ); + assert!(index.dependencies.iter().any(|dependency| { + dependency.source_symbol_id == caller.id + && dependency.target_symbol_id == d.id + && dependency.reference_count == 3 + })); + } + #[test] fn skips_references_shadowed_by_python_parameters_and_locals() { let repo = temp_repo_path("python-shadowed-reference-sources"); diff --git a/rust-rewrite/benchmarks.md b/rust-rewrite/benchmarks.md index e73803400..b107406b0 100644 --- a/rust-rewrite/benchmarks.md +++ b/rust-rewrite/benchmarks.md @@ -195,8 +195,8 @@ These measurements use real `Codebase(...)` construction with `CodebaseConfig(gr | Input | Python mode | Python wall | Python max RSS | Rust `Codebase` wall | Rust `Codebase` max RSS | Python files | Rust files | Rust symbols | Rust imports | Rust import resolutions | Rust references | Rust dependencies | Python graph blocked | Wall ratio | RSS ratio | | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | ---: | ---: | -| `graph-sitter` repo checkout | `--disable-graph` | 2.865s | 543.9 MB | 0.749s | 123.7 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.824x | 4.397x | -| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 19.414s | 3469.6 MB | 4.040s | 264.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109743 | 71863 | yes | 4.806x | 13.136x | +| `graph-sitter` repo checkout | `--disable-graph` | 2.933s | 544.8 MB | 0.752s | 123.7 MB | 1133 | 1133 | 6505 | 6496 | 432 | 4110 | 2953 | yes | 3.899x | 4.404x | +| Apache Airflow `2.10.5` (`b93c3db6b1641b0840bd15ac7d05bc58ff2cccbf`) | `--disable-graph` | 18.409s | 3469.2 MB | 4.209s | 268.1 MB | 4789 | 4789 | 52339 | 40580 | 19011 | 109817 | 71932 | yes | 4.374x | 12.940x | ## Pinned Compact Snapshot Evidence @@ -208,8 +208,8 @@ The first committed large-repo compact snapshot is `rust-rewrite/golden/apache-a | Symbols | 52339 | `d4b75c9c6d82b1d30424845c86b88c9fb18ca7748fc088c16b4cfca00de30699` | | Imports | 40580 | `fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0` | | Import resolutions | 19011 | `84df9ba7bf069278f61ac2a4891d8b4cb38b25f4f63ce20dd77eada1ba654278` | -| References | 109743 | `d369c16c4c153e5902f301a5ecf9721c914fee3b0a5bcaf1ac9f837cb14099cb` | -| Dependencies | 71863 | `18e315d2d122a9c7808ac4b7544afa7d25b2d97bc78c0f5124f6554039a4a5c9` | +| References | 109817 | `d7ab546586eb968f35dd1bf8f109db6a54b889af464a2c349e7af2e38ea60a8a` | +| Dependencies | 71932 | `cbf361a2b46e5ea2e5cad352c5abe8ab493869eb422cbdb77912484ea9fab1d1` | The snapshot tool also validates internal compact graph integrity: import-resolution links, reference links, dependency links, dependency reference counts, and dependency reference source/target consistency must all be zero-mismatch before the snapshot can pass. @@ -219,7 +219,7 @@ Important caveats: - Direct package re-exports and wildcard import/re-export chains are resolved for indexed internal modules when the package file exposes a matching imported binding. `__all__`, order-sensitive wildcard binding semantics, and ambiguous external re-export chains remain future work. - Public Python handles still expose top-level `Codebase.symbols`, `classes`, and `functions`; nested compact symbols are currently internal records for dependency-source precision and `file.symbols(nested=True)`. - Function parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations now shadow imported/top-level names in the compact reference pass, reducing false-positive dependency edges before full lexical scope tables exist. Comprehension targets are scoped to the comprehension expression instead of leaking to the whole enclosing function. `global` declarations now remove matching names from the local-shadow set so module-level writes and uses remain visible in the compact reference/dependency graph. -- Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. +- Imported module member references such as `module.some_func`, `alias.SomeClass`, `pkg.module.some_func`, and namespace-style nested module chains like `from a import b; b.c.d()` now resolve when the qualifier maps to an indexed internal Python module. Other attribute field names are skipped as bare-name references until full attribute/type resolution exists. The object side of an attribute expression is still scanned, so `helper.attr` preserves the `helper` reference while `obj.helper` no longer pretends `helper` is a standalone symbol use. - The Python-facing Rust facade uses Python's selected file list, but the compact Rust records are not yet full Python graph parity. Symbol and import totals should not be compared directly with current Python graph node totals until the resolver and lazy handle layers are implemented. - The Python backend numbers include the current eager Python object materialization and, in full graph mode, dependency edge computation. - The Rust RSS number is sampled from a short-lived release process; it is suitable for directional comparison, not allocator-level attribution. diff --git a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json index 20c33ca7e..d145cca50 100644 --- a/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json +++ b/rust-rewrite/golden/apache-airflow-2.10.5-rust-compact.json @@ -1,7 +1,7 @@ { "graphs": { "dependencies": { - "count": 71863, + "count": 71932, "samples": [ { "reference_count": 1, @@ -144,7 +144,7 @@ "target_symbol": "airflow/api/auth/backend/kerberos_auth.py:function:_gssapi_authenticate@3989" } ], - "sha256": "18e315d2d122a9c7808ac4b7544afa7d25b2d97bc78c0f5124f6554039a4a5c9" + "sha256": "cbf361a2b46e5ea2e5cad352c5abe8ab493869eb422cbdb77912484ea9fab1d1" }, "files": { "count": 4789, @@ -665,7 +665,7 @@ "sha256": "fe4a595d850f2f57f1eb1a5ca347ecfcc09259e31cd7b44306902c04de7275d0" }, "references": { - "count": 109743, + "count": 109817, "samples": [ { "import": "airflow/__init__.py:from_import:airflow:settings:@2460", @@ -888,7 +888,7 @@ "target_symbol": "airflow/providers/fab/auth_manager/api/auth/backend/basic_auth.py:function:requires_authentication@2134" } ], - "sha256": "d369c16c4c153e5902f301a5ecf9721c914fee3b0a5bcaf1ac9f837cb14099cb" + "sha256": "d7ab546586eb968f35dd1bf8f109db6a54b889af464a2c349e7af2e38ea60a8a" }, "symbols": { "count": 52339, @@ -1234,7 +1234,7 @@ "summary": { "bytes": 36617627, "classes": 5665, - "dependencies": 71863, + "dependencies": 71932, "files": 4789, "files_with_errors": 0, "functions": 34535, @@ -1242,7 +1242,7 @@ "import_resolutions": 19011, "imports": 40580, "lines": 924514, - "references": 109743, + "references": 109817, "symbols": 52339 } } diff --git a/rust-rewrite/python-compat.md b/rust-rewrite/python-compat.md index 8bc17ab00..78570c801 100644 --- a/rust-rewrite/python-compat.md +++ b/rust-rewrite/python-compat.md @@ -125,7 +125,7 @@ Current implemented bridge status: - `PythonIndex.dependencies_json()` exposes compact dependency edge records. - `RustIndexBackend.files`, `.symbols`, `.imports`, `.import_resolutions`, `.references`, and `.dependencies` parse those record-family payloads into typed Python dataclasses for shell/debug/golden-test use. - Rust currently emits compact `ImportResolutionRecord` rows for indexed internal Python modules: direct `import pkg.mod`, absolute `from pkg.mod import Symbol`, and relative `from .mod import Symbol` forms. Target symbols now include top-level classes, functions, simple top-level globals, direct package re-exports such as `from pkg import Symbol`, and named imports through wildcard-backed package files when the wildcard chain stays inside indexed internal modules. -- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, and `pkg.module.some_func` resolve when the qualifier points to an indexed internal Python module. Bare names imported through wildcard import chains resolve when the chain exposes an indexed internal symbol. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. +- Rust currently emits compact `ReferenceRecord` rows for same-file and imported top-level symbol references inside Python symbols. Nested class/function records are used as source symbols when an identifier appears inside a method or nested function. Parameters, lambda parameters, local assignment targets, local imports, `for` targets, `with ... as ...` targets, `except ... as ...` targets, comprehension targets, match-pattern captures, nested definitions, and `nonlocal` declarations shadow imported/top-level names in this pass. Comprehension targets are scoped to their comprehension expression so they do not hide later uses in the enclosing function. `global` declarations are honored so declared names continue to resolve to module-level symbols/imports. Imported module member references such as `module.some_func`, `alias.SomeClass`, `pkg.module.some_func`, and namespace-style nested module chains such as `from a import b; b.c.d()` resolve when the qualifier points to an indexed internal Python module. Bare names imported through wildcard import chains resolve when the chain exposes an indexed internal symbol. Other attribute field names are not treated as bare references; the object side of an attribute expression is still scanned. Full lexical scoping and full attribute/type resolution remain future work. - Rust currently emits compact `DependencyRecord` rows by de-duplicating reference records into source-symbol to target-symbol edges with contributing reference IDs. Full lexical/reference coverage, external modules, and TypeScript remain future work. - `CodebaseConfig(graph_backend="rust" | "auto")` builds a `CodebaseContext.rust_index` compact index when the extension is available and the codebase is Python. - `CodebaseConfig(graph_backend="rust")` now keeps the eager Python graph unbuilt when the compact index succeeds. Raw Python graph APIs such as `CodebaseContext.nodes` remain blocked in that mode. diff --git a/rust-rewrite/strategy.md b/rust-rewrite/strategy.md index cbe96e77c..a976ece11 100644 --- a/rust-rewrite/strategy.md +++ b/rust-rewrite/strategy.md @@ -198,6 +198,7 @@ Recommended task format: - [x] Exclude compact Python references shadowed by `nonlocal` declarations. owner: codex. Result: prevents closure variables declared `nonlocal` from resolving to imported/top-level symbols in nested functions; this checkout and pinned Airflow stayed graph-stable at 4,110 and 109,282 references respectively. - [x] Resolve direct Python package re-export imports. owner: codex. Result: `from pkg import Symbol` follows matching imported bindings in `pkg/__init__.py` to the original internal symbol; Airflow compact coverage now emits 109,655 references and 71,788 dependencies. - [x] Resolve Python wildcard import and re-export chains. owner: codex. Result: compact exported-name tables now propagate `from module import *` across indexed internal modules and feed named imports, references, and dependency edges; Airflow compact coverage now emits 109,743 references and 71,863 dependencies. +- [x] Resolve nested Python module attribute references. owner: codex. Result: module-prefix bindings now resolve namespace-style chains such as `from a import b; b.c.d()` and `import a.b; a.b.c.d()` to indexed internal module symbols; Airflow compact coverage now emits 109,817 references and 71,932 dependencies. - [ ] Expand symbol usage extraction to full lexical shadowing behavior, full attribute/type resolution, and order-sensitive scopes. - [x] Implement first compact dependency edge construction from usage records. owner: codex. Result: emits de-duplicated Python `DependencyRecord` edges from compact references with contributing reference IDs. - [ ] Expand dependency edge construction to full lexical/reference coverage, external modules, and TypeScript. @@ -288,3 +289,4 @@ Recommended task format: - [x] 2026-06-18: Added `nonlocal` declaration shadowing for compact Python references. owner: codex. Notes: `nonlocal helper` inside nested functions no longer creates a false imported/top-level `helper` reference; Airflow compact graph stayed stable at 109,282 references and 71,534 dependencies while staying 4.663x faster with 13.244x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added direct Python package re-export import resolution. owner: codex. Notes: `from pkg import Symbol` now follows matching imported bindings in `pkg/__init__.py`; Airflow compact graph now emits 109,655 references and 71,788 dependencies while staying 4.562x faster with 13.307x lower max RSS than Python parse/object materialization. - [x] 2026-06-18: Added wildcard import/re-export chain resolution to the compact Rust graph. owner: codex. Notes: fixed-point exported-name tables now propagate `from module import *` across indexed internal modules; Airflow compact graph now emits 109,743 references and 71,863 dependencies while staying 4.806x faster with 13.136x lower max RSS than Python parse/object materialization. +- [x] 2026-06-18: Added nested module-prefix attribute resolution to the compact Rust graph. owner: codex. Notes: `from a import b; b.c.d()` and `import a.b; a.b.c.d()` now resolve through indexed internal module prefixes, including namespace-package-style prefixes without concrete `__init__.py` files; Airflow compact graph now emits 109,817 references and 71,932 dependencies while staying 4.374x faster with 12.940x lower max RSS than Python parse/object materialization. From fa70e981187900fabd839eb9bcf079ccc679ac1c Mon Sep 17 00:00:00 2001 From: Jay Hack Date: Thu, 18 Jun 2026 15:48:32 -0700 Subject: [PATCH 032/228] Honor static Python all exports in Rust --- crates/graph-sitter-engine/src/lib.rs | 175 ++++++++++++++++++++++++-- rust-rewrite/benchmarks.md | 6 +- rust-rewrite/python-compat.md | 2 +- rust-rewrite/strategy.md | 2 + 4 files changed, 171 insertions(+), 14 deletions(-) diff --git a/crates/graph-sitter-engine/src/lib.rs b/crates/graph-sitter-engine/src/lib.rs index 0d1b056c7..67cf4650c 100644 --- a/crates/graph-sitter-engine/src/lib.rs +++ b/crates/graph-sitter-engine/src/lib.rs @@ -1,7 +1,7 @@ #![forbid(unsafe_code)] use serde::Serialize; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt; use std::fs; use std::io; @@ -137,6 +137,8 @@ pub struct PythonIndex { pub import_resolutions: Vec, pub references: Vec, pub dependencies: Vec, + #[serde(skip)] + pub all_exports_by_file: HashMap>, } impl PythonIndex { @@ -362,6 +364,7 @@ impl PythonIndexer { import_resolutions: Vec::new(), references: Vec::new(), dependencies: Vec::new(), + all_exports_by_file: HashMap::new(), }; let mut reference_candidates = Vec::new(); paths.sort(); @@ -691,6 +694,19 @@ fn push_global_assignment( }; let mut targets = Vec::new(); collect_assignment_targets(left, &mut targets); + let defines_static_all_exports = targets.iter().any(|target| { + target + .utf8_text(source.as_bytes()) + .is_ok_and(|name| name == "__all__") + }); + if defines_static_all_exports { + if let Some(exports) = node + .child_by_field_name("right") + .and_then(|right| collect_static_all_exports(source, right)) + { + index.all_exports_by_file.insert(file_id, exports); + } + } for target in targets { let Ok(name) = target.utf8_text(source.as_bytes()) else { continue; @@ -709,6 +725,57 @@ fn push_global_assignment( } } +fn collect_static_all_exports(source: &str, node: Node<'_>) -> Option> { + match node.kind() { + "list" | "tuple" | "set" => { + let mut exports = BTreeSet::new(); + let mut cursor = node.walk(); + for child in node.named_children(&mut cursor) { + if child.kind() != "string" { + return None; + } + let value = python_string_literal_value(node_text(source, child))?; + exports.insert(value); + } + Some(exports) + } + "parenthesized_expression" => { + first_named_child(node).and_then(|child| collect_static_all_exports(source, child)) + } + _ => None, + } +} + +fn python_string_literal_value(text: &str) -> Option { + let mut literal = text.trim(); + let mut has_f_prefix = false; + while let Some(prefix) = literal.chars().next() { + if matches!(prefix, '\'' | '"') { + break; + } + if matches!(prefix, 'f' | 'F') { + has_f_prefix = true; + } + if matches!(prefix, 'r' | 'R' | 'b' | 'B' | 'u' | 'U' | 'f' | 'F') { + literal = &literal[prefix.len_utf8()..]; + } else { + return None; + } + } + if has_f_prefix { + return None; + } + for quote in ["'''", "\"\"\"", "'", "\""] { + if let Some(value) = literal + .strip_prefix(quote) + .and_then(|value| value.strip_suffix(quote)) + { + return Some(value.to_owned()); + } + } + None +} + fn collect_assignment_targets<'tree>(node: Node<'tree>, out: &mut Vec>) { match node.kind() { "identifier" => out.push(node), @@ -1616,7 +1683,9 @@ fn python_exported_symbols_by_file(index: &PythonIndex) -> ExportedSymbolsByFile continue; }; let file_exports = exports.entry(import.file_id).or_default(); - for (name, target_symbol_id) in target_exports { + for (name, target_symbol_id) in + wildcard_visible_exports(index, resolution.target_file_id, target_exports) + { file_exports.insert(name.clone(), *target_symbol_id); } continue; @@ -1642,6 +1711,20 @@ fn python_exported_symbols_by_file(index: &PythonIndex) -> ExportedSymbolsByFile exports } +fn wildcard_visible_exports<'a>( + index: &'a PythonIndex, + file_id: u32, + exports: &'a BTreeMap, +) -> Vec<(&'a String, &'a u32)> { + let Some(all_exports) = index.all_exports_by_file.get(&file_id) else { + return exports.iter().collect(); + }; + all_exports + .iter() + .filter_map(|name| exports.get_key_value(name)) + .collect() +} + fn resolve_python_references(index: &mut PythonIndex, candidates: Vec) { let module_to_file: HashMap<&str, u32> = index .files @@ -1677,14 +1760,18 @@ fn resolve_python_references(index: &mut PythonIndex, candidates: Vec