From 17b26104b3a88f02ea20b854015f9a03d5933db4 Mon Sep 17 00:00:00 2001 From: Gadi Evron Date: Mon, 8 Jun 2026 23:34:04 +0300 Subject: [PATCH 1/3] =?UTF-8?q?fix(parsers/ruby):=20u10=20call=5Fgraph=5Fb?= =?UTF-8?q?uilder=20=E2=80=94=20parenless/crossfile-class/builtin/method-o?= =?UTF-8?q?bject/substring-import=20(BUG-NEW=201,8,11,31,49)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local-only finder-fixes-54 (base master 368b559). TDD; 10 tests in the new tests/parsers/ruby/test_call_graph_builder_u*.py. Judge (vs prepared recommendation, independent re-derivation from raw): AGREE / SHIP-AS-IS. CM-B builtin pre-check scoped SAME-FILE (judge-confirmed: no global cross-file fallback; genuine-builtin non-link pinned by a negative test). Combined parser suite: 70 passed, 10 skipped. Local-only; not pushed. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit 74153f98a94f9e8b95c39083d3d0eaf71ea7956e) --- .../parsers/ruby/call_graph_builder.py | 175 ++++++++++++++- .../tests/parsers/ruby/__init__.py | 0 .../ruby/test_call_graph_builder_u10.py | 210 ++++++++++++++++++ 3 files changed, 375 insertions(+), 10 deletions(-) create mode 100644 libs/openant-core/tests/parsers/ruby/__init__.py create mode 100644 libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py diff --git a/libs/openant-core/parsers/ruby/call_graph_builder.py b/libs/openant-core/parsers/ruby/call_graph_builder.py index 7e5d533b..a53c432b 100644 --- a/libs/openant-core/parsers/ruby/call_graph_builder.py +++ b/libs/openant-core/parsers/ruby/call_graph_builder.py @@ -176,20 +176,118 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]: except Exception: return self._extract_calls_regex(code, caller_id) + # First pass: collect local-variable names (assignment LHS) and + # method-object bindings (`m = method(:sym)` / proc / lambda). These + # inform parenless-call precision [BUG 1] and `.call` resolution [BUG 31]. + local_vars: Set[str] = set() + method_object_bindings: Dict[str, str] = {} + stack = [tree.root_node] + while stack: + node = stack.pop() + if node.type == 'assignment': + lhs = node.children[0] if node.children else None + if lhs is not None and lhs.type == 'identifier': + var_name = self._node_str(lhs, code_bytes) + local_vars.add(var_name) + bound = self._method_object_target(node, code_bytes) + if bound: + method_object_bindings[var_name] = bound + stack.extend(reversed(node.children)) + + # Second pass: resolve calls and bare (parenless) identifier calls. stack = [tree.root_node] while stack: node = stack.pop() if node.type == 'call': - resolved = self._resolve_call_node(node, code_bytes, caller_file, caller_class) + resolved = self._resolve_call_node( + node, code_bytes, caller_file, caller_class, + method_object_bindings + ) + if resolved: + calls.add(resolved) + elif node.type == 'identifier': + resolved = self._resolve_bare_identifier( + node, code_bytes, caller_file, caller_class, local_vars + ) if resolved: calls.add(resolved) stack.extend(reversed(node.children)) return calls + def _node_str(self, node, source: bytes) -> str: + return source[node.start_byte:node.end_byte].decode('utf-8', errors='replace') + + def _method_object_target(self, assignment_node, source: bytes) -> Optional[str]: + """If RHS is method(:sym)/proc/lambda binding a named method, return the name. + + Tracks `m = method(:helper)` so a later `m.call` resolves to `helper`. + """ + rhs = assignment_node.children[-1] if assignment_node.children else None + if rhs is None or rhs.type != 'call': + return None + callee = None + arg_list = None + for child in rhs.children: + if child.type == 'identifier' and callee is None: + callee = self._node_str(child, source) + elif child.type == 'argument_list': + arg_list = child + if callee != 'method' or arg_list is None: + return None + for arg in arg_list.children: + if arg.type in ('simple_symbol', 'symbol'): + sym = self._node_str(arg, source) + return sym.lstrip(':') + return None + + def _resolve_bare_identifier(self, node, source: bytes, caller_file: str, + caller_class: Optional[str], + local_vars: Set[str]) -> Optional[str]: + """Resolve a bare (parenless) identifier used as a call. + + Precision guard: only treat a bare identifier as a call when its name + is a KNOWN user function, it is NOT a Ruby builtin, and it is NOT a + local variable / assignment target in this method body. We also skip + identifiers that are a structural part of a call/method/assignment + (their own handlers cover those) to avoid double-counting. + """ + parent = node.parent + if parent is not None and parent.type in ( + 'call', 'method', 'singleton_method', 'assignment', 'method_call', + ): + return None + name = self._node_str(node, source) + if name in local_vars: + return None + if self._is_builtin(name): + return None + if name not in self.functions_by_name: + return None + return self._resolve_simple_call(name, caller_file, caller_class) + def _resolve_call_node(self, node, source: bytes, caller_file: str, - caller_class: Optional[str]) -> Optional[str]: + caller_class: Optional[str], + method_object_bindings: Optional[Dict[str, str]] = None + ) -> Optional[str]: """Resolve a tree-sitter call node to a function ID.""" + method_object_bindings = method_object_bindings or {} + + # Method-object call: `m = method(:helper)` then `m.call` resolves to the + # bound target [BUG 31]. The positional heuristic below mis-parses a + # lowercase-variable receiver (it captures the var as the method name), + # so detect this precisely via tree-sitter's named receiver/method fields. + recv_field = node.child_by_field_name('receiver') + meth_field = node.child_by_field_name('method') + if recv_field is not None and meth_field is not None: + recv_text = source[recv_field.start_byte:recv_field.end_byte].decode( + 'utf-8', errors='replace') + meth_text = source[meth_field.start_byte:meth_field.end_byte].decode( + 'utf-8', errors='replace') + if meth_text == 'call' and recv_text in method_object_bindings: + target_name = method_object_bindings[recv_text] + return self._resolve_simple_call(target_name, caller_file, caller_class) + # Extract method name method_name = None receiver = None @@ -210,7 +308,16 @@ def _resolve_call_node(self, node, source: bytes, caller_file: str, if not method_name: return None + # A user method may share a name with a Ruby builtin (e.g. render, log, + # open). Don't let the builtin filter drop a call that resolves to a + # user-defined function visible in scope. Scope = same class or same + # file ONLY (no global cross-file single-match) so a genuine builtin + # call isn't wired to an unrelated same-named user method elsewhere. if self._is_builtin(method_name): + if receiver is None and self._has_scoped_user_function( + method_name, caller_file, caller_class + ): + return self._resolve_simple_call(method_name, caller_file, caller_class) return None # self.method(...) - same class @@ -247,9 +354,16 @@ def _resolve_simple_call(self, func_name: str, caller_file: str, file_imports = self.imports.get(caller_file, {}) for import_name, import_type in file_imports.items(): if import_type in ('require', 'require_relative'): - # Try matching import path to file + # Try matching import path to file by basename equality (the + # require's last path segment must equal the file's stem). An + # unanchored substring test (`import_name in file_path`) wrongly + # matched e.g. require 'auth' -> authentication.rb. + import_basename = import_name.replace('\\', '/').rsplit('/', 1)[-1] for file_path in self.functions_by_file: - if file_path.endswith(f"{import_name}.rb") or import_name in file_path: + file_stem = file_path.replace('\\', '/').rsplit('/', 1)[-1] + if file_stem.endswith('.rb'): + file_stem = file_stem[:-len('.rb')] + if file_stem == import_basename: file_funcs = self.functions_by_file[file_path] for func_id in file_funcs: func_data = self.functions.get(func_id, {}) @@ -264,6 +378,25 @@ def _resolve_simple_call(self, func_name: str, caller_file: str, return None + def _has_scoped_user_function(self, name: str, caller_file: str, + caller_class: Optional[str]) -> bool: + """True if `name` is a user function visible in the caller's scope. + + Scope is deliberately narrow (same class or same file). This lets a + user method that shadows a builtin name resolve, without globally + rescuing a genuine builtin call into an unrelated same-named method. + """ + if caller_class: + class_key = f"{caller_file}:{caller_class}" + for func_id in self.methods_by_class.get(class_key, []): + if self.functions.get(func_id, {}).get('name') == name: + return True + for func_id in self.functions_by_file.get(caller_file, []): + func_data = self.functions.get(func_id, {}) + if func_data.get('name') == name and not func_data.get('class_name'): + return True + return False + def _resolve_self_call(self, method_name: str, caller_file: str, caller_class: str) -> Optional[str]: """Resolve a self.method() call within a class.""" @@ -288,16 +421,38 @@ def _resolve_class_call(self, class_name: str, method_name: str, if func_data.get('name') == method_name: return func_id - # Check all files for the class + # Cross-file fallback. The previous behaviour linked the FIRST same-named + # class found ANYWHERE, which is unsound: it wired callers to unrelated + # files (and is wrong under same-name class collisions). Scope it: only + # accept a cross-file class when the caller's file require/require_relative + # resolves to the file that defines the class. for key, func_ids in self.methods_by_class.items(): - if key.endswith(f":{class_name}"): - for func_id in func_ids: - func_data = self.functions.get(func_id, {}) - if func_data.get('name') == method_name: - return func_id + if not key.endswith(f":{class_name}"): + continue + key_file = key.rsplit(':', 1)[0] + if not self._file_is_required_by(key_file, caller_file): + continue + for func_id in func_ids: + func_data = self.functions.get(func_id, {}) + if func_data.get('name') == method_name: + return func_id return None + def _file_is_required_by(self, target_file: str, caller_file: str) -> bool: + """True if caller_file has a require/require_relative resolving to target_file.""" + target_stem = target_file.replace('\\', '/').rsplit('/', 1)[-1] + if target_stem.endswith('.rb'): + target_stem = target_stem[:-len('.rb')] + file_imports = self.imports.get(caller_file, {}) + for import_name, import_type in file_imports.items(): + if import_type not in ('require', 'require_relative'): + continue + import_basename = import_name.replace('\\', '/').rsplit('/', 1)[-1] + if import_basename == target_stem: + return True + return False + def _extract_calls_regex(self, code: str, caller_id: str) -> Set[str]: """Fallback regex-based call extraction for unparseable code.""" calls = set() diff --git a/libs/openant-core/tests/parsers/ruby/__init__.py b/libs/openant-core/tests/parsers/ruby/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py b/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py new file mode 100644 index 00000000..fec23b9c --- /dev/null +++ b/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py @@ -0,0 +1,210 @@ +"""Regression tests for the Ruby call graph builder (u10 blind-fix batch). + +Five confirmed bugs in parsers/ruby/call_graph_builder.py, each driven through +the REAL pipeline (FunctionExtractor -> CallGraphBuilder) so the func_data shape +(indented `code`, `imports`, `class_name`, ...) matches production exactly. + + [1] parenless free-function call drops the edge (assert edge PRESENT) + [8] ClassName.method over-resolves to an unrelated file (assert edge ABSENT) + [11] user method named like a builtin gets filtered (assert edge PRESENT) + [31] m = method(:helper); m.call drops the edge (assert edge PRESENT) + [49] require 'auth' substring-matches authentication.rb (assert edge ABSENT) +""" + +import sys +import tempfile +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.ruby.function_extractor import FunctionExtractor +from parsers.ruby.call_graph_builder import CallGraphBuilder + + +def _build(files: dict) -> tuple[dict, CallGraphBuilder]: + """Write Ruby sources to a temp repo, run the real two-stage pipeline.""" + repo = Path(tempfile.mkdtemp()) + for name, src in files.items(): + p = repo / name + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(src) + extractor = FunctionExtractor(str(repo)) + output = extractor.extract_all(list(files.keys())) + builder = CallGraphBuilder(output) + builder.build_call_graph() + return output, builder + + +def _fid(output: dict, suffix: str) -> str: + matches = [k for k in output["functions"] if k.endswith(suffix)] + assert len(matches) == 1, f"expected one func id ending {suffix!r}, got {matches}" + return matches[0] + + +# ----------------------------------------------------------------- [1] parenless +def test_parenless_free_function_call_is_an_edge(): + """`main` body is a bare `greet` (no parens) — the main->greet edge must exist.""" + output, builder = _build( + {"app.rb": "def greet\n puts 'hi'\nend\n\ndef main\n greet\nend\n"} + ) + caller = _fid(output, ":main") + callee = _fid(output, ":greet") + assert callee in builder.call_graph[caller], ( + f"parenless call edge missing: {caller} -> {callee}; got {builder.call_graph[caller]}" + ) + + +def test_parenless_local_variable_is_not_an_edge(): + """Precision guard: a bare identifier that is a LOCAL VAR must NOT become a call. + + `greet` here is a method name AND a local variable in `main`; the bare + reference `greet` on the last line is a variable read, not a call. + """ + output, builder = _build( + { + "app.rb": "def greet\n puts 'hi'\nend\n\n" + "def main\n greet = 1\n greet\nend\n" + } + ) + caller = _fid(output, ":main") + callee = _fid(output, ":greet") + assert callee not in builder.call_graph[caller], ( + f"local variable wrongly treated as a call: {caller} -> {callee}" + ) + + +def test_parenless_unknown_identifier_is_not_an_edge(): + """Precision guard: a bare identifier that is NOT a known function is not a call.""" + output, builder = _build( + {"app.rb": "def greet\n puts 'hi'\nend\n\ndef main\n unknown_thing\nend\n"} + ) + caller = _fid(output, ":main") + # No edge to anything (unknown_thing is not a function). + assert builder.call_graph[caller] == [], ( + f"unknown bare identifier produced an edge: {builder.call_graph[caller]}" + ) + + +# ------------------------------------------------- [8] cross-file over-resolution +def test_class_call_does_not_resolve_to_unrelated_file(): + """Worker is defined in an unrelated file with no require link — no edge.""" + output, builder = _build( + { + "caller.rb": "class Caller\n def run\n Worker.process()\n end\nend\n", + "decoy.rb": "class Worker\n def process\n 42\n end\nend\n", + } + ) + caller = _fid(output, ":Caller.run") + forbidden = _fid(output, "decoy.rb:Worker.process") + assert forbidden not in builder.call_graph[caller], ( + f"over-resolved to unrelated file: {caller} -> {forbidden}; " + f"got {builder.call_graph[caller]}" + ) + + +def test_class_call_still_resolves_when_required(): + """Recall guard for [8]: when caller requires the defining file, edge stays.""" + output, builder = _build( + { + "caller.rb": "require_relative 'worker'\n" + "class Caller\n def run\n Worker.process()\n end\nend\n", + "worker.rb": "class Worker\n def process\n 42\n end\nend\n", + } + ) + caller = _fid(output, ":Caller.run") + callee = _fid(output, "worker.rb:Worker.process") + assert callee in builder.call_graph[caller], ( + f"required class-call edge lost: {caller} -> {callee}; got {builder.call_graph[caller]}" + ) + + +# --------------------------------------------------------------- [11] builtin leak +def test_user_method_named_like_builtin_is_an_edge(): + """`render` is in RUBY_BUILTINS but here it is a user function in the same file.""" + output, builder = _build( + {"app.rb": "def render\n 1\nend\n\ndef main\n render()\nend\n"} + ) + caller = _fid(output, ":main") + callee = _fid(output, ":render") + assert callee in builder.call_graph[caller], ( + f"user method named like builtin filtered: {caller} -> {callee}; " + f"got {builder.call_graph[caller]}" + ) + + +def test_genuine_builtin_not_linked_to_unrelated_user_method(): + """Scope guard for [11]: a genuine builtin call must NOT link to a same-named + user method in an UNRELATED file.""" + output, builder = _build( + { + "user.rb": "def main\n puts('hi')\nend\n", + "other.rb": "class Box\n def puts\n 99\n end\nend\n", + } + ) + caller = _fid(output, "user.rb:main") + forbidden = _fid(output, "other.rb:Box.puts") + assert forbidden not in builder.call_graph[caller], ( + f"genuine builtin linked to unrelated user method: {caller} -> {forbidden}" + ) + + +# ---------------------------------------------------------- [31] method-object var +def test_method_object_call_is_an_edge(): + """`m = method(:helper); m.call` — the caller_fn->helper edge must exist.""" + output, builder = _build( + { + "m.rb": "def helper\n 42\nend\n\n" + "def caller_fn\n m = method(:helper)\n m.call\nend\n" + } + ) + caller = _fid(output, ":caller_fn") + callee = _fid(output, ":helper") + assert callee in builder.call_graph[caller], ( + f"method-object edge missing: {caller} -> {callee}; got {builder.call_graph[caller]}" + ) + + +# ------------------------------------------------ [49] substring import over-match +def test_require_does_not_substring_match_longer_filename(): + """require 'auth' must NOT match authentication.rb by substring. + + A same-named `shared_helper` is also defined in a third file so the + global unique-name fallback (resolution step 4) CANNOT fire — that + isolates the substring-import mechanism (step 3) as the only thing + that could (wrongly) produce the forbidden edge. + """ + output, builder = _build( + { + "main.rb": "require 'auth'\ndef caller_fn\n shared_helper()\nend\n", + "authentication.rb": "def shared_helper\n 1\nend\n", + "elsewhere.rb": "def shared_helper\n 2\nend\n", + } + ) + caller = _fid(output, "main.rb:caller_fn") + forbidden = "authentication.rb:shared_helper" + assert forbidden not in builder.call_graph[caller], ( + f"require substring over-matched: {caller} -> {forbidden}; " + f"got {builder.call_graph[caller]}" + ) + + +def test_require_resolves_by_basename_equality(): + """Recall guard for [49]: require 'auth' DOES resolve to auth.rb (basename eq). + + A same-named decoy defeats the unique-name fallback, so a passing edge + must come from the basename-equality require match (step 3). + """ + output, builder = _build( + { + "main.rb": "require 'auth'\ndef caller_fn\n shared_helper()\nend\n", + "auth.rb": "def shared_helper\n 1\nend\n", + "elsewhere.rb": "def shared_helper\n 2\nend\n", + } + ) + caller = _fid(output, "main.rb:caller_fn") + callee = "auth.rb:shared_helper" + assert callee in builder.call_graph[caller], ( + f"basename-equal require edge lost: {caller} -> {callee}; " + f"got {builder.call_graph[caller]}" + ) From 7bac117bab6c03f2dac5b224f486be90fbdc1ac8 Mon Sep 17 00:00:00 2001 From: Gadi Evron Date: Mon, 8 Jun 2026 23:57:43 +0300 Subject: [PATCH 2/3] =?UTF-8?q?fix(parsers/ruby):=20u11=20function=5Fextra?= =?UTF-8?q?ctor=20=E2=80=94=20nested/singleton/define=5Fmethod/alias/priva?= =?UTF-8?q?te/nested-class=20(BUG-NEW=206,18,24,42,44,50)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local-only finder-fixes-54. 8 tests. recurse method bodies; is_singleton before initialize; define_method(:sym) registers sym [24] and alias node registers name [42] (DISTINCT AST nodes, judge-confirmed); private/ public/protected keyword-state visibility model [44]; compose Outer::Inner [50]. Judge: AGREE / SHIP-AS-IS. Local-only; not pushed. Co-Authored-By: Claude Opus 4.8 (1M context) (cherry picked from commit 825c92d71c686234828215102da60eb584e0de7b) --- .../parsers/ruby/function_extractor.py | 222 +++++++++++++++--- .../ruby/test_function_extractor_u11.py | 140 +++++++++++ 2 files changed, 333 insertions(+), 29 deletions(-) create mode 100644 libs/openant-core/tests/parsers/ruby/test_function_extractor_u11.py diff --git a/libs/openant-core/parsers/ruby/function_extractor.py b/libs/openant-core/parsers/ruby/function_extractor.py index 798945bb..497a8c5e 100644 --- a/libs/openant-core/parsers/ruby/function_extractor.py +++ b/libs/openant-core/parsers/ruby/function_extractor.py @@ -127,15 +127,23 @@ def _get_parameters(self, node, source: bytes) -> List[str]: def _classify_function(self, func_name: str, class_name: Optional[str], module_name: Optional[str], is_singleton: bool, - file_path: str) -> str: + file_path: str, visibility: str = 'public') -> str: """Classify a function by its type/purpose.""" path_lower = file_path.lower() + # is_singleton must be checked BEFORE the initialize/constructor branch: + # `def self.initialize` is a class-level singleton method, not the + # instance constructor (Ruby's constructor is the instance `initialize`). + if is_singleton: + return 'singleton_method' + if func_name == 'initialize': return 'constructor' - if is_singleton: - return 'singleton_method' + # Explicit Ruby visibility keyword (`private` / `protected`) shadows the + # leading-underscore heuristic below. + if class_name and visibility in ('private', 'protected'): + return f'{visibility}_method' # Callbacks if func_name.startswith(('before_', 'after_', 'around_')): @@ -209,31 +217,89 @@ def _extract_imports(self, tree, source: bytes) -> Dict[str, str]: return imports + def _body_node(self, node): + """Return a node's body (`body` field, else a `body_statement` child).""" + body_node = node.child_by_field_name('body') + if body_node is None: + for child in node.children: + if child.type == 'body_statement': + body_node = child + break + return body_node + + def _push_body_in_order(self, stack, body_node, class_name, module_name) -> None: + """Push a class/module body's children, tracking the `private`/`public`/ + `protected` visibility keyword state across the body in document order. + + Pushed reversed so the LIFO stack pops them in document order; each child + carries the visibility in effect at its position. + """ + visibility = 'public' + annotated = [] + for child in body_node.children: + # Bare visibility keyword (no arguments) flips the body-level state. + if child.type == 'identifier': + kw = self._node_text(child, source=self._current_source) + if kw in ('private', 'public', 'protected'): + visibility = kw + continue + annotated.append((child, class_name, module_name, visibility)) + for entry in reversed(annotated): + stack.append(entry) + def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path, relative_path: str) -> None: """Extract all method definitions from a parsed tree.""" - # Stack-based traversal: (node, class_name, module_name) - stack = [(tree.root_node, None, None)] + # `_push_body_in_order` reads source for visibility keywords. + self._current_source = source + # Stack-based traversal: (node, class_name, module_name, visibility) + stack = [(tree.root_node, None, None, 'public')] while stack: - node, class_name, module_name = stack.pop() + node, class_name, module_name, visibility = stack.pop() - if node.type == 'method': + if node.type in ('method', 'singleton_method'): self._process_method_node( node, source, relative_path, class_name, module_name, - is_singleton=False + is_singleton=(node.type == 'singleton_method'), + visibility=visibility, ) + # A nested `def` lives in the method's body — keep traversing so + # methods defined inside another method are not lost. Nested defs + # inherit the enclosing class but default to public visibility. + body_node = self._body_node(node) + if body_node: + for child in reversed(body_node.children): + stack.append((child, class_name, module_name, 'public')) + continue - elif node.type == 'singleton_method': - self._process_method_node( - node, source, relative_path, class_name, module_name, - is_singleton=True + elif node.type == 'alias': + # `alias ` — register the new name as a method. + self._process_alias_node( + node, source, relative_path, class_name, module_name, visibility ) + continue + + elif node.type == 'call': + # Metaprogramming: `define_method(:name) {...}` and + # `alias_method :new, :old` define methods via a call node. + if self._process_call_definition( + node, source, relative_path, class_name, module_name, visibility + ): + continue + for child in reversed(node.children): + stack.append((child, class_name, module_name, visibility)) elif node.type == 'class': # Extract class name name_node = node.child_by_field_name('name') - new_class_name = self._node_text(name_node, source) if name_node else None + local_class_name = self._node_text(name_node, source) if name_node else None + # Compose with the enclosing class so a nested class keeps its + # outer qualifier, e.g. `Outer::Inner`. + if local_class_name and class_name: + new_class_name = f"{class_name}::{local_class_name}" + else: + new_class_name = local_class_name # Extract superclass superclass = None @@ -271,11 +337,10 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path, } self.stats['total_classes'] += 1 - # Recurse into class body with updated class_name + # Recurse into class body with updated class_name + visibility tracking body_node = node.child_by_field_name('body') if body_node: - for child in reversed(body_node.children): - stack.append((child, new_class_name, module_name)) + self._push_body_in_order(stack, body_node, new_class_name, module_name) continue # Don't walk children again elif node.type == 'module': @@ -288,26 +353,18 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path, new_module_name = self._node_text(name_node, source) if name_node else module_name # Recurse into module body - body_node = node.child_by_field_name('body') - if body_node is None: - # Try finding body_statement child - for child in node.children: - if child.type == 'body_statement': - body_node = child - break - + body_node = self._body_node(node) if body_node: - for child in reversed(body_node.children): - stack.append((child, class_name, new_module_name)) + self._push_body_in_order(stack, body_node, class_name, new_module_name) continue # Don't walk children again else: for child in reversed(node.children): - stack.append((child, class_name, module_name)) + stack.append((child, class_name, module_name, visibility)) def _process_method_node(self, node, source: bytes, relative_path: str, class_name: Optional[str], module_name: Optional[str], - is_singleton: bool) -> None: + is_singleton: bool, visibility: str = 'public') -> None: """Process a single method or singleton_method node.""" name = self._get_method_name(node, source) if not name: @@ -319,7 +376,7 @@ def _process_method_node(self, node, source: bytes, relative_path: str, parameters = self._get_parameters(node, source) unit_type = self._classify_function( - name, class_name, module_name, is_singleton, relative_path + name, class_name, module_name, is_singleton, relative_path, visibility ) # Build qualified name and function ID @@ -359,6 +416,113 @@ def _process_method_node(self, node, source: bytes, relative_path: str, self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1 + def _register_synthetic_method(self, name: str, node, source: bytes, + relative_path: str, class_name: Optional[str], + module_name: Optional[str], visibility: str) -> None: + """Register a method defined by metaprogramming (define_method / alias / + alias_method), where there is no `method` AST node to mine. + + `node` supplies the source span/lines; parameters are unknown so they are + recorded empty. + """ + if not name: + return + + unit_type = self._classify_function( + name, class_name, module_name, False, relative_path, visibility + ) + + if class_name: + qualified_name = f"{class_name}.{name}" + elif module_name: + qualified_name = f"{module_name}.{name}" + else: + qualified_name = name + + func_id = f"{relative_path}:{qualified_name}" + if func_id in self.functions: + return # don't clobber a real def of the same name + + self.functions[func_id] = { + 'name': name, + 'qualified_name': qualified_name, + 'file_path': relative_path, + 'start_line': node.start_point[0] + 1, + 'end_line': node.end_point[0] + 1, + 'code': self._node_text(node, source), + 'class_name': class_name, + 'module_name': module_name, + 'parameters': [], + 'is_singleton': False, + 'unit_type': unit_type, + } + self.stats['total_functions'] += 1 + if class_name: + self.stats['total_methods'] += 1 + else: + self.stats['standalone_functions'] += 1 + self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1 + + def _process_alias_node(self, node, source: bytes, relative_path: str, + class_name: Optional[str], module_name: Optional[str], + visibility: str) -> None: + """Handle an `alias ` keyword node: register the new name.""" + # The `alias` node holds two name children (identifier / symbol); the + # first is the new (aliased) name. + names = [c for c in node.children + if c.type in ('identifier', 'simple_symbol', 'symbol', 'constant')] + if not names: + return + new_name = self._node_text(names[0], source).lstrip(':') + self._register_synthetic_method( + new_name, node, source, relative_path, class_name, module_name, visibility + ) + + def _process_call_definition(self, node, source: bytes, relative_path: str, + class_name: Optional[str], module_name: Optional[str], + visibility: str) -> bool: + """Handle method-defining call nodes (`define_method(:x)`, `alias_method + :new, :old`). Returns True if the call was a method definition (so the + traversal should not recurse into it). + """ + method_node = None + for child in node.children: + if child.type == 'identifier': + method_node = child + break + if method_node is None: + return False + call_name = self._node_text(method_node, source) + if call_name not in ('define_method', 'alias_method'): + return False + + arg_list = None + for child in node.children: + if child.type == 'argument_list': + arg_list = child + break + if arg_list is None: + return False + + # First symbol/string argument is the method name being defined. + sym_args = [c for c in arg_list.children + if c.type in ('simple_symbol', 'symbol', 'string')] + if not sym_args: + return False + first = sym_args[0] + if first.type == 'string': + new_name = ''.join( + self._node_text(sc, source) + for sc in first.children if sc.type == 'string_content' + ) + else: + new_name = self._node_text(first, source).lstrip(':') + + self._register_synthetic_method( + new_name, node, source, relative_path, class_name, module_name, visibility + ) + return True + def process_file(self, file_path: Path) -> None: """Process a single Ruby file.""" source = self.read_file(file_path) diff --git a/libs/openant-core/tests/parsers/ruby/test_function_extractor_u11.py b/libs/openant-core/tests/parsers/ruby/test_function_extractor_u11.py new file mode 100644 index 00000000..2569933d --- /dev/null +++ b/libs/openant-core/tests/parsers/ruby/test_function_extractor_u11.py @@ -0,0 +1,140 @@ +"""Regression tests for the Ruby function_extractor (u11 blind-fix batch). + +Six confirmed bugs in parsers/ruby/function_extractor.py, each driven through the +REAL FunctionExtractor on a temp .rb file so the parse/extract shape matches +production exactly. Assertions are on the exported `functions` dict. + + [6] nested `def` inside a `def` body is never extracted (assert present) + [18] `def self.initialize` mis-typed 'constructor' (assert singleton_method + is_singleton) + [24] `define_method(:sym){}` not registered (assert sym present) + [42] `alias`/`alias_method` aliased name not registered (assert new name present) + [44] method after bare `private` not marked private (assert unit_type private_method) + [50] method in nested class gets bare class_name (assert class_name 'Outer::Inner') +""" + +import sys +import tempfile +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.ruby.function_extractor import FunctionExtractor + + +def _extract(src: str, name: str = "m.rb") -> dict: + """Write one Ruby source to a temp repo, run the real extractor, return functions.""" + repo = Path(tempfile.mkdtemp()) + (repo / name).write_text(src) + extractor = FunctionExtractor(str(repo)) + output = extractor.extract_all([name]) + return output["functions"] + + +def _find(functions: dict, suffix: str): + """Return the single func_data whose id ends with `suffix`, or None.""" + matches = [v for k, v in functions.items() if k.endswith(suffix)] + assert len(matches) <= 1, f"expected <=1 func id ending {suffix!r}, got {list(functions)}" + return matches[0] if matches else None + + +# ------------------------------------------------------------------ [6] nested def +def test_nested_def_is_extracted(): + """A `def inner` nested in `def outer`'s body must be extracted as a unit.""" + functions = _extract("def outer\n def inner\n 1\n end\nend\n") + assert _find(functions, ":inner") is not None, ( + f"nested def 'inner' missing; got {list(functions)}" + ) + # outer must still be present (no double-count regression) + assert _find(functions, ":outer") is not None + + +# ------------------------------------------------------ [18] self.initialize ordering +def test_self_initialize_is_singleton_not_constructor(): + """`def self.initialize` is a class singleton method, not the instance constructor.""" + functions = _extract( + "class Widget\n def self.initialize\n @count = 0\n end\nend\n", + "widget.rb", + ) + fn = _find(functions, ":Widget.initialize") + assert fn is not None, f"Widget.initialize missing; got {list(functions)}" + assert fn["is_singleton"] is True, f"expected is_singleton True; got {fn['is_singleton']}" + assert fn["unit_type"] == "singleton_method", ( + f"expected unit_type singleton_method; got {fn['unit_type']!r}" + ) + + +# ----------------------------------------------------------------- [24] define_method +def test_define_method_registers_symbol(): + """`define_method(:render){...}` must register `render` as a method unit.""" + functions = _extract( + 'def ctrl\n 1\nend\n\nclass Widget\n define_method(:render) do\n "x"\n end\nend\n' + ) + assert _find(functions, ":ctrl") is not None, "in-file control 'ctrl' missing (file parse)" + fn = _find(functions, ":render") or _find(functions, ".render") + assert fn is not None, f"define_method 'render' missing; got {list(functions)}" + assert fn["name"] == "render", f"expected name 'render'; got {fn['name']!r}" + assert fn["class_name"] == "Widget", f"expected class_name Widget; got {fn['class_name']!r}" + + +# ------------------------------------------------------------------------ [42] alias +def test_alias_keyword_registers_new_name(): + """`alias greet hello` must register `greet` as a method node.""" + functions = _extract( + 'def control\n 1\nend\nclass Greeter\n def hello\n "hi"\n end\n' + " alias greet hello\nend\n" + ) + assert _find(functions, ":Greeter.hello") is not None, "control method 'hello' missing" + fn = _find(functions, ":Greeter.greet") + assert fn is not None, f"aliased name 'greet' missing; got {list(functions)}" + assert fn["name"] == "greet" + assert fn["class_name"] == "Greeter" + + +def test_alias_method_call_registers_new_name(): + """`alias_method :greet, :hello` must register `greet` (distinct AST node from `alias`).""" + functions = _extract( + 'class Greeter\n def hello\n "hi"\n end\n alias_method :greet, :hello\nend\n' + ) + fn = _find(functions, ":Greeter.greet") + assert fn is not None, f"alias_method name 'greet' missing; got {list(functions)}" + assert fn["name"] == "greet" + + +# ------------------------------------------------------------------- [44] private kw +def test_method_after_private_keyword_is_private(): + """A method following a bare `private` keyword must be unit_type private_method.""" + functions = _extract( + "class Foo\n private\n\n def secret\n 42\n end\nend\n", "foo.rb" + ) + fn = _find(functions, ":Foo.secret") + assert fn is not None, f"Foo.secret missing; got {list(functions)}" + assert fn["unit_type"] == "private_method", ( + f"expected unit_type private_method; got {fn['unit_type']!r}" + ) + + +def test_method_before_private_keyword_stays_public(): + """Precision guard: a method declared BEFORE `private` must stay public ('method').""" + functions = _extract( + "class Foo\n def open_api\n 1\n end\n\n private\n\n def secret\n 2\n end\nend\n", + "foo2.rb", + ) + pub = _find(functions, ":Foo.open_api") + assert pub is not None and pub["unit_type"] == "method", ( + f"public method mis-typed: {pub['unit_type'] if pub else None}" + ) + + +# ----------------------------------------------------------- [50] nested class_name +def test_nested_class_method_has_composed_class_name(): + """A method in `class Inner` nested in `class Outer` must have class_name 'Outer::Inner'.""" + functions = _extract( + "class Outer\n class Inner\n def deep_method\n 1\n end\n end\nend\n", + "n.rb", + ) + fn = _find(functions, ".deep_method") + assert fn is not None, f"deep_method missing; got {list(functions)}" + assert fn["class_name"] == "Outer::Inner", ( + f"expected class_name 'Outer::Inner'; got {fn['class_name']!r}" + ) From f69717c336ae36929520c03d3a0e20cdab86de6f Mon Sep 17 00:00:00 2001 From: Gadi Evron Date: Tue, 9 Jun 2026 23:31:28 +0300 Subject: [PATCH 3/3] fix(parsers/ruby): guard alias resolution to single unconditional binding (BUG-NEW 31) Ruby hunk of the alias-guard (python hunk ships in the python PR). Local-only; not pushed. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../parsers/ruby/call_graph_builder.py | 42 +++++++++++++++++-- .../ruby/test_call_graph_builder_u10.py | 39 +++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) diff --git a/libs/openant-core/parsers/ruby/call_graph_builder.py b/libs/openant-core/parsers/ruby/call_graph_builder.py index a53c432b..9cbb61d2 100644 --- a/libs/openant-core/parsers/ruby/call_graph_builder.py +++ b/libs/openant-core/parsers/ruby/call_graph_builder.py @@ -179,8 +179,20 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]: # First pass: collect local-variable names (assignment LHS) and # method-object bindings (`m = method(:sym)` / proc / lambda). These # inform parenless-call precision [BUG 1] and `.call` resolution [BUG 31]. + # + # SINGLE-UNCONDITIONAL-ASSIGNMENT GUARD for method-object bindings: a + # binding is kept ONLY when the variable is assigned exactly once AND at + # the method/program top level (a direct statement of the body, not + # nested inside an `if`/`unless`/`while`/`case`/`begin`/block). A var + # assigned 2+ times (last-write-wins) or bound conditionally is a + # "maybe" binding; resolving its `.call` would assert a maybe as + # definite, so we drop it and let `m.call` go unresolved (no edge). + # `local_vars` (for parenless precision) is NOT narrowed -- any + # assignment target is still a local variable for the bare-identifier + # guard, regardless of how many times / where it is bound. local_vars: Set[str] = set() - method_object_bindings: Dict[str, str] = {} + assign_counts: Dict[str, int] = {} + top_level_binding: Dict[str, str] = {} stack = [tree.root_node] while stack: node = stack.pop() @@ -189,11 +201,21 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]: if lhs is not None and lhs.type == 'identifier': var_name = self._node_str(lhs, code_bytes) local_vars.add(var_name) - bound = self._method_object_target(node, code_bytes) - if bound: - method_object_bindings[var_name] = bound + assign_counts[var_name] = assign_counts.get(var_name, 0) + 1 + if self._is_top_level_statement(node): + bound = self._method_object_target(node, code_bytes) + if bound: + top_level_binding[var_name] = bound stack.extend(reversed(node.children)) + # Keep a method-object binding only if it is single + unconditional: + # exactly one assignment to that name, and that binding is top-level. + method_object_bindings: Dict[str, str] = { + name: bound + for name, bound in top_level_binding.items() + if assign_counts.get(name, 0) == 1 + } + # Second pass: resolve calls and bare (parenless) identifier calls. stack = [tree.root_node] while stack: @@ -218,6 +240,18 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]: def _node_str(self, node, source: bytes) -> str: return source[node.start_byte:node.end_byte].decode('utf-8', errors='replace') + @staticmethod + def _is_top_level_statement(node) -> bool: + """True if `node` is a direct statement of a method/program body. + + Top-level means the parent is the method body (`body_statement`) or the + file program node. Anything nested in an `if`/`then`/`else`/`while`/ + `case`/`begin`/block has some other parent, so it is conditional/looped + and not a single-unconditional binding. + """ + parent = node.parent + return parent is not None and parent.type in ('body_statement', 'program') + def _method_object_target(self, assignment_node, source: bytes) -> Optional[str]: """If RHS is method(:sym)/proc/lambda binding a named method, return the name. diff --git a/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py b/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py index fec23b9c..50ab3ddc 100644 --- a/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py +++ b/libs/openant-core/tests/parsers/ruby/test_call_graph_builder_u10.py @@ -165,6 +165,45 @@ def test_method_object_call_is_an_edge(): ) +# ------------------------------------------- [31] single-unconditional GUARD +def test_method_object_reassignment_not_resolved(): + """GUARD (reassignment): `m = method(:a); m = method(:b); m.call` is + last-write-wins, so the binding is a "maybe". The guard must NOT assert a + definite edge to EITHER target — pinned behavior: no edge at all.""" + output, builder = _build( + { + "m.rb": "def a\n 1\nend\n\ndef b\n 2\nend\n\n" + "def caller_fn\n m = method(:a)\n m = method(:b)\n m.call\nend\n" + } + ) + caller = _fid(output, ":caller_fn") + a_id = _fid(output, "m.rb:a") + b_id = _fid(output, "m.rb:b") + edges = builder.call_graph[caller] + assert a_id not in edges and b_id not in edges, ( + f"reassigned method-object asserted a maybe-binding as definite: {edges}" + ) + + +def test_method_object_conditional_binding_not_resolved(): + """GUARD (conditional): a binding inside an `if`/`else` branch is not + unconditional, so `m.call` must NOT resolve (no edge to either target).""" + output, builder = _build( + { + "m.rb": "def a\n 1\nend\n\ndef b\n 2\nend\n\n" + "def caller_fn(cond)\n if cond\n m = method(:a)\n else\n" + " m = method(:b)\n end\n m.call\nend\n" + } + ) + caller = _fid(output, ":caller_fn") + a_id = _fid(output, "m.rb:a") + b_id = _fid(output, "m.rb:b") + edges = builder.call_graph[caller] + assert a_id not in edges and b_id not in edges, ( + f"conditional method-object resolved despite non-unconditional binding: {edges}" + ) + + # ------------------------------------------------ [49] substring import over-match def test_require_does_not_substring_match_longer_filename(): """require 'auth' must NOT match authentication.rb by substring.