From 3b9b28be8870d69de420a09bda3f682474343939 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 30 Apr 2026 11:46:17 -0400 Subject: [PATCH] fix(#1442): scan_*_references reads raw JSON metadata instead of decoded codec output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `gc.scan_hash_references` and `gc.scan_schema_references` were calling `table.to_arrays(attr_name)`, which routes through `Expression.to_dicts` (`expression.py:899`) and runs `decode_attribute` (`codecs.py:518`) for each value. For the storage codecs (``, ``, ``, ``, ``) this means downloading the bytes from external storage and deserializing them into the codec's runtime type — a `numpy.ndarray`, an `NpyRef`, an `ObjectRef`, raw bytes, or a local file-path string — none of which satisfy `_extract_*_refs`'s `isinstance(value, dict) and "path" in value` check. Both helpers therefore silently returned empty reference sets. Every populated schema reported `hash_referenced: 0` and `schema_paths_referenced: 0`, so every external file looked orphaned to `scan()` and a subsequent `collect()` would have deleted live data. The broad `try/except Exception` around the loop never fired because no exception was raised — `_extract_*_refs` returns `[]` for unrecognized shapes by design. The intended design (per `reference/specs/type-system.md`) is for GC to operate on the raw stored JSON metadata, not the decoded payload. Replace `table.to_arrays(attr_name)` with `table.proj(attr_name).cursor(as_dict=True)` in both helpers. The cursor yields the JSON column value directly: a Python dict on PostgreSQL (JSONB auto-decoded) or a JSON string on MySQL. `_extract_*_refs` already handles both shapes (`gc.py:138` string branch, `gc.py:145` dict branch), so this is backend-agnostic with no adapter dispatch. Side effect — `scan` is now a metadata-only operation. Previously it downloaded every external blob just to deserialize and discard the result via the silent type mismatch; on a 1 TB schema that meant 1 TB of egress to produce `referenced: 0`. After this change, scan touches only the JSON column on the database. Custom-codec authors are unaffected: reference discovery operates on the raw stored metadata regardless of what the codec's `decode()` returns, so third-party codecs following the documented `encode`/`decode` contract get correct GC for free. Tests ----- The existing `tests/integration/test_gc.py` mocks `scan_hash_references`, `scan_schema_references`, and `list_*_paths` directly, so the production code path through `to_arrays` → `decode_attribute` was never exercised end-to-end. The mocked tests stay (they cover orchestration: composition with `list_*_paths`, dry-run vs real, stat-key shape, format strings). Add a `TestScanWithLiveData` class with three non-mocked end-to-end tests, one per structurally distinct decoded-value type: - `test_scan_finds_active_blob_reference` — `` (decode → ndarray) - `test_scan_finds_active_npy_reference` — `` (decode → NpyRef) - `test_scan_finds_active_object_reference` — `` (decode → ObjectRef) Each declares a small manual table, inserts one row, and asserts `scan(schema, store_name='local')` reports the expected `*_referenced` count > 0. Verified to fail on the pre-fix code: `{'hash_referenced': 0, 'hash_stored': 1, 'hash_orphaned': 1, ...}`. Adjacent -------- Register `gc` in `_lazy_modules` (`src/datajoint/__init__.py`). The `gc.py` module docstring and the user docs at `how-to/garbage-collection.md` both invoke GC as `dj.gc.scan(...)`, which previously raised `AttributeError` because `gc` wasn't lazily exposed at the package level. Pattern matches the existing `"diagram": (".diagram", None)` entry. Out of scope ------------ GC remains non-transaction-safe even after this fix — there is a TOCTOU window between scan and delete during which a concurrent transaction could insert a row referencing what looks like an orphan. A two-phase retrieval/removal API (quarantine → grace window → purge) is the right remedy and will be tracked as a separate enhancement issue. Fixes #1442 --- src/datajoint/__init__.py | 3 + src/datajoint/gc.py | 22 ++++--- tests/integration/test_gc.py | 118 +++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 8 deletions(-) diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 05813e6ac..b1dba84e1 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -275,6 +275,9 @@ def FreeTable(conn_or_name, full_table_name: str | None = None) -> _FreeTable: "diagram": (".diagram", None), # Return the module itself # cli imports click "cli": (".cli", "cli"), + # gc — exposed lazily so `dj.gc.scan(...)` works as documented in gc.py + # and in the user docs (how-to/garbage-collection.md). + "gc": (".gc", None), # Return the module itself } diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index 7f083416b..8c87efd84 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -229,11 +229,14 @@ def scan_hash_references( if verbose: logger.info(f" Scanning {table_name}.{attr_name}") - # Fetch all values for this attribute + # Read raw JSON metadata via cursor — bypasses decode_attribute + # so we get the stored dict (PostgreSQL/JSONB) or JSON string + # (MySQL), not the decoded codec output. _extract_hash_refs + # handles both shapes. try: - values = table.to_arrays(attr_name) - for value in values: - for path, ref_store in _extract_hash_refs(value): + cursor = table.proj(attr_name).cursor(as_dict=True) + for row in cursor: + for path, ref_store in _extract_hash_refs(row[attr_name]): # Filter by store if specified if store_name is None or ref_store == store_name: referenced.add(path) @@ -291,11 +294,14 @@ def scan_schema_references( if verbose: logger.info(f" Scanning {table_name}.{attr_name}") - # Fetch all values for this attribute + # Read raw JSON metadata via cursor — bypasses decode_attribute + # so we get the stored dict (PostgreSQL/JSONB) or JSON string + # (MySQL), not the decoded codec output. _extract_schema_refs + # handles both shapes. try: - values = table.to_arrays(attr_name) - for value in values: - for path, ref_store in _extract_schema_refs(value): + cursor = table.proj(attr_name).cursor(as_dict=True) + for row in cursor: + for path, ref_store in _extract_schema_refs(row[attr_name]): # Filter by store if specified if store_name is None or ref_store == store_name: referenced.add(path) diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py index 47ca0a96d..c9ea741bd 100644 --- a/tests/integration/test_gc.py +++ b/tests/integration/test_gc.py @@ -4,12 +4,43 @@ from unittest.mock import MagicMock, patch +import numpy as np import pytest +import datajoint as dj from datajoint import gc from datajoint.errors import DataJointError +# Tables used by TestScanWithLiveData. Defined at module scope so dj.Schema's +# context resolution can find them by class name; bound to a schema inside +# each fixture (see schema(...) calls below). + + +class GcBlobTest(dj.Manual): + definition = """ + rid : int + --- + payload : + """ + + +class GcNpyTest(dj.Manual): + definition = """ + rid : int + --- + waveform : + """ + + +class GcObjectTest(dj.Manual): + definition = """ + rid : int + --- + results : + """ + + class TestUsesHashStorage: """Tests for _uses_hash_storage helper function.""" @@ -347,3 +378,90 @@ def test_formats_collect_stats_actual(self): assert "Schema paths: 1" in result assert "2.00 MB" in result assert "Errors: 2" in result + + +class TestScanWithLiveData: + """End-to-end tests for gc.scan() against real schemas with external storage. + + Exercises the full production path: + scan_*_references → table.proj(attr).cursor() → raw JSON metadata. + + These are the regression tests that would have caught issue #1442 + (silent type mismatch when scan helpers iterated decoded codec outputs + instead of raw stored metadata). + """ + + @pytest.fixture + def schema_blob(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_blob" + schema = dj.Schema( + schema_name, + context={"GcBlobTest": GcBlobTest}, + connection=connection_test, + ) + schema(GcBlobTest) + yield schema + schema.drop() + + @pytest.fixture + def schema_npy(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_npy" + schema = dj.Schema( + schema_name, + context={"GcNpyTest": GcNpyTest}, + connection=connection_test, + ) + schema(GcNpyTest) + yield schema + schema.drop() + + @pytest.fixture + def schema_object(self, connection_test, prefix, mock_stores): + schema_name = f"{prefix}_test_gc_e2e_object" + schema = dj.Schema( + schema_name, + context={"GcObjectTest": GcObjectTest}, + connection=connection_test, + ) + schema(GcObjectTest) + yield schema + schema.drop() + + def test_scan_finds_active_blob_reference(self, schema_blob): + """scan() must report hash_referenced >= 1 for a populated column. + + Decoded value type returned by BlobCodec.decode is numpy.ndarray, which + does not satisfy `_extract_hash_refs`'s dict/JSON-string check — this + test fails before the cursor-based fix in scan_hash_references. + """ + GcBlobTest.insert1({"rid": 1, "payload": np.arange(64, dtype="uint8")}) + + stats = gc.scan(schema_blob, store_name="local") + + assert stats["hash_referenced"] >= 1, f"scan should find the active reference; got {stats}" + + def test_scan_finds_active_npy_reference(self, schema_npy): + """scan() must report schema_paths_referenced >= 1 for a populated column. + + Decoded value type returned by NpyCodec.decode is NpyRef (lazy handle), + which does not satisfy `_extract_schema_refs`'s dict check — this test + fails before the cursor-based fix in scan_schema_references. + """ + GcNpyTest.insert1({"rid": 1, "waveform": np.arange(64, dtype="float32")}) + + stats = gc.scan(schema_npy, store_name="local") + + assert stats["schema_paths_referenced"] >= 1, f"scan should find the active reference; got {stats}" + + def test_scan_finds_active_object_reference(self, schema_object): + """scan() must report schema_paths_referenced >= 1 for a populated column. + + Decoded value type returned by ObjectCodec.decode is ObjectRef (lazy + handle), which does not satisfy `_extract_schema_refs`'s dict check — + this test fails before the cursor-based fix in scan_schema_references. + """ + GcObjectTest.insert1({"rid": 1, "results": b"hello-gc-test"}) + + stats = gc.scan(schema_object, store_name="local") + + assert stats["schema_paths_referenced"] >= 1, f"scan should find the active reference; got {stats}"