From 3b9b28be8870d69de420a09bda3f682474343939 Mon Sep 17 00:00:00 2001
From: Kushal Bakshi <kushalbakshi@gmail.com>
Date: Thu, 30 Apr 2026 11:46:17 -0400
Subject: [PATCH] fix(#1442): scan_*_references reads raw JSON metadata instead
 of decoded codec output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`gc.scan_hash_references` and `gc.scan_schema_references` were calling
`table.to_arrays(attr_name)`, which routes through `Expression.to_dicts`
(`expression.py:899`) and runs `decode_attribute` (`codecs.py:518`) for
each value. For the storage codecs (`<blob@>`, `<hash@>`, `<attach@>`,
`<npy@>`, `<object@>`) this means downloading the bytes from external
storage and deserializing them into the codec's runtime type — a
`numpy.ndarray`, an `NpyRef`, an `ObjectRef`, raw bytes, or a local
file-path string — none of which satisfy `_extract_*_refs`'s
`isinstance(value, dict) and "path" in value` check.

Both helpers therefore silently returned empty reference sets. Every
populated schema reported `hash_referenced: 0` and
`schema_paths_referenced: 0`, so every external file looked orphaned to
`scan()` and a subsequent `collect()` would have deleted live data. The
broad `try/except Exception` around the loop never fired because no
exception was raised — `_extract_*_refs` returns `[]` for unrecognized
shapes by design.

The intended design (per `reference/specs/type-system.md`) is for GC to
operate on the raw stored JSON metadata, not the decoded payload.
Replace `table.to_arrays(attr_name)` with
`table.proj(attr_name).cursor(as_dict=True)` in both helpers. The cursor
yields the JSON column value directly: a Python dict on PostgreSQL
(JSONB auto-decoded) or a JSON string on MySQL. `_extract_*_refs`
already handles both shapes (`gc.py:138` string branch, `gc.py:145` dict
branch), so this is backend-agnostic with no adapter dispatch.

Side effect — `scan` is now a metadata-only operation. Previously it
downloaded every external blob just to deserialize and discard the
result via the silent type mismatch; on a 1 TB schema that meant 1 TB
of egress to produce `referenced: 0`. After this change, scan touches
only the JSON column on the database.

Custom-codec authors are unaffected: reference discovery operates on
the raw stored metadata regardless of what the codec's `decode()`
returns, so third-party codecs following the documented
`encode`/`decode` contract get correct GC for free.

Tests
-----

The existing `tests/integration/test_gc.py` mocks `scan_hash_references`,
`scan_schema_references`, and `list_*_paths` directly, so the production
code path through `to_arrays` → `decode_attribute` was never exercised
end-to-end. The mocked tests stay (they cover orchestration: composition
with `list_*_paths`, dry-run vs real, stat-key shape, format strings).

Add a `TestScanWithLiveData` class with three non-mocked end-to-end
tests, one per structurally distinct decoded-value type:

- `test_scan_finds_active_blob_reference` — `<blob@>` (decode → ndarray)
- `test_scan_finds_active_npy_reference`  — `<npy@>`  (decode → NpyRef)
- `test_scan_finds_active_object_reference` — `<object@>` (decode → ObjectRef)

Each declares a small manual table, inserts one row, and asserts
`scan(schema, store_name='local')` reports the expected `*_referenced`
count > 0. Verified to fail on the pre-fix code:
`{'hash_referenced': 0, 'hash_stored': 1, 'hash_orphaned': 1, ...}`.

Adjacent
--------

Register `gc` in `_lazy_modules` (`src/datajoint/__init__.py`). The
`gc.py` module docstring and the user docs at
`how-to/garbage-collection.md` both invoke GC as `dj.gc.scan(...)`,
which previously raised `AttributeError` because `gc` wasn't lazily
exposed at the package level. Pattern matches the existing
`"diagram": (".diagram", None)` entry.

Out of scope
------------

GC remains non-transaction-safe even after this fix — there is a TOCTOU
window between scan and delete during which a concurrent transaction
could insert a row referencing what looks like an orphan. A two-phase
retrieval/removal API (quarantine → grace window → purge) is the right
remedy and will be tracked as a separate enhancement issue.

Fixes #1442
---
 src/datajoint/__init__.py    |   3 +
 src/datajoint/gc.py          |  22 ++++---
 tests/integration/test_gc.py | 118 +++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 8 deletions(-)

diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py
index 05813e6ac..b1dba84e1 100644
--- a/src/datajoint/__init__.py
+++ b/src/datajoint/__init__.py
@@ -275,6 +275,9 @@ def FreeTable(conn_or_name, full_table_name: str | None = None) -> _FreeTable:
     "diagram": (".diagram", None),  # Return the module itself
     # cli imports click
     "cli": (".cli", "cli"),
+    # gc — exposed lazily so `dj.gc.scan(...)` works as documented in gc.py
+    # and in the user docs (how-to/garbage-collection.md).
+    "gc": (".gc", None),  # Return the module itself
 }
 
 
diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py
index 7f083416b..8c87efd84 100644
--- a/src/datajoint/gc.py
+++ b/src/datajoint/gc.py
@@ -229,11 +229,14 @@ def scan_hash_references(
                     if verbose:
                         logger.info(f"  Scanning {table_name}.{attr_name}")
 
-                    # Fetch all values for this attribute
+                    # Read raw JSON metadata via cursor — bypasses decode_attribute
+                    # so we get the stored dict (PostgreSQL/JSONB) or JSON string
+                    # (MySQL), not the decoded codec output. _extract_hash_refs
+                    # handles both shapes.
                     try:
-                        values = table.to_arrays(attr_name)
-                        for value in values:
-                            for path, ref_store in _extract_hash_refs(value):
+                        cursor = table.proj(attr_name).cursor(as_dict=True)
+                        for row in cursor:
+                            for path, ref_store in _extract_hash_refs(row[attr_name]):
                                 # Filter by store if specified
                                 if store_name is None or ref_store == store_name:
                                     referenced.add(path)
@@ -291,11 +294,14 @@ def scan_schema_references(
                     if verbose:
                         logger.info(f"  Scanning {table_name}.{attr_name}")
 
-                    # Fetch all values for this attribute
+                    # Read raw JSON metadata via cursor — bypasses decode_attribute
+                    # so we get the stored dict (PostgreSQL/JSONB) or JSON string
+                    # (MySQL), not the decoded codec output. _extract_schema_refs
+                    # handles both shapes.
                     try:
-                        values = table.to_arrays(attr_name)
-                        for value in values:
-                            for path, ref_store in _extract_schema_refs(value):
+                        cursor = table.proj(attr_name).cursor(as_dict=True)
+                        for row in cursor:
+                            for path, ref_store in _extract_schema_refs(row[attr_name]):
                                 # Filter by store if specified
                                 if store_name is None or ref_store == store_name:
                                     referenced.add(path)
diff --git a/tests/integration/test_gc.py b/tests/integration/test_gc.py
index 47ca0a96d..c9ea741bd 100644
--- a/tests/integration/test_gc.py
+++ b/tests/integration/test_gc.py
@@ -4,12 +4,43 @@
 
 from unittest.mock import MagicMock, patch
 
+import numpy as np
 import pytest
 
+import datajoint as dj
 from datajoint import gc
 from datajoint.errors import DataJointError
 
 
+# Tables used by TestScanWithLiveData. Defined at module scope so dj.Schema's
+# context resolution can find them by class name; bound to a schema inside
+# each fixture (see schema(...) calls below).
+
+
+class GcBlobTest(dj.Manual):
+    definition = """
+    rid : int
+    ---
+    payload : <blob@local>
+    """
+
+
+class GcNpyTest(dj.Manual):
+    definition = """
+    rid : int
+    ---
+    waveform : <npy@local>
+    """
+
+
+class GcObjectTest(dj.Manual):
+    definition = """
+    rid : int
+    ---
+    results : <object@local>
+    """
+
+
 class TestUsesHashStorage:
     """Tests for _uses_hash_storage helper function."""
 
@@ -347,3 +378,90 @@ def test_formats_collect_stats_actual(self):
         assert "Schema paths: 1" in result
         assert "2.00 MB" in result
         assert "Errors:      2" in result
+
+
+class TestScanWithLiveData:
+    """End-to-end tests for gc.scan() against real schemas with external storage.
+
+    Exercises the full production path:
+        scan_*_references → table.proj(attr).cursor() → raw JSON metadata.
+
+    These are the regression tests that would have caught issue #1442
+    (silent type mismatch when scan helpers iterated decoded codec outputs
+    instead of raw stored metadata).
+    """
+
+    @pytest.fixture
+    def schema_blob(self, connection_test, prefix, mock_stores):
+        schema_name = f"{prefix}_test_gc_e2e_blob"
+        schema = dj.Schema(
+            schema_name,
+            context={"GcBlobTest": GcBlobTest},
+            connection=connection_test,
+        )
+        schema(GcBlobTest)
+        yield schema
+        schema.drop()
+
+    @pytest.fixture
+    def schema_npy(self, connection_test, prefix, mock_stores):
+        schema_name = f"{prefix}_test_gc_e2e_npy"
+        schema = dj.Schema(
+            schema_name,
+            context={"GcNpyTest": GcNpyTest},
+            connection=connection_test,
+        )
+        schema(GcNpyTest)
+        yield schema
+        schema.drop()
+
+    @pytest.fixture
+    def schema_object(self, connection_test, prefix, mock_stores):
+        schema_name = f"{prefix}_test_gc_e2e_object"
+        schema = dj.Schema(
+            schema_name,
+            context={"GcObjectTest": GcObjectTest},
+            connection=connection_test,
+        )
+        schema(GcObjectTest)
+        yield schema
+        schema.drop()
+
+    def test_scan_finds_active_blob_reference(self, schema_blob):
+        """scan() must report hash_referenced >= 1 for a populated <blob@> column.
+
+        Decoded value type returned by BlobCodec.decode is numpy.ndarray, which
+        does not satisfy `_extract_hash_refs`'s dict/JSON-string check — this
+        test fails before the cursor-based fix in scan_hash_references.
+        """
+        GcBlobTest.insert1({"rid": 1, "payload": np.arange(64, dtype="uint8")})
+
+        stats = gc.scan(schema_blob, store_name="local")
+
+        assert stats["hash_referenced"] >= 1, f"scan should find the active <blob@> reference; got {stats}"
+
+    def test_scan_finds_active_npy_reference(self, schema_npy):
+        """scan() must report schema_paths_referenced >= 1 for a populated <npy@> column.
+
+        Decoded value type returned by NpyCodec.decode is NpyRef (lazy handle),
+        which does not satisfy `_extract_schema_refs`'s dict check — this test
+        fails before the cursor-based fix in scan_schema_references.
+        """
+        GcNpyTest.insert1({"rid": 1, "waveform": np.arange(64, dtype="float32")})
+
+        stats = gc.scan(schema_npy, store_name="local")
+
+        assert stats["schema_paths_referenced"] >= 1, f"scan should find the active <npy@> reference; got {stats}"
+
+    def test_scan_finds_active_object_reference(self, schema_object):
+        """scan() must report schema_paths_referenced >= 1 for a populated <object@> column.
+
+        Decoded value type returned by ObjectCodec.decode is ObjectRef (lazy
+        handle), which does not satisfy `_extract_schema_refs`'s dict check —
+        this test fails before the cursor-based fix in scan_schema_references.
+        """
+        GcObjectTest.insert1({"rid": 1, "results": b"hello-gc-test"})
+
+        stats = gc.scan(schema_object, store_name="local")
+
+        assert stats["schema_paths_referenced"] >= 1, f"scan should find the active <object@> reference; got {stats}"