Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/datajoint/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,9 @@ def FreeTable(conn_or_name, full_table_name: str | None = None) -> _FreeTable:
"diagram": (".diagram", None), # Return the module itself
# cli imports click
"cli": (".cli", "cli"),
# gc — exposed lazily so `dj.gc.scan(...)` works as documented in gc.py
# and in the user docs (how-to/garbage-collection.md).
"gc": (".gc", None), # Return the module itself
}


Expand Down
22 changes: 14 additions & 8 deletions src/datajoint/gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,14 @@ def scan_hash_references(
if verbose:
logger.info(f" Scanning {table_name}.{attr_name}")

# Fetch all values for this attribute
# Read raw JSON metadata via cursor — bypasses decode_attribute
# so we get the stored dict (PostgreSQL/JSONB) or JSON string
# (MySQL), not the decoded codec output. _extract_hash_refs
# handles both shapes.
try:
values = table.to_arrays(attr_name)
for value in values:
for path, ref_store in _extract_hash_refs(value):
cursor = table.proj(attr_name).cursor(as_dict=True)
for row in cursor:
for path, ref_store in _extract_hash_refs(row[attr_name]):
# Filter by store if specified
if store_name is None or ref_store == store_name:
referenced.add(path)
Expand Down Expand Up @@ -291,11 +294,14 @@ def scan_schema_references(
if verbose:
logger.info(f" Scanning {table_name}.{attr_name}")

# Fetch all values for this attribute
# Read raw JSON metadata via cursor — bypasses decode_attribute
# so we get the stored dict (PostgreSQL/JSONB) or JSON string
# (MySQL), not the decoded codec output. _extract_schema_refs
# handles both shapes.
try:
values = table.to_arrays(attr_name)
for value in values:
for path, ref_store in _extract_schema_refs(value):
cursor = table.proj(attr_name).cursor(as_dict=True)
for row in cursor:
for path, ref_store in _extract_schema_refs(row[attr_name]):
# Filter by store if specified
if store_name is None or ref_store == store_name:
referenced.add(path)
Expand Down
118 changes: 118 additions & 0 deletions tests/integration/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,43 @@

from unittest.mock import MagicMock, patch

import numpy as np
import pytest

import datajoint as dj
from datajoint import gc
from datajoint.errors import DataJointError


# Tables used by TestScanWithLiveData. Defined at module scope so dj.Schema's
# context resolution can find them by class name; bound to a schema inside
# each fixture (see schema(...) calls below).


class GcBlobTest(dj.Manual):
definition = """
rid : int
---
payload : <blob@local>
"""


class GcNpyTest(dj.Manual):
definition = """
rid : int
---
waveform : <npy@local>
"""


class GcObjectTest(dj.Manual):
definition = """
rid : int
---
results : <object@local>
"""


class TestUsesHashStorage:
"""Tests for _uses_hash_storage helper function."""

Expand Down Expand Up @@ -347,3 +378,90 @@ def test_formats_collect_stats_actual(self):
assert "Schema paths: 1" in result
assert "2.00 MB" in result
assert "Errors: 2" in result


class TestScanWithLiveData:
"""End-to-end tests for gc.scan() against real schemas with external storage.

Exercises the full production path:
scan_*_references → table.proj(attr).cursor() → raw JSON metadata.

These are the regression tests that would have caught issue #1442
(silent type mismatch when scan helpers iterated decoded codec outputs
instead of raw stored metadata).
"""

@pytest.fixture
def schema_blob(self, connection_test, prefix, mock_stores):
schema_name = f"{prefix}_test_gc_e2e_blob"
schema = dj.Schema(
schema_name,
context={"GcBlobTest": GcBlobTest},
connection=connection_test,
)
schema(GcBlobTest)
yield schema
schema.drop()

@pytest.fixture
def schema_npy(self, connection_test, prefix, mock_stores):
schema_name = f"{prefix}_test_gc_e2e_npy"
schema = dj.Schema(
schema_name,
context={"GcNpyTest": GcNpyTest},
connection=connection_test,
)
schema(GcNpyTest)
yield schema
schema.drop()

@pytest.fixture
def schema_object(self, connection_test, prefix, mock_stores):
schema_name = f"{prefix}_test_gc_e2e_object"
schema = dj.Schema(
schema_name,
context={"GcObjectTest": GcObjectTest},
connection=connection_test,
)
schema(GcObjectTest)
yield schema
schema.drop()

def test_scan_finds_active_blob_reference(self, schema_blob):
"""scan() must report hash_referenced >= 1 for a populated <blob@> column.

Decoded value type returned by BlobCodec.decode is numpy.ndarray, which
does not satisfy `_extract_hash_refs`'s dict/JSON-string check — this
test fails before the cursor-based fix in scan_hash_references.
"""
GcBlobTest.insert1({"rid": 1, "payload": np.arange(64, dtype="uint8")})

stats = gc.scan(schema_blob, store_name="local")

assert stats["hash_referenced"] >= 1, f"scan should find the active <blob@> reference; got {stats}"

def test_scan_finds_active_npy_reference(self, schema_npy):
"""scan() must report schema_paths_referenced >= 1 for a populated <npy@> column.

Decoded value type returned by NpyCodec.decode is NpyRef (lazy handle),
which does not satisfy `_extract_schema_refs`'s dict check — this test
fails before the cursor-based fix in scan_schema_references.
"""
GcNpyTest.insert1({"rid": 1, "waveform": np.arange(64, dtype="float32")})

stats = gc.scan(schema_npy, store_name="local")

assert stats["schema_paths_referenced"] >= 1, f"scan should find the active <npy@> reference; got {stats}"

def test_scan_finds_active_object_reference(self, schema_object):
"""scan() must report schema_paths_referenced >= 1 for a populated <object@> column.

Decoded value type returned by ObjectCodec.decode is ObjectRef (lazy
handle), which does not satisfy `_extract_schema_refs`'s dict check —
this test fails before the cursor-based fix in scan_schema_references.
"""
GcObjectTest.insert1({"rid": 1, "results": b"hello-gc-test"})

stats = gc.scan(schema_object, store_name="local")

assert stats["schema_paths_referenced"] >= 1, f"scan should find the active <object@> reference; got {stats}"
Loading