From 835f57ee3434f3a827ea62a3198aade62d79a795 Mon Sep 17 00:00:00 2001
From: Ari Angelo <hello@ari.nz>
Date: Tue, 12 May 2026 01:15:51 +0200
Subject: [PATCH 1/2] feat(tests): add parquet size checks and GeoJSON parity
 validation for HETA 1.2.0

---
 tests/aignostics/application/cli_test.py | 23 +++++++++++++++++++++--
 tests/constants_test.py                  |  6 ++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/aignostics/application/cli_test.py b/tests/aignostics/application/cli_test.py
index c5d4a2e9..0673cd65 100644
--- a/tests/aignostics/application/cli_test.py
+++ b/tests/aignostics/application/cli_test.py
@@ -1111,8 +1111,8 @@ def test_cli_run_execute(runner: CliRunner, tmp_path: Path, record_property) ->
     results_dir = tmp_path / SPOT_1_FILENAME.replace(".tiff", "")
     assert results_dir.is_dir(), f"Expected directory {results_dir} not found"
     files_in_dir = list(results_dir.glob("*"))
-    assert len(files_in_dir) == 9, (
-        f"Expected 9 files in {results_dir}, but found {len(files_in_dir)}: {[f.name for f in files_in_dir]}"
+    assert len(files_in_dir) == 12, (
+        f"Expected 12 files in {results_dir}, but found {len(files_in_dir)}: {[f.name for f in files_in_dir]}"
     )
     print(f"Found files in {results_dir}:")
     for filename, expected_size, tolerance_percent in SPOT_1_EXPECTED_RESULT_FILES:
@@ -1133,6 +1133,25 @@ def test_cli_run_execute(runner: CliRunner, tmp_path: Path, record_property) ->
             f"({min_size} to {max_size} bytes, ±{tolerance_percent}% of {expected_size})"
         )
 
+    # Validate parquet <-> GeoJSON row count parity for the 3 paired outputs
+    import pandas as pd
+
+    parquet_geojson_pairs = [
+        ("tissue_qc_parquet_polygons.parquet", "tissue_qc_geojson_polygons.json"),
+        ("tissue_segmentation_parquet_polygons.parquet", "tissue_segmentation_geojson_polygons.json"),
+        ("cell_classification_parquet_polygons.parquet", "cell_classification_geojson_polygons.json"),
+    ]
+    for parquet_filename, geojson_filename in parquet_geojson_pairs:
+        parquet_path = results_dir / parquet_filename
+        geojson_path = results_dir / geojson_filename
+        parquet_row_count = len(pd.read_parquet(parquet_path))
+        with geojson_path.open() as f:
+            geojson_feature_count = len(json.load(f)["features"])
+        assert parquet_row_count == geojson_feature_count, (
+            f"Row count mismatch between {parquet_filename} ({parquet_row_count} rows) "
+            f"and {geojson_filename} ({geojson_feature_count} features)"
+        )
+
     # Validate the execute command exited successfully
     assert result.exit_code == 0
 
diff --git a/tests/constants_test.py b/tests/constants_test.py
index 0296cb0d..9aec7ee3 100644
--- a/tests/constants_test.py
+++ b/tests/constants_test.py
@@ -105,6 +105,9 @@
             ("tissue_segmentation_segmentation_map_image.tiff", 1783952, 10),
             ("tissue_segmentation_csv_class_information.csv", 446, 10),
             ("tissue_qc_csv_class_information.csv", 290, 10),
+            ("tissue_qc_parquet_polygons.parquet", 29049, 10),
+            ("tissue_segmentation_parquet_polygons.parquet", 56682, 10),
+            ("cell_classification_parquet_polygons.parquet", 838533, 10),
         ]
 
     case "staging":
@@ -146,6 +149,9 @@
             ("tissue_segmentation_segmentation_map_image.tiff", 1783952, 10),
             ("tissue_segmentation_csv_class_information.csv", 446, 10),
             ("tissue_qc_csv_class_information.csv", 290, 10),
+            ("tissue_qc_parquet_polygons.parquet", 29049, 10),
+            ("tissue_segmentation_parquet_polygons.parquet", 56682, 10),
+            ("cell_classification_parquet_polygons.parquet", 838533, 10),
         ]
 
     case _:

From 4bf84bb6c4814618aaa26a4e270db9dadf2d59e1 Mon Sep 17 00:00:00 2001
From: Ari Angelo <hello@ari.nz>
Date: Tue, 12 May 2026 09:44:17 +0200
Subject: [PATCH 2/2] chore(tests): add HETA 1.2.0 parquet outputs to SPOT_0
 and update gui_test file count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SPOT_0_EXPECTED_RESULT_FILES updated with 3 new parquet artifacts
(tissue_qc, tissue_segmentation, cell_classification) from a HETA 1.2.0 run.
gui_test updated to assert 12 result files and validate parquet↔GeoJSON row
count parity for all 3 paired outputs.
---
 tests/aignostics/application/gui_test.py | 27 +++++++++++++--
 tests/constants_test.py                  | 42 ++++++++++++++----------
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/tests/aignostics/application/gui_test.py b/tests/aignostics/application/gui_test.py
index 59ba189e..0fd367f0 100644
--- a/tests/aignostics/application/gui_test.py
+++ b/tests/aignostics/application/gui_test.py
@@ -354,7 +354,7 @@ async def test_gui_download_dataset_via_application_to_run_cancel_to_find_back(
 @pytest.mark.flaky(retries=1, delay=5)
 @pytest.mark.timeout(timeout=60 * 10)
 @pytest.mark.sequential  # Helps on Linux with image analysis step otherwise timing out
-async def test_gui_run_download(  # noqa: PLR0915
+async def test_gui_run_download(  # noqa: PLR0914, PLR0915
     user: User, runner: CliRunner, tmp_path: Path, silent_logging: None, record_property
 ) -> None:
     """Test that the user can download a run result via the GUI."""
@@ -440,8 +440,8 @@ async def test_gui_run_download(  # noqa: PLR0915
 
         # Check for files in the results directory
         files_in_results_dir = list(results_dir.glob("*"))
-        assert len(files_in_results_dir) == 9, (
-            f"Expected 9 files in {results_dir}, but found {len(files_in_results_dir)}: "
+        assert len(files_in_results_dir) == 12, (
+            f"Expected 12 files in {results_dir}, but found {len(files_in_results_dir)}: "
             f"{[f.name for f in files_in_results_dir]}"
         )
 
@@ -464,6 +464,27 @@ async def test_gui_run_download(  # noqa: PLR0915
                 f"({min_size} to {max_size} bytes, ±{tolerance_percent}% of {expected_size})"
             )
 
+        # Validate parquet <-> GeoJSON row count parity for the 3 paired outputs
+        import json
+
+        import pandas as pd
+
+        parquet_geojson_pairs = [
+            ("tissue_qc_parquet_polygons.parquet", "tissue_qc_geojson_polygons.json"),
+            ("tissue_segmentation_parquet_polygons.parquet", "tissue_segmentation_geojson_polygons.json"),
+            ("cell_classification_parquet_polygons.parquet", "cell_classification_geojson_polygons.json"),
+        ]
+        for parquet_filename, geojson_filename in parquet_geojson_pairs:
+            parquet_path = results_dir / parquet_filename
+            geojson_path = results_dir / geojson_filename
+            parquet_row_count = len(pd.read_parquet(parquet_path))
+            with geojson_path.open() as f:
+                geojson_feature_count = len(json.load(f)["features"])
+            assert parquet_row_count == geojson_feature_count, (
+                f"Row count mismatch between {parquet_filename} ({parquet_row_count} rows) "
+                f"and {geojson_filename} ({geojson_feature_count} features)"
+            )
+
 
 @pytest.mark.integration
 @pytest.mark.sequential
diff --git a/tests/constants_test.py b/tests/constants_test.py
index 9aec7ee3..aa18676e 100644
--- a/tests/constants_test.py
+++ b/tests/constants_test.py
@@ -83,15 +83,18 @@
         # SPOT_0: uv run pytest tests/aignostics/application/gui_test.py::test_gui_run_download -s --no-cov
         # SPOT_1: uv run pytest tests/aignostics/application/cli_test.py::test_cli_run_execute -s --no-cov
         SPOT_0_EXPECTED_RESULT_FILES = [
-            ("tissue_qc_segmentation_map_image.tiff", 1642856, 10),
-            ("tissue_qc_geojson_polygons.json", 259955, 10),
-            ("tissue_segmentation_geojson_polygons.json", 887003, 10),
-            ("readout_generation_slide_readouts.csv", 303217, 10),
-            ("readout_generation_cell_readouts.csv", 1658344, 10),
-            ("cell_classification_geojson_polygons.json", 11218951, 10),
-            ("tissue_segmentation_segmentation_map_image.tiff", 2945078, 10),
-            ("tissue_segmentation_csv_class_information.csv", 452, 10),
-            ("tissue_qc_csv_class_information.csv", 285, 10),
+            ("tissue_qc_segmentation_map_image.tiff", 470150, 10),
+            ("tissue_qc_geojson_polygons.json", 171251, 10),
+            ("tissue_segmentation_geojson_polygons.json", 185516, 10),
+            ("readout_generation_slide_readouts.csv", 300205, 10),
+            ("readout_generation_cell_readouts.csv", 2417117, 10),
+            ("cell_classification_geojson_polygons.json", 16673412, 10),
+            ("tissue_segmentation_segmentation_map_image.tiff", 527264, 10),
+            ("tissue_segmentation_csv_class_information.csv", 443, 10),
+            ("tissue_qc_csv_class_information.csv", 286, 10),
+            ("tissue_qc_parquet_polygons.parquet", 34346, 10),
+            ("tissue_segmentation_parquet_polygons.parquet", 39185, 10),
+            ("cell_classification_parquet_polygons.parquet", 5476364, 10),
         ]
         SPOT_0_EXPECTED_CELLS_CLASSIFIED = (39798, 10)
 
@@ -127,15 +130,18 @@
 
         # See production block above for instructions on how to update these sizes.
         SPOT_0_EXPECTED_RESULT_FILES = [
-            ("tissue_qc_segmentation_map_image.tiff", 1642856, 10),
-            ("tissue_qc_geojson_polygons.json", 259955, 10),
-            ("tissue_segmentation_geojson_polygons.json", 887003, 10),
-            ("readout_generation_slide_readouts.csv", 303217, 10),
-            ("readout_generation_cell_readouts.csv", 1658344, 10),
-            ("cell_classification_geojson_polygons.json", 11218951, 10),
-            ("tissue_segmentation_segmentation_map_image.tiff", 2945078, 10),
-            ("tissue_segmentation_csv_class_information.csv", 452, 10),
-            ("tissue_qc_csv_class_information.csv", 285, 10),
+            ("tissue_qc_segmentation_map_image.tiff", 470150, 10),
+            ("tissue_qc_geojson_polygons.json", 171251, 10),
+            ("tissue_segmentation_geojson_polygons.json", 185516, 10),
+            ("readout_generation_slide_readouts.csv", 300205, 10),
+            ("readout_generation_cell_readouts.csv", 2417117, 10),
+            ("cell_classification_geojson_polygons.json", 16673412, 10),
+            ("tissue_segmentation_segmentation_map_image.tiff", 527264, 10),
+            ("tissue_segmentation_csv_class_information.csv", 443, 10),
+            ("tissue_qc_csv_class_information.csv", 286, 10),
+            ("tissue_qc_parquet_polygons.parquet", 34346, 10),
+            ("tissue_segmentation_parquet_polygons.parquet", 39185, 10),
+            ("cell_classification_parquet_polygons.parquet", 5476364, 10),
         ]
         SPOT_0_EXPECTED_CELLS_CLASSIFIED = (39798, 10)