diff --git a/README.md b/README.md
index a0a382e..29e5637 100644
--- a/README.md
+++ b/README.md
@@ -28,9 +28,9 @@ should convince readers of the significance and relevance of your task.
 
 ## Authors & contributors
 
-| Name | Roles | Orcid | Twitter | Github | Email | Linkedin |
+| Name | Roles | Github | Twitter | Email | Orcid | Linkedin |
 |:---|:---|:---|:---|:---|:---|:---|
-| John Doe | author, maintainer | 0000-0000-0000-0000 | johndoe | johndoe | john@doe.me | johndoe |
+| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | 0000-0000-0000-0000 | johndoe |
 
 ## API
 
@@ -38,25 +38,32 @@ should convince readers of the significance and relevance of your task.
 flowchart TB
   file_common_ist("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-ist-dataset'>Common iST Dataset</a>")
   comp_data_processor[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-data-processor'>Data processor</a>"/]
-  file_spatial_dataset("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-raw-ist-dataset'>Raw iST Dataset</a>")
+  file_spatial_unlabelled("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-unlabelled'>Unlabelled</a>")
+  file_spatial_solution("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-solution'>Solution</a>")
   file_scrnaseq_reference("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-scrna-seq-reference'>scRNA-seq Reference</a>")
   comp_control_method[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-control-method'>Control Method</a>"/]
   comp_method[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-method'>Method</a>"/]
+  comp_output_processor[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-output-processor'>Output processor</a>"/]
   comp_metric[/"<a href='https://github.com/openproblems-bio/task_spatial_segmentation#component-type-metric'>Metric</a>"/]
   file_prediction("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-predicted-data'>Predicted data</a>")
+  file_processed_prediction("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-processed-prediction'>Processed prediction</a>")
   file_score("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-score'>Score</a>")
   file_common_scrnaseq("<a href='https://github.com/openproblems-bio/task_spatial_segmentation#file-format-common-sc-dataset'>Common SC Dataset</a>")
   file_common_ist---comp_data_processor
-  comp_data_processor-->file_spatial_dataset
+  comp_data_processor-->file_spatial_unlabelled
+  comp_data_processor-->file_spatial_solution
   comp_data_processor-->file_scrnaseq_reference
-  file_spatial_dataset---comp_control_method
-  file_spatial_dataset---comp_method
-  file_scrnaseq_reference---comp_control_method
-  file_scrnaseq_reference---comp_metric
+  file_spatial_unlabelled---comp_control_method
+  file_spatial_unlabelled---comp_method
+  file_spatial_unlabelled---comp_output_processor
+  file_spatial_solution---comp_control_method
+  file_spatial_solution---comp_metric
   comp_control_method-->file_prediction
   comp_method-->file_prediction
+  comp_output_processor-->file_processed_prediction
   comp_metric-->file_score
-  file_prediction---comp_metric
+  file_prediction---comp_output_processor
+  file_processed_prediction---comp_metric
   file_common_scrnaseq---comp_data_processor
 ```
 
@@ -175,22 +182,25 @@ Arguments:
 |:---|:---|:---|
 | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. |
 | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. |
-| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--output_spatial_unlabelled` | `file` | (*Output*) Preprocessed spatial transcriptomics data without segmentation labels for method input. |
+| `--output_spatial_solution` | `file` | (*Output*) Ground truth segmentation labels and cell assignments for method evaluation. |
 | `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. |
 
 </div>
 
-## File format: Raw iST Dataset
+## File format: Unlabelled
 
-A spatial transcriptomics dataset, preprocessed for this benchmark.
+Preprocessed spatial transcriptomics data without segmentation labels
+for method input.
 
 Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr`
+`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr`
 
 Description:
 
-This dataset contains preprocessed images, labels, points, shapes, and
-tables for spatial transcriptomics data.
+This dataset contains preprocessed images and transcript point clouds
+for spatial transcriptomics data. Ground truth segmentation labels are
+intentionally excluded to prevent methods from cheating.
 
 Format:
 
@@ -198,9 +208,7 @@ Format:
 
     SpatialData object
      images: 'morphology_mip'
-     labels: 'cell_labels', 'nucleus_labels'
      points: 'transcripts'
-     shapes: 'cell_boundaries', 'nucleus_boundaries'
      tables: 'table'
      coordinate_systems: 'global'
 
@@ -212,16 +220,9 @@ Data structure:
 
 *images*
 
-| Name             | Description         |
-|:-----------------|:--------------------|
-| `morphology_mip` | The raw image data. |
-
-*labels*
-
-| Name             | Description                            |
-|:-----------------|:---------------------------------------|
-| `cell_labels`    | (*Optional*) Cell segmentation labels. |
-| `nucleus_labels` | (*Optional*) Cell segmentation labels. |
+| Name             | Description                                              |
+|:-----------------|:---------------------------------------------------------|
+| `morphology_mip` | The raw morphology image (maximum intensity projection). |
 
 *points*
 
@@ -233,26 +234,9 @@ Data structure:
 | `y` | `float` | y-coordinate of the point. |
 | `z` | `float` | (*Optional*) z-coordinate of the point. |
 | `feature_name` | `categorical` | Name of the feature. |
-| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. |
-| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. |
-| `cell_type` | `string` | (*Optional*) Cell type of the cell. |
 | `qv` | `float` | (*Optional*) Quality value of the point. |
 | `transcript_id` | `long` | Unique identifier of the transcript. |
-| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. |
-
-*shapes*
-
-`cell_boundaries`: Cell boundaries.
-
-| Column     | Type     | Description                    |
-|:-----------|:---------|:-------------------------------|
-| `geometry` | `object` | Geometry of the cell boundary. |
-
-`nucleus_boundaries`: Nucleus boundaries.
-
-| Column     | Type     | Description                       |
-|:-----------|:---------|:----------------------------------|
-| `geometry` | `object` | Geometry of the nucleus boundary. |
+| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with the nucleus (derived from morphology). |
 
 *tables*
 
@@ -260,10 +244,8 @@ Data structure:
 
 | Slot | Type | Description |
 |:---|:---|:---|
-| `obs["cell_id"]` | `string` | A unique identifier for the cell. |
-| `var["gene_ids"]` | `string` | Unique identifier for the gene. |
-| `var["feature_types"]` | `string` | Type of the feature. |
-| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. |
+| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
 | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
 | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. |
 | `uns["dataset_url"]` | `string` | Link to the original source of the dataset. |
@@ -271,7 +253,7 @@ Data structure:
 | `uns["dataset_summary"]` | `string` | Short description of the dataset. |
 | `uns["dataset_description"]` | `string` | Long description of the dataset. |
 | `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. |
-| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. |
+| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). |
 
 *coordinate_systems*
 
@@ -281,6 +263,88 @@ Data structure:
 
 </div>
 
+## File format: Solution
+
+Ground truth segmentation labels and cell assignments for method
+evaluation.
+
+Example file:
+`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr`
+
+Description:
+
+This dataset contains the ground truth cell and nucleus segmentation
+labels, cell boundaries, and a reference table matching each cell to its
+label region.
+
+Format:
+
+<div class="small">
+
+    SpatialData object
+     labels: 'cell_labels', 'nucleus_labels'
+     points: 'transcripts'
+     shapes: 'cell_boundaries', 'nucleus_boundaries'
+     tables: 'table'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+*labels*
+
+| Name             | Description                                            |
+|:-----------------|:-------------------------------------------------------|
+| `cell_labels`    | Ground truth cell segmentation labels.                 |
+| `nucleus_labels` | (*Optional*) Ground truth nucleus segmentation labels. |
+
+*points*
+
+`transcripts`: Point cloud data of transcripts with ground truth cell
+assignments.
+
+| Column | Type | Description |
+|:---|:---|:---|
+| `x` | `float` | x-coordinate of the point. |
+| `y` | `float` | y-coordinate of the point. |
+| `z` | `float` | (*Optional*) z-coordinate of the point. |
+| `feature_name` | `categorical` | Name of the feature. |
+| `cell_id` | `integer` | Ground truth cell assignment (0 = background). |
+| `transcript_id` | `long` | Unique identifier of the transcript. |
+
+*shapes*
+
+`cell_boundaries`: Ground truth cell boundary shapes.
+
+| Column     | Type     | Description                    |
+|:-----------|:---------|:-------------------------------|
+| `geometry` | `object` | Geometry of the cell boundary. |
+
+`nucleus_boundaries`: Ground truth nucleus boundary shapes.
+
+| Column     | Type     | Description                       |
+|:-----------|:---------|:----------------------------------|
+| `geometry` | `object` | Geometry of the nucleus boundary. |
+
+*tables*
+
+`table`: Reference cell metadata table.
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["cell_id"]` | `integer` | Unique cell identifier, matching instance IDs in the label images. |
+| `obs["region"]` | `string` | Name of the label image this cell belongs to (e.g. ‘cell_labels’). |
+| `obs["cell_area"]` | `double` | (*Optional*) Area of the cell in pixels. |
+| `obs["transcript_counts"]` | `integer` | (*Optional*) Total number of transcripts assigned to this cell. |
+| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). |
+
+</div>
+
 ## File format: scRNA-seq Reference
 
 A single-cell reference dataset, preprocessed for this benchmark.
@@ -346,8 +410,8 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
-| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
+| `--input` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. |
+| `--input_solution` | `file` | Ground truth segmentation labels and cell assignments for method evaluation. |
 | `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
 
 </div>
@@ -362,11 +426,27 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. |
+| `--input` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. |
 | `--output` | `file` | (*Output*) A predicted dataset as output by a method. |
 
 </div>
 
+## Component type: Output processor
+
+An output processor for the prediction.
+
+Arguments:
+
+<div class="small">
+
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input_prediction` | `file` | A predicted dataset as output by a method. |
+| `--input_spatial_unlabelled` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. |
+| `--output` | `file` | (*Output*) A processed predicted dataset, ready to be used as input for the evaluation. |
+
+</div>
+
 ## Component type: Metric
 
 A task template metric.
@@ -377,8 +457,8 @@ Arguments:
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input_prediction` | `file` | A predicted dataset as output by a method. |
-| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. |
+| `--input_prediction` | `file` | A processed predicted dataset, ready to be used as input for the evaluation. |
+| `--input_solution` | `file` | Ground truth segmentation labels and cell assignments for method evaluation. |
 | `--output` | `file` | (*Output*) File indicating the score of a metric. |
 
 </div>
@@ -388,7 +468,7 @@ Arguments:
 A predicted dataset as output by a method.
 
 Example file:
-`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad`
+`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr`
 
 Format:
 
@@ -416,13 +496,59 @@ Data structure:
 
 | Slot                | Type     | Description                          |
 |:--------------------|:---------|:-------------------------------------|
-| `obs["cell_id"]`    | `string` | Cell ID.                             |
-| `obs["region"]`     | `string` | Region.                              |
 | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
 | `uns["method_id"]`  | `string` | A unique identifier for the method.  |
 
 </div>
 
+## File format: Processed prediction
+
+A processed predicted dataset, ready to be used as input for the
+evaluation.
+
+Example file:
+`resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr`
+
+Format:
+
+<div class="small">
+
+    SpatialData object
+     labels: 'segmentation'
+     tables: 'table'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+*labels*
+
+| Name           | Description               |
+|:---------------|:--------------------------|
+| `segmentation` | Segmentation of the data. |
+
+*tables*
+
+`table`: AnnData table.
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["cell_id"]` | `string` | Cell ID. |
+| `obs["region"]` | `string` | Region. |
+| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `double` | Normalized expression values. |
+| `layers["normalized_log"]` | `double` | Log1p normalized expression values. |
+| `layers["normalized_log_scaled"]` | `double` | Log1p normalized expression values scaled to unit variance and zero mean. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["method_id"]` | `string` | A unique identifier for the method. |
+
+</div>
+
 ## File format: Score
 
 File indicating the score of a metric.
diff --git a/common b/common
index 91a35a2..f0816e1 160000
--- a/common
+++ b/common
@@ -1 +1 @@
-Subproject commit 91a35a2e808da7029e222456c0e3ca70ab3a06dd
+Subproject commit f0816e178a2b44749fdfb1a9cdfc76887dcf7462
diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index 52ee226..4a921f8 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -16,9 +16,9 @@ exit 1
 
 cat > /tmp/params.yaml << 'HERE'
 input_states: s3://openproblems-data/resources/datasets/**/state.yaml
-rename_keys: 'input:output_dataset'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
 output_state: '$id/state.yaml'
-settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
+settings: '{"output_spatial_unlabelled": "$id/output_spatial_unlabelled.zarr", "output_spatial_solution": "$id/output_spatial_solution.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}'
 publish_dir: s3://openproblems-data/resources/task_template/datasets/
 HERE
 
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index b11d437..59fc1c2 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -24,7 +24,8 @@ mkdir -p $DATASET_DIR
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
   --input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \
   --input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \
-  --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \
+  --output_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+  --output_spatial_solution $DATASET_DIR/spatial_solution.zarr \
   --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
   --dataset_id mouse_brain_combined \
   --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \
@@ -36,22 +37,29 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \
 
 # run one method
 viash run src/methods/cellpose/config.vsh.yaml -- \
-    --input $DATASET_DIR/spatial_dataset.zarr \
-    --output $DATASET_DIR/prediction.h5ad
+    --input $DATASET_DIR/spatial_unlabelled.zarr \
+    --output $DATASET_DIR/prediction.zarr
+
+# run prediction processor
+viash run src/data_processors/process_prediction/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/prediction.zarr \
+    --input_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \
+    --output $DATASET_DIR/processed_prediction.zarr
 
 # run one metric
-# TODO: implement this!
-# viash run src/metrics/ari/config.vsh.yaml -- \
-#     --input_prediction $DATASET_DIR/prediction.h5ad \
-#     --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \
-#     --output $DATASET_DIR/score.h5ad
+viash run src/metrics/ari/config.vsh.yaml -- \
+    --input_prediction $DATASET_DIR/processed_prediction.zarr \
+    --input_solution $DATASET_DIR/spatial_solution.zarr \
+    --output $DATASET_DIR/score.h5ad
 
 # write manual state.yaml. this is not actually necessary but you never know it might be useful
 cat > $DATASET_DIR/state.yaml << HERE
 id: $DATASET_ID
-spatial_dataset: spatial_dataset.zarr
+spatial_unlabelled: spatial_unlabelled.zarr
+spatial_solution: spatial_solution.zarr
 scrnaseq_reference: scrnaseq_reference.h5ad
-prediction: prediction.h5ad
+prediction: prediction.zarr
+processed_prediction: processed_prediction.zarr
 score: score.h5ad
 HERE
 
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 26bba56..808df7f 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: resources/datasets/**/state.yaml
-rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 3c31e74..745aa77 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
-rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference'
+rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 3f4fa2e..990602d 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -13,11 +13,11 @@ info:
       in the task.
 arguments:
   - name: --input
-    __merge__: file_spatial_dataset.yaml
+    __merge__: file_spatial_unlabelled.yaml
     required: true
     direction: input
-  - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq_reference.yaml
+  - name: "--input_solution"
+    __merge__: file_spatial_solution.yaml
     direction: input
     required: true
   - name: --output
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
index ecd3f9c..69deecc 100644
--- a/src/api/comp_data_processor.yaml
+++ b/src/api/comp_data_processor.yaml
@@ -19,8 +19,12 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      - name: "--output_spatial_dataset"
-        __merge__: file_spatial_dataset.yaml
+      - name: "--output_spatial_unlabelled"
+        __merge__: file_spatial_unlabelled.yaml
+        direction: output
+        required: true
+      - name: "--output_spatial_solution"
+        __merge__: file_spatial_solution.yaml
         direction: output
         required: true
       - name: "--output_scrnaseq_reference"
@@ -80,4 +84,3 @@ test_resources:
     dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
-      
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index 633a8a1..2acf7ef 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -8,7 +8,7 @@ info:
       A method to predict the task effects.
 arguments:
   - name: --input
-    __merge__: file_spatial_dataset.yaml
+    __merge__: file_spatial_unlabelled.yaml
     required: true
     direction: input
   - name: --output
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index a7470e9..6102919 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -8,11 +8,11 @@ info:
       A metric for evaluating method predictions.
 arguments:
   - name: "--input_prediction"
-    __merge__: file_prediction.yaml
+    __merge__: file_processed_prediction.yaml
     direction: input
     required: true
-  - name: "--input_scrnaseq_reference"
-    __merge__: file_scrnaseq_reference.yaml
+  - name: "--input_solution"
+    __merge__: file_spatial_solution.yaml
     direction: input
     required: true
   - name: "--output"
diff --git a/src/api/comp_output_processor.yaml b/src/api/comp_output_processor.yaml
new file mode 100644
index 0000000..a5db8ec
--- /dev/null
+++ b/src/api/comp_output_processor.yaml
@@ -0,0 +1,30 @@
+namespace: "data_processors"
+info:
+  type: data_processor
+  type_info:
+    label: Output processor
+    summary: An output processor for the prediction.
+    description: |
+      A component for a prediction dataset into a processed prediction dataset that can be evaluated by the metrics.
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input_prediction"
+        __merge__: file_prediction.yaml
+        required: true
+        direction: input
+      - name: "--input_spatial_unlabelled"
+        __merge__: file_spatial_unlabelled.yaml
+        required: true
+        direction: input
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        __merge__: file_processed_prediction.yaml
+        direction: output
+        required: true
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/task_spatial_segmentation/mouse_brain_combined
+    dest: resources_test/task_spatial_segmentation/mouse_brain_combined
diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml
index b1fc443..e72794b 100644
--- a/src/api/file_prediction.yaml
+++ b/src/api/file_prediction.yaml
@@ -1,6 +1,6 @@
 #TODO: Change to the required and/or optional fields of the anndata
 type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad"
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr"
 label: "Predicted data"
 summary: A predicted dataset as output by a method.
 info:
@@ -16,15 +16,6 @@ info:
         name: table
         description: AnnData table
         required: true
-        obs:
-          - type: string
-            name: cell_id
-            description: Cell ID
-            required: true
-          - type: string
-            name: region
-            description: Region
-            required: true
         uns:
           - type: string
             name: dataset_id
diff --git a/src/api/file_processed_prediction.yaml b/src/api/file_processed_prediction.yaml
new file mode 100644
index 0000000..52e49df
--- /dev/null
+++ b/src/api/file_processed_prediction.yaml
@@ -0,0 +1,67 @@
+type: file
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr"
+label: "Processed prediction"
+summary: A processed predicted dataset, ready to be used as input for the evaluation.
+info:
+  format:
+    type: spatialdata_zarr
+    labels:
+      - type: object
+        name: "segmentation"
+        description: Segmentation of the data
+        required: true
+    tables:
+      - type: anndata
+        name: table
+        description: AnnData table
+        required: true
+        # TODO: what is it that this component adds to the anndata?
+        layers:
+          - type: integer
+            name: counts
+            description: Raw counts
+            required: true
+          - type: double
+            name: normalized
+            description: Normalized expression values
+            required: true
+          - type: double
+            name: normalized_log
+            description: Log1p normalized expression values
+            required: true
+          - type: double
+            name: normalized_log_scaled
+            description: Log1p normalized expression values scaled to unit variance and zero mean
+            required: true
+        obs:
+          - type: string
+            name: cell_id
+            description: Cell ID
+            required: true
+          - type: string
+            name: region
+            description: Region
+            required: true
+          # .... cell info ... ?
+        var:
+          - type: string
+            name: feature_id
+            description: Unique identifier for the feature, usually a ENSEMBL gene id.
+            required: false    
+          - type: string
+            name: feature_name
+            description: A human-readable name for the feature, usually a gene symbol.
+            required: true
+          - type: boolean
+            name: hvg
+            description: Whether or not the feature is considered to be a 'highly variable gene'
+            required: true
+        uns:
+          - type: string
+            name: dataset_id
+            description: "A unique identifier for the dataset"
+            required: true
+          - type: string
+            name: method_id
+            description: "A unique identifier for the method"
+            required: true
diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml
index 9b855fd..c1da992 100644
--- a/src/api/file_scrnaseq_reference.yaml
+++ b/src/api/file_scrnaseq_reference.yaml
@@ -1,7 +1,5 @@
 type: file
 example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad"
-# TODO: revert to the original example once file exists
-# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad"
 label: "scRNA-seq Reference"
 summary: A single-cell reference dataset, preprocessed for this benchmark.
 description: |
diff --git a/src/api/file_spatial_solution.yaml b/src/api/file_spatial_solution.yaml
new file mode 100644
index 0000000..8552e48
--- /dev/null
+++ b/src/api/file_spatial_solution.yaml
@@ -0,0 +1,108 @@
+type: file
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr"
+label: "Solution"
+summary: Ground truth segmentation labels and cell assignments for method evaluation.
+description: |
+  This dataset contains the ground truth cell and nucleus segmentation labels,
+  cell boundaries, and a reference table matching each cell to its label region.
+info:
+  format:
+    type: spatialdata_zarr
+    points:
+      - type: dataframe
+        name: transcripts
+        description: Point cloud data of transcripts with ground truth cell assignments
+        required: true
+        columns:
+          - type: float
+            name: "x"
+            required: true
+            description: x-coordinate of the point
+          - type: float
+            name: "y"
+            required: true
+            description: y-coordinate of the point
+          - type: float
+            name: "z"
+            required: false
+            description: z-coordinate of the point
+          - type: categorical
+            name: feature_name
+            required: true
+            description: Name of the feature
+          - type: integer
+            name: cell_id
+            required: true
+            description: Ground truth cell assignment (0 = background)
+          - type: long
+            name: transcript_id
+            required: true
+            description: Unique identifier of the transcript
+    labels:
+      - type: object
+        name: "cell_labels"
+        description: Ground truth cell segmentation labels
+        required: true
+      - type: object
+        name: "nucleus_labels"
+        description: Ground truth nucleus segmentation labels
+        required: false
+    shapes:
+      - type: dataframe
+        name: "cell_boundaries"
+        description: Ground truth cell boundary shapes
+        required: false
+        columns:
+          - type: object
+            name: "geometry"
+            required: true
+            description: Geometry of the cell boundary
+      - type: dataframe
+        name: "nucleus_boundaries"
+        description: Ground truth nucleus boundary shapes
+        required: false
+        columns:
+          - type: object
+            name: "geometry"
+            required: true
+            description: Geometry of the nucleus boundary
+    tables:
+      - type: anndata
+        name: "table"
+        description: Reference cell metadata table
+        required: true
+        obs:
+          - type: integer
+            name: cell_id
+            description: Unique cell identifier, matching instance IDs in the label images
+            required: true
+          - type: string
+            name: region
+            description: Name of the label image this cell belongs to (e.g. 'cell_labels')
+            required: true
+          - type: double
+            name: cell_area
+            description: Area of the cell in pixels
+            required: false
+          - type: integer
+            name: transcript_counts
+            description: Total number of transcripts assigned to this cell
+            required: false
+        uns:
+          - type: string
+            name: dataset_id
+            description: A unique identifier for the dataset
+            required: true
+          - type: string
+            name: orig_dataset_id
+            required: true
+            description: The identifier of the original dataset from which this dataset was derived (if applicable)
+        var:
+          - type: string
+            name: feature_id
+            required: false
+            description: Unique identifier for the feature, usually a ENSEMBL gene id.
+          - type: string
+            name: feature_name
+            required: true
+            description: A human-readable name for the feature, usually a gene symbol.
diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_unlabelled.yaml
similarity index 54%
rename from src/api/file_spatial_dataset.yaml
rename to src/api/file_spatial_unlabelled.yaml
index 4c5253e..3d86b0f 100644
--- a/src/api/file_spatial_dataset.yaml
+++ b/src/api/file_spatial_unlabelled.yaml
@@ -1,28 +1,18 @@
 type: file
-example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
-# TODO: revert to the original example once file exists
-# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr"
-label: "Raw iST Dataset"
-summary: A spatial transcriptomics dataset, preprocessed for this benchmark.
+example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr"
+label: "Unlabelled"
+summary: Preprocessed spatial transcriptomics data without segmentation labels for method input.
 description: |
-  This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data.
+  This dataset contains preprocessed images and transcript point clouds for spatial transcriptomics data.
+  Ground truth segmentation labels are intentionally excluded to prevent methods from cheating.
 info:
   format:
     type: spatialdata_zarr
     images:
       - type: object
         name: morphology_mip
-        description: The raw image data
+        description: The raw morphology image (maximum intensity projection)
         required: true
-    labels:
-      - type: object
-        name: "cell_labels"
-        description: Cell segmentation labels
-        required: false
-      - type: object
-        name: "nucleus_labels"
-        description: Cell segmentation labels
-        required: false
     points:
       - type: dataframe
         name: transcripts
@@ -45,18 +35,6 @@ info:
             name: feature_name
             required: true
             description: Name of the feature
-          - type: integer
-            name: "cell_id"
-            required: false
-            description: Unique identifier of the cell
-          - type: integer
-            name: "nucleus_id"
-            required: false
-            description: Unique identifier of the nucleus
-          - type: string
-            name: "cell_type"
-            required: false
-            description: Cell type of the cell
           - type: float
             name: qv
             required: false
@@ -68,26 +46,7 @@ info:
           - type: boolean
             name: overlaps_nucleus
             required: false
-            description: Whether the point overlaps with a nucleus
-    shapes:
-      - type: dataframe
-        name: "cell_boundaries"
-        description: Cell boundaries
-        required: false
-        columns:
-          - type: object
-            name: "geometry"
-            required: true
-            description: Geometry of the cell boundary
-      - type: dataframe
-        name: "nucleus_boundaries"
-        description: Nucleus boundaries
-        required: false
-        columns:
-          - type: object
-            name: "geometry"
-            required: true
-            description: Geometry of the nucleus boundary
+            description: Whether the point overlaps with the nucleus (derived from morphology)
     tables:
       - type: anndata
         name: "table"
@@ -123,29 +82,18 @@ info:
             required: true
             description: The organism of the sample in the dataset
           - type: string
-            name: segmentation_id
-            required: true
-            multiple: true
-            description: A unique identifier for the segmentation
-        obs:
-          - type: string
-            name: cell_id
+            name: orig_dataset_id
             required: true
-            description: A unique identifier for the cell
+            description: The identifier of the original dataset from which this dataset was derived (if applicable)
         var:
           - type: string
-            name: gene_ids
-            required: true
-            description: Unique identifier for the gene
+            name: feature_id
+            required: false
+            description: Unique identifier for the feature, usually a ENSEMBL gene id.
           - type: string
-            name: feature_types
-            required: true
-            description: Type of the feature
-        obsm:
-          - type: double
-            name: spatial
+            name: feature_name
             required: true
-            description: Spatial coordinates of the cell
+            description: A human-readable name for the feature, usually a gene symbol.
     coordinate_systems:
       - type: object
         name: global
diff --git a/src/control_methods/empty_labels/config.vsh.yaml b/src/control_methods/empty_labels/config.vsh.yaml
new file mode 100644
index 0000000..126b095
--- /dev/null
+++ b/src/control_methods/empty_labels/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+
+name: empty_labels
+label: Empty Labels
+summary: "A negative control that predicts no cells (all background)."
+description: |
+  A negative control where no cells are detected — the segmentation label image
+  is filled with zeros (background). This represents a lower bound of performance
+  for any segmentation method and is useful for verifying that metrics can
+  distinguish between no prediction and a perfect prediction.
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__: ../../base/setup_spatialdata_partial.yaml
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, lowmem, lowcpu]
diff --git a/src/control_methods/empty_labels/script.py b/src/control_methods/empty_labels/script.py
new file mode 100644
index 0000000..5bc705b
--- /dev/null
+++ b/src/control_methods/empty_labels/script.py
@@ -0,0 +1,54 @@
+import anndata as ad
+import numpy as np
+import spatialdata as sd
+from spatialdata.models import Labels2DModel
+
+## VIASH START
+par = {
+  'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr',
+  'output': 'output.zarr'
+}
+meta = {
+  'name': 'empty_labels'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+sdata_solution = sd.read_zarr(par['input_solution'])
+
+print('Creating empty (all-zero) segmentation labels', flush=True)
+gt_labels = sdata_solution['cell_labels']
+
+# Resolve the image array (handles both DataTree and DataArray)
+import xarray as xr
+if isinstance(gt_labels, xr.DataTree):
+    gt_array = gt_labels['scale0'].image.to_numpy()
+else:
+    gt_array = gt_labels.to_numpy()
+
+empty_array = np.zeros_like(gt_array)
+
+# Preserve the same coordinate system and transform as the GT labels
+transform = sd.transformations.get_transformation(gt_labels, get_all=True)
+segmentation = Labels2DModel.parse(
+    empty_array,
+    transformations=transform,
+)
+
+output = sd.SpatialData(
+  labels={
+    'segmentation': segmentation
+  },
+  tables={
+    'table': ad.AnnData(
+      uns={
+        'dataset_id': sdata_solution.tables['table'].uns['dataset_id'],
+        'method_id': meta['name']
+      }
+    )
+  }
+)
+
+print('Writing output', flush=True)
+output.write(par['output'], overwrite=True)
diff --git a/src/control_methods/random_voronoi/config.vsh.yaml b/src/control_methods/random_voronoi/config.vsh.yaml
new file mode 100644
index 0000000..3805183
--- /dev/null
+++ b/src/control_methods/random_voronoi/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: ../../api/comp_control_method.yaml
+
+name: random_voronoi
+label: Random Voronoi
+summary: "A negative control that assigns pixels to randomly placed seed points via Voronoi tessellation."
+description: |
+  A negative control that places N seed points uniformly at random across the
+  image (where N equals the number of cells in the ground truth) and assigns
+  each pixel to the nearest seed. This produces spatially coherent, non-overlapping
+  cells with realistic count but no biological basis, resulting in poor metric scores.
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__: ../../base/setup_spatialdata_partial.yaml
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, lowmem, lowcpu]
diff --git a/src/control_methods/random_voronoi/script.py b/src/control_methods/random_voronoi/script.py
new file mode 100644
index 0000000..f90c03b
--- /dev/null
+++ b/src/control_methods/random_voronoi/script.py
@@ -0,0 +1,72 @@
+import anndata as ad
+import numpy as np
+import spatialdata as sd
+import xarray as xr
+from scipy.spatial import cKDTree
+from spatialdata.models import Labels2DModel
+
+## VIASH START
+par = {
+  'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr',
+  'output': 'output.zarr'
+}
+meta = {
+  'name': 'random_voronoi'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+sdata_solution = sd.read_zarr(par['input_solution'])
+
+print('Building random Voronoi segmentation', flush=True)
+gt_labels = sdata_solution['cell_labels']
+
+# Resolve the image array (handles both DataTree and DataArray)
+if isinstance(gt_labels, xr.DataTree):
+    gt_array = gt_labels['scale0'].image.to_numpy()
+else:
+    gt_array = gt_labels.to_numpy()
+
+H, W = gt_array.shape
+rng = np.random.default_rng(42)
+
+# Use the same number of cells as in the ground truth
+n_cells = len(np.unique(gt_array)) - 1  # subtract 1 for background (0)
+n_cells = max(n_cells, 1)
+
+# Place seed points uniformly at random
+seeds_y = rng.integers(0, H, size=n_cells)
+seeds_x = rng.integers(0, W, size=n_cells)
+seeds = np.column_stack([seeds_y, seeds_x])
+
+# Assign every pixel to the nearest seed via KD-tree
+print(f'Assigning {H * W} pixels to {n_cells} random seeds', flush=True)
+yx = np.mgrid[0:H, 0:W].reshape(2, -1).T  # (H*W, 2)
+tree = cKDTree(seeds)
+_, idx = tree.query(yx, workers=-1)
+voronoi_array = (idx + 1).reshape(H, W).astype(np.int32)  # labels start at 1
+
+# Preserve the same coordinate system and transform as the GT labels
+transform = sd.transformations.get_transformation(gt_labels, get_all=True)
+segmentation = Labels2DModel.parse(
+    voronoi_array,
+    transformations=transform,
+)
+
+output = sd.SpatialData(
+  labels={
+    'segmentation': segmentation
+  },
+  tables={
+    'table': ad.AnnData(
+      uns={
+        'dataset_id': sdata_solution.tables['table'].uns['dataset_id'],
+        'method_id': meta['name']
+      }
+    )
+  }
+)
+
+print('Writing output', flush=True)
+output.write(par['output'], overwrite=True)
diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml
index 0a71e50..7d47a02 100644
--- a/src/control_methods/true_labels/config.vsh.yaml
+++ b/src/control_methods/true_labels/config.vsh.yaml
@@ -1,59 +1,24 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-#   - The input/output files
-#   - Common parameters
-#   - A unit test
 __merge__: ../../api/comp_control_method.yaml
 
-# A unique identifier for your component (required).
-# Can contain only lowercase letters or underscores.
 name: true_labels
-
-# A relatively short label, used when rendering visualisations (required)
 label: True Labels
-# A one sentence summary of how this method works (required). Used when 
-# rendering summary tables.
-summary: "a positive control, solution labels are copied 1 to 1 to the predicted data."
-# A multi-line description of how this component works (required). Used
-# when rendering reference documentation.
+summary: "A positive control where the ground truth cell_labels are used as the prediction."
 description: |
-  A positive control, where the solution labels are copied 1 to 1 to the predicted data.
-
-# Metadata for your component
-info:
-  # Which normalisation method this component prefers to use (required).
-  preferred_normalization: counts
-
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
+  A positive control where the ground truth cell_labels segmentation is copied
+  directly as the prediction. This represents the upper bound of performance
+  for any segmentation method.
 
-# Resources required to run the component
 resources:
-  # The script of your component (required)
   - type: python_script
     path: script.py
-  # Additional resources your script needs (optional)
-  # - type: file
-  #   path: weights.pt
 
 engines:
-  # Specifications for the Docker image for this component.
   - type: docker
     image: openproblems/base_python:1
-    # Add custom dependencies here (optional). For more information, see
-    # https://viash.io/reference/config/engines/docker/#setup .
-    # setup:
-    #   - type: python
-    #     packages: scib==1.1.5
+    __merge__: ../../base/setup_spatialdata_partial.yaml
 
 runners:
-  # This platform allows running the component natively
   - type: executable
-  # Allows turning the component into a Nextflow module / pipeline.
   - type: nextflow
     directives:
       label: [midtime, lowmem, lowcpu]
diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py
index 935f3af..d59ae90 100644
--- a/src/control_methods/true_labels/script.py
+++ b/src/control_methods/true_labels/script.py
@@ -1,13 +1,14 @@
 import anndata as ad
+import numpy as np
+import spatialdata as sd
+import xarray as xr
+from spatialdata.models import Labels2DModel
 
 ## VIASH START
-# Note: this section is auto-generated by viash at runtime. To edit it, make changes
-# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad',
-  'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad',
-  'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad',
-  'output': 'output.h5ad'
+  'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+  'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr',
+  'output': 'output.zarr'
 }
 meta = {
   'name': 'true_labels'
@@ -15,31 +16,45 @@
 ## VIASH END
 
 print('Reading input files', flush=True)
-input_train = ad.read_h5ad(par['input_train'])
-input_test = ad.read_h5ad(par['input_test'])
-input_solution = ad.read_h5ad(par['input_solution'])
-
-print('Preprocess data', flush=True)
-# ... preprocessing ...
-
-print('Train model', flush=True)
-# ... train model ...
-
-print('Generate predictions', flush=True)
-# ... generate predictions ...
-obs_label_pred = input_solution.obs["label"]
-
-print("Write output AnnData to file", flush=True)
-output = ad.AnnData(
-  uns={
-    'dataset_id': input_train.uns['dataset_id'],
-    'normalization_id': input_train.uns['normalization_id'],
-    'method_id': meta['name']
+sdata_solution = sd.read_zarr(par['input_solution'])
+
+print('Relabelling ground truth cell_labels as prediction', flush=True)
+gt_labels = sdata_solution['cell_labels']
+
+# Resolve the image array (handles both DataTree and DataArray)
+if isinstance(gt_labels, xr.DataTree):
+    gt_array = gt_labels['scale0'].image.to_numpy()
+else:
+    gt_array = gt_labels.to_numpy()
+
+# Randomly permute cell IDs while keeping background (0) unchanged.
+# A correct metric (e.g. ARI) must be invariant to label permutation
+rng = np.random.default_rng(42)
+cell_ids = np.unique(gt_array[gt_array != 0])
+perm = rng.permutation(cell_ids)
+lut = np.zeros(cell_ids.max() + 1, dtype=gt_array.dtype)
+lut[cell_ids] = perm
+relabelled = np.where(gt_array != 0, lut[gt_array], 0)
+
+transform = sd.transformations.get_transformation(gt_labels, get_all=True)
+segmentation = Labels2DModel.parse(
+    relabelled,
+    transformations=transform,
+)
+
+output = sd.SpatialData(
+  labels={
+    'segmentation': segmentation
   },
-  obs={
-    'label_pred': obs_label_pred
+  tables={
+    'table': ad.AnnData(
+      uns={
+        'dataset_id': sdata_solution.tables['table'].uns['dataset_id'],
+        'method_id': meta['name']
+      }
+    )
   }
 )
-output.obs_names = input_test.obs_names
 
-output.write_h5ad(par['output'], compression='gzip')
+print('Writing output', flush=True)
+output.write(par['output'], overwrite=True)
diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml
index 0ea6508..96cc85c 100644
--- a/src/data_processors/process_dataset/config.vsh.yaml
+++ b/src/data_processors/process_dataset/config.vsh.yaml
@@ -5,10 +5,6 @@ name: process_dataset
 argument_groups:
   - name: "Processing parameters"
     arguments:
-      - name: "--seed"
-        type: "integer"
-        description: "A seed for the subsampling."
-        example: 123
       - name: "--span"
         type: double
         description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'.
@@ -24,7 +20,6 @@ resources:
 
 engines:
   - type: docker
-    #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work
     image: openproblems/base_python:1
     setup:
       - type: python
@@ -37,4 +32,4 @@ runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [highmem, midcpu, midtime]
\ No newline at end of file
+      label: [midmem, midcpu, midtime]
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index b04cb33..ad85433 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -1,5 +1,5 @@
-import random
 import anndata as ad
+import pandas as pd
 import spatialdata as sd
 import scanpy as sc
 
@@ -7,8 +7,9 @@
 par = {
     'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr',
     'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad',
-    'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr',
-    'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad',
+    'output_spatial_unlabelled': 'spatial_unlabelled.zarr',
+    'output_spatial_solution': 'spatial_solution.zarr',
+    'output_scrnaseq_reference': 'scrnaseq_reference.h5ad',
     'span': 0.3,
     'seed': 123,
     'n_top_genes': 3000,
@@ -53,12 +54,6 @@ def sc_processing(adata):
         )
         adata.var.rename(columns={"highly_variable": "hvg"}, inplace=True)
 
-
-# set seed if need be
-if par["seed"]:
-    print(f">> Setting seed to {par['seed']}")
-    random.seed(par["seed"])
-
 print(">> Load data", flush=True)
 sc_data = ad.read_h5ad(par["input_sc"])
 print(f"single cell data: {sc_data}")
@@ -71,7 +66,7 @@ def sc_processing(adata):
 for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]:
     sc_data.uns[key] = par[key]
 
-print(">> Writing data", flush=True)
+print(">> Writing scrnaseq reference", flush=True)
 sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip")
 
 # read input_sp
@@ -79,27 +74,81 @@ def sc_processing(adata):
 sp_data = sd.read_zarr(par["input_sp"])
 print(f"spatial data: {sp_data}")
 
-print(">> Processing spatial data", flush=True)
-sp_data_table = sp_data.tables['table']
-print(f"single cell part of spatial data: {sp_data_table}")
-sc_processing(sp_data_table)
-
-if "cell_area" not in sp_data_table.obs:
-    print(">> Perform scanpy qc for cell area", flush=True)
-    sc.pp.calculate_qc_metrics(sp_data_table, layer="counts", inplace=True)
-
-for x in ["transcript_counts", "n_genes_by_counts"]:
-    if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs:
-        print(f">> Perform cell area normalization for {x}", flush=True)
-        sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"]
-
-print(">> Override dataset metadata in .uns", flush=True)
-sp_data_table.uns["orig_dataset_id"] = sp_data_table.uns.get("dataset_id", None)
-for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]:
-    sp_data_table.uns[key] = par[key]
-
-print(f"spatial data: {sp_data}")
-print(f"spatial data tables['table']: {sp_data.tables['table']}")
+dataset_uns = {
+    "dataset_id": par["dataset_id"],
+    "dataset_name": par["dataset_name"],
+    "dataset_url": par["dataset_url"],
+    "dataset_summary": par["dataset_summary"],
+    "dataset_description": par["dataset_description"],
+    "dataset_reference": par["dataset_reference"],
+    "dataset_organism": par["dataset_organism"],
+    "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None),
+}
 
-print(">> Writing spatial data", flush=True)
-sp_data.write(par["output_spatial_dataset"], overwrite=True)
+# ---------------------------------------------------------------
+# output_spatial_dataset: image + transcripts (no ground truth)
+# ---------------------------------------------------------------
+print(">> Building spatial dataset for methods (no ground truth)", flush=True)
+
+# Strip columns that reveal ground truth cell assignments from transcripts
+_GROUND_TRUTH_COLS = {"cell_id", "nucleus_id", "cell_type"}
+transcripts = sp_data.points["transcripts"]
+clean_transcript_cols = [c for c in transcripts.columns if c not in _GROUND_TRUTH_COLS]
+clean_transcripts = transcripts[clean_transcript_cols]
+
+# Build var from unique feature names in transcripts, mapping to feature_ids from metadata
+feature_names = transcripts["feature_name"].compute().unique().tolist()
+var_df = pd.DataFrame({"feature_name": feature_names}, index=feature_names)
+var_df.index.name = "feature_name"
+if "metadata" in sp_data.tables and "gene_ids" in sp_data.tables["metadata"].var.columns:
+    id_map = sp_data.tables["metadata"].var["gene_ids"]
+    var_df["feature_id"] = var_df.index.map(id_map)
+
+# Minimal table: dataset metadata in uns, gene list in var
+minimal_table = ad.AnnData(var=var_df, uns=dataset_uns)
+
+output_spatial = sd.SpatialData(
+    images={"morphology_mip": sp_data.images["morphology_mip"]},
+    points={"transcripts": clean_transcripts},
+    tables={"table": minimal_table},
+)
+
+print(">> Writing spatial unlabelled dataset", flush=True)
+output_spatial.write(par["output_spatial_unlabelled"], overwrite=True)
+
+# ---------------------------------------------------------------
+# output_spatial_solution: ground truth labels, shapes, reference table
+# ---------------------------------------------------------------
+print(">> Building spatial solution (ground truth)", flush=True)
+
+ref_table = sp_data.tables["table"]
+solution_obs = ref_table.obs[["cell_id", "region"]].copy()
+for extra_col in ["cell_area", "transcript_counts"]:
+    if extra_col in ref_table.obs.columns:
+        solution_obs[extra_col] = ref_table.obs[extra_col]
+
+solution_table = ad.AnnData(
+    obs=solution_obs,
+    var=var_df,
+    uns={
+        "dataset_id": par["dataset_id"],
+        "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None),
+        "spatialdata_attrs": ref_table.uns["spatialdata_attrs"],
+    },
+)
+
+# Keep only the columns needed for the solution (ground truth assignments)
+_SOLUTION_TRANSCRIPT_COLS = ["x", "y", "feature_name", "cell_id", "transcript_id"]
+if "z" in transcripts.columns:
+    _SOLUTION_TRANSCRIPT_COLS = ["x", "y", "z"] + _SOLUTION_TRANSCRIPT_COLS[2:]
+solution_transcripts = transcripts[[c for c in _SOLUTION_TRANSCRIPT_COLS if c in transcripts.columns]]
+
+output_solution = sd.SpatialData(
+    points={"transcripts": solution_transcripts},
+    labels={k: v for k, v in sp_data.labels.items()},
+    shapes={k: v for k, v in sp_data.shapes.items()},
+    tables={"table": solution_table},
+)
+
+print(">> Writing spatial solution", flush=True)
+output_solution.write(par["output_spatial_solution"], overwrite=True)
diff --git a/src/data_processors/process_prediction/config.vsh.yaml b/src/data_processors/process_prediction/config.vsh.yaml
new file mode 100644
index 0000000..7e473ac
--- /dev/null
+++ b/src/data_processors/process_prediction/config.vsh.yaml
@@ -0,0 +1,23 @@
+__merge__: ../../api/comp_output_processor.yaml
+
+name: process_prediction
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: python
+        packages: [scikit-learn, scikit-misc]
+    __merge__: 
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midmem, midcpu, midtime]
diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py
new file mode 100644
index 0000000..17c3728
--- /dev/null
+++ b/src/data_processors/process_prediction/script.py
@@ -0,0 +1,110 @@
+import numpy as np
+import xarray as xr
+import anndata as ad
+import pandas as pd
+import spatialdata as sd
+import scanpy as sc
+
+## VIASH START
+par = {
+    'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr',
+    'input_spatial_unlabelled': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
+    'output': 'output.zarr'
+}
+## VIASH END
+
+print(">> Reading input files", flush=True)
+sdata_pred = sd.read_zarr(par["input_prediction"])
+sdata_sp = sd.read_zarr(par["input_spatial_unlabelled"])
+
+dataset_id = sdata_sp.tables["table"].uns["dataset_id"]
+method_id = sdata_pred.tables["table"].uns["method_id"]
+
+print(">> Transforming transcripts to global coordinate system", flush=True)
+transcripts = sd.transform(sdata_sp["transcripts"], to_coordinate_system="global")
+
+# Adjust for any translation applied to the segmentation
+trans = sd.transformations.get_transformation(
+    sdata_pred["segmentation"], get_all=True
+)["global"].inverse()
+transcripts = sd.transform(transcripts, trans, "global")
+
+print(">> Assigning transcripts to cells via label image lookup", flush=True)
+y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
+x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
+
+if isinstance(sdata_pred["segmentation"], xr.DataTree):
+    label_image = sdata_pred["segmentation"]["scale0"].image.to_numpy()
+else:
+    label_image = sdata_pred["segmentation"].to_numpy()
+
+# Clip coordinates to valid label image bounds
+y_coords = np.clip(y_coords, 0, label_image.shape[0] - 1)
+x_coords = np.clip(x_coords, 0, label_image.shape[1] - 1)
+
+cell_ids = label_image[y_coords, x_coords]
+
+# NOTE: Is it useful to build a cxg count matrix? Is this used downstream?
+print(">> Building cell x gene count matrix", flush=True)
+feature_names = transcripts["feature_name"].compute().to_numpy()
+
+transcript_df = pd.DataFrame({"cell_id": cell_ids, "feature_name": feature_names})
+# Remove background (cell_id == 0)
+transcript_df = transcript_df[transcript_df["cell_id"] != 0]
+
+count_matrix = (
+    transcript_df.groupby(["cell_id", "feature_name"])
+    .size()
+    .unstack(fill_value=0)
+)
+
+obs = pd.DataFrame(
+    {"cell_id": count_matrix.index.astype(str), "region": pd.Categorical(["segmentation"] * len(count_matrix))},
+    index=count_matrix.index.astype(str),
+)
+var = pd.DataFrame(index=count_matrix.columns.astype(str))
+var.index.name = "feature_name"
+var["feature_name"] = var.index
+if "var" in dir(sdata_sp.tables["table"]) and "feature_id" in sdata_sp.tables["table"].var.columns:
+    id_map = sdata_sp.tables["table"].var["feature_id"]
+    var["feature_id"] = var.index.map(id_map)
+
+table = ad.AnnData(X=count_matrix.values.astype(np.float32), obs=obs, var=var)
+table.layers["counts"] = table.X.copy()
+
+print(">> Normalizing counts", flush=True)
+sc.pp.normalize_total(table, target_sum=1e4)
+table.layers["normalized"] = table.X.copy()
+
+sc.pp.log1p(table)
+table.layers["normalized_log"] = table.X.copy()
+
+sc.pp.scale(table)
+table.layers["normalized_log_scaled"] = table.X.copy()
+
+print(">> Computing highly variable genes", flush=True)
+# Reset X to counts for HVG computation
+table.X = table.layers["counts"].copy()
+try:
+    sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars))
+except ValueError:
+    # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor
+    sc.pp.normalize_total(table, target_sum=1e4)
+    sc.pp.log1p(table)
+    sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars))
+table.var.rename(columns={"highly_variable": "hvg"}, inplace=True)
+
+table.uns["dataset_id"] = dataset_id
+table.uns["method_id"] = method_id
+table.uns["spatialdata_attrs"] = {
+    "instance_key": "cell_id",
+    "region": ["segmentation"],
+    "region_key": "region",
+}
+
+print(">> Writing output", flush=True)
+output = sd.SpatialData(
+    labels={"segmentation": sdata_pred["segmentation"]},
+    tables={"table": table},
+)
+output.write(par["output"], overwrite=True)
diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py
index 6ebae72..3f0236b 100644
--- a/src/methods/cellpose/script.py
+++ b/src/methods/cellpose/script.py
@@ -10,7 +10,7 @@
 
 ## VIASH START
 par = {
-  'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr',
+  'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr',
   'output': 'prediction.zarr'
 }
 meta = {
@@ -47,24 +47,23 @@ def convert_to_lower_dtype(arr):
 print('Cellpose segmentation finished, post-processing results', flush=True)
 masks = convert_to_lower_dtype(masks)
 
-print('Segmentation done, preparing output', flush=True)
-sd_output = sd.SpatialData()
-data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x'))
-parsed = Labels2DModel.parse(data_array, transformations=transformation)
-sd_output.labels['segmentation'] = parsed
-
-cell_ids = np.unique(masks)[1:]  # exclude background (0)
-table = ad.AnnData(
-  obs=pd.DataFrame(
-    {'cell_id': cell_ids.astype(str), 'region': 'segmentation'},
-    index=cell_ids.astype(str),
-  ),
-  uns={
-    'dataset_id': sdata.tables['table'].uns['dataset_id'],
-    'method_id': meta['name']
+print('Creating output data structure', flush=True)
+sd_output = sd.SpatialData(
+  labels={
+    'segmentation': Labels2DModel.parse(
+      xr.DataArray(masks, name='segmentation', dims=('y', 'x')),
+      transformations=transformation
+    )
+  },
+  tables={
+    'table': ad.AnnData(
+      uns={
+        'dataset_id': sdata.tables['table'].uns['dataset_id'],
+        'method_id': meta['name']
+      }
+    )
   }
 )
-sd_output.tables['table'] = table
 
 print('Saving output', flush=True)
 if os.path.exists(par["output"]):
diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml
deleted file mode 100644
index ac197bd..0000000
--- a/src/metrics/accuracy/config.vsh.yaml
+++ /dev/null
@@ -1,70 +0,0 @@
-# The API specifies which type of component this is.
-# It contains specifications for:
-#   - The input/output files
-#   - Common parameters
-#   - A unit test
-__merge__: ../../api/comp_metric.yaml
-
-
-# A unique identifier for your component (required).
-# Can contain only lowercase letters or underscores.
-name: accuracy
-
-# Metadata for your component
-info:
-  metrics:
-    # A unique identifier for your metric (required).
-    # Can contain only lowercase letters or underscores.
-    - name: accuracy
-      # A relatively short label, used when rendering visualisarions (required)
-      label: Accuracy
-      # A one sentence summary of how this metric works (required). Used when 
-      # rendering summary tables.
-      summary: "The percentage of correctly predicted labels."
-      # A multi-line description of how this component works (required). Used
-      # when rendering reference documentation.
-      description: |
-        The percentage of correctly predicted labels.
-      # A reference key from the bibtex library at src/common/library.bib (required).
-      references:
-        doi: 10.48550/arXiv.2008.05756
-      # The minimum possible value for this metric (required)
-      min: 0
-      # The maximum possible value for this metric (required)
-      max: 1
-      # Whether a higher value represents a 'better' solution (required)
-      maximize: true
-
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
-
-# Resources required to run the component
-resources:
-  # The script of your component (required)
-  - type: python_script
-    path: script.py
-  # Additional resources your script needs (optional)
-  # - type: file
-  #   path: weights.pt
-
-engines:
-  # Specifications for the Docker image for this component.
-  - type: docker
-    image: openproblems/base_python:1
-    # Add custom dependencies here (optional). For more information, see
-    # https://viash.io/reference/config/engines/docker/#setup .
-    setup:
-      - type: python
-        packages: scikit-learn
-
-runners:
-  # This platform allows running the component natively
-  - type: executable
-  # Allows turning the component into a Nextflow module / pipeline.
-  - type: nextflow
-    directives:
-      label: [midtime, midmem, midcpu]
diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py
deleted file mode 100644
index 054e809..0000000
--- a/src/metrics/accuracy/script.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import anndata as ad
-import numpy as np
-import sklearn.preprocessing
-
-## VIASH START
-# Note: this section is auto-generated by viash at runtime. To edit it, make changes
-# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
-par = {
-  'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad',
-  'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad',
-  'output': 'output.h5ad'
-}
-meta = {
-  'name': 'accuracy'
-}
-## VIASH END
-
-print('Reading input files', flush=True)
-input_solution = ad.read_h5ad(par['input_solution'])
-input_prediction = ad.read_h5ad(par['input_prediction'])
-
-assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs"
-
-print("Encode labels", flush=True)
-cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories)
-encoder = sklearn.preprocessing.LabelEncoder().fit(cats)
-input_solution.obs["label"] = encoder.transform(input_solution.obs["label"])
-input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"])
-
-
-print('Compute metrics', flush=True)
-# metric_ids and metric_values can have length > 1
-# but should be of equal length
-uns_metric_ids = [ 'accuracy' ]
-uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"])
-
-print("Write output AnnData to file", flush=True)
-output = ad.AnnData(
-  uns={
-    'dataset_id': input_prediction.uns['dataset_id'],
-    'normalization_id': input_prediction.uns['normalization_id'],
-    'method_id': input_prediction.uns['method_id'],
-    'metric_ids': uns_metric_ids,
-    'metric_values': uns_metric_values
-  }
-)
-output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml
new file mode 100644
index 0000000..8c1e3df
--- /dev/null
+++ b/src/metrics/ari/config.vsh.yaml
@@ -0,0 +1,36 @@
+__merge__: ../../api/comp_metric.yaml
+
+name: ari
+
+info:
+  metrics:
+    - name: ari
+      label: "Adjusted Rand Index"
+      summary: "ARI between ground truth and predicted transcript-to-cell assignments."
+      description: |
+        For each transcript, its cell ID is looked up in both the ground truth
+        label image (cell_labels) and the predicted segmentation label image.
+        The Adjusted Rand Index (ARI) is then computed between the two resulting
+        assignment vectors. ARI = 1 is perfect agreement; values near 0 are random.
+        Background transcripts (cell_id = 0 in both) are excluded.
+      # TODO: add ref and link
+      references:
+        doi: 10.1007/BF01908075
+      min: -1
+      max: 1
+      maximize: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    __merge__: ../../base/setup_spatialdata_partial.yaml
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu]
\ No newline at end of file
diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py
new file mode 100644
index 0000000..6c5a848
--- /dev/null
+++ b/src/metrics/ari/script.py
@@ -0,0 +1,70 @@
+import numpy as np
+import xarray as xr
+import anndata as ad
+import spatialdata as sd
+from sklearn.metrics import adjusted_rand_score
+
+## VIASH START
+par = {
+    'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr',
+    'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr',
+    'output': 'output.h5ad'
+}
+meta = {
+    'name': 'ari'
+}
+## VIASH END
+
+print(">> Reading input files", flush=True)
+sdata_pred = sd.read_zarr(par["input_prediction"])
+sdata_sol = sd.read_zarr(par["input_solution"])
+
+dataset_id = sdata_sol.tables["table"].uns["dataset_id"]
+method_id = sdata_pred.tables["table"].uns["method_id"]
+
+print(">> Loading transcripts in global coordinate system", flush=True)
+transcripts = sd.transform(sdata_sol["transcripts"], to_coordinate_system="global")
+
+def lookup_labels(label_element, transcripts_global):
+    """Look up label image values at transcript coordinates."""
+    trans = sd.transformations.get_transformation(
+        label_element, get_all=True
+    )["global"].inverse()
+    transcripts_local = sd.transform(transcripts_global, trans, "global")
+    y = transcripts_local.y.compute().to_numpy(dtype=np.int64)
+    x = transcripts_local.x.compute().to_numpy(dtype=np.int64)
+    if isinstance(label_element, xr.DataTree):
+        img = label_element["scale0"].image.to_numpy()
+    else:
+        img = label_element.to_numpy()
+    y = np.clip(y, 0, img.shape[0] - 1)
+    x = np.clip(x, 0, img.shape[1] - 1)
+    return img[y, x]
+
+print(">> Looking up ground truth cell IDs from cell_labels", flush=True)
+gt_cell_ids = lookup_labels(sdata_sol["cell_labels"], transcripts)
+
+print(">> Looking up predicted cell IDs from segmentation", flush=True)
+pred_cell_ids = lookup_labels(sdata_pred["segmentation"], transcripts)
+
+print(">> Computing ARI", flush=True)
+# Exclude transcripts that are background in both
+mask = (gt_cell_ids != 0) | (pred_cell_ids != 0)
+print(f"  Transcripts used: {mask.sum()} / {len(mask)}", flush=True)
+print(f"  GT unique cells: {len(np.unique(gt_cell_ids[mask]))}", flush=True)
+print(f"  Pred unique cells: {len(np.unique(pred_cell_ids[mask]))}", flush=True)
+
+ari_score = adjusted_rand_score(gt_cell_ids[mask], pred_cell_ids[mask])
+print(f"  ARI = {ari_score:.4f}", flush=True)
+
+print(">> Writing output", flush=True)
+output = ad.AnnData(
+    uns={
+        "dataset_id": dataset_id,
+        "normalization_id": "counts",
+        "method_id": method_id,
+        "metric_ids": ["ari"],
+        "metric_values": [float(ari_score)],
+    }
+)
+output.write_h5ad(par["output"], compression="gzip")
\ No newline at end of file
diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml
index c71286a..454c63f 100644
--- a/src/workflows/process_datasets/config.vsh.yaml
+++ b/src/workflows/process_datasets/config.vsh.yaml
@@ -14,8 +14,12 @@ argument_groups:
         direction: input
   - name: Outputs
     arguments:
-      - name: "--output_spatial_dataset"
-        __merge__: /src/api/file_spatial_dataset.yaml
+      - name: "--output_spatial_unlabelled"
+        __merge__: /src/api/file_spatial_unlabelled.yaml
+        direction: output
+        required: true
+      - name: "--output_spatial_solution"
+        __merge__: /src/api/file_spatial_solution.yaml
         direction: output
         required: true
       - name: "--output_scrnaseq_reference"
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index 947a8f1..2e0c2d4 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -44,13 +44,14 @@ workflow run_wf {
         "input_sc": "input_sc"
       ],
       toState: [
-        output_spatial_dataset: "output_spatial_dataset",
+        output_spatial_unlabelled: "output_spatial_unlabelled",
+        output_spatial_solution: "output_spatial_solution",
         output_scrnaseq_reference: "output_scrnaseq_reference"
       ]
     )
 
     // only output the files for which an output file was specified
-    | setState(["output_spatial_dataset", "output_scrnaseq_reference"])
+    | setState(["output_spatial_unlabelled", "output_spatial_solution", "output_scrnaseq_reference"])
 
   emit:
   output_ch
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 4ab5f83..67bf53c 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -4,13 +4,17 @@ namespace: workflows
 argument_groups:
   - name: Inputs
     arguments:
-      - name: "--input_spatial_dataset"
-        __merge__: /src/api/file_spatial_dataset.yaml
-        direction: output
+      - name: "--input_spatial_unlabelled"
+        __merge__: /src/api/file_spatial_unlabelled.yaml
+        direction: input
+        required: true
+      - name: "--input_spatial_solution"
+        __merge__: /src/api/file_spatial_solution.yaml
+        direction: input
         required: true
       - name: "--input_scrnaseq_reference"
         __merge__: /src/api/file_scrnaseq_reference.yaml
-        direction: output
+        direction: input
         required: true
   - name: Outputs
     arguments:
@@ -58,8 +62,11 @@ dependencies:
   - name: utils/extract_uns_metadata
     repository: openproblems
   - name: control_methods/true_labels
+  - name: control_methods/empty_labels
+  - name: control_methods/random_voronoi
   - name: methods/cellpose
-  - name: metrics/accuracy
+  - name: metrics/ari
+  - name: data_processors/process_prediction
 
 runners:
   - type: nextflow
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index fe25140..d39386c 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -8,12 +8,14 @@ workflow auto {
 // construct list of methods and control methods
 methods = [
   true_labels,
+  empty_labels,
+  random_voronoi,
   cellpose
 ]
 
 // construct list of metrics
 metrics = [
-  accuracy
+  ari
 ]
 
 workflow run_wf {
@@ -33,7 +35,7 @@ workflow run_wf {
 
     // extract the dataset metadata
     | extract_uns_metadata.run(
-      fromState: [input: "input_solution"],
+      fromState: [input: "input_spatial_unlabelled"],
       toState: { id, output, state ->
         state + [
           dataset_uns: readYaml(output.output).uns
@@ -52,14 +54,8 @@ workflow run_wf {
 
       // use the 'filter' argument to only run a method on the normalisation the component is asking for
       filter: { id, state, comp ->
-        def norm = state.dataset_uns.normalization_id
-        def pref = comp.config.info.preferred_normalization
-        // if the preferred normalisation is none at all,
-        // we can pass whichever dataset we want
-        def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref
         def method_check = !state.method_ids || state.method_ids.contains(comp.config.name)
-
-        method_check && norm_check
+        method_check
       },
 
       // define a new 'id' by appending the method name to the dataset id
@@ -70,11 +66,10 @@ workflow run_wf {
       // use 'fromState' to fetch the arguments the component requires from the overall state
       fromState: { id, state, comp ->
         def new_args = [
-          input_train: state.input_train,
-          input_test: state.input_test
+          input: state.input_spatial_unlabelled
         ]
         if (comp.config.info.type == "control_method") {
-          new_args.input_solution = state.input_solution
+          new_args.input_solution = state.input_spatial_solution
         }
         new_args
       },
@@ -88,6 +83,15 @@ workflow run_wf {
       }
     )
 
+    | process_prediction.run(
+      fromState: [input: "method_output"],
+      toState: { id, output, state ->
+        state + [
+          input_prediction: output.output
+        ]
+      }
+    )
+
     // run all metrics
     | runEach(
       components: metrics,
@@ -96,8 +100,8 @@ workflow run_wf {
       },
       // use 'fromState' to fetch the arguments the component requires from the overall state
       fromState: [
-        input_solution: "input_solution", 
-        input_prediction: "method_output"
+        input_solution: "input_solution",
+        input_prediction: "input_prediction"
       ],
       // use 'toState' to publish that component's outputs to the overall state
       toState: { id, output, state, comp ->
@@ -136,10 +140,8 @@ workflow run_wf {
 
   // extract the dataset metadata
   meta_ch = dataset_ch
-    // only keep one of the normalization methods
-    | filter{ id, state ->
-      state.dataset_uns.normalization_id == "log_cp10k"
-    }
+    // only keep one entry per dataset
+    | filter{ id, state -> true }
     | joinStates { ids, states ->
       // store the dataset metadata in a file
       def dataset_uns = states.collect{state ->