diff --git a/README.md b/README.md index a0a382e..29e5637 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ should convince readers of the significance and relevance of your task. ## Authors & contributors -| Name | Roles | Orcid | Twitter | Github | Email | Linkedin | +| Name | Roles | Github | Twitter | Email | Orcid | Linkedin | |:---|:---|:---|:---|:---|:---|:---| -| John Doe | author, maintainer | 0000-0000-0000-0000 | johndoe | johndoe | john@doe.me | johndoe | +| John Doe | author, maintainer | johndoe | johndoe | john@doe.me | 0000-0000-0000-0000 | johndoe | ## API @@ -38,25 +38,32 @@ should convince readers of the significance and relevance of your task. flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] - file_spatial_dataset("Raw iST Dataset") + file_spatial_unlabelled("Unlabelled") + file_spatial_solution("Solution") file_scrnaseq_reference("scRNA-seq Reference") comp_control_method[/"Control Method"/] comp_method[/"Method"/] + comp_output_processor[/"Output processor"/] comp_metric[/"Metric"/] file_prediction("Predicted data") + file_processed_prediction("Processed prediction") file_score("Score") file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor - comp_data_processor-->file_spatial_dataset + comp_data_processor-->file_spatial_unlabelled + comp_data_processor-->file_spatial_solution comp_data_processor-->file_scrnaseq_reference - file_spatial_dataset---comp_control_method - file_spatial_dataset---comp_method - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + file_spatial_unlabelled---comp_control_method + file_spatial_unlabelled---comp_method + file_spatial_unlabelled---comp_output_processor + file_spatial_solution---comp_control_method + file_spatial_solution---comp_metric comp_control_method-->file_prediction comp_method-->file_prediction + comp_output_processor-->file_processed_prediction comp_metric-->file_score - file_prediction---comp_metric + file_prediction---comp_output_processor + file_processed_prediction---comp_metric file_common_scrnaseq---comp_data_processor ``` @@ -175,22 +182,25 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | -| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output_spatial_unlabelled` | `file` | (*Output*) Preprocessed spatial transcriptomics data without segmentation labels for method input. | +| `--output_spatial_solution` | `file` | (*Output*) Ground truth segmentation labels and cell assignments for method evaluation. | | `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | -## File format: Raw iST Dataset +## File format: Unlabelled -A spatial transcriptomics dataset, preprocessed for this benchmark. +Preprocessed spatial transcriptomics data without segmentation labels +for method input. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr` +`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr` Description: -This dataset contains preprocessed images, labels, points, shapes, and -tables for spatial transcriptomics data. +This dataset contains preprocessed images and transcript point clouds +for spatial transcriptomics data. Ground truth segmentation labels are +intentionally excluded to prevent methods from cheating. Format: @@ -198,9 +208,7 @@ Format: SpatialData object images: 'morphology_mip' - labels: 'cell_labels', 'nucleus_labels' points: 'transcripts' - shapes: 'cell_boundaries', 'nucleus_boundaries' tables: 'table' coordinate_systems: 'global' @@ -212,16 +220,9 @@ Data structure: *images* -| Name | Description | -|:-----------------|:--------------------| -| `morphology_mip` | The raw image data. | - -*labels* - -| Name | Description | -|:-----------------|:---------------------------------------| -| `cell_labels` | (*Optional*) Cell segmentation labels. | -| `nucleus_labels` | (*Optional*) Cell segmentation labels. | +| Name | Description | +|:-----------------|:---------------------------------------------------------| +| `morphology_mip` | The raw morphology image (maximum intensity projection). | *points* @@ -233,26 +234,9 @@ Data structure: | `y` | `float` | y-coordinate of the point. | | `z` | `float` | (*Optional*) z-coordinate of the point. | | `feature_name` | `categorical` | Name of the feature. | -| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | -| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | -| `cell_type` | `string` | (*Optional*) Cell type of the cell. | | `qv` | `float` | (*Optional*) Quality value of the point. | | `transcript_id` | `long` | Unique identifier of the transcript. | -| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | - -*shapes* - -`cell_boundaries`: Cell boundaries. - -| Column | Type | Description | -|:-----------|:---------|:-------------------------------| -| `geometry` | `object` | Geometry of the cell boundary. | - -`nucleus_boundaries`: Nucleus boundaries. - -| Column | Type | Description | -|:-----------|:---------|:----------------------------------| -| `geometry` | `object` | Geometry of the nucleus boundary. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with the nucleus (derived from morphology). | *tables* @@ -260,10 +244,8 @@ Data structure: | Slot | Type | Description | |:---|:---|:---| -| `obs["cell_id"]` | `string` | A unique identifier for the cell. | -| `var["gene_ids"]` | `string` | Unique identifier for the gene. | -| `var["feature_types"]` | `string` | Type of the feature. | -| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | | `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | @@ -271,7 +253,7 @@ Data structure: | `uns["dataset_summary"]` | `string` | Short description of the dataset. | | `uns["dataset_description"]` | `string` | Long description of the dataset. | | `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | -| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | +| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). | *coordinate_systems* @@ -281,6 +263,88 @@ Data structure: +## File format: Solution + +Ground truth segmentation labels and cell assignments for method +evaluation. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr` + +Description: + +This dataset contains the ground truth cell and nucleus segmentation +labels, cell boundaries, and a reference table matching each cell to its +label region. + +Format: + +
+ + SpatialData object + labels: 'cell_labels', 'nucleus_labels' + points: 'transcripts' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'table' + +
+ +Data structure: + +
+ +*labels* + +| Name | Description | +|:-----------------|:-------------------------------------------------------| +| `cell_labels` | Ground truth cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Ground truth nucleus segmentation labels. | + +*points* + +`transcripts`: Point cloud data of transcripts with ground truth cell +assignments. + +| Column | Type | Description | +|:---|:---|:---| +| `x` | `float` | x-coordinate of the point. | +| `y` | `float` | y-coordinate of the point. | +| `z` | `float` | (*Optional*) z-coordinate of the point. | +| `feature_name` | `categorical` | Name of the feature. | +| `cell_id` | `integer` | Ground truth cell assignment (0 = background). | +| `transcript_id` | `long` | Unique identifier of the transcript. | + +*shapes* + +`cell_boundaries`: Ground truth cell boundary shapes. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Ground truth nucleus boundary shapes. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`table`: Reference cell metadata table. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `integer` | Unique cell identifier, matching instance IDs in the label images. | +| `obs["region"]` | `string` | Name of the label image this cell belongs to (e.g. ‘cell_labels’). | +| `obs["cell_area"]` | `double` | (*Optional*) Area of the cell in pixels. | +| `obs["transcript_counts"]` | `integer` | (*Optional*) Total number of transcripts assigned to this cell. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). | + +
+ ## File format: scRNA-seq Reference A single-cell reference dataset, preprocessed for this benchmark. @@ -346,8 +410,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--input` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. | +| `--input_solution` | `file` | Ground truth segmentation labels and cell assignments for method evaluation. | | `--output` | `file` | (*Output*) A predicted dataset as output by a method. | @@ -362,11 +426,27 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--input` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. | | `--output` | `file` | (*Output*) A predicted dataset as output by a method. | +## Component type: Output processor + +An output processor for the prediction. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_spatial_unlabelled` | `file` | Preprocessed spatial transcriptomics data without segmentation labels for method input. | +| `--output` | `file` | (*Output*) A processed predicted dataset, ready to be used as input for the evaluation. | + +
+ ## Component type: Metric A task template metric. @@ -377,8 +457,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_prediction` | `file` | A predicted dataset as output by a method. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--input_prediction` | `file` | A processed predicted dataset, ready to be used as input for the evaluation. | +| `--input_solution` | `file` | Ground truth segmentation labels and cell assignments for method evaluation. | | `--output` | `file` | (*Output*) File indicating the score of a metric. | @@ -388,7 +468,7 @@ Arguments: A predicted dataset as output by a method. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad` +`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr` Format: @@ -416,13 +496,59 @@ Data structure: | Slot | Type | Description | |:--------------------|:---------|:-------------------------------------| -| `obs["cell_id"]` | `string` | Cell ID. | -| `obs["region"]` | `string` | Region. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["method_id"]` | `string` | A unique identifier for the method. | +## File format: Processed prediction + +A processed predicted dataset, ready to be used as input for the +evaluation. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr` + +Format: + +
+ + SpatialData object + labels: 'segmentation' + tables: 'table' + +
+ +Data structure: + +
+ +*labels* + +| Name | Description | +|:---------------|:--------------------------| +| `segmentation` | Segmentation of the data. | + +*tables* + +`table`: AnnData table. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | Cell ID. | +| `obs["region"]` | `string` | Region. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `layers["normalized_log"]` | `double` | Log1p normalized expression values. | +| `layers["normalized_log_scaled"]` | `double` | Log1p normalized expression values scaled to unit variance and zero mean. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ ## File format: Score File indicating the score of a metric. diff --git a/common b/common index 91a35a2..f0816e1 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 91a35a2e808da7029e222456c0e3ca70ab3a06dd +Subproject commit f0816e178a2b44749fdfb1a9cdfc76887dcf7462 diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 52ee226..4a921f8 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -16,9 +16,9 @@ exit 1 cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml -rename_keys: 'input:output_dataset' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: '$id/state.yaml' -settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' +settings: '{"output_spatial_unlabelled": "$id/output_spatial_unlabelled.zarr", "output_spatial_solution": "$id/output_spatial_solution.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index b11d437..59fc1c2 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -24,7 +24,8 @@ mkdir -p $DATASET_DIR viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \ --input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ - --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \ + --output_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \ + --output_spatial_solution $DATASET_DIR/spatial_solution.zarr \ --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ --dataset_id mouse_brain_combined \ --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \ @@ -36,22 +37,29 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \ # run one method viash run src/methods/cellpose/config.vsh.yaml -- \ - --input $DATASET_DIR/spatial_dataset.zarr \ - --output $DATASET_DIR/prediction.h5ad + --input $DATASET_DIR/spatial_unlabelled.zarr \ + --output $DATASET_DIR/prediction.zarr + +# run prediction processor +viash run src/data_processors/process_prediction/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/prediction.zarr \ + --input_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \ + --output $DATASET_DIR/processed_prediction.zarr # run one metric -# TODO: implement this! -# viash run src/metrics/ari/config.vsh.yaml -- \ -# --input_prediction $DATASET_DIR/prediction.h5ad \ -# --input_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ -# --output $DATASET_DIR/score.h5ad +viash run src/metrics/ari/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/processed_prediction.zarr \ + --input_solution $DATASET_DIR/spatial_solution.zarr \ + --output $DATASET_DIR/score.h5ad # write manual state.yaml. this is not actually necessary but you never know it might be useful cat > $DATASET_DIR/state.yaml << HERE id: $DATASET_ID -spatial_dataset: spatial_dataset.zarr +spatial_unlabelled: spatial_unlabelled.zarr +spatial_solution: spatial_solution.zarr scrnaseq_reference: scrnaseq_reference.h5ad -prediction: prediction.h5ad +prediction: prediction.zarr +processed_prediction: processed_prediction.zarr score: score.h5ad HERE diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 26bba56..808df7f 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 3c31e74..745aa77 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 3f4fa2e..990602d 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -13,11 +13,11 @@ info: in the task. arguments: - name: --input - __merge__: file_spatial_dataset.yaml + __merge__: file_spatial_unlabelled.yaml required: true direction: input - - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + - name: "--input_solution" + __merge__: file_spatial_solution.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index ecd3f9c..69deecc 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,8 +19,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: file_spatial_dataset.yaml + - name: "--output_spatial_unlabelled" + __merge__: file_spatial_unlabelled.yaml + direction: output + required: true + - name: "--output_spatial_solution" + __merge__: file_spatial_solution.yaml direction: output required: true - name: "--output_scrnaseq_reference" @@ -80,4 +84,3 @@ test_resources: dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 - type: python_script path: /common/component_tests/run_and_check_output.py - diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 633a8a1..2acf7ef 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -8,7 +8,7 @@ info: A method to predict the task effects. arguments: - name: --input - __merge__: file_spatial_dataset.yaml + __merge__: file_spatial_unlabelled.yaml required: true direction: input - name: --output diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index a7470e9..6102919 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -8,11 +8,11 @@ info: A metric for evaluating method predictions. arguments: - name: "--input_prediction" - __merge__: file_prediction.yaml + __merge__: file_processed_prediction.yaml direction: input required: true - - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + - name: "--input_solution" + __merge__: file_spatial_solution.yaml direction: input required: true - name: "--output" diff --git a/src/api/comp_output_processor.yaml b/src/api/comp_output_processor.yaml new file mode 100644 index 0000000..a5db8ec --- /dev/null +++ b/src/api/comp_output_processor.yaml @@ -0,0 +1,30 @@ +namespace: "data_processors" +info: + type: data_processor + type_info: + label: Output processor + summary: An output processor for the prediction. + description: | + A component for a prediction dataset into a processed prediction dataset that can be evaluated by the metrics. +argument_groups: + - name: Inputs + arguments: + - name: "--input_prediction" + __merge__: file_prediction.yaml + required: true + direction: input + - name: "--input_spatial_unlabelled" + __merge__: file_spatial_unlabelled.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output" + __merge__: file_processed_prediction.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index b1fc443..e72794b 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr" label: "Predicted data" summary: A predicted dataset as output by a method. info: @@ -16,15 +16,6 @@ info: name: table description: AnnData table required: true - obs: - - type: string - name: cell_id - description: Cell ID - required: true - - type: string - name: region - description: Region - required: true uns: - type: string name: dataset_id diff --git a/src/api/file_processed_prediction.yaml b/src/api/file_processed_prediction.yaml new file mode 100644 index 0000000..52e49df --- /dev/null +++ b/src/api/file_processed_prediction.yaml @@ -0,0 +1,67 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr" +label: "Processed prediction" +summary: A processed predicted dataset, ready to be used as input for the evaluation. +info: + format: + type: spatialdata_zarr + labels: + - type: object + name: "segmentation" + description: Segmentation of the data + required: true + tables: + - type: anndata + name: table + description: AnnData table + required: true + # TODO: what is it that this component adds to the anndata? + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + - type: double + name: normalized_log + description: Log1p normalized expression values + required: true + - type: double + name: normalized_log_scaled + description: Log1p normalized expression values scaled to unit variance and zero mean + required: true + obs: + - type: string + name: cell_id + description: Cell ID + required: true + - type: string + name: region + description: Region + required: true + # .... cell info ... ? + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml index 9b855fd..c1da992 100644 --- a/src/api/file_scrnaseq_reference.yaml +++ b/src/api/file_scrnaseq_reference.yaml @@ -1,7 +1,5 @@ type: file example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad" -# TODO: revert to the original example once file exists -# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" label: "scRNA-seq Reference" summary: A single-cell reference dataset, preprocessed for this benchmark. description: | diff --git a/src/api/file_spatial_solution.yaml b/src/api/file_spatial_solution.yaml new file mode 100644 index 0000000..8552e48 --- /dev/null +++ b/src/api/file_spatial_solution.yaml @@ -0,0 +1,108 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr" +label: "Solution" +summary: Ground truth segmentation labels and cell assignments for method evaluation. +description: | + This dataset contains the ground truth cell and nucleus segmentation labels, + cell boundaries, and a reference table matching each cell to its label region. +info: + format: + type: spatialdata_zarr + points: + - type: dataframe + name: transcripts + description: Point cloud data of transcripts with ground truth cell assignments + required: true + columns: + - type: float + name: "x" + required: true + description: x-coordinate of the point + - type: float + name: "y" + required: true + description: y-coordinate of the point + - type: float + name: "z" + required: false + description: z-coordinate of the point + - type: categorical + name: feature_name + required: true + description: Name of the feature + - type: integer + name: cell_id + required: true + description: Ground truth cell assignment (0 = background) + - type: long + name: transcript_id + required: true + description: Unique identifier of the transcript + labels: + - type: object + name: "cell_labels" + description: Ground truth cell segmentation labels + required: true + - type: object + name: "nucleus_labels" + description: Ground truth nucleus segmentation labels + required: false + shapes: + - type: dataframe + name: "cell_boundaries" + description: Ground truth cell boundary shapes + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the cell boundary + - type: dataframe + name: "nucleus_boundaries" + description: Ground truth nucleus boundary shapes + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the nucleus boundary + tables: + - type: anndata + name: "table" + description: Reference cell metadata table + required: true + obs: + - type: integer + name: cell_id + description: Unique cell identifier, matching instance IDs in the label images + required: true + - type: string + name: region + description: Name of the label image this cell belongs to (e.g. 'cell_labels') + required: true + - type: double + name: cell_area + description: Area of the cell in pixels + required: false + - type: integer + name: transcript_counts + description: Total number of transcripts assigned to this cell + required: false + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset + required: true + - type: string + name: orig_dataset_id + required: true + description: The identifier of the original dataset from which this dataset was derived (if applicable) + var: + - type: string + name: feature_id + required: false + description: Unique identifier for the feature, usually a ENSEMBL gene id. + - type: string + name: feature_name + required: true + description: A human-readable name for the feature, usually a gene symbol. diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_unlabelled.yaml similarity index 54% rename from src/api/file_spatial_dataset.yaml rename to src/api/file_spatial_unlabelled.yaml index 4c5253e..3d86b0f 100644 --- a/src/api/file_spatial_dataset.yaml +++ b/src/api/file_spatial_unlabelled.yaml @@ -1,28 +1,18 @@ type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" -# TODO: revert to the original example once file exists -# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" -label: "Raw iST Dataset" -summary: A spatial transcriptomics dataset, preprocessed for this benchmark. +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr" +label: "Unlabelled" +summary: Preprocessed spatial transcriptomics data without segmentation labels for method input. description: | - This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data. + This dataset contains preprocessed images and transcript point clouds for spatial transcriptomics data. + Ground truth segmentation labels are intentionally excluded to prevent methods from cheating. info: format: type: spatialdata_zarr images: - type: object name: morphology_mip - description: The raw image data + description: The raw morphology image (maximum intensity projection) required: true - labels: - - type: object - name: "cell_labels" - description: Cell segmentation labels - required: false - - type: object - name: "nucleus_labels" - description: Cell segmentation labels - required: false points: - type: dataframe name: transcripts @@ -45,18 +35,6 @@ info: name: feature_name required: true description: Name of the feature - - type: integer - name: "cell_id" - required: false - description: Unique identifier of the cell - - type: integer - name: "nucleus_id" - required: false - description: Unique identifier of the nucleus - - type: string - name: "cell_type" - required: false - description: Cell type of the cell - type: float name: qv required: false @@ -68,26 +46,7 @@ info: - type: boolean name: overlaps_nucleus required: false - description: Whether the point overlaps with a nucleus - shapes: - - type: dataframe - name: "cell_boundaries" - description: Cell boundaries - required: false - columns: - - type: object - name: "geometry" - required: true - description: Geometry of the cell boundary - - type: dataframe - name: "nucleus_boundaries" - description: Nucleus boundaries - required: false - columns: - - type: object - name: "geometry" - required: true - description: Geometry of the nucleus boundary + description: Whether the point overlaps with the nucleus (derived from morphology) tables: - type: anndata name: "table" @@ -123,29 +82,18 @@ info: required: true description: The organism of the sample in the dataset - type: string - name: segmentation_id - required: true - multiple: true - description: A unique identifier for the segmentation - obs: - - type: string - name: cell_id + name: orig_dataset_id required: true - description: A unique identifier for the cell + description: The identifier of the original dataset from which this dataset was derived (if applicable) var: - type: string - name: gene_ids - required: true - description: Unique identifier for the gene + name: feature_id + required: false + description: Unique identifier for the feature, usually a ENSEMBL gene id. - type: string - name: feature_types - required: true - description: Type of the feature - obsm: - - type: double - name: spatial + name: feature_name required: true - description: Spatial coordinates of the cell + description: A human-readable name for the feature, usually a gene symbol. coordinate_systems: - type: object name: global diff --git a/src/control_methods/empty_labels/config.vsh.yaml b/src/control_methods/empty_labels/config.vsh.yaml new file mode 100644 index 0000000..126b095 --- /dev/null +++ b/src/control_methods/empty_labels/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml + +name: empty_labels +label: Empty Labels +summary: "A negative control that predicts no cells (all background)." +description: | + A negative control where no cells are detected — the segmentation label image + is filled with zeros (background). This represents a lower bound of performance + for any segmentation method and is useful for verifying that metrics can + distinguish between no prediction and a perfect prediction. + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: ../../base/setup_spatialdata_partial.yaml + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/empty_labels/script.py b/src/control_methods/empty_labels/script.py new file mode 100644 index 0000000..5bc705b --- /dev/null +++ b/src/control_methods/empty_labels/script.py @@ -0,0 +1,54 @@ +import anndata as ad +import numpy as np +import spatialdata as sd +from spatialdata.models import Labels2DModel + +## VIASH START +par = { + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr', + 'output': 'output.zarr' +} +meta = { + 'name': 'empty_labels' +} +## VIASH END + +print('Reading input files', flush=True) +sdata_solution = sd.read_zarr(par['input_solution']) + +print('Creating empty (all-zero) segmentation labels', flush=True) +gt_labels = sdata_solution['cell_labels'] + +# Resolve the image array (handles both DataTree and DataArray) +import xarray as xr +if isinstance(gt_labels, xr.DataTree): + gt_array = gt_labels['scale0'].image.to_numpy() +else: + gt_array = gt_labels.to_numpy() + +empty_array = np.zeros_like(gt_array) + +# Preserve the same coordinate system and transform as the GT labels +transform = sd.transformations.get_transformation(gt_labels, get_all=True) +segmentation = Labels2DModel.parse( + empty_array, + transformations=transform, +) + +output = sd.SpatialData( + labels={ + 'segmentation': segmentation + }, + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata_solution.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) + } +) + +print('Writing output', flush=True) +output.write(par['output'], overwrite=True) diff --git a/src/control_methods/random_voronoi/config.vsh.yaml b/src/control_methods/random_voronoi/config.vsh.yaml new file mode 100644 index 0000000..3805183 --- /dev/null +++ b/src/control_methods/random_voronoi/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml + +name: random_voronoi +label: Random Voronoi +summary: "A negative control that assigns pixels to randomly placed seed points via Voronoi tessellation." +description: | + A negative control that places N seed points uniformly at random across the + image (where N equals the number of cells in the ground truth) and assigns + each pixel to the nearest seed. This produces spatially coherent, non-overlapping + cells with realistic count but no biological basis, resulting in poor metric scores. + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: ../../base/setup_spatialdata_partial.yaml + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/random_voronoi/script.py b/src/control_methods/random_voronoi/script.py new file mode 100644 index 0000000..f90c03b --- /dev/null +++ b/src/control_methods/random_voronoi/script.py @@ -0,0 +1,72 @@ +import anndata as ad +import numpy as np +import spatialdata as sd +import xarray as xr +from scipy.spatial import cKDTree +from spatialdata.models import Labels2DModel + +## VIASH START +par = { + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr', + 'output': 'output.zarr' +} +meta = { + 'name': 'random_voronoi' +} +## VIASH END + +print('Reading input files', flush=True) +sdata_solution = sd.read_zarr(par['input_solution']) + +print('Building random Voronoi segmentation', flush=True) +gt_labels = sdata_solution['cell_labels'] + +# Resolve the image array (handles both DataTree and DataArray) +if isinstance(gt_labels, xr.DataTree): + gt_array = gt_labels['scale0'].image.to_numpy() +else: + gt_array = gt_labels.to_numpy() + +H, W = gt_array.shape +rng = np.random.default_rng(42) + +# Use the same number of cells as in the ground truth +n_cells = len(np.unique(gt_array)) - 1 # subtract 1 for background (0) +n_cells = max(n_cells, 1) + +# Place seed points uniformly at random +seeds_y = rng.integers(0, H, size=n_cells) +seeds_x = rng.integers(0, W, size=n_cells) +seeds = np.column_stack([seeds_y, seeds_x]) + +# Assign every pixel to the nearest seed via KD-tree +print(f'Assigning {H * W} pixels to {n_cells} random seeds', flush=True) +yx = np.mgrid[0:H, 0:W].reshape(2, -1).T # (H*W, 2) +tree = cKDTree(seeds) +_, idx = tree.query(yx, workers=-1) +voronoi_array = (idx + 1).reshape(H, W).astype(np.int32) # labels start at 1 + +# Preserve the same coordinate system and transform as the GT labels +transform = sd.transformations.get_transformation(gt_labels, get_all=True) +segmentation = Labels2DModel.parse( + voronoi_array, + transformations=transform, +) + +output = sd.SpatialData( + labels={ + 'segmentation': segmentation + }, + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata_solution.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) + } +) + +print('Writing output', flush=True) +output.write(par['output'], overwrite=True) diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml index 0a71e50..7d47a02 100644 --- a/src/control_methods/true_labels/config.vsh.yaml +++ b/src/control_methods/true_labels/config.vsh.yaml @@ -1,59 +1,24 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_control_method.yaml -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. name: true_labels - -# A relatively short label, used when rendering visualisations (required) label: True Labels -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. +summary: "A positive control where the ground truth cell_labels are used as the prediction." description: | - A positive control, where the solution labels are copied 1 to 1 to the predicted data. - -# Metadata for your component -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: counts - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. + A positive control where the ground truth cell_labels segmentation is copied + directly as the prediction. This represents the upper bound of performance + for any segmentation method. -# Resources required to run the component resources: - # The script of your component (required) - type: python_script path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt engines: - # Specifications for the Docker image for this component. - type: docker image: openproblems/base_python:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 + __merge__: ../../base/setup_spatialdata_partial.yaml runners: - # This platform allows running the component natively - type: executable - # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py index 935f3af..d59ae90 100644 --- a/src/control_methods/true_labels/script.py +++ b/src/control_methods/true_labels/script.py @@ -1,13 +1,14 @@ import anndata as ad +import numpy as np +import spatialdata as sd +import xarray as xr +from spatialdata.models import Labels2DModel ## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'output': 'output.h5ad' + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr', + 'output': 'output.zarr' } meta = { 'name': 'true_labels' @@ -15,31 +16,45 @@ ## VIASH END print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) -input_solution = ad.read_h5ad(par['input_solution']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = input_solution.obs["label"] - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] +sdata_solution = sd.read_zarr(par['input_solution']) + +print('Relabelling ground truth cell_labels as prediction', flush=True) +gt_labels = sdata_solution['cell_labels'] + +# Resolve the image array (handles both DataTree and DataArray) +if isinstance(gt_labels, xr.DataTree): + gt_array = gt_labels['scale0'].image.to_numpy() +else: + gt_array = gt_labels.to_numpy() + +# Randomly permute cell IDs while keeping background (0) unchanged. +# A correct metric (e.g. ARI) must be invariant to label permutation +rng = np.random.default_rng(42) +cell_ids = np.unique(gt_array[gt_array != 0]) +perm = rng.permutation(cell_ids) +lut = np.zeros(cell_ids.max() + 1, dtype=gt_array.dtype) +lut[cell_ids] = perm +relabelled = np.where(gt_array != 0, lut[gt_array], 0) + +transform = sd.transformations.get_transformation(gt_labels, get_all=True) +segmentation = Labels2DModel.parse( + relabelled, + transformations=transform, +) + +output = sd.SpatialData( + labels={ + 'segmentation': segmentation }, - obs={ - 'label_pred': obs_label_pred + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata_solution.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) } ) -output.obs_names = input_test.obs_names -output.write_h5ad(par['output'], compression='gzip') +print('Writing output', flush=True) +output.write(par['output'], overwrite=True) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0ea6508..96cc85c 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -5,10 +5,6 @@ name: process_dataset argument_groups: - name: "Processing parameters" arguments: - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - name: "--span" type: double description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. @@ -24,7 +20,6 @@ resources: engines: - type: docker - #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work image: openproblems/base_python:1 setup: - type: python @@ -37,4 +32,4 @@ runners: - type: executable - type: nextflow directives: - label: [highmem, midcpu, midtime] \ No newline at end of file + label: [midmem, midcpu, midtime] diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index b04cb33..ad85433 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -1,5 +1,5 @@ -import random import anndata as ad +import pandas as pd import spatialdata as sd import scanpy as sc @@ -7,8 +7,9 @@ par = { 'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr', 'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', - 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', - 'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'output_spatial_unlabelled': 'spatial_unlabelled.zarr', + 'output_spatial_solution': 'spatial_solution.zarr', + 'output_scrnaseq_reference': 'scrnaseq_reference.h5ad', 'span': 0.3, 'seed': 123, 'n_top_genes': 3000, @@ -53,12 +54,6 @@ def sc_processing(adata): ) adata.var.rename(columns={"highly_variable": "hvg"}, inplace=True) - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - print(">> Load data", flush=True) sc_data = ad.read_h5ad(par["input_sc"]) print(f"single cell data: {sc_data}") @@ -71,7 +66,7 @@ def sc_processing(adata): for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: sc_data.uns[key] = par[key] -print(">> Writing data", flush=True) +print(">> Writing scrnaseq reference", flush=True) sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip") # read input_sp @@ -79,27 +74,81 @@ def sc_processing(adata): sp_data = sd.read_zarr(par["input_sp"]) print(f"spatial data: {sp_data}") -print(">> Processing spatial data", flush=True) -sp_data_table = sp_data.tables['table'] -print(f"single cell part of spatial data: {sp_data_table}") -sc_processing(sp_data_table) - -if "cell_area" not in sp_data_table.obs: - print(">> Perform scanpy qc for cell area", flush=True) - sc.pp.calculate_qc_metrics(sp_data_table, layer="counts", inplace=True) - -for x in ["transcript_counts", "n_genes_by_counts"]: - if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs: - print(f">> Perform cell area normalization for {x}", flush=True) - sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"] - -print(">> Override dataset metadata in .uns", flush=True) -sp_data_table.uns["orig_dataset_id"] = sp_data_table.uns.get("dataset_id", None) -for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: - sp_data_table.uns[key] = par[key] - -print(f"spatial data: {sp_data}") -print(f"spatial data tables['table']: {sp_data.tables['table']}") +dataset_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_reference": par["dataset_reference"], + "dataset_organism": par["dataset_organism"], + "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), +} -print(">> Writing spatial data", flush=True) -sp_data.write(par["output_spatial_dataset"], overwrite=True) +# --------------------------------------------------------------- +# output_spatial_dataset: image + transcripts (no ground truth) +# --------------------------------------------------------------- +print(">> Building spatial dataset for methods (no ground truth)", flush=True) + +# Strip columns that reveal ground truth cell assignments from transcripts +_GROUND_TRUTH_COLS = {"cell_id", "nucleus_id", "cell_type"} +transcripts = sp_data.points["transcripts"] +clean_transcript_cols = [c for c in transcripts.columns if c not in _GROUND_TRUTH_COLS] +clean_transcripts = transcripts[clean_transcript_cols] + +# Build var from unique feature names in transcripts, mapping to feature_ids from metadata +feature_names = transcripts["feature_name"].compute().unique().tolist() +var_df = pd.DataFrame({"feature_name": feature_names}, index=feature_names) +var_df.index.name = "feature_name" +if "metadata" in sp_data.tables and "gene_ids" in sp_data.tables["metadata"].var.columns: + id_map = sp_data.tables["metadata"].var["gene_ids"] + var_df["feature_id"] = var_df.index.map(id_map) + +# Minimal table: dataset metadata in uns, gene list in var +minimal_table = ad.AnnData(var=var_df, uns=dataset_uns) + +output_spatial = sd.SpatialData( + images={"morphology_mip": sp_data.images["morphology_mip"]}, + points={"transcripts": clean_transcripts}, + tables={"table": minimal_table}, +) + +print(">> Writing spatial unlabelled dataset", flush=True) +output_spatial.write(par["output_spatial_unlabelled"], overwrite=True) + +# --------------------------------------------------------------- +# output_spatial_solution: ground truth labels, shapes, reference table +# --------------------------------------------------------------- +print(">> Building spatial solution (ground truth)", flush=True) + +ref_table = sp_data.tables["table"] +solution_obs = ref_table.obs[["cell_id", "region"]].copy() +for extra_col in ["cell_area", "transcript_counts"]: + if extra_col in ref_table.obs.columns: + solution_obs[extra_col] = ref_table.obs[extra_col] + +solution_table = ad.AnnData( + obs=solution_obs, + var=var_df, + uns={ + "dataset_id": par["dataset_id"], + "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), + "spatialdata_attrs": ref_table.uns["spatialdata_attrs"], + }, +) + +# Keep only the columns needed for the solution (ground truth assignments) +_SOLUTION_TRANSCRIPT_COLS = ["x", "y", "feature_name", "cell_id", "transcript_id"] +if "z" in transcripts.columns: + _SOLUTION_TRANSCRIPT_COLS = ["x", "y", "z"] + _SOLUTION_TRANSCRIPT_COLS[2:] +solution_transcripts = transcripts[[c for c in _SOLUTION_TRANSCRIPT_COLS if c in transcripts.columns]] + +output_solution = sd.SpatialData( + points={"transcripts": solution_transcripts}, + labels={k: v for k, v in sp_data.labels.items()}, + shapes={k: v for k, v in sp_data.shapes.items()}, + tables={"table": solution_table}, +) + +print(">> Writing spatial solution", flush=True) +output_solution.write(par["output_spatial_solution"], overwrite=True) diff --git a/src/data_processors/process_prediction/config.vsh.yaml b/src/data_processors/process_prediction/config.vsh.yaml new file mode 100644 index 0000000..7e473ac --- /dev/null +++ b/src/data_processors/process_prediction/config.vsh.yaml @@ -0,0 +1,23 @@ +__merge__: ../../api/comp_output_processor.yaml + +name: process_prediction + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + packages: [scikit-learn, scikit-misc] + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py new file mode 100644 index 0000000..17c3728 --- /dev/null +++ b/src/data_processors/process_prediction/script.py @@ -0,0 +1,110 @@ +import numpy as np +import xarray as xr +import anndata as ad +import pandas as pd +import spatialdata as sd +import scanpy as sc + +## VIASH START +par = { + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr', + 'input_spatial_unlabelled': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'output': 'output.zarr' +} +## VIASH END + +print(">> Reading input files", flush=True) +sdata_pred = sd.read_zarr(par["input_prediction"]) +sdata_sp = sd.read_zarr(par["input_spatial_unlabelled"]) + +dataset_id = sdata_sp.tables["table"].uns["dataset_id"] +method_id = sdata_pred.tables["table"].uns["method_id"] + +print(">> Transforming transcripts to global coordinate system", flush=True) +transcripts = sd.transform(sdata_sp["transcripts"], to_coordinate_system="global") + +# Adjust for any translation applied to the segmentation +trans = sd.transformations.get_transformation( + sdata_pred["segmentation"], get_all=True +)["global"].inverse() +transcripts = sd.transform(transcripts, trans, "global") + +print(">> Assigning transcripts to cells via label image lookup", flush=True) +y_coords = transcripts.y.compute().to_numpy(dtype=np.int64) +x_coords = transcripts.x.compute().to_numpy(dtype=np.int64) + +if isinstance(sdata_pred["segmentation"], xr.DataTree): + label_image = sdata_pred["segmentation"]["scale0"].image.to_numpy() +else: + label_image = sdata_pred["segmentation"].to_numpy() + +# Clip coordinates to valid label image bounds +y_coords = np.clip(y_coords, 0, label_image.shape[0] - 1) +x_coords = np.clip(x_coords, 0, label_image.shape[1] - 1) + +cell_ids = label_image[y_coords, x_coords] + +# NOTE: Is it useful to build a cxg count matrix? Is this used downstream? +print(">> Building cell x gene count matrix", flush=True) +feature_names = transcripts["feature_name"].compute().to_numpy() + +transcript_df = pd.DataFrame({"cell_id": cell_ids, "feature_name": feature_names}) +# Remove background (cell_id == 0) +transcript_df = transcript_df[transcript_df["cell_id"] != 0] + +count_matrix = ( + transcript_df.groupby(["cell_id", "feature_name"]) + .size() + .unstack(fill_value=0) +) + +obs = pd.DataFrame( + {"cell_id": count_matrix.index.astype(str), "region": pd.Categorical(["segmentation"] * len(count_matrix))}, + index=count_matrix.index.astype(str), +) +var = pd.DataFrame(index=count_matrix.columns.astype(str)) +var.index.name = "feature_name" +var["feature_name"] = var.index +if "var" in dir(sdata_sp.tables["table"]) and "feature_id" in sdata_sp.tables["table"].var.columns: + id_map = sdata_sp.tables["table"].var["feature_id"] + var["feature_id"] = var.index.map(id_map) + +table = ad.AnnData(X=count_matrix.values.astype(np.float32), obs=obs, var=var) +table.layers["counts"] = table.X.copy() + +print(">> Normalizing counts", flush=True) +sc.pp.normalize_total(table, target_sum=1e4) +table.layers["normalized"] = table.X.copy() + +sc.pp.log1p(table) +table.layers["normalized_log"] = table.X.copy() + +sc.pp.scale(table) +table.layers["normalized_log_scaled"] = table.X.copy() + +print(">> Computing highly variable genes", flush=True) +# Reset X to counts for HVG computation +table.X = table.layers["counts"].copy() +try: + sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars)) +except ValueError: + # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor + sc.pp.normalize_total(table, target_sum=1e4) + sc.pp.log1p(table) + sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars)) +table.var.rename(columns={"highly_variable": "hvg"}, inplace=True) + +table.uns["dataset_id"] = dataset_id +table.uns["method_id"] = method_id +table.uns["spatialdata_attrs"] = { + "instance_key": "cell_id", + "region": ["segmentation"], + "region_key": "region", +} + +print(">> Writing output", flush=True) +output = sd.SpatialData( + labels={"segmentation": sdata_pred["segmentation"]}, + tables={"table": table}, +) +output.write(par["output"], overwrite=True) diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py index 6ebae72..3f0236b 100644 --- a/src/methods/cellpose/script.py +++ b/src/methods/cellpose/script.py @@ -10,7 +10,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr', + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', 'output': 'prediction.zarr' } meta = { @@ -47,24 +47,23 @@ def convert_to_lower_dtype(arr): print('Cellpose segmentation finished, post-processing results', flush=True) masks = convert_to_lower_dtype(masks) -print('Segmentation done, preparing output', flush=True) -sd_output = sd.SpatialData() -data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x')) -parsed = Labels2DModel.parse(data_array, transformations=transformation) -sd_output.labels['segmentation'] = parsed - -cell_ids = np.unique(masks)[1:] # exclude background (0) -table = ad.AnnData( - obs=pd.DataFrame( - {'cell_id': cell_ids.astype(str), 'region': 'segmentation'}, - index=cell_ids.astype(str), - ), - uns={ - 'dataset_id': sdata.tables['table'].uns['dataset_id'], - 'method_id': meta['name'] +print('Creating output data structure', flush=True) +sd_output = sd.SpatialData( + labels={ + 'segmentation': Labels2DModel.parse( + xr.DataArray(masks, name='segmentation', dims=('y', 'x')), + transformations=transformation + ) + }, + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) } ) -sd_output.tables['table'] = table print('Saving output', flush=True) if os.path.exists(par["output"]): diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml deleted file mode 100644 index ac197bd..0000000 --- a/src/metrics/accuracy/config.vsh.yaml +++ /dev/null @@ -1,70 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_metric.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: accuracy - -# Metadata for your component -info: - metrics: - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: accuracy - # A relatively short label, used when rendering visualisarions (required) - label: Accuracy - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "The percentage of correctly predicted labels." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - The percentage of correctly predicted labels. - # A reference key from the bibtex library at src/common/library.bib (required). - references: - doi: 10.48550/arXiv.2008.05756 - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: 1 - # Whether a higher value represents a 'better' solution (required) - maximize: true - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py deleted file mode 100644 index 054e809..0000000 --- a/src/metrics/accuracy/script.py +++ /dev/null @@ -1,47 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.preprocessing - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'accuracy' -} -## VIASH END - -print('Reading input files', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_prediction = ad.read_h5ad(par['input_prediction']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - - -print('Compute metrics', flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = [ 'accuracy' ] -uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_prediction.uns['dataset_id'], - 'normalization_id': input_prediction.uns['normalization_id'], - 'method_id': input_prediction.uns['method_id'], - 'metric_ids': uns_metric_ids, - 'metric_values': uns_metric_values - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/ari/config.vsh.yaml b/src/metrics/ari/config.vsh.yaml new file mode 100644 index 0000000..8c1e3df --- /dev/null +++ b/src/metrics/ari/config.vsh.yaml @@ -0,0 +1,36 @@ +__merge__: ../../api/comp_metric.yaml + +name: ari + +info: + metrics: + - name: ari + label: "Adjusted Rand Index" + summary: "ARI between ground truth and predicted transcript-to-cell assignments." + description: | + For each transcript, its cell ID is looked up in both the ground truth + label image (cell_labels) and the predicted segmentation label image. + The Adjusted Rand Index (ARI) is then computed between the two resulting + assignment vectors. ARI = 1 is perfect agreement; values near 0 are random. + Background transcripts (cell_id = 0 in both) are excluded. + # TODO: add ref and link + references: + doi: 10.1007/BF01908075 + min: -1 + max: 1 + maximize: true + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + __merge__: ../../base/setup_spatialdata_partial.yaml + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] \ No newline at end of file diff --git a/src/metrics/ari/script.py b/src/metrics/ari/script.py new file mode 100644 index 0000000..6c5a848 --- /dev/null +++ b/src/metrics/ari/script.py @@ -0,0 +1,70 @@ +import numpy as np +import xarray as xr +import anndata as ad +import spatialdata as sd +from sklearn.metrics import adjusted_rand_score + +## VIASH START +par = { + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.zarr', + 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr', + 'output': 'output.h5ad' +} +meta = { + 'name': 'ari' +} +## VIASH END + +print(">> Reading input files", flush=True) +sdata_pred = sd.read_zarr(par["input_prediction"]) +sdata_sol = sd.read_zarr(par["input_solution"]) + +dataset_id = sdata_sol.tables["table"].uns["dataset_id"] +method_id = sdata_pred.tables["table"].uns["method_id"] + +print(">> Loading transcripts in global coordinate system", flush=True) +transcripts = sd.transform(sdata_sol["transcripts"], to_coordinate_system="global") + +def lookup_labels(label_element, transcripts_global): + """Look up label image values at transcript coordinates.""" + trans = sd.transformations.get_transformation( + label_element, get_all=True + )["global"].inverse() + transcripts_local = sd.transform(transcripts_global, trans, "global") + y = transcripts_local.y.compute().to_numpy(dtype=np.int64) + x = transcripts_local.x.compute().to_numpy(dtype=np.int64) + if isinstance(label_element, xr.DataTree): + img = label_element["scale0"].image.to_numpy() + else: + img = label_element.to_numpy() + y = np.clip(y, 0, img.shape[0] - 1) + x = np.clip(x, 0, img.shape[1] - 1) + return img[y, x] + +print(">> Looking up ground truth cell IDs from cell_labels", flush=True) +gt_cell_ids = lookup_labels(sdata_sol["cell_labels"], transcripts) + +print(">> Looking up predicted cell IDs from segmentation", flush=True) +pred_cell_ids = lookup_labels(sdata_pred["segmentation"], transcripts) + +print(">> Computing ARI", flush=True) +# Exclude transcripts that are background in both +mask = (gt_cell_ids != 0) | (pred_cell_ids != 0) +print(f" Transcripts used: {mask.sum()} / {len(mask)}", flush=True) +print(f" GT unique cells: {len(np.unique(gt_cell_ids[mask]))}", flush=True) +print(f" Pred unique cells: {len(np.unique(pred_cell_ids[mask]))}", flush=True) + +ari_score = adjusted_rand_score(gt_cell_ids[mask], pred_cell_ids[mask]) +print(f" ARI = {ari_score:.4f}", flush=True) + +print(">> Writing output", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": dataset_id, + "normalization_id": "counts", + "method_id": method_id, + "metric_ids": ["ari"], + "metric_values": [float(ari_score)], + } +) +output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index c71286a..454c63f 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,8 +14,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml + - name: "--output_spatial_unlabelled" + __merge__: /src/api/file_spatial_unlabelled.yaml + direction: output + required: true + - name: "--output_spatial_solution" + __merge__: /src/api/file_spatial_solution.yaml direction: output required: true - name: "--output_scrnaseq_reference" diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 947a8f1..2e0c2d4 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,13 +44,14 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - output_spatial_dataset: "output_spatial_dataset", + output_spatial_unlabelled: "output_spatial_unlabelled", + output_spatial_solution: "output_spatial_solution", output_scrnaseq_reference: "output_scrnaseq_reference" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) + | setState(["output_spatial_unlabelled", "output_spatial_solution", "output_scrnaseq_reference"]) emit: output_ch diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4ab5f83..67bf53c 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -4,13 +4,17 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml - direction: output + - name: "--input_spatial_unlabelled" + __merge__: /src/api/file_spatial_unlabelled.yaml + direction: input + required: true + - name: "--input_spatial_solution" + __merge__: /src/api/file_spatial_solution.yaml + direction: input required: true - name: "--input_scrnaseq_reference" __merge__: /src/api/file_scrnaseq_reference.yaml - direction: output + direction: input required: true - name: Outputs arguments: @@ -58,8 +62,11 @@ dependencies: - name: utils/extract_uns_metadata repository: openproblems - name: control_methods/true_labels + - name: control_methods/empty_labels + - name: control_methods/random_voronoi - name: methods/cellpose - - name: metrics/accuracy + - name: metrics/ari + - name: data_processors/process_prediction runners: - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index fe25140..d39386c 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -8,12 +8,14 @@ workflow auto { // construct list of methods and control methods methods = [ true_labels, + empty_labels, + random_voronoi, cellpose ] // construct list of metrics metrics = [ - accuracy + ari ] workflow run_wf { @@ -33,7 +35,7 @@ workflow run_wf { // extract the dataset metadata | extract_uns_metadata.run( - fromState: [input: "input_solution"], + fromState: [input: "input_spatial_unlabelled"], toState: { id, output, state -> state + [ dataset_uns: readYaml(output.output).uns @@ -52,14 +54,8 @@ workflow run_wf { // use the 'filter' argument to only run a method on the normalisation the component is asking for filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) - - method_check && norm_check + method_check }, // define a new 'id' by appending the method name to the dataset id @@ -70,11 +66,10 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: { id, state, comp -> def new_args = [ - input_train: state.input_train, - input_test: state.input_test + input: state.input_spatial_unlabelled ] if (comp.config.info.type == "control_method") { - new_args.input_solution = state.input_solution + new_args.input_solution = state.input_spatial_solution } new_args }, @@ -88,6 +83,15 @@ workflow run_wf { } ) + | process_prediction.run( + fromState: [input: "method_output"], + toState: { id, output, state -> + state + [ + input_prediction: output.output + ] + } + ) + // run all metrics | runEach( components: metrics, @@ -96,8 +100,8 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", - input_prediction: "method_output" + input_solution: "input_solution", + input_prediction: "input_prediction" ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp -> @@ -136,10 +140,8 @@ workflow run_wf { // extract the dataset metadata meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } + // only keep one entry per dataset + | filter{ id, state -> true } | joinStates { ids, states -> // store the dataset metadata in a file def dataset_uns = states.collect{state ->