Vitek-Lab · tonywu1999 · Jun 10, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 10, 2026
diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
@@ -9,17 +9,25 @@
 #' @param mzmine_annotations `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. Required;
 #'   passing `NULL` raises an error. The highest-scoring `compound_name`
-#'   per feature is used as `ProteinName`, and features in the quant
-#'   table with no matching annotation row are dropped from the output.
-#'   These are MSI Level 2 annotations (putative identification via
-#'   MS/MS spectral matching). See the public `MZMinetoMSstatsFormat`
-#'   docstring for the full scope discussion.
+#'   per feature (MSI Level 2 putative identification via MS/MS spectral
+#'   matching) is used as `ProteinName`.
+#' @param sirius_annotations Optional `data.frame` of SIRIUS
+#'   `structure_identifications.tsv` output, or `NULL`. Only the
+#'   `mappingFeatureId` and `name` columns are read; score columns
+#'   (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`,
+#'   `SiriusScore`) are ignored in this release. When supplied, the
+#'   SIRIUS `name` (MSI Level 3, in-silico structure prediction) fills
+#'   `ProteinName` for features that received no MZMine compound name.
+#'   The schema is validated against SIRIUS 6 output; users on other
+#'   versions can rename columns to match. Pass `NULL` to disable.
 #' @return data.table
 #' @keywords internal
-.cleanRawMZMine <- function(msstats_object, mzmine_annotations) {
+.cleanRawMZMine <- function(msstats_object, mzmine_annotations,
+                            sirius_annotations = NULL) {
     ProteinName = PeptideSequence = Intensity = Run = NULL
     PrecursorCharge = FragmentIon = ProductCharge = NULL
     id = score = compound_name = i.compound_name = NULL
+    rowmz = rowretentiontime = mappingFeatureId = name = NULL
 
     mz_input = getInputFile(msstats_object, "input")
     mz_input = data.table::as.data.table(mz_input)
@@ -32,10 +40,13 @@
              "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
     }
     id_col <- "rowID"
-    required_meta <- id_col
+    mz_col <- "rowmz"
+    rt_col <- "rowretentiontime"
+    required_meta <- c(id_col, mz_col, rt_col)
     missing_meta <- setdiff(required_meta, colnames(mz_input))
     if (length(missing_meta) > 0) {
-        stop("Missing required MZMine metadata column (expected 'row ID'). ",
+        stop("Missing required MZMine metadata column(s) ",
+             "(expected 'row ID', 'row m/z', 'row retention time'). ",
              "After standardization, looked for: ",
              paste(missing_meta, collapse = ", "), ".")
     }
@@ -57,20 +68,47 @@
     }
     data.table::setorder(feature_to_compound, id, -score)
     feature_to_compound <- unique(feature_to_compound, by = "id")
-    # Inner-join filter: drop quant rows with no matching annotation.
+    # MZMine compound name fill (left-join, no drop).
     mz_input[
         feature_to_compound,
         ProteinName := i.compound_name,
         on = setNames("id", id_col)
     ]
-    mz_input <- mz_input[!is.na(ProteinName)]
+    n_mzmine <- sum(!is.na(mz_input$ProteinName))
 
-    retained_ids <- feature_to_compound$id
-    retained_msg <- paste0("** MZMine: retained ", length(retained_ids),
-                           " feature(s) after annotation join: ",
-                           paste(retained_ids, collapse = ", "))
-    getOption("MSstatsLog")("INFO", retained_msg)
-    getOption("MSstatsMsg")("INFO", retained_msg)
+    # SIRIUS name fills features still NA after the MZMine compound fill.
+    n_sirius <- 0L
+    if (!is.null(sirius_annotations)) {
+        sirius_dt <- data.table::copy(data.table::as.data.table(sirius_annotations))
+        drop_cols <- setdiff(colnames(sirius_dt), c("mappingFeatureId", "name"))
+        for (col in drop_cols) data.table::set(sirius_dt, j = col, value = NULL)
+        sirius_dt[, name := ifelse(is.na(name) | name == "",
+                                   NA_character_, as.character(name))]
+        sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)]
+        data.table::setorder(sirius_dt, mappingFeatureId, name)
+        # unique() keeps the dedup 1:1 for the join and handles
+        # multiple structure candidates per feature.
+        sirius_dt <- unique(sirius_dt, by = "mappingFeatureId")
+        mz_input[is.na(ProteinName), ProteinName :=
+            sirius_dt[.(as.character(get(id_col))), on = "mappingFeatureId", name]]
+        n_sirius <- sum(!is.na(mz_input$ProteinName)) - n_mzmine
+    }
+
+    # m/z-RT fallback for features still NA.
+    na_mask <- is.na(mz_input$ProteinName)
+    n_fallback <- sum(na_mask)
+    if (n_fallback > 0) {
+        mz_input[na_mask, ProteinName := paste0(
+            round(get(mz_col), 4), "_", round(get(rt_col), 2))]
+    }
+
+    assignment_msg <- paste0(
+        "** MZMine ProteinName assignment: ",
+        "MZMine compound: ", n_mzmine, " feature(s); ",
+        "SIRIUS name: ", n_sirius, " feature(s); ",
+        "m/z-RT fallback: ", n_fallback, " feature(s).")
+    getOption("MSstatsLog")("INFO", assignment_msg)
+    getOption("MSstatsMsg")("INFO", assignment_msg)
 
     mz_input[, PeptideSequence := as.character(get(id_col))]
 

diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
@@ -1,6 +1,7 @@
 #' Import MZMine files
 #'
 #' @inheritParams .sharedParametersAmongConverters
+#' @inheritParams .cleanRawMZMine
 #' @param input MZMine feature-quantification table (wide format; one row per
 #'   feature). Must include the metadata columns `row ID`, `row m/z`,
 #'   `row retention time`, and per-sample peak-area columns named
@@ -11,19 +12,39 @@
 #'   trailing `"Peakarea"` suffix removed. For example, a quant-file column
 #'   `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
 #'   so the corresponding `Run` value must be `sampleAmzML`.
-#' @param mzmine_annotations `data.frame` of MZMine spectral-library
-#'   annotations with columns `id`, `compound_name`, `score`. Required:
-#'   the highest-scoring `compound_name` per feature is used as
-#'   `ProteinName`, and features in the quant table with no matching
-#'   annotation row are dropped from the output.
 #'
-#'   These are MSI Level 2 annotations (putative identification via
-#'   MS/MS spectral matching against a reference library). Higher-
-#'   confidence Level 1 identifications require pure reference standards
-#'   and are out of scope here. Lower-confidence annotations such as
-#'   Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
-#'   CANOPUS) are not currently supported -- features without a Level 2
-#'   annotation row are filtered out.
+#' @details
+#' `ProteinName` is assigned from one of three sources, in priority
+#' order: the MZMine compound name (mandatory), the SIRIUS name
+#' (optional), and an m/z-RT fallback (always available).
+#'
+#' The **MZMine compound name** is the highest-scoring `compound_name`
+#' from `mzmine_annotations` for each feature. This corresponds to MSI
+#' Level 2 (Sumner et al. 2007, PMID 27624161): a putative
+#' identification by MS/MS spectral matching to a reference library.
+#'
+#' The **SIRIUS name** comes from SIRIUS's
+#' `structure_identifications.tsv` and corresponds to MSI Level 3: an
+#' in-silico structure prediction. When `sirius_annotations` is
+#' non-NULL, the SIRIUS `name` fills `ProteinName` only for features
+#' the MZMine library missed -- the MZMine compound name takes
+#' precedence.
+#'
+#' The **m/z-RT fallback** is an identifier built from the feature's
+#' m/z and retention time (for example, `455.282_0.65`). Features that
+#' receive no MZMine or SIRIUS annotation are retained, not dropped,
+#' and assigned an m/z-RT identifier as their `ProteinName`.
+#'
+#' Retaining every feature is a deliberate trade-off. A fuller feature
+#' set gives more stable medians and a more reliable empirical
+#' distribution for global normalization. SIRIUS extends discovery
+#' coverage to features that level-2 spectral matching misses. The
+#' cost is an increase in the number of hypotheses tested downstream
+#' (in `MSstats::groupComparison`), which weakens multiple-testing
+#' correction. Users running confirmatory analyses should restrict to
+#' the MZMine-annotated features post-conversion; users running
+#' discovery analyses benefit from the additional sources despite the
+#' FDR burden.
 #'
 #' @return data.table in the MSstats required format.
 #'
@@ -43,11 +64,23 @@
 #'                                mzmine_annotations = lib,
 #'                                use_log_file = FALSE)
 #' head(output)
+#'
+#' # With SIRIUS annotations:
+#' sirius_path = system.file(
+#'   "tinytest/raw_data/MZMine/structure_identifications.tsv",
+#'   package = "MSstatsConvert")
+#' sirius = data.table::fread(sirius_path)
+#' output_with_sirius = MZMinetoMSstatsFormat(
+#'   input, annotation = annot,
+#'   mzmine_annotations = lib,
+#'   sirius_annotations = sirius,
+#'   use_log_file = FALSE)
+#' head(output_with_sirius)
 MZMinetoMSstatsFormat = function(
     input,
     annotation = NULL,
     mzmine_annotations,
-    removeProtein_with1Feature = FALSE,
+    sirius_annotations = NULL,
     summaryforMultipleRows = max,
     use_log_file = TRUE,
     append = FALSE,
@@ -62,10 +95,22 @@ MZMinetoMSstatsFormat = function(
              "columns 'id', 'compound_name', 'score'.")
     }
 
+    if (!is.null(sirius_annotations)) {
+        sirius_cols = colnames(sirius_annotations)
+        missing_sirius = setdiff(c("mappingFeatureId", "name"), sirius_cols)
+        if (length(missing_sirius) > 0) {
+            stop("sirius_annotations is missing required column(s): ",
+                 paste(missing_sirius, collapse = ", "),
+                 ". Required: 'mappingFeatureId' and 'name'.")
+        }
+    }
+
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "MZMine", ...)
     input = MSstatsConvert::MSstatsClean(
-        input, mzmine_annotations = mzmine_annotations)
+        input,
+        mzmine_annotations = mzmine_annotations,
+        sirius_annotations = sirius_annotations)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
 
     feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge")
@@ -75,7 +120,7 @@ MZMinetoMSstatsFormat = function(
         annotation,
         feature_columns,
         remove_shared_peptides = FALSE,
-        remove_single_feature_proteins = removeProtein_with1Feature,
+        remove_single_feature_proteins = FALSE,
         exact_filtering = NULL,
         pattern_filtering = NULL,
         aggregate_isotopic = FALSE,

diff --git a/inst/tinytest/raw_data/MZMine/structure_identifications.tsv b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv
@@ -0,0 +1,5 @@
+mappingFeatureId	name	ConfidenceScoreExact	ConfidenceScoreApproximate	SiriusScore
+1	DuplicateFromSirius	0.30	0.40	5.5
+4	Caffeic acid	0.85	0.88	22.1
+5		0.10	0.12	1.0
+99	Ghost	0.50	0.55	8.0
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -15,10 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot,
                                use_log_file = FALSE)
 output_dt = data.table::as.data.table(output)
 
-# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns
-# Features 4 and 5 have no annotation row and are dropped by the inner join.
+# Basic structure: 6 features x 4 runs = 24 rows; all features retained.
+# Features 4 and 5 have no MZMine annotation and receive mz_rt fallback names.
 expect_equal(ncol(output), 11)
-expect_equal(nrow(output), 16)
+expect_equal(nrow(output), 24)
 expect_true("Run" %in% colnames(output))
 expect_true("ProteinName" %in% colnames(output))
 expect_true("PeptideSequence" %in% colnames(output))
@@ -54,11 +54,13 @@ expect_equal(as.character(feature3_proteins), "Lactate")
 feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
 expect_equal(as.character(feature6_proteins), "Caffeine")
 
-# Features absent from the annotations file are filtered out (no mz_rt fallback)
-expect_false("4" %in% as.character(output_dt$PeptideSequence))
-expect_false("5" %in% as.character(output_dt$PeptideSequence))
-expect_false(any(as.character(output_dt$ProteinName) %in%
-                 c("489.334_7.89", "555.447_9.1")))
+# Features absent from the MZMine annotations file get mz_rt fallback ProteinNames.
+expect_true("4" %in% as.character(output_dt$PeptideSequence))
+expect_true("5" %in% as.character(output_dt$PeptideSequence))
+feature4_protein = unique(output_dt[PeptideSequence == "4", ProteinName])
+expect_equal(as.character(feature4_protein), "489.334_7.89")
+feature5_protein = unique(output_dt[PeptideSequence == "5", ProteinName])
+expect_equal(as.character(feature5_protein), "555.447_9.1")
 
 # Zero-intensity input cells are converted to NA in output
 # Feature 3 sampleB = 0  ->  NA  (feature 3 is annotated as Lactate)
@@ -97,17 +99,43 @@ expect_error(
     "mzmine_annotations is required"
 )
 
-# removeProtein_with1Feature filters non-Caffeine proteins -------------------
-# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6);
-# Lactate and Glucose each have 1.
-output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
-                                        mzmine_annotations = mzmine_ann,
-                                        removeProtein_with1Feature = TRUE,
-                                        use_log_file = FALSE)
-output_filtered_dt = data.table::as.data.table(output_filtered)
-
-expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
-# 2 features x 4 runs = 8 rows
-expect_equal(nrow(output_filtered), 8)
-expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
-             c("1", "6"))
+# With sirius_annotations supplied ---------------------------------------------
+sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv",
+                          package = "MSstatsConvert")
+sirius = data.table::fread(sirius_path)
+
+output_sirius = MZMinetoMSstatsFormat(input, annotation = annot,
+                                      mzmine_annotations = mzmine_ann,
+                                      sirius_annotations = sirius,
+                                      use_log_file = FALSE)
+output_sirius_dt = data.table::as.data.table(output_sirius)
+
+# All 6 features still retained
+expect_equal(nrow(output_sirius), 24)
+
+# Precedence: feature 1 hit by both MZMine (Caffeine) and SIRIUS
+# (DuplicateFromSirius). MZMine wins.
+feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName])
+expect_equal(as.character(feature1_proteins), "Caffeine")
+
+# SIRIUS fill: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid"
+feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName])
+expect_equal(as.character(feature4_proteins), "Caffeic acid")
+
+# m/z-RT fallback: feature 5 has only an empty-name SIRIUS row; falls to m/z-RT
+feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName])
+expect_equal(as.character(feature5_proteins), "555.447_9.1")
+
+# An irrelevant SIRIUS row (mappingFeatureId=99) must not introduce new features
+expect_false("99" %in% as.character(output_sirius_dt$PeptideSequence))
+expect_false("Ghost" %in% as.character(output_sirius_dt$ProteinName))
+
+# sirius_annotations missing required columns triggers stop() ------------------
+bad_sirius = data.frame(mappingFeatureId = 1, score = 0.9)  # no 'name'
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          mzmine_annotations = mzmine_ann,
+                          sirius_annotations = bad_sirius,
+                          use_log_file = FALSE),
+    "missing required column"
+)
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd