From 0096baacaf55d7e33a28c12010472b28d12e3410 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Thu, 4 Jun 2026 11:50:32 -0400
Subject: [PATCH 1/4] - Add SIRIUS annotation support and mz_rt fallback to
 MZMine converter - Retain features lacking MZMine compound names, assign
 ProteinName via MZMine/SIRIUS/mz_rt tiers, restore required mz/RT metadata,
 and add validation, tests, docs, and tier-level logging.

---
 R/clean_MZMine.R                              | 70 ++++++++++++++----
 R/converters_MZMinetoMSstatsFormat.R          | 74 ++++++++++++++++---
 .../MZMine/structure_identifications.tsv      |  5 ++
 .../test_converters_MZMinetoMSstatsFormat.R   | 59 +++++++++++++--
 man/MSstatsClean.Rd                           | 16 ++--
 man/MZMinetoMSstatsFormat.Rd                  | 66 ++++++++++++++---
 man/dot-cleanRawMZMine.Rd                     | 16 ++--
 vignettes/msstats_data_format.Rmd             | 68 ++++++++++++++---
 8 files changed, 306 insertions(+), 68 deletions(-)
 create mode 100644 inst/tinytest/raw_data/MZMine/structure_identifications.tsv
diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index cee0cf05..ec0f3d7f 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -9,17 +9,22 @@
 #' @param mzmine_annotations `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. Required;
 #'   passing `NULL` raises an error. The highest-scoring `compound_name`
-#'   per feature is used as `ProteinName`, and features in the quant
-#'   table with no matching annotation row are dropped from the output.
-#'   These are MSI Level 2 annotations (putative identification via
-#'   MS/MS spectral matching). See the public `MZMinetoMSstatsFormat`
-#'   docstring for the full scope discussion.
+#'   per feature is used as `ProteinName` (tier 1, MSI Level 2 putative
+#'   identification via MS/MS spectral matching). See the public
+#'   `MZMinetoMSstatsFormat` docstring for the full tier discussion.
+#' @param sirius_annotations Optional `data.frame` of SIRIUS
+#'   `structure_identifications.tsv` output, or `NULL`. Only the
+#'   `mappingFeatureId` and `name` columns are read. When supplied,
+#'   SIRIUS `name` fills `ProteinName` for features that received no
+#'   MZMine compound (tier 2, MSI Level 3).
 #' @return data.table
 #' @keywords internal
-.cleanRawMZMine <- function(msstats_object, mzmine_annotations) {
+.cleanRawMZMine <- function(msstats_object, mzmine_annotations,
+                            sirius_annotations = NULL) {
     ProteinName = PeptideSequence = Intensity = Run = NULL
     PrecursorCharge = FragmentIon = ProductCharge = NULL
     id = score = compound_name = i.compound_name = NULL
+    rowmz = rowretentiontime = mappingFeatureId = name = NULL
 
     mz_input = getInputFile(msstats_object, "input")
     mz_input = data.table::as.data.table(mz_input)
@@ -32,10 +37,13 @@
              "columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
     }
     id_col <- "rowID"
-    required_meta <- id_col
+    mz_col <- "rowmz"
+    rt_col <- "rowretentiontime"
+    required_meta <- c(id_col, mz_col, rt_col)
     missing_meta <- setdiff(required_meta, colnames(mz_input))
     if (length(missing_meta) > 0) {
-        stop("Missing required MZMine metadata column (expected 'row ID'). ",
+        stop("Missing required MZMine metadata column(s) ",
+             "(expected 'row ID', 'row m/z', 'row retention time'). ",
              "After standardization, looked for: ",
              paste(missing_meta, collapse = ", "), ".")
     }
@@ -57,20 +65,50 @@
     }
     data.table::setorder(feature_to_compound, id, -score)
     feature_to_compound <- unique(feature_to_compound, by = "id")
-    # Inner-join filter: drop quant rows with no matching annotation.
+    # Tier 1: MZMine compound name (left-join, no drop).
     mz_input[
         feature_to_compound,
         ProteinName := i.compound_name,
         on = setNames("id", id_col)
     ]
-    mz_input <- mz_input[!is.na(ProteinName)]
+    n_tier1 <- sum(!is.na(mz_input$ProteinName))
 
-    retained_ids <- feature_to_compound$id
-    retained_msg <- paste0("** MZMine: retained ", length(retained_ids),
-                           " feature(s) after annotation join: ",
-                           paste(retained_ids, collapse = ", "))
-    getOption("MSstatsLog")("INFO", retained_msg)
-    getOption("MSstatsMsg")("INFO", retained_msg)
+    # Tier 2: SIRIUS name fills features still NA after tier 1.
+    n_tier2 <- 0L
+    if (!is.null(sirius_annotations)) {
+        sirius_dt <- data.table::as.data.table(sirius_annotations)
+        sirius_dt <- sirius_dt[, c("mappingFeatureId", "name"), with = FALSE]
+        sirius_dt[, name := ifelse(is.na(name) | name == "",
+                                   NA_character_, as.character(name))]
+        sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)]
+        data.table::setorder(sirius_dt, mappingFeatureId)
+        sirius_dt <- unique(sirius_dt, by = "mappingFeatureId")
+        mz_input[, ProteinName := ifelse(
+            is.na(ProteinName),
+            sirius_dt[
+                .(as.character(get(id_col))),
+                on = "mappingFeatureId",
+                name
+            ],
+            ProteinName)]
+        n_tier2 <- sum(!is.na(mz_input$ProteinName)) - n_tier1
+    }
+
+    # Tier 3: mz_rt fallback for features still NA.
+    na_mask <- is.na(mz_input$ProteinName)
+    n_tier3 <- sum(na_mask)
+    if (n_tier3 > 0) {
+        mz_input[na_mask, ProteinName := paste0(
+            round(get(mz_col), 4), "_", round(get(rt_col), 2))]
+    }
+
+    tier_msg <- paste0(
+        "** MZMine ProteinName assignment: ",
+        "tier 1 (MZMine compound): ", n_tier1, " feature(s); ",
+        "tier 2 (SIRIUS name): ", n_tier2, " feature(s); ",
+        "tier 3 (mz_rt fallback): ", n_tier3, " feature(s).")
+    getOption("MSstatsLog")("INFO", tier_msg)
+    getOption("MSstatsMsg")("INFO", tier_msg)
 
     mz_input[, PeptideSequence := as.character(get(id_col))]
 
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index 4e3df158..bc544285 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -14,16 +14,45 @@
 #' @param mzmine_annotations `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. Required:
 #'   the highest-scoring `compound_name` per feature is used as
-#'   `ProteinName`, and features in the quant table with no matching
-#'   annotation row are dropped from the output.
+#'   `ProteinName` (tier 1, MSI Level 2 putative identification via
+#'   MS/MS spectral matching).
+#' @param sirius_annotations Optional `data.frame` of SIRIUS
+#'   `structure_identifications.tsv` output, or `NULL`. Only the
+#'   `mappingFeatureId` and `name` columns are read; score columns
+#'   (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`,
+#'   `SiriusScore`) are ignored in this release. When supplied, SIRIUS
+#'   `name` fills `ProteinName` for features that received no MZMine
+#'   compound (tier 2, MSI Level 3 in-silico structure prediction).
+#'   The schema is validated against SIRIUS 6 output; users on other
+#'   versions can rename columns to match. Pass `NULL` to disable.
 #'
-#'   These are MSI Level 2 annotations (putative identification via
-#'   MS/MS spectral matching against a reference library). Higher-
-#'   confidence Level 1 identifications require pure reference standards
-#'   and are out of scope here. Lower-confidence annotations such as
-#'   Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
-#'   CANOPUS) are not currently supported -- features without a Level 2
-#'   annotation row are filtered out.
+#' @details
+#' `ProteinName` is assigned in three tiers, in priority order:
+#'
+#' 1. **MZMine compound (mandatory)** -- the highest-scoring
+#'    `compound_name` from `mzmine_annotations`. Equivalent to MSI
+#'    Level 2 (Sumner et al. 2007, PMID 27624161): putative
+#'    identification by MS/MS spectral matching to a reference library.
+#'
+#' 2. **SIRIUS name (optional)** -- when `sirius_annotations` is
+#'    non-NULL, the `name` from SIRIUS `structure_identifications.tsv`
+#'    fills any `ProteinName` still NA after tier 1. Equivalent to MSI
+#'    Level 3: in-silico structure prediction. MZMine annotations take
+#'    precedence: SIRIUS only fills features that MZMine missed.
+#'
+#' 3. **mz_rt fallback (always)** -- features with no annotation from
+#'    either source are retained, not dropped, and assigned
+#'    `paste0(round(mz, 4), "_", round(rt, 2))` as their `ProteinName`.
+#'
+#' The tier-3 retain-all policy is a deliberate trade-off. A fuller
+#' feature set gives more stable medians and a more reliable empirical
+#' distribution for global normalization. SIRIUS extends discovery
+#' coverage to features that level-2 spectral matching misses. The
+#' cost is an increase in the number of hypotheses tested downstream
+#' (in `MSstats::groupComparison`), which weakens multiple-testing
+#' correction. Users running confirmatory analyses should restrict to
+#' tier-1 features post-conversion; users running discovery analyses
+#' benefit from the additional tiers despite the FDR burden.
 #'
 #' @return data.table in the MSstats required format.
 #'
@@ -43,10 +72,23 @@
 #'                                mzmine_annotations = lib,
 #'                                use_log_file = FALSE)
 #' head(output)
+#'
+#' # With SIRIUS annotations:
+#' sirius_path = system.file(
+#'   "tinytest/raw_data/MZMine/structure_identifications.tsv",
+#'   package = "MSstatsConvert")
+#' sirius = data.table::fread(sirius_path)
+#' output_with_sirius = MZMinetoMSstatsFormat(
+#'   input, annotation = annot,
+#'   mzmine_annotations = lib,
+#'   sirius_annotations = sirius,
+#'   use_log_file = FALSE)
+#' head(output_with_sirius)
 MZMinetoMSstatsFormat = function(
     input,
     annotation = NULL,
     mzmine_annotations,
+    sirius_annotations = NULL,
     removeProtein_with1Feature = FALSE,
     summaryforMultipleRows = max,
     use_log_file = TRUE,
@@ -62,10 +104,22 @@ MZMinetoMSstatsFormat = function(
              "columns 'id', 'compound_name', 'score'.")
     }
 
+    if (!is.null(sirius_annotations)) {
+        sirius_cols = colnames(sirius_annotations)
+        missing_sirius = setdiff(c("mappingFeatureId", "name"), sirius_cols)
+        if (length(missing_sirius) > 0) {
+            stop("sirius_annotations is missing required column(s): ",
+                 paste(missing_sirius, collapse = ", "),
+                 ". Required: 'mappingFeatureId' and 'name'.")
+        }
+    }
+
     input = MSstatsConvert::MSstatsImport(list(input = input),
                                           "MSstats", "MZMine", ...)
     input = MSstatsConvert::MSstatsClean(
-        input, mzmine_annotations = mzmine_annotations)
+        input,
+        mzmine_annotations = mzmine_annotations,
+        sirius_annotations = sirius_annotations)
     annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)
 
     feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge")
diff --git a/inst/tinytest/raw_data/MZMine/structure_identifications.tsv b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv
new file mode 100644
index 00000000..a0e38e69
--- /dev/null
+++ b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv
@@ -0,0 +1,5 @@
+mappingFeatureId	name	ConfidenceScoreExact	ConfidenceScoreApproximate	SiriusScore
+1	DuplicateFromSirius	0.30	0.40	5.5
+4	Caffeic acid	0.85	0.88	22.1
+5		0.10	0.12	1.0
+99	Ghost	0.50	0.55	8.0
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
index dcfccf90..6ac1c0fc 100644
--- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
+++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -15,10 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot,
                                use_log_file = FALSE)
 output_dt = data.table::as.data.table(output)
 
-# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns
-# Features 4 and 5 have no annotation row and are dropped by the inner join.
+# Basic structure: 6 features x 4 runs = 24 rows; all features retained.
+# Features 4 and 5 have no MZMine annotation and receive mz_rt fallback names.
 expect_equal(ncol(output), 11)
-expect_equal(nrow(output), 16)
+expect_equal(nrow(output), 24)
 expect_true("Run" %in% colnames(output))
 expect_true("ProteinName" %in% colnames(output))
 expect_true("PeptideSequence" %in% colnames(output))
@@ -54,11 +54,13 @@ expect_equal(as.character(feature3_proteins), "Lactate")
 feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
 expect_equal(as.character(feature6_proteins), "Caffeine")
 
-# Features absent from the annotations file are filtered out (no mz_rt fallback)
-expect_false("4" %in% as.character(output_dt$PeptideSequence))
-expect_false("5" %in% as.character(output_dt$PeptideSequence))
-expect_false(any(as.character(output_dt$ProteinName) %in%
-                 c("489.334_7.89", "555.447_9.1")))
+# Features absent from the MZMine annotations file get mz_rt fallback ProteinNames.
+expect_true("4" %in% as.character(output_dt$PeptideSequence))
+expect_true("5" %in% as.character(output_dt$PeptideSequence))
+feature4_protein = unique(output_dt[PeptideSequence == "4", ProteinName])
+expect_equal(as.character(feature4_protein), "489.334_7.89")
+feature5_protein = unique(output_dt[PeptideSequence == "5", ProteinName])
+expect_equal(as.character(feature5_protein), "555.447_9.1")
 
 # Zero-intensity input cells are converted to NA in output
 # Feature 3 sampleB = 0  ->  NA  (feature 3 is annotated as Lactate)
@@ -111,3 +113,44 @@ expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
 expect_equal(nrow(output_filtered), 8)
 expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
              c("1", "6"))
+
+# With sirius_annotations supplied ---------------------------------------------
+sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv",
+                          package = "MSstatsConvert")
+sirius = data.table::fread(sirius_path)
+
+output_sirius = MZMinetoMSstatsFormat(input, annotation = annot,
+                                      mzmine_annotations = mzmine_ann,
+                                      sirius_annotations = sirius,
+                                      use_log_file = FALSE)
+output_sirius_dt = data.table::as.data.table(output_sirius)
+
+# All 6 features still retained
+expect_equal(nrow(output_sirius), 24)
+
+# Precedence: feature 1 hit by both MZMine (Caffeine) and SIRIUS
+# (DuplicateFromSirius). MZMine wins.
+feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName])
+expect_equal(as.character(feature1_proteins), "Caffeine")
+
+# Tier 2: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid"
+feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName])
+expect_equal(as.character(feature4_proteins), "Caffeic acid")
+
+# Tier 3: feature 5 has only an empty-name SIRIUS row; falls to mz_rt
+feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName])
+expect_equal(as.character(feature5_proteins), "555.447_9.1")
+
+# An irrelevant SIRIUS row (mappingFeatureId=99) must not introduce new features
+expect_false("99" %in% as.character(output_sirius_dt$PeptideSequence))
+expect_false("Ghost" %in% as.character(output_sirius_dt$ProteinName))
+
+# sirius_annotations missing required columns triggers stop() ------------------
+bad_sirius = data.frame(mappingFeatureId = 1, score = 0.9)  # no 'name'
+expect_error(
+    MZMinetoMSstatsFormat(input, annotation = annot,
+                          mzmine_annotations = mzmine_ann,
+                          sirius_annotations = bad_sirius,
+                          use_log_file = FALSE),
+    "missing required column"
+)
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index 8863b4ea..e10b0949 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -82,7 +82,7 @@ MSstatsClean(msstats_object, ...)
 
 \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object)
 
-\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations)
+\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations, sirius_annotations = NULL)
 }
 \arguments{
 \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.}
@@ -204,11 +204,15 @@ peptides receive \code{IsotopeLabelType = "Light"}.}
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
-per feature is used as \code{ProteinName}, and features in the quant
-table with no matching annotation row are dropped from the output.
-These are MSI Level 2 annotations (putative identification via
-MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat}
-docstring for the full scope discussion.}
+per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative
+identification via MS/MS spectral matching). See the public
+\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.}
+
+\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
+\code{structure_identifications.tsv} output, or \code{NULL}. Only the
+\code{mappingFeatureId} and \code{name} columns are read. When supplied,
+SIRIUS \code{name} fills \code{ProteinName} for features that received no
+MZMine compound (tier 2, MSI Level 3).}
 }
 \value{
 data.table
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index b3fdcfe1..9c187da6 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -8,6 +8,7 @@ MZMinetoMSstatsFormat(
   input,
   annotation = NULL,
   mzmine_annotations,
+  sirius_annotations = NULL,
   removeProtein_with1Feature = FALSE,
   summaryforMultipleRows = max,
   use_log_file = TRUE,
@@ -33,20 +34,26 @@ so the corresponding \code{Run} value must be \code{sampleAmzML}.}
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required:
 the highest-scoring \code{compound_name} per feature is used as
-\code{ProteinName}, and features in the quant table with no matching
-annotation row are dropped from the output.
+\code{ProteinName} (tier 1, MSI Level 2 putative identification via
+MS/MS spectral matching).}
 
-These are MSI Level 2 annotations (putative identification via
-MS/MS spectral matching against a reference library). Higher-
-confidence Level 1 identifications require pure reference standards
-and are out of scope here. Lower-confidence annotations such as
-Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
-CANOPUS) are not currently supported -- features without a Level 2
-annotation row are filtered out.}
+\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
+\code{structure_identifications.tsv} output, or \code{NULL}. Only the
+\code{mappingFeatureId} and \code{name} columns are read; score columns
+(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate},
+\code{SiriusScore}) are ignored in this release. When supplied, SIRIUS
+\code{name} fills \code{ProteinName} for features that received no MZMine
+compound (tier 2, MSI Level 3 in-silico structure prediction).
+The schema is validated against SIRIUS 6 output; users on other
+versions can rename columns to match. Pass \code{NULL} to disable.}
 
 \item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.}
 
-\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.}
+\item{summaryforMultipleRows}{max or sum - when multiple PSMs identify
+the same feature within a single MS run (duplicate PSMs), use the
+highest (max) or sum of the duplicate intensities. Default is max for
+label-free converters and sum for TMT converters. Note that this parameter
+does NOT control collapsing across fractions of the same biological mixture.}
 
 \item{use_log_file}{logical. If TRUE, information about data processing
 will be saved to a file.}
@@ -70,6 +77,33 @@ data.table in the MSstats required format.
 \description{
 Import MZMine files
 }
+\details{
+\code{ProteinName} is assigned in three tiers, in priority order:
+\enumerate{
+\item \strong{MZMine compound (mandatory)} -- the highest-scoring
+\code{compound_name} from \code{mzmine_annotations}. Equivalent to MSI
+Level 2 (Sumner et al. 2007, PMID 27624161): putative
+identification by MS/MS spectral matching to a reference library.
+\item \strong{SIRIUS name (optional)} -- when \code{sirius_annotations} is
+non-NULL, the \code{name} from SIRIUS \code{structure_identifications.tsv}
+fills any \code{ProteinName} still NA after tier 1. Equivalent to MSI
+Level 3: in-silico structure prediction. MZMine annotations take
+precedence: SIRIUS only fills features that MZMine missed.
+\item \strong{mz_rt fallback (always)} -- features with no annotation from
+either source are retained, not dropped, and assigned
+\code{paste0(round(mz, 4), "_", round(rt, 2))} as their \code{ProteinName}.
+}
+
+The tier-3 retain-all policy is a deliberate trade-off. A fuller
+feature set gives more stable medians and a more reliable empirical
+distribution for global normalization. SIRIUS extends discovery
+coverage to features that level-2 spectral matching misses. The
+cost is an increase in the number of hypotheses tested downstream
+(in \code{MSstats::groupComparison}), which weakens multiple-testing
+correction. Users running confirmatory analyses should restrict to
+tier-1 features post-conversion; users running discovery analyses
+benefit from the additional tiers despite the FDR burden.
+}
 \examples{
 input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
                          package = "MSstatsConvert")
@@ -84,4 +118,16 @@ output = MZMinetoMSstatsFormat(input, annotation = annot,
                                mzmine_annotations = lib,
                                use_log_file = FALSE)
 head(output)
+
+# With SIRIUS annotations:
+sirius_path = system.file(
+  "tinytest/raw_data/MZMine/structure_identifications.tsv",
+  package = "MSstatsConvert")
+sirius = data.table::fread(sirius_path)
+output_with_sirius = MZMinetoMSstatsFormat(
+  input, annotation = annot,
+  mzmine_annotations = lib,
+  sirius_annotations = sirius,
+  use_log_file = FALSE)
+head(output_with_sirius)
 }
diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd
index 8c93db08..d7c1fc69 100644
--- a/man/dot-cleanRawMZMine.Rd
+++ b/man/dot-cleanRawMZMine.Rd
@@ -4,7 +4,7 @@
 \alias{.cleanRawMZMine}
 \title{Clean raw MZMine files}
 \usage{
-.cleanRawMZMine(msstats_object, mzmine_annotations)
+.cleanRawMZMine(msstats_object, mzmine_annotations, sirius_annotations = NULL)
 }
 \arguments{
 \item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.}
@@ -12,11 +12,15 @@
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
-per feature is used as \code{ProteinName}, and features in the quant
-table with no matching annotation row are dropped from the output.
-These are MSI Level 2 annotations (putative identification via
-MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat}
-docstring for the full scope discussion.}
+per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative
+identification via MS/MS spectral matching). See the public
+\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.}
+
+\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
+\code{structure_identifications.tsv} output, or \code{NULL}. Only the
+\code{mappingFeatureId} and \code{name} columns are read. When supplied,
+SIRIUS \code{name} fills \code{ProteinName} for features that received no
+MZMine compound (tier 2, MSI Level 3).}
 }
 \value{
 data.table
diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd
index 64a4d882..b52f757e 100644
--- a/vignettes/msstats_data_format.Rmd
+++ b/vignettes/msstats_data_format.Rmd
@@ -353,17 +353,47 @@ per feature, one `<sample> Peak area` column per sample) together with a standar
 MSstats annotation and produces an MSstats-ready long-format `data.table`.
 
 An MZMine spectral-library annotation table with `id`, `compound_name`, and
-`score` columns is **required**. The highest-scoring `compound_name` per feature
-is used as `ProteinName`. Features in the quant table with no matching annotation
-row are dropped from the output — there is no synthesized mz_rt fallback,
-because placeholder identifiers inflate the hypothesis count for downstream
-`groupComparison` without biological signal.
-
-These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/)
-(putative identification via MS/MS spectral matching against a reference library).
-Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS
-(Level 4) — are out of scope for this iteration; features without a Level 2
-annotation row are filtered out.
+`score` columns is **required**. All features in the quant table are retained
+in the output: see "Annotation levels and the SIRIUS option" below for how
+`ProteinName` is assigned.
+
+## Annotation levels and the SIRIUS option
+
+`ProteinName` is filled in three tiers, in priority order:
+
+| Tier | Source | MSI Level | Status |
+|------|--------|-----------|--------|
+| 1 | MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory |
+| 2 | SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) |
+| 3 | `paste0(round(mz, 4), "_", round(rt, 2))` | — | Always, for features still unannotated |
+
+MZMine annotations (level 2) take precedence over SIRIUS (level 3).
+The mz_rt fallback retains features rather than dropping them — a
+deliberate trade-off:
+
+- **Pro**: a fuller feature set gives more stable medians and a more
+  reliable empirical distribution for global normalization.
+- **Pro**: SIRIUS extends discovery coverage to features that level-2
+  spectral matching misses.
+- **Con**: retaining all features increases the number of hypotheses
+  tested downstream, which weakens multiple-testing correction. For
+  confirmatory analyses, restrict to tier-1 features after conversion.
+- **Note**: passing `removeProtein_with1Feature = TRUE` drops every
+  mz_rt feature (and any SIRIUS-only singleton) because each mz_rt
+  name is unique to one feature, so it is by construction a
+  single-feature "protein" and gets filtered out — quietly losing the
+  tier-3 retain-all benefit.
+
+The SIRIUS schema we depend on is `mappingFeatureId` (join key against
+MZMine `rowID`) and `name`. Score columns
+(`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, `SiriusScore`)
+are present in the file but not used in this release. The schema is
+validated against SIRIUS 6 output; users on different SIRIUS versions
+can rename their columns to match.
+
+The reference background is
+[MSI Level 2 / Level 3 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/)
+(Sumner et al. 2007, PMID 27624161).
 
 ```{r mzmine}
 mzmine_input = data.table::fread(system.file(
@@ -379,7 +409,7 @@ mzmine_library = data.table::fread(system.file(
   package = "MSstatsConvert"
 ))
 
-# ProteinName comes from the matched compound_name; unannotated features are dropped
+# tier 1 fills from compound_name; tier 3 fills the remaining as mz_rt
 mzmine_converted = MZMinetoMSstatsFormat(
   mzmine_input,
   annotation = mzmine_annotation,
@@ -387,6 +417,20 @@ mzmine_converted = MZMinetoMSstatsFormat(
   use_log_file = FALSE
 )
 head(mzmine_converted)
+
+# Worked example with SIRIUS — fills tier 2 for features the MZMine library missed:
+sirius = data.table::fread(system.file(
+  "tinytest/raw_data/MZMine/structure_identifications.tsv",
+  package = "MSstatsConvert"
+))
+mzmine_with_sirius = MZMinetoMSstatsFormat(
+  mzmine_input,
+  annotation = mzmine_annotation,
+  mzmine_annotations = mzmine_library,
+  sirius_annotations = sirius,
+  use_log_file = FALSE
+)
+unique(mzmine_with_sirius[, c("PeptideSequence", "ProteinName")])
 ```
 
 Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence`

From dfeb79fe42298d8993fc636b96c451c59bdaf9c9 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Thu, 4 Jun 2026 12:26:29 -0400
Subject: [PATCH 2/4] Resolve nitpicks raised by coderabbit-ai

---
 vignettes/msstats_data_format.Rmd | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd
index b52f757e..44c67989 100644
--- a/vignettes/msstats_data_format.Rmd
+++ b/vignettes/msstats_data_format.Rmd
@@ -378,11 +378,14 @@ deliberate trade-off:
 - **Con**: retaining all features increases the number of hypotheses
   tested downstream, which weakens multiple-testing correction. For
   confirmatory analyses, restrict to tier-1 features after conversion.
-- **Note**: passing `removeProtein_with1Feature = TRUE` drops every
-  mz_rt feature (and any SIRIUS-only singleton) because each mz_rt
-  name is unique to one feature, so it is by construction a
-  single-feature "protein" and gets filtered out — quietly losing the
-  tier-3 retain-all benefit.
+- **Note**: passing `removeProtein_with1Feature = TRUE` drops nearly
+  every mz_rt feature (and any SIRIUS-only singleton), because the
+  rounded `paste0(round(mz, 4), "_", round(rt, 2))` identifier is
+  almost always unique to one feature — so each mz_rt row is treated
+  as a single-feature "protein" and filtered out, quietly losing the
+  tier-3 retain-all benefit. In rare cases two features can share an
+  mz_rt name (when their rounded m/z and RT coincide); those collide
+  into a single multi-feature "protein" and survive the filter.
 
 The SIRIUS schema we depend on is `mappingFeatureId` (join key against
 MZMine `rowID`) and `name`. Score columns

From 66b378cd510b40e77a0fd028d9053ee6577cb726 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Wed, 10 Jun 2026 00:03:48 -0400
Subject: [PATCH 3/4] Address review feedback on MZMine SIRIUS converter:
 Inherit sirius_annotations docs via @inheritParams, replace tier terminology
 with MSI levels and plain-language source descriptions, and remove the
 removeProtein_with1Feature parameter (hard-coded FALSE internally). Switch
 the SIRIUS fill to in-place data.table updates with a deterministic dedup
 tiebreaker.

---
 R/clean_MZMine.R                              | 64 ++++++++++---------
 R/converters_MZMinetoMSstatsFormat.R          | 58 ++++++++---------
 .../test_converters_MZMinetoMSstatsFormat.R   | 19 +-----
 man/MSstatsClean.Rd                           | 17 +++--
 man/MZMinetoMSstatsFormat.Rd                  | 57 +++++++++--------
 man/dot-cleanRawMZMine.Rd                     | 17 +++--
 vignettes/msstats_data_format.Rmd             | 29 ++++-----
 7 files changed, 125 insertions(+), 136 deletions(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index ec0f3d7f..1113ae7f 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -9,14 +9,19 @@
 #' @param mzmine_annotations `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. Required;
 #'   passing `NULL` raises an error. The highest-scoring `compound_name`
-#'   per feature is used as `ProteinName` (tier 1, MSI Level 2 putative
-#'   identification via MS/MS spectral matching). See the public
-#'   `MZMinetoMSstatsFormat` docstring for the full tier discussion.
+#'   per feature (MSI Level 2 putative identification via MS/MS spectral
+#'   matching) is used as `ProteinName`. See the public
+#'   `MZMinetoMSstatsFormat` docstring for the full discussion of
+#'   identification sources and precedence.
 #' @param sirius_annotations Optional `data.frame` of SIRIUS
 #'   `structure_identifications.tsv` output, or `NULL`. Only the
-#'   `mappingFeatureId` and `name` columns are read. When supplied,
-#'   SIRIUS `name` fills `ProteinName` for features that received no
-#'   MZMine compound (tier 2, MSI Level 3).
+#'   `mappingFeatureId` and `name` columns are read; score columns
+#'   (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`,
+#'   `SiriusScore`) are ignored in this release. When supplied, the
+#'   SIRIUS `name` (MSI Level 3, in-silico structure prediction) fills
+#'   `ProteinName` for features that received no MZMine compound name.
+#'   The schema is validated against SIRIUS 6 output; users on other
+#'   versions can rename columns to match. Pass `NULL` to disable.
 #' @return data.table
 #' @keywords internal
 .cleanRawMZMine <- function(msstats_object, mzmine_annotations,
@@ -65,50 +70,47 @@
     }
     data.table::setorder(feature_to_compound, id, -score)
     feature_to_compound <- unique(feature_to_compound, by = "id")
-    # Tier 1: MZMine compound name (left-join, no drop).
+    # MZMine compound name fill (left-join, no drop).
     mz_input[
         feature_to_compound,
         ProteinName := i.compound_name,
         on = setNames("id", id_col)
     ]
-    n_tier1 <- sum(!is.na(mz_input$ProteinName))
+    n_mzmine <- sum(!is.na(mz_input$ProteinName))
 
-    # Tier 2: SIRIUS name fills features still NA after tier 1.
-    n_tier2 <- 0L
+    # SIRIUS name fills features still NA after the MZMine compound fill.
+    n_sirius <- 0L
     if (!is.null(sirius_annotations)) {
-        sirius_dt <- data.table::as.data.table(sirius_annotations)
-        sirius_dt <- sirius_dt[, c("mappingFeatureId", "name"), with = FALSE]
+        sirius_dt <- data.table::copy(data.table::as.data.table(sirius_annotations))
+        drop_cols <- setdiff(colnames(sirius_dt), c("mappingFeatureId", "name"))
+        for (col in drop_cols) data.table::set(sirius_dt, j = col, value = NULL)
         sirius_dt[, name := ifelse(is.na(name) | name == "",
                                    NA_character_, as.character(name))]
         sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)]
-        data.table::setorder(sirius_dt, mappingFeatureId)
+        data.table::setorder(sirius_dt, mappingFeatureId, name)
+        # unique() keeps the dedup 1:1 for the join and handles
+        # multiple structure candidates per feature.
         sirius_dt <- unique(sirius_dt, by = "mappingFeatureId")
-        mz_input[, ProteinName := ifelse(
-            is.na(ProteinName),
-            sirius_dt[
-                .(as.character(get(id_col))),
-                on = "mappingFeatureId",
-                name
-            ],
-            ProteinName)]
-        n_tier2 <- sum(!is.na(mz_input$ProteinName)) - n_tier1
+        mz_input[is.na(ProteinName), ProteinName :=
+            sirius_dt[.(as.character(get(id_col))), on = "mappingFeatureId", name]]
+        n_sirius <- sum(!is.na(mz_input$ProteinName)) - n_mzmine
     }
 
-    # Tier 3: mz_rt fallback for features still NA.
+    # m/z-RT fallback for features still NA.
     na_mask <- is.na(mz_input$ProteinName)
-    n_tier3 <- sum(na_mask)
-    if (n_tier3 > 0) {
+    n_fallback <- sum(na_mask)
+    if (n_fallback > 0) {
         mz_input[na_mask, ProteinName := paste0(
             round(get(mz_col), 4), "_", round(get(rt_col), 2))]
     }
 
-    tier_msg <- paste0(
+    assignment_msg <- paste0(
         "** MZMine ProteinName assignment: ",
-        "tier 1 (MZMine compound): ", n_tier1, " feature(s); ",
-        "tier 2 (SIRIUS name): ", n_tier2, " feature(s); ",
-        "tier 3 (mz_rt fallback): ", n_tier3, " feature(s).")
-    getOption("MSstatsLog")("INFO", tier_msg)
-    getOption("MSstatsMsg")("INFO", tier_msg)
+        "MZMine compound: ", n_mzmine, " feature(s); ",
+        "SIRIUS name: ", n_sirius, " feature(s); ",
+        "m/z-RT fallback: ", n_fallback, " feature(s).")
+    getOption("MSstatsLog")("INFO", assignment_msg)
+    getOption("MSstatsMsg")("INFO", assignment_msg)
 
     mz_input[, PeptideSequence := as.character(get(id_col))]
 
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index bc544285..3140c7f5 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -1,6 +1,7 @@
 #' Import MZMine files
 #'
 #' @inheritParams .sharedParametersAmongConverters
+#' @inheritParams .cleanRawMZMine
 #' @param input MZMine feature-quantification table (wide format; one row per
 #'   feature). Must include the metadata columns `row ID`, `row m/z`,
 #'   `row retention time`, and per-sample peak-area columns named
@@ -13,46 +14,42 @@
 #'   so the corresponding `Run` value must be `sampleAmzML`.
 #' @param mzmine_annotations `data.frame` of MZMine spectral-library
 #'   annotations with columns `id`, `compound_name`, `score`. Required:
-#'   the highest-scoring `compound_name` per feature is used as
-#'   `ProteinName` (tier 1, MSI Level 2 putative identification via
-#'   MS/MS spectral matching).
-#' @param sirius_annotations Optional `data.frame` of SIRIUS
-#'   `structure_identifications.tsv` output, or `NULL`. Only the
-#'   `mappingFeatureId` and `name` columns are read; score columns
-#'   (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`,
-#'   `SiriusScore`) are ignored in this release. When supplied, SIRIUS
-#'   `name` fills `ProteinName` for features that received no MZMine
-#'   compound (tier 2, MSI Level 3 in-silico structure prediction).
-#'   The schema is validated against SIRIUS 6 output; users on other
-#'   versions can rename columns to match. Pass `NULL` to disable.
+#'   the highest-scoring `compound_name` per feature (MSI Level 2
+#'   putative identification via MS/MS spectral matching) is used as
+#'   `ProteinName`.
 #'
 #' @details
-#' `ProteinName` is assigned in three tiers, in priority order:
+#' `ProteinName` is assigned from one of three sources, in priority
+#' order: the MZMine compound name (mandatory), the SIRIUS name
+#' (optional), and an m/z-RT fallback (always available).
 #'
-#' 1. **MZMine compound (mandatory)** -- the highest-scoring
-#'    `compound_name` from `mzmine_annotations`. Equivalent to MSI
-#'    Level 2 (Sumner et al. 2007, PMID 27624161): putative
-#'    identification by MS/MS spectral matching to a reference library.
+#' The **MZMine compound name** is the highest-scoring `compound_name`
+#' from `mzmine_annotations` for each feature. This corresponds to MSI
+#' Level 2 (Sumner et al. 2007, PMID 27624161): a putative
+#' identification by MS/MS spectral matching to a reference library.
 #'
-#' 2. **SIRIUS name (optional)** -- when `sirius_annotations` is
-#'    non-NULL, the `name` from SIRIUS `structure_identifications.tsv`
-#'    fills any `ProteinName` still NA after tier 1. Equivalent to MSI
-#'    Level 3: in-silico structure prediction. MZMine annotations take
-#'    precedence: SIRIUS only fills features that MZMine missed.
+#' The **SIRIUS name** comes from SIRIUS's
+#' `structure_identifications.tsv` and corresponds to MSI Level 3: an
+#' in-silico structure prediction. When `sirius_annotations` is
+#' non-NULL, the SIRIUS `name` fills `ProteinName` only for features
+#' the MZMine library missed -- the MZMine compound name takes
+#' precedence.
 #'
-#' 3. **mz_rt fallback (always)** -- features with no annotation from
-#'    either source are retained, not dropped, and assigned
-#'    `paste0(round(mz, 4), "_", round(rt, 2))` as their `ProteinName`.
+#' The **m/z-RT fallback** is an identifier built from the feature's
+#' m/z and retention time (for example, `455.282_0.65`). Features that
+#' receive no MZMine or SIRIUS annotation are retained, not dropped,
+#' and assigned an m/z-RT identifier as their `ProteinName`.
 #'
-#' The tier-3 retain-all policy is a deliberate trade-off. A fuller
-#' feature set gives more stable medians and a more reliable empirical
+#' Retaining every feature is a deliberate trade-off. A fuller feature
+#' set gives more stable medians and a more reliable empirical
 #' distribution for global normalization. SIRIUS extends discovery
 #' coverage to features that level-2 spectral matching misses. The
 #' cost is an increase in the number of hypotheses tested downstream
 #' (in `MSstats::groupComparison`), which weakens multiple-testing
 #' correction. Users running confirmatory analyses should restrict to
-#' tier-1 features post-conversion; users running discovery analyses
-#' benefit from the additional tiers despite the FDR burden.
+#' the MZMine-annotated features post-conversion; users running
+#' discovery analyses benefit from the additional sources despite the
+#' FDR burden.
 #'
 #' @return data.table in the MSstats required format.
 #'
@@ -89,7 +86,6 @@ MZMinetoMSstatsFormat = function(
     annotation = NULL,
     mzmine_annotations,
     sirius_annotations = NULL,
-    removeProtein_with1Feature = FALSE,
     summaryforMultipleRows = max,
     use_log_file = TRUE,
     append = FALSE,
@@ -129,7 +125,7 @@ MZMinetoMSstatsFormat = function(
         annotation,
         feature_columns,
         remove_shared_peptides = FALSE,
-        remove_single_feature_proteins = removeProtein_with1Feature,
+        remove_single_feature_proteins = FALSE,
         exact_filtering = NULL,
         pattern_filtering = NULL,
         aggregate_isotopic = FALSE,
diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
index 6ac1c0fc..2589f1fc 100644
--- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
+++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
@@ -99,21 +99,6 @@ expect_error(
     "mzmine_annotations is required"
 )
 
-# removeProtein_with1Feature filters non-Caffeine proteins -------------------
-# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6);
-# Lactate and Glucose each have 1.
-output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
-                                        mzmine_annotations = mzmine_ann,
-                                        removeProtein_with1Feature = TRUE,
-                                        use_log_file = FALSE)
-output_filtered_dt = data.table::as.data.table(output_filtered)
-
-expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
-# 2 features x 4 runs = 8 rows
-expect_equal(nrow(output_filtered), 8)
-expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
-             c("1", "6"))
-
 # With sirius_annotations supplied ---------------------------------------------
 sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv",
                           package = "MSstatsConvert")
@@ -133,11 +118,11 @@ expect_equal(nrow(output_sirius), 24)
 feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName])
 expect_equal(as.character(feature1_proteins), "Caffeine")
 
-# Tier 2: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid"
+# SIRIUS fill: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid"
 feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName])
 expect_equal(as.character(feature4_proteins), "Caffeic acid")
 
-# Tier 3: feature 5 has only an empty-name SIRIUS row; falls to mz_rt
+# m/z-RT fallback: feature 5 has only an empty-name SIRIUS row; falls to m/z-RT
 feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName])
 expect_equal(as.character(feature5_proteins), "555.447_9.1")
 
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index e10b0949..6d4030e4 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -204,15 +204,20 @@ peptides receive \code{IsotopeLabelType = "Light"}.}
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
-per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative
-identification via MS/MS spectral matching). See the public
-\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.}
+per feature (MSI Level 2 putative identification via MS/MS spectral
+matching) is used as \code{ProteinName}. See the public
+\code{MZMinetoMSstatsFormat} docstring for the full discussion of
+identification sources and precedence.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the
-\code{mappingFeatureId} and \code{name} columns are read. When supplied,
-SIRIUS \code{name} fills \code{ProteinName} for features that received no
-MZMine compound (tier 2, MSI Level 3).}
+\code{mappingFeatureId} and \code{name} columns are read; score columns
+(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate},
+\code{SiriusScore}) are ignored in this release. When supplied, the
+SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills
+\code{ProteinName} for features that received no MZMine compound name.
+The schema is validated against SIRIUS 6 output; users on other
+versions can rename columns to match. Pass \code{NULL} to disable.}
 }
 \value{
 data.table
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index 9c187da6..70be5876 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -9,7 +9,6 @@ MZMinetoMSstatsFormat(
   annotation = NULL,
   mzmine_annotations,
   sirius_annotations = NULL,
-  removeProtein_with1Feature = FALSE,
   summaryforMultipleRows = max,
   use_log_file = TRUE,
   append = FALSE,
@@ -33,22 +32,20 @@ so the corresponding \code{Run} value must be \code{sampleAmzML}.}
 
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required:
-the highest-scoring \code{compound_name} per feature is used as
-\code{ProteinName} (tier 1, MSI Level 2 putative identification via
-MS/MS spectral matching).}
+the highest-scoring \code{compound_name} per feature (MSI Level 2
+putative identification via MS/MS spectral matching) is used as
+\code{ProteinName}.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the
 \code{mappingFeatureId} and \code{name} columns are read; score columns
 (\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate},
-\code{SiriusScore}) are ignored in this release. When supplied, SIRIUS
-\code{name} fills \code{ProteinName} for features that received no MZMine
-compound (tier 2, MSI Level 3 in-silico structure prediction).
+\code{SiriusScore}) are ignored in this release. When supplied, the
+SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills
+\code{ProteinName} for features that received no MZMine compound name.
 The schema is validated against SIRIUS 6 output; users on other
 versions can rename columns to match. Pass \code{NULL} to disable.}
 
-\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.}
-
 \item{summaryforMultipleRows}{max or sum - when multiple PSMs identify
 the same feature within a single MS run (duplicate PSMs), use the
 highest (max) or sum of the duplicate intensities. Default is max for
@@ -78,31 +75,37 @@ data.table in the MSstats required format.
 Import MZMine files
 }
 \details{
-\code{ProteinName} is assigned in three tiers, in priority order:
-\enumerate{
-\item \strong{MZMine compound (mandatory)} -- the highest-scoring
-\code{compound_name} from \code{mzmine_annotations}. Equivalent to MSI
-Level 2 (Sumner et al. 2007, PMID 27624161): putative
+\code{ProteinName} is assigned from one of three sources, in priority
+order: the MZMine compound name (mandatory), the SIRIUS name
+(optional), and an m/z-RT fallback (always available).
+
+The \strong{MZMine compound name} is the highest-scoring \code{compound_name}
+from \code{mzmine_annotations} for each feature. This corresponds to MSI
+Level 2 (Sumner et al. 2007, PMID 27624161): a putative
 identification by MS/MS spectral matching to a reference library.
-\item \strong{SIRIUS name (optional)} -- when \code{sirius_annotations} is
-non-NULL, the \code{name} from SIRIUS \code{structure_identifications.tsv}
-fills any \code{ProteinName} still NA after tier 1. Equivalent to MSI
-Level 3: in-silico structure prediction. MZMine annotations take
-precedence: SIRIUS only fills features that MZMine missed.
-\item \strong{mz_rt fallback (always)} -- features with no annotation from
-either source are retained, not dropped, and assigned
-\code{paste0(round(mz, 4), "_", round(rt, 2))} as their \code{ProteinName}.
-}
 
-The tier-3 retain-all policy is a deliberate trade-off. A fuller
-feature set gives more stable medians and a more reliable empirical
+The \strong{SIRIUS name} comes from SIRIUS's
+\code{structure_identifications.tsv} and corresponds to MSI Level 3: an
+in-silico structure prediction. When \code{sirius_annotations} is
+non-NULL, the SIRIUS \code{name} fills \code{ProteinName} only for features
+the MZMine library missed -- the MZMine compound name takes
+precedence.
+
+The \strong{m/z-RT fallback} is an identifier built from the feature's
+m/z and retention time (for example, \verb{455.282_0.65}). Features that
+receive no MZMine or SIRIUS annotation are retained, not dropped,
+and assigned an m/z-RT identifier as their \code{ProteinName}.
+
+Retaining every feature is a deliberate trade-off. A fuller feature
+set gives more stable medians and a more reliable empirical
 distribution for global normalization. SIRIUS extends discovery
 coverage to features that level-2 spectral matching misses. The
 cost is an increase in the number of hypotheses tested downstream
 (in \code{MSstats::groupComparison}), which weakens multiple-testing
 correction. Users running confirmatory analyses should restrict to
-tier-1 features post-conversion; users running discovery analyses
-benefit from the additional tiers despite the FDR burden.
+the MZMine-annotated features post-conversion; users running
+discovery analyses benefit from the additional sources despite the
+FDR burden.
 }
 \examples{
 input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv",
diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd
index d7c1fc69..2a2189bf 100644
--- a/man/dot-cleanRawMZMine.Rd
+++ b/man/dot-cleanRawMZMine.Rd
@@ -12,15 +12,20 @@
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
-per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative
-identification via MS/MS spectral matching). See the public
-\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.}
+per feature (MSI Level 2 putative identification via MS/MS spectral
+matching) is used as \code{ProteinName}. See the public
+\code{MZMinetoMSstatsFormat} docstring for the full discussion of
+identification sources and precedence.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the
-\code{mappingFeatureId} and \code{name} columns are read. When supplied,
-SIRIUS \code{name} fills \code{ProteinName} for features that received no
-MZMine compound (tier 2, MSI Level 3).}
+\code{mappingFeatureId} and \code{name} columns are read; score columns
+(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate},
+\code{SiriusScore}) are ignored in this release. When supplied, the
+SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills
+\code{ProteinName} for features that received no MZMine compound name.
+The schema is validated against SIRIUS 6 output; users on other
+versions can rename columns to match. Pass \code{NULL} to disable.}
 }
 \value{
 data.table
diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd
index 44c67989..2115f1c5 100644
--- a/vignettes/msstats_data_format.Rmd
+++ b/vignettes/msstats_data_format.Rmd
@@ -359,16 +359,16 @@ in the output: see "Annotation levels and the SIRIUS option" below for how
 
 ## Annotation levels and the SIRIUS option
 
-`ProteinName` is filled in three tiers, in priority order:
+`ProteinName` is filled from one of three sources, in priority order:
 
-| Tier | Source | MSI Level | Status |
-|------|--------|-----------|--------|
-| 1 | MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory |
-| 2 | SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) |
-| 3 | `paste0(round(mz, 4), "_", round(rt, 2))` | — | Always, for features still unannotated |
+| Source | MSI Level | Status |
+|--------|-----------|--------|
+| MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory |
+| SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) |
+| m/z-RT identifier built from the feature's m/z and retention time (e.g. `455.282_0.65`) | — | Always, for features still unannotated |
 
 MZMine annotations (level 2) take precedence over SIRIUS (level 3).
-The mz_rt fallback retains features rather than dropping them — a
+The m/z-RT fallback retains features rather than dropping them — a
 deliberate trade-off:
 
 - **Pro**: a fuller feature set gives more stable medians and a more
@@ -377,15 +377,8 @@ deliberate trade-off:
   spectral matching misses.
 - **Con**: retaining all features increases the number of hypotheses
   tested downstream, which weakens multiple-testing correction. For
-  confirmatory analyses, restrict to tier-1 features after conversion.
-- **Note**: passing `removeProtein_with1Feature = TRUE` drops nearly
-  every mz_rt feature (and any SIRIUS-only singleton), because the
-  rounded `paste0(round(mz, 4), "_", round(rt, 2))` identifier is
-  almost always unique to one feature — so each mz_rt row is treated
-  as a single-feature "protein" and filtered out, quietly losing the
-  tier-3 retain-all benefit. In rare cases two features can share an
-  mz_rt name (when their rounded m/z and RT coincide); those collide
-  into a single multi-feature "protein" and survive the filter.
+  confirmatory analyses, restrict to the MZMine-annotated features
+  after conversion.
 
 The SIRIUS schema we depend on is `mappingFeatureId` (join key against
 MZMine `rowID`) and `name`. Score columns
@@ -412,7 +405,7 @@ mzmine_library = data.table::fread(system.file(
   package = "MSstatsConvert"
 ))
 
-# tier 1 fills from compound_name; tier 3 fills the remaining as mz_rt
+# MZMine compound name fills annotated features; the m/z-RT fallback fills the rest
 mzmine_converted = MZMinetoMSstatsFormat(
   mzmine_input,
   annotation = mzmine_annotation,
@@ -421,7 +414,7 @@ mzmine_converted = MZMinetoMSstatsFormat(
 )
 head(mzmine_converted)
 
-# Worked example with SIRIUS — fills tier 2 for features the MZMine library missed:
+# Worked example with SIRIUS — fills the SIRIUS name for features the MZMine library missed:
 sirius = data.table::fread(system.file(
   "tinytest/raw_data/MZMine/structure_identifications.tsv",
   package = "MSstatsConvert"

From 7ddee25c3692dc7ac544fe70500ad82e776f1086 Mon Sep 17 00:00:00 2001
From: Swaraj Patil <patil.swaraj@northeastern.edu>
Date: Wed, 10 Jun 2026 12:27:30 -0400
Subject: [PATCH 4/4] Remove the explicit @param mzmine_annotations block;
 instead inherit from .cleanRawMZMine alongside sirius_annotations

---
 R/clean_MZMine.R                     | 4 +---
 R/converters_MZMinetoMSstatsFormat.R | 5 -----
 man/MSstatsClean.Rd                  | 4 +---
 man/MZMinetoMSstatsFormat.Rd         | 8 ++++----
 man/dot-cleanRawMZMine.Rd            | 4 +---
 5 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R
index 1113ae7f..35f0cf00 100644
--- a/R/clean_MZMine.R
+++ b/R/clean_MZMine.R
@@ -10,9 +10,7 @@
 #'   annotations with columns `id`, `compound_name`, `score`. Required;
 #'   passing `NULL` raises an error. The highest-scoring `compound_name`
 #'   per feature (MSI Level 2 putative identification via MS/MS spectral
-#'   matching) is used as `ProteinName`. See the public
-#'   `MZMinetoMSstatsFormat` docstring for the full discussion of
-#'   identification sources and precedence.
+#'   matching) is used as `ProteinName`.
 #' @param sirius_annotations Optional `data.frame` of SIRIUS
 #'   `structure_identifications.tsv` output, or `NULL`. Only the
 #'   `mappingFeatureId` and `name` columns are read; score columns
diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R
index 3140c7f5..ad872727 100644
--- a/R/converters_MZMinetoMSstatsFormat.R
+++ b/R/converters_MZMinetoMSstatsFormat.R
@@ -12,11 +12,6 @@
 #'   trailing `"Peakarea"` suffix removed. For example, a quant-file column
 #'   `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
 #'   so the corresponding `Run` value must be `sampleAmzML`.
-#' @param mzmine_annotations `data.frame` of MZMine spectral-library
-#'   annotations with columns `id`, `compound_name`, `score`. Required:
-#'   the highest-scoring `compound_name` per feature (MSI Level 2
-#'   putative identification via MS/MS spectral matching) is used as
-#'   `ProteinName`.
 #'
 #' @details
 #' `ProteinName` is assigned from one of three sources, in priority
diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd
index 6d4030e4..bfa79d6a 100644
--- a/man/MSstatsClean.Rd
+++ b/man/MSstatsClean.Rd
@@ -205,9 +205,7 @@ peptides receive \code{IsotopeLabelType = "Light"}.}
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
 per feature (MSI Level 2 putative identification via MS/MS spectral
-matching) is used as \code{ProteinName}. See the public
-\code{MZMinetoMSstatsFormat} docstring for the full discussion of
-identification sources and precedence.}
+matching) is used as \code{ProteinName}.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the
diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd
index 70be5876..971d6974 100644
--- a/man/MZMinetoMSstatsFormat.Rd
+++ b/man/MZMinetoMSstatsFormat.Rd
@@ -31,10 +31,10 @@ trailing \code{"Peakarea"} suffix removed. For example, a quant-file column
 so the corresponding \code{Run} value must be \code{sampleAmzML}.}
 
 \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library
-annotations with columns \code{id}, \code{compound_name}, \code{score}. Required:
-the highest-scoring \code{compound_name} per feature (MSI Level 2
-putative identification via MS/MS spectral matching) is used as
-\code{ProteinName}.}
+annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
+passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
+per feature (MSI Level 2 putative identification via MS/MS spectral
+matching) is used as \code{ProteinName}.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the
diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd
index 2a2189bf..0c936d52 100644
--- a/man/dot-cleanRawMZMine.Rd
+++ b/man/dot-cleanRawMZMine.Rd
@@ -13,9 +13,7 @@
 annotations with columns \code{id}, \code{compound_name}, \code{score}. Required;
 passing \code{NULL} raises an error. The highest-scoring \code{compound_name}
 per feature (MSI Level 2 putative identification via MS/MS spectral
-matching) is used as \code{ProteinName}. See the public
-\code{MZMinetoMSstatsFormat} docstring for the full discussion of
-identification sources and precedence.}
+matching) is used as \code{ProteinName}.}
 
 \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS
 \code{structure_identifications.tsv} output, or \code{NULL}. Only the