From 0096baacaf55d7e33a28c12010472b28d12e3410 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Thu, 4 Jun 2026 11:50:32 -0400 Subject: [PATCH 1/4] - Add SIRIUS annotation support and mz_rt fallback to MZMine converter - Retain features lacking MZMine compound names, assign ProteinName via MZMine/SIRIUS/mz_rt tiers, restore required mz/RT metadata, and add validation, tests, docs, and tier-level logging. --- R/clean_MZMine.R | 70 ++++++++++++++---- R/converters_MZMinetoMSstatsFormat.R | 74 ++++++++++++++++--- .../MZMine/structure_identifications.tsv | 5 ++ .../test_converters_MZMinetoMSstatsFormat.R | 59 +++++++++++++-- man/MSstatsClean.Rd | 16 ++-- man/MZMinetoMSstatsFormat.Rd | 66 ++++++++++++++--- man/dot-cleanRawMZMine.Rd | 16 ++-- vignettes/msstats_data_format.Rmd | 68 ++++++++++++++--- 8 files changed, 306 insertions(+), 68 deletions(-) create mode 100644 inst/tinytest/raw_data/MZMine/structure_identifications.tsv diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index cee0cf05..ec0f3d7f 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -9,17 +9,22 @@ #' @param mzmine_annotations `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. Required; #' passing `NULL` raises an error. The highest-scoring `compound_name` -#' per feature is used as `ProteinName`, and features in the quant -#' table with no matching annotation row are dropped from the output. -#' These are MSI Level 2 annotations (putative identification via -#' MS/MS spectral matching). See the public `MZMinetoMSstatsFormat` -#' docstring for the full scope discussion. +#' per feature is used as `ProteinName` (tier 1, MSI Level 2 putative +#' identification via MS/MS spectral matching). See the public +#' `MZMinetoMSstatsFormat` docstring for the full tier discussion. +#' @param sirius_annotations Optional `data.frame` of SIRIUS +#' `structure_identifications.tsv` output, or `NULL`. Only the +#' `mappingFeatureId` and `name` columns are read. When supplied, +#' SIRIUS `name` fills `ProteinName` for features that received no +#' MZMine compound (tier 2, MSI Level 3). #' @return data.table #' @keywords internal -.cleanRawMZMine <- function(msstats_object, mzmine_annotations) { +.cleanRawMZMine <- function(msstats_object, mzmine_annotations, + sirius_annotations = NULL) { ProteinName = PeptideSequence = Intensity = Run = NULL PrecursorCharge = FragmentIon = ProductCharge = NULL id = score = compound_name = i.compound_name = NULL + rowmz = rowretentiontime = mappingFeatureId = name = NULL mz_input = getInputFile(msstats_object, "input") mz_input = data.table::as.data.table(mz_input) @@ -32,10 +37,13 @@ "columns named ' Peak area' (e.g. 'sampleA.mzML Peak area').") } id_col <- "rowID" - required_meta <- id_col + mz_col <- "rowmz" + rt_col <- "rowretentiontime" + required_meta <- c(id_col, mz_col, rt_col) missing_meta <- setdiff(required_meta, colnames(mz_input)) if (length(missing_meta) > 0) { - stop("Missing required MZMine metadata column (expected 'row ID'). ", + stop("Missing required MZMine metadata column(s) ", + "(expected 'row ID', 'row m/z', 'row retention time'). ", "After standardization, looked for: ", paste(missing_meta, collapse = ", "), ".") } @@ -57,20 +65,50 @@ } data.table::setorder(feature_to_compound, id, -score) feature_to_compound <- unique(feature_to_compound, by = "id") - # Inner-join filter: drop quant rows with no matching annotation. + # Tier 1: MZMine compound name (left-join, no drop). mz_input[ feature_to_compound, ProteinName := i.compound_name, on = setNames("id", id_col) ] - mz_input <- mz_input[!is.na(ProteinName)] + n_tier1 <- sum(!is.na(mz_input$ProteinName)) - retained_ids <- feature_to_compound$id - retained_msg <- paste0("** MZMine: retained ", length(retained_ids), - " feature(s) after annotation join: ", - paste(retained_ids, collapse = ", ")) - getOption("MSstatsLog")("INFO", retained_msg) - getOption("MSstatsMsg")("INFO", retained_msg) + # Tier 2: SIRIUS name fills features still NA after tier 1. + n_tier2 <- 0L + if (!is.null(sirius_annotations)) { + sirius_dt <- data.table::as.data.table(sirius_annotations) + sirius_dt <- sirius_dt[, c("mappingFeatureId", "name"), with = FALSE] + sirius_dt[, name := ifelse(is.na(name) | name == "", + NA_character_, as.character(name))] + sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)] + data.table::setorder(sirius_dt, mappingFeatureId) + sirius_dt <- unique(sirius_dt, by = "mappingFeatureId") + mz_input[, ProteinName := ifelse( + is.na(ProteinName), + sirius_dt[ + .(as.character(get(id_col))), + on = "mappingFeatureId", + name + ], + ProteinName)] + n_tier2 <- sum(!is.na(mz_input$ProteinName)) - n_tier1 + } + + # Tier 3: mz_rt fallback for features still NA. + na_mask <- is.na(mz_input$ProteinName) + n_tier3 <- sum(na_mask) + if (n_tier3 > 0) { + mz_input[na_mask, ProteinName := paste0( + round(get(mz_col), 4), "_", round(get(rt_col), 2))] + } + + tier_msg <- paste0( + "** MZMine ProteinName assignment: ", + "tier 1 (MZMine compound): ", n_tier1, " feature(s); ", + "tier 2 (SIRIUS name): ", n_tier2, " feature(s); ", + "tier 3 (mz_rt fallback): ", n_tier3, " feature(s).") + getOption("MSstatsLog")("INFO", tier_msg) + getOption("MSstatsMsg")("INFO", tier_msg) mz_input[, PeptideSequence := as.character(get(id_col))] diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index 4e3df158..bc544285 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -14,16 +14,45 @@ #' @param mzmine_annotations `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. Required: #' the highest-scoring `compound_name` per feature is used as -#' `ProteinName`, and features in the quant table with no matching -#' annotation row are dropped from the output. +#' `ProteinName` (tier 1, MSI Level 2 putative identification via +#' MS/MS spectral matching). +#' @param sirius_annotations Optional `data.frame` of SIRIUS +#' `structure_identifications.tsv` output, or `NULL`. Only the +#' `mappingFeatureId` and `name` columns are read; score columns +#' (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, +#' `SiriusScore`) are ignored in this release. When supplied, SIRIUS +#' `name` fills `ProteinName` for features that received no MZMine +#' compound (tier 2, MSI Level 3 in-silico structure prediction). +#' The schema is validated against SIRIUS 6 output; users on other +#' versions can rename columns to match. Pass `NULL` to disable. #' -#' These are MSI Level 2 annotations (putative identification via -#' MS/MS spectral matching against a reference library). Higher- -#' confidence Level 1 identifications require pure reference standards -#' and are out of scope here. Lower-confidence annotations such as -#' Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via -#' CANOPUS) are not currently supported -- features without a Level 2 -#' annotation row are filtered out. +#' @details +#' `ProteinName` is assigned in three tiers, in priority order: +#' +#' 1. **MZMine compound (mandatory)** -- the highest-scoring +#' `compound_name` from `mzmine_annotations`. Equivalent to MSI +#' Level 2 (Sumner et al. 2007, PMID 27624161): putative +#' identification by MS/MS spectral matching to a reference library. +#' +#' 2. **SIRIUS name (optional)** -- when `sirius_annotations` is +#' non-NULL, the `name` from SIRIUS `structure_identifications.tsv` +#' fills any `ProteinName` still NA after tier 1. Equivalent to MSI +#' Level 3: in-silico structure prediction. MZMine annotations take +#' precedence: SIRIUS only fills features that MZMine missed. +#' +#' 3. **mz_rt fallback (always)** -- features with no annotation from +#' either source are retained, not dropped, and assigned +#' `paste0(round(mz, 4), "_", round(rt, 2))` as their `ProteinName`. +#' +#' The tier-3 retain-all policy is a deliberate trade-off. A fuller +#' feature set gives more stable medians and a more reliable empirical +#' distribution for global normalization. SIRIUS extends discovery +#' coverage to features that level-2 spectral matching misses. The +#' cost is an increase in the number of hypotheses tested downstream +#' (in `MSstats::groupComparison`), which weakens multiple-testing +#' correction. Users running confirmatory analyses should restrict to +#' tier-1 features post-conversion; users running discovery analyses +#' benefit from the additional tiers despite the FDR burden. #' #' @return data.table in the MSstats required format. #' @@ -43,10 +72,23 @@ #' mzmine_annotations = lib, #' use_log_file = FALSE) #' head(output) +#' +#' # With SIRIUS annotations: +#' sirius_path = system.file( +#' "tinytest/raw_data/MZMine/structure_identifications.tsv", +#' package = "MSstatsConvert") +#' sirius = data.table::fread(sirius_path) +#' output_with_sirius = MZMinetoMSstatsFormat( +#' input, annotation = annot, +#' mzmine_annotations = lib, +#' sirius_annotations = sirius, +#' use_log_file = FALSE) +#' head(output_with_sirius) MZMinetoMSstatsFormat = function( input, annotation = NULL, mzmine_annotations, + sirius_annotations = NULL, removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, @@ -62,10 +104,22 @@ MZMinetoMSstatsFormat = function( "columns 'id', 'compound_name', 'score'.") } + if (!is.null(sirius_annotations)) { + sirius_cols = colnames(sirius_annotations) + missing_sirius = setdiff(c("mappingFeatureId", "name"), sirius_cols) + if (length(missing_sirius) > 0) { + stop("sirius_annotations is missing required column(s): ", + paste(missing_sirius, collapse = ", "), + ". Required: 'mappingFeatureId' and 'name'.") + } + } + input = MSstatsConvert::MSstatsImport(list(input = input), "MSstats", "MZMine", ...) input = MSstatsConvert::MSstatsClean( - input, mzmine_annotations = mzmine_annotations) + input, + mzmine_annotations = mzmine_annotations, + sirius_annotations = sirius_annotations) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") diff --git a/inst/tinytest/raw_data/MZMine/structure_identifications.tsv b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv new file mode 100644 index 00000000..a0e38e69 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv @@ -0,0 +1,5 @@ +mappingFeatureId name ConfidenceScoreExact ConfidenceScoreApproximate SiriusScore +1 DuplicateFromSirius 0.30 0.40 5.5 +4 Caffeic acid 0.85 0.88 22.1 +5 0.10 0.12 1.0 +99 Ghost 0.50 0.55 8.0 diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R index dcfccf90..6ac1c0fc 100644 --- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -15,10 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot, use_log_file = FALSE) output_dt = data.table::as.data.table(output) -# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns -# Features 4 and 5 have no annotation row and are dropped by the inner join. +# Basic structure: 6 features x 4 runs = 24 rows; all features retained. +# Features 4 and 5 have no MZMine annotation and receive mz_rt fallback names. expect_equal(ncol(output), 11) -expect_equal(nrow(output), 16) +expect_equal(nrow(output), 24) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) @@ -54,11 +54,13 @@ expect_equal(as.character(feature3_proteins), "Lactate") feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName]) expect_equal(as.character(feature6_proteins), "Caffeine") -# Features absent from the annotations file are filtered out (no mz_rt fallback) -expect_false("4" %in% as.character(output_dt$PeptideSequence)) -expect_false("5" %in% as.character(output_dt$PeptideSequence)) -expect_false(any(as.character(output_dt$ProteinName) %in% - c("489.334_7.89", "555.447_9.1"))) +# Features absent from the MZMine annotations file get mz_rt fallback ProteinNames. +expect_true("4" %in% as.character(output_dt$PeptideSequence)) +expect_true("5" %in% as.character(output_dt$PeptideSequence)) +feature4_protein = unique(output_dt[PeptideSequence == "4", ProteinName]) +expect_equal(as.character(feature4_protein), "489.334_7.89") +feature5_protein = unique(output_dt[PeptideSequence == "5", ProteinName]) +expect_equal(as.character(feature5_protein), "555.447_9.1") # Zero-intensity input cells are converted to NA in output # Feature 3 sampleB = 0 -> NA (feature 3 is annotated as Lactate) @@ -111,3 +113,44 @@ expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine") expect_equal(nrow(output_filtered), 8) expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))), c("1", "6")) + +# With sirius_annotations supplied --------------------------------------------- +sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert") +sirius = data.table::fread(sirius_path) + +output_sirius = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + sirius_annotations = sirius, + use_log_file = FALSE) +output_sirius_dt = data.table::as.data.table(output_sirius) + +# All 6 features still retained +expect_equal(nrow(output_sirius), 24) + +# Precedence: feature 1 hit by both MZMine (Caffeine) and SIRIUS +# (DuplicateFromSirius). MZMine wins. +feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName]) +expect_equal(as.character(feature1_proteins), "Caffeine") + +# Tier 2: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid" +feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName]) +expect_equal(as.character(feature4_proteins), "Caffeic acid") + +# Tier 3: feature 5 has only an empty-name SIRIUS row; falls to mz_rt +feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName]) +expect_equal(as.character(feature5_proteins), "555.447_9.1") + +# An irrelevant SIRIUS row (mappingFeatureId=99) must not introduce new features +expect_false("99" %in% as.character(output_sirius_dt$PeptideSequence)) +expect_false("Ghost" %in% as.character(output_sirius_dt$ProteinName)) + +# sirius_annotations missing required columns triggers stop() ------------------ +bad_sirius = data.frame(mappingFeatureId = 1, score = 0.9) # no 'name' +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + sirius_annotations = bad_sirius, + use_log_file = FALSE), + "missing required column" +) diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 8863b4ea..e10b0949 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -82,7 +82,7 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object) -\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations) +\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations, sirius_annotations = NULL) } \arguments{ \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.} @@ -204,11 +204,15 @@ peptides receive \code{IsotopeLabelType = "Light"}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName}, and features in the quant -table with no matching annotation row are dropped from the output. -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} -docstring for the full scope discussion.} +per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative +identification via MS/MS spectral matching). See the public +\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.} + +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read. When supplied, +SIRIUS \code{name} fills \code{ProteinName} for features that received no +MZMine compound (tier 2, MSI Level 3).} } \value{ data.table diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index b3fdcfe1..9c187da6 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -8,6 +8,7 @@ MZMinetoMSstatsFormat( input, annotation = NULL, mzmine_annotations, + sirius_annotations = NULL, removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, @@ -33,20 +34,26 @@ so the corresponding \code{Run} value must be \code{sampleAmzML}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: the highest-scoring \code{compound_name} per feature is used as -\code{ProteinName}, and features in the quant table with no matching -annotation row are dropped from the output. +\code{ProteinName} (tier 1, MSI Level 2 putative identification via +MS/MS spectral matching).} -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching against a reference library). Higher- -confidence Level 1 identifications require pure reference standards -and are out of scope here. Lower-confidence annotations such as -Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via -CANOPUS) are not currently supported -- features without a Level 2 -annotation row are filtered out.} +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, SIRIUS +\code{name} fills \code{ProteinName} for features that received no MZMine +compound (tier 2, MSI Level 3 in-silico structure prediction). +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} \item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} -\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.} +\item{summaryforMultipleRows}{max or sum - when multiple PSMs identify +the same feature within a single MS run (duplicate PSMs), use the +highest (max) or sum of the duplicate intensities. Default is max for +label-free converters and sum for TMT converters. Note that this parameter +does NOT control collapsing across fractions of the same biological mixture.} \item{use_log_file}{logical. If TRUE, information about data processing will be saved to a file.} @@ -70,6 +77,33 @@ data.table in the MSstats required format. \description{ Import MZMine files } +\details{ +\code{ProteinName} is assigned in three tiers, in priority order: +\enumerate{ +\item \strong{MZMine compound (mandatory)} -- the highest-scoring +\code{compound_name} from \code{mzmine_annotations}. Equivalent to MSI +Level 2 (Sumner et al. 2007, PMID 27624161): putative +identification by MS/MS spectral matching to a reference library. +\item \strong{SIRIUS name (optional)} -- when \code{sirius_annotations} is +non-NULL, the \code{name} from SIRIUS \code{structure_identifications.tsv} +fills any \code{ProteinName} still NA after tier 1. Equivalent to MSI +Level 3: in-silico structure prediction. MZMine annotations take +precedence: SIRIUS only fills features that MZMine missed. +\item \strong{mz_rt fallback (always)} -- features with no annotation from +either source are retained, not dropped, and assigned +\code{paste0(round(mz, 4), "_", round(rt, 2))} as their \code{ProteinName}. +} + +The tier-3 retain-all policy is a deliberate trade-off. A fuller +feature set gives more stable medians and a more reliable empirical +distribution for global normalization. SIRIUS extends discovery +coverage to features that level-2 spectral matching misses. The +cost is an increase in the number of hypotheses tested downstream +(in \code{MSstats::groupComparison}), which weakens multiple-testing +correction. Users running confirmatory analyses should restrict to +tier-1 features post-conversion; users running discovery analyses +benefit from the additional tiers despite the FDR burden. +} \examples{ input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", package = "MSstatsConvert") @@ -84,4 +118,16 @@ output = MZMinetoMSstatsFormat(input, annotation = annot, mzmine_annotations = lib, use_log_file = FALSE) head(output) + +# With SIRIUS annotations: +sirius_path = system.file( + "tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert") +sirius = data.table::fread(sirius_path) +output_with_sirius = MZMinetoMSstatsFormat( + input, annotation = annot, + mzmine_annotations = lib, + sirius_annotations = sirius, + use_log_file = FALSE) +head(output_with_sirius) } diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd index 8c93db08..d7c1fc69 100644 --- a/man/dot-cleanRawMZMine.Rd +++ b/man/dot-cleanRawMZMine.Rd @@ -4,7 +4,7 @@ \alias{.cleanRawMZMine} \title{Clean raw MZMine files} \usage{ -.cleanRawMZMine(msstats_object, mzmine_annotations) +.cleanRawMZMine(msstats_object, mzmine_annotations, sirius_annotations = NULL) } \arguments{ \item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.} @@ -12,11 +12,15 @@ \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName}, and features in the quant -table with no matching annotation row are dropped from the output. -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} -docstring for the full scope discussion.} +per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative +identification via MS/MS spectral matching). See the public +\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.} + +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read. When supplied, +SIRIUS \code{name} fills \code{ProteinName} for features that received no +MZMine compound (tier 2, MSI Level 3).} } \value{ data.table diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index 64a4d882..b52f757e 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -353,17 +353,47 @@ per feature, one ` Peak area` column per sample) together with a standar MSstats annotation and produces an MSstats-ready long-format `data.table`. An MZMine spectral-library annotation table with `id`, `compound_name`, and -`score` columns is **required**. The highest-scoring `compound_name` per feature -is used as `ProteinName`. Features in the quant table with no matching annotation -row are dropped from the output — there is no synthesized mz_rt fallback, -because placeholder identifiers inflate the hypothesis count for downstream -`groupComparison` without biological signal. - -These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) -(putative identification via MS/MS spectral matching against a reference library). -Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS -(Level 4) — are out of scope for this iteration; features without a Level 2 -annotation row are filtered out. +`score` columns is **required**. All features in the quant table are retained +in the output: see "Annotation levels and the SIRIUS option" below for how +`ProteinName` is assigned. + +## Annotation levels and the SIRIUS option + +`ProteinName` is filled in three tiers, in priority order: + +| Tier | Source | MSI Level | Status | +|------|--------|-----------|--------| +| 1 | MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory | +| 2 | SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) | +| 3 | `paste0(round(mz, 4), "_", round(rt, 2))` | — | Always, for features still unannotated | + +MZMine annotations (level 2) take precedence over SIRIUS (level 3). +The mz_rt fallback retains features rather than dropping them — a +deliberate trade-off: + +- **Pro**: a fuller feature set gives more stable medians and a more + reliable empirical distribution for global normalization. +- **Pro**: SIRIUS extends discovery coverage to features that level-2 + spectral matching misses. +- **Con**: retaining all features increases the number of hypotheses + tested downstream, which weakens multiple-testing correction. For + confirmatory analyses, restrict to tier-1 features after conversion. +- **Note**: passing `removeProtein_with1Feature = TRUE` drops every + mz_rt feature (and any SIRIUS-only singleton) because each mz_rt + name is unique to one feature, so it is by construction a + single-feature "protein" and gets filtered out — quietly losing the + tier-3 retain-all benefit. + +The SIRIUS schema we depend on is `mappingFeatureId` (join key against +MZMine `rowID`) and `name`. Score columns +(`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, `SiriusScore`) +are present in the file but not used in this release. The schema is +validated against SIRIUS 6 output; users on different SIRIUS versions +can rename their columns to match. + +The reference background is +[MSI Level 2 / Level 3 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) +(Sumner et al. 2007, PMID 27624161). ```{r mzmine} mzmine_input = data.table::fread(system.file( @@ -379,7 +409,7 @@ mzmine_library = data.table::fread(system.file( package = "MSstatsConvert" )) -# ProteinName comes from the matched compound_name; unannotated features are dropped +# tier 1 fills from compound_name; tier 3 fills the remaining as mz_rt mzmine_converted = MZMinetoMSstatsFormat( mzmine_input, annotation = mzmine_annotation, @@ -387,6 +417,20 @@ mzmine_converted = MZMinetoMSstatsFormat( use_log_file = FALSE ) head(mzmine_converted) + +# Worked example with SIRIUS — fills tier 2 for features the MZMine library missed: +sirius = data.table::fread(system.file( + "tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert" +)) +mzmine_with_sirius = MZMinetoMSstatsFormat( + mzmine_input, + annotation = mzmine_annotation, + mzmine_annotations = mzmine_library, + sirius_annotations = sirius, + use_log_file = FALSE +) +unique(mzmine_with_sirius[, c("PeptideSequence", "ProteinName")]) ``` Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence` From dfeb79fe42298d8993fc636b96c451c59bdaf9c9 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Thu, 4 Jun 2026 12:26:29 -0400 Subject: [PATCH 2/4] Resolve nitpicks raised by coderabbit-ai --- vignettes/msstats_data_format.Rmd | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index b52f757e..44c67989 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -378,11 +378,14 @@ deliberate trade-off: - **Con**: retaining all features increases the number of hypotheses tested downstream, which weakens multiple-testing correction. For confirmatory analyses, restrict to tier-1 features after conversion. -- **Note**: passing `removeProtein_with1Feature = TRUE` drops every - mz_rt feature (and any SIRIUS-only singleton) because each mz_rt - name is unique to one feature, so it is by construction a - single-feature "protein" and gets filtered out — quietly losing the - tier-3 retain-all benefit. +- **Note**: passing `removeProtein_with1Feature = TRUE` drops nearly + every mz_rt feature (and any SIRIUS-only singleton), because the + rounded `paste0(round(mz, 4), "_", round(rt, 2))` identifier is + almost always unique to one feature — so each mz_rt row is treated + as a single-feature "protein" and filtered out, quietly losing the + tier-3 retain-all benefit. In rare cases two features can share an + mz_rt name (when their rounded m/z and RT coincide); those collide + into a single multi-feature "protein" and survive the filter. The SIRIUS schema we depend on is `mappingFeatureId` (join key against MZMine `rowID`) and `name`. Score columns From 66b378cd510b40e77a0fd028d9053ee6577cb726 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Wed, 10 Jun 2026 00:03:48 -0400 Subject: [PATCH 3/4] Address review feedback on MZMine SIRIUS converter: Inherit sirius_annotations docs via @inheritParams, replace tier terminology with MSI levels and plain-language source descriptions, and remove the removeProtein_with1Feature parameter (hard-coded FALSE internally). Switch the SIRIUS fill to in-place data.table updates with a deterministic dedup tiebreaker. --- R/clean_MZMine.R | 64 ++++++++++--------- R/converters_MZMinetoMSstatsFormat.R | 58 ++++++++--------- .../test_converters_MZMinetoMSstatsFormat.R | 19 +----- man/MSstatsClean.Rd | 17 +++-- man/MZMinetoMSstatsFormat.Rd | 57 +++++++++-------- man/dot-cleanRawMZMine.Rd | 17 +++-- vignettes/msstats_data_format.Rmd | 29 ++++----- 7 files changed, 125 insertions(+), 136 deletions(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index ec0f3d7f..1113ae7f 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -9,14 +9,19 @@ #' @param mzmine_annotations `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. Required; #' passing `NULL` raises an error. The highest-scoring `compound_name` -#' per feature is used as `ProteinName` (tier 1, MSI Level 2 putative -#' identification via MS/MS spectral matching). See the public -#' `MZMinetoMSstatsFormat` docstring for the full tier discussion. +#' per feature (MSI Level 2 putative identification via MS/MS spectral +#' matching) is used as `ProteinName`. See the public +#' `MZMinetoMSstatsFormat` docstring for the full discussion of +#' identification sources and precedence. #' @param sirius_annotations Optional `data.frame` of SIRIUS #' `structure_identifications.tsv` output, or `NULL`. Only the -#' `mappingFeatureId` and `name` columns are read. When supplied, -#' SIRIUS `name` fills `ProteinName` for features that received no -#' MZMine compound (tier 2, MSI Level 3). +#' `mappingFeatureId` and `name` columns are read; score columns +#' (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, +#' `SiriusScore`) are ignored in this release. When supplied, the +#' SIRIUS `name` (MSI Level 3, in-silico structure prediction) fills +#' `ProteinName` for features that received no MZMine compound name. +#' The schema is validated against SIRIUS 6 output; users on other +#' versions can rename columns to match. Pass `NULL` to disable. #' @return data.table #' @keywords internal .cleanRawMZMine <- function(msstats_object, mzmine_annotations, @@ -65,50 +70,47 @@ } data.table::setorder(feature_to_compound, id, -score) feature_to_compound <- unique(feature_to_compound, by = "id") - # Tier 1: MZMine compound name (left-join, no drop). + # MZMine compound name fill (left-join, no drop). mz_input[ feature_to_compound, ProteinName := i.compound_name, on = setNames("id", id_col) ] - n_tier1 <- sum(!is.na(mz_input$ProteinName)) + n_mzmine <- sum(!is.na(mz_input$ProteinName)) - # Tier 2: SIRIUS name fills features still NA after tier 1. - n_tier2 <- 0L + # SIRIUS name fills features still NA after the MZMine compound fill. + n_sirius <- 0L if (!is.null(sirius_annotations)) { - sirius_dt <- data.table::as.data.table(sirius_annotations) - sirius_dt <- sirius_dt[, c("mappingFeatureId", "name"), with = FALSE] + sirius_dt <- data.table::copy(data.table::as.data.table(sirius_annotations)) + drop_cols <- setdiff(colnames(sirius_dt), c("mappingFeatureId", "name")) + for (col in drop_cols) data.table::set(sirius_dt, j = col, value = NULL) sirius_dt[, name := ifelse(is.na(name) | name == "", NA_character_, as.character(name))] sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)] - data.table::setorder(sirius_dt, mappingFeatureId) + data.table::setorder(sirius_dt, mappingFeatureId, name) + # unique() keeps the dedup 1:1 for the join and handles + # multiple structure candidates per feature. sirius_dt <- unique(sirius_dt, by = "mappingFeatureId") - mz_input[, ProteinName := ifelse( - is.na(ProteinName), - sirius_dt[ - .(as.character(get(id_col))), - on = "mappingFeatureId", - name - ], - ProteinName)] - n_tier2 <- sum(!is.na(mz_input$ProteinName)) - n_tier1 + mz_input[is.na(ProteinName), ProteinName := + sirius_dt[.(as.character(get(id_col))), on = "mappingFeatureId", name]] + n_sirius <- sum(!is.na(mz_input$ProteinName)) - n_mzmine } - # Tier 3: mz_rt fallback for features still NA. + # m/z-RT fallback for features still NA. na_mask <- is.na(mz_input$ProteinName) - n_tier3 <- sum(na_mask) - if (n_tier3 > 0) { + n_fallback <- sum(na_mask) + if (n_fallback > 0) { mz_input[na_mask, ProteinName := paste0( round(get(mz_col), 4), "_", round(get(rt_col), 2))] } - tier_msg <- paste0( + assignment_msg <- paste0( "** MZMine ProteinName assignment: ", - "tier 1 (MZMine compound): ", n_tier1, " feature(s); ", - "tier 2 (SIRIUS name): ", n_tier2, " feature(s); ", - "tier 3 (mz_rt fallback): ", n_tier3, " feature(s).") - getOption("MSstatsLog")("INFO", tier_msg) - getOption("MSstatsMsg")("INFO", tier_msg) + "MZMine compound: ", n_mzmine, " feature(s); ", + "SIRIUS name: ", n_sirius, " feature(s); ", + "m/z-RT fallback: ", n_fallback, " feature(s).") + getOption("MSstatsLog")("INFO", assignment_msg) + getOption("MSstatsMsg")("INFO", assignment_msg) mz_input[, PeptideSequence := as.character(get(id_col))] diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index bc544285..3140c7f5 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -1,6 +1,7 @@ #' Import MZMine files #' #' @inheritParams .sharedParametersAmongConverters +#' @inheritParams .cleanRawMZMine #' @param input MZMine feature-quantification table (wide format; one row per #' feature). Must include the metadata columns `row ID`, `row m/z`, #' `row retention time`, and per-sample peak-area columns named @@ -13,46 +14,42 @@ #' so the corresponding `Run` value must be `sampleAmzML`. #' @param mzmine_annotations `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. Required: -#' the highest-scoring `compound_name` per feature is used as -#' `ProteinName` (tier 1, MSI Level 2 putative identification via -#' MS/MS spectral matching). -#' @param sirius_annotations Optional `data.frame` of SIRIUS -#' `structure_identifications.tsv` output, or `NULL`. Only the -#' `mappingFeatureId` and `name` columns are read; score columns -#' (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, -#' `SiriusScore`) are ignored in this release. When supplied, SIRIUS -#' `name` fills `ProteinName` for features that received no MZMine -#' compound (tier 2, MSI Level 3 in-silico structure prediction). -#' The schema is validated against SIRIUS 6 output; users on other -#' versions can rename columns to match. Pass `NULL` to disable. +#' the highest-scoring `compound_name` per feature (MSI Level 2 +#' putative identification via MS/MS spectral matching) is used as +#' `ProteinName`. #' #' @details -#' `ProteinName` is assigned in three tiers, in priority order: +#' `ProteinName` is assigned from one of three sources, in priority +#' order: the MZMine compound name (mandatory), the SIRIUS name +#' (optional), and an m/z-RT fallback (always available). #' -#' 1. **MZMine compound (mandatory)** -- the highest-scoring -#' `compound_name` from `mzmine_annotations`. Equivalent to MSI -#' Level 2 (Sumner et al. 2007, PMID 27624161): putative -#' identification by MS/MS spectral matching to a reference library. +#' The **MZMine compound name** is the highest-scoring `compound_name` +#' from `mzmine_annotations` for each feature. This corresponds to MSI +#' Level 2 (Sumner et al. 2007, PMID 27624161): a putative +#' identification by MS/MS spectral matching to a reference library. #' -#' 2. **SIRIUS name (optional)** -- when `sirius_annotations` is -#' non-NULL, the `name` from SIRIUS `structure_identifications.tsv` -#' fills any `ProteinName` still NA after tier 1. Equivalent to MSI -#' Level 3: in-silico structure prediction. MZMine annotations take -#' precedence: SIRIUS only fills features that MZMine missed. +#' The **SIRIUS name** comes from SIRIUS's +#' `structure_identifications.tsv` and corresponds to MSI Level 3: an +#' in-silico structure prediction. When `sirius_annotations` is +#' non-NULL, the SIRIUS `name` fills `ProteinName` only for features +#' the MZMine library missed -- the MZMine compound name takes +#' precedence. #' -#' 3. **mz_rt fallback (always)** -- features with no annotation from -#' either source are retained, not dropped, and assigned -#' `paste0(round(mz, 4), "_", round(rt, 2))` as their `ProteinName`. +#' The **m/z-RT fallback** is an identifier built from the feature's +#' m/z and retention time (for example, `455.282_0.65`). Features that +#' receive no MZMine or SIRIUS annotation are retained, not dropped, +#' and assigned an m/z-RT identifier as their `ProteinName`. #' -#' The tier-3 retain-all policy is a deliberate trade-off. A fuller -#' feature set gives more stable medians and a more reliable empirical +#' Retaining every feature is a deliberate trade-off. A fuller feature +#' set gives more stable medians and a more reliable empirical #' distribution for global normalization. SIRIUS extends discovery #' coverage to features that level-2 spectral matching misses. The #' cost is an increase in the number of hypotheses tested downstream #' (in `MSstats::groupComparison`), which weakens multiple-testing #' correction. Users running confirmatory analyses should restrict to -#' tier-1 features post-conversion; users running discovery analyses -#' benefit from the additional tiers despite the FDR burden. +#' the MZMine-annotated features post-conversion; users running +#' discovery analyses benefit from the additional sources despite the +#' FDR burden. #' #' @return data.table in the MSstats required format. #' @@ -89,7 +86,6 @@ MZMinetoMSstatsFormat = function( annotation = NULL, mzmine_annotations, sirius_annotations = NULL, - removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, append = FALSE, @@ -129,7 +125,7 @@ MZMinetoMSstatsFormat = function( annotation, feature_columns, remove_shared_peptides = FALSE, - remove_single_feature_proteins = removeProtein_with1Feature, + remove_single_feature_proteins = FALSE, exact_filtering = NULL, pattern_filtering = NULL, aggregate_isotopic = FALSE, diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R index 6ac1c0fc..2589f1fc 100644 --- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -99,21 +99,6 @@ expect_error( "mzmine_annotations is required" ) -# removeProtein_with1Feature filters non-Caffeine proteins ------------------- -# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6); -# Lactate and Glucose each have 1. -output_filtered = MZMinetoMSstatsFormat(input, annotation = annot, - mzmine_annotations = mzmine_ann, - removeProtein_with1Feature = TRUE, - use_log_file = FALSE) -output_filtered_dt = data.table::as.data.table(output_filtered) - -expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine") -# 2 features x 4 runs = 8 rows -expect_equal(nrow(output_filtered), 8) -expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))), - c("1", "6")) - # With sirius_annotations supplied --------------------------------------------- sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv", package = "MSstatsConvert") @@ -133,11 +118,11 @@ expect_equal(nrow(output_sirius), 24) feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName]) expect_equal(as.character(feature1_proteins), "Caffeine") -# Tier 2: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid" +# SIRIUS fill: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid" feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName]) expect_equal(as.character(feature4_proteins), "Caffeic acid") -# Tier 3: feature 5 has only an empty-name SIRIUS row; falls to mz_rt +# m/z-RT fallback: feature 5 has only an empty-name SIRIUS row; falls to m/z-RT feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName]) expect_equal(as.character(feature5_proteins), "555.447_9.1") diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index e10b0949..6d4030e4 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -204,15 +204,20 @@ peptides receive \code{IsotopeLabelType = "Light"}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative -identification via MS/MS spectral matching). See the public -\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}. See the public +\code{MZMinetoMSstatsFormat} docstring for the full discussion of +identification sources and precedence.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the -\code{mappingFeatureId} and \code{name} columns are read. When supplied, -SIRIUS \code{name} fills \code{ProteinName} for features that received no -MZMine compound (tier 2, MSI Level 3).} +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} } \value{ data.table diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index 9c187da6..70be5876 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -9,7 +9,6 @@ MZMinetoMSstatsFormat( annotation = NULL, mzmine_annotations, sirius_annotations = NULL, - removeProtein_with1Feature = FALSE, summaryforMultipleRows = max, use_log_file = TRUE, append = FALSE, @@ -33,22 +32,20 @@ so the corresponding \code{Run} value must be \code{sampleAmzML}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: -the highest-scoring \code{compound_name} per feature is used as -\code{ProteinName} (tier 1, MSI Level 2 putative identification via -MS/MS spectral matching).} +the highest-scoring \code{compound_name} per feature (MSI Level 2 +putative identification via MS/MS spectral matching) is used as +\code{ProteinName}.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the \code{mappingFeatureId} and \code{name} columns are read; score columns (\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, -\code{SiriusScore}) are ignored in this release. When supplied, SIRIUS -\code{name} fills \code{ProteinName} for features that received no MZMine -compound (tier 2, MSI Level 3 in-silico structure prediction). +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. The schema is validated against SIRIUS 6 output; users on other versions can rename columns to match. Pass \code{NULL} to disable.} -\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} - \item{summaryforMultipleRows}{max or sum - when multiple PSMs identify the same feature within a single MS run (duplicate PSMs), use the highest (max) or sum of the duplicate intensities. Default is max for @@ -78,31 +75,37 @@ data.table in the MSstats required format. Import MZMine files } \details{ -\code{ProteinName} is assigned in three tiers, in priority order: -\enumerate{ -\item \strong{MZMine compound (mandatory)} -- the highest-scoring -\code{compound_name} from \code{mzmine_annotations}. Equivalent to MSI -Level 2 (Sumner et al. 2007, PMID 27624161): putative +\code{ProteinName} is assigned from one of three sources, in priority +order: the MZMine compound name (mandatory), the SIRIUS name +(optional), and an m/z-RT fallback (always available). + +The \strong{MZMine compound name} is the highest-scoring \code{compound_name} +from \code{mzmine_annotations} for each feature. This corresponds to MSI +Level 2 (Sumner et al. 2007, PMID 27624161): a putative identification by MS/MS spectral matching to a reference library. -\item \strong{SIRIUS name (optional)} -- when \code{sirius_annotations} is -non-NULL, the \code{name} from SIRIUS \code{structure_identifications.tsv} -fills any \code{ProteinName} still NA after tier 1. Equivalent to MSI -Level 3: in-silico structure prediction. MZMine annotations take -precedence: SIRIUS only fills features that MZMine missed. -\item \strong{mz_rt fallback (always)} -- features with no annotation from -either source are retained, not dropped, and assigned -\code{paste0(round(mz, 4), "_", round(rt, 2))} as their \code{ProteinName}. -} -The tier-3 retain-all policy is a deliberate trade-off. A fuller -feature set gives more stable medians and a more reliable empirical +The \strong{SIRIUS name} comes from SIRIUS's +\code{structure_identifications.tsv} and corresponds to MSI Level 3: an +in-silico structure prediction. When \code{sirius_annotations} is +non-NULL, the SIRIUS \code{name} fills \code{ProteinName} only for features +the MZMine library missed -- the MZMine compound name takes +precedence. + +The \strong{m/z-RT fallback} is an identifier built from the feature's +m/z and retention time (for example, \verb{455.282_0.65}). Features that +receive no MZMine or SIRIUS annotation are retained, not dropped, +and assigned an m/z-RT identifier as their \code{ProteinName}. + +Retaining every feature is a deliberate trade-off. A fuller feature +set gives more stable medians and a more reliable empirical distribution for global normalization. SIRIUS extends discovery coverage to features that level-2 spectral matching misses. The cost is an increase in the number of hypotheses tested downstream (in \code{MSstats::groupComparison}), which weakens multiple-testing correction. Users running confirmatory analyses should restrict to -tier-1 features post-conversion; users running discovery analyses -benefit from the additional tiers despite the FDR burden. +the MZMine-annotated features post-conversion; users running +discovery analyses benefit from the additional sources despite the +FDR burden. } \examples{ input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd index d7c1fc69..2a2189bf 100644 --- a/man/dot-cleanRawMZMine.Rd +++ b/man/dot-cleanRawMZMine.Rd @@ -12,15 +12,20 @@ \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName} (tier 1, MSI Level 2 putative -identification via MS/MS spectral matching). See the public -\code{MZMinetoMSstatsFormat} docstring for the full tier discussion.} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}. See the public +\code{MZMinetoMSstatsFormat} docstring for the full discussion of +identification sources and precedence.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the -\code{mappingFeatureId} and \code{name} columns are read. When supplied, -SIRIUS \code{name} fills \code{ProteinName} for features that received no -MZMine compound (tier 2, MSI Level 3).} +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} } \value{ data.table diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index 44c67989..2115f1c5 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -359,16 +359,16 @@ in the output: see "Annotation levels and the SIRIUS option" below for how ## Annotation levels and the SIRIUS option -`ProteinName` is filled in three tiers, in priority order: +`ProteinName` is filled from one of three sources, in priority order: -| Tier | Source | MSI Level | Status | -|------|--------|-----------|--------| -| 1 | MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory | -| 2 | SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) | -| 3 | `paste0(round(mz, 4), "_", round(rt, 2))` | — | Always, for features still unannotated | +| Source | MSI Level | Status | +|--------|-----------|--------| +| MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory | +| SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) | +| m/z-RT identifier built from the feature's m/z and retention time (e.g. `455.282_0.65`) | — | Always, for features still unannotated | MZMine annotations (level 2) take precedence over SIRIUS (level 3). -The mz_rt fallback retains features rather than dropping them — a +The m/z-RT fallback retains features rather than dropping them — a deliberate trade-off: - **Pro**: a fuller feature set gives more stable medians and a more @@ -377,15 +377,8 @@ deliberate trade-off: spectral matching misses. - **Con**: retaining all features increases the number of hypotheses tested downstream, which weakens multiple-testing correction. For - confirmatory analyses, restrict to tier-1 features after conversion. -- **Note**: passing `removeProtein_with1Feature = TRUE` drops nearly - every mz_rt feature (and any SIRIUS-only singleton), because the - rounded `paste0(round(mz, 4), "_", round(rt, 2))` identifier is - almost always unique to one feature — so each mz_rt row is treated - as a single-feature "protein" and filtered out, quietly losing the - tier-3 retain-all benefit. In rare cases two features can share an - mz_rt name (when their rounded m/z and RT coincide); those collide - into a single multi-feature "protein" and survive the filter. + confirmatory analyses, restrict to the MZMine-annotated features + after conversion. The SIRIUS schema we depend on is `mappingFeatureId` (join key against MZMine `rowID`) and `name`. Score columns @@ -412,7 +405,7 @@ mzmine_library = data.table::fread(system.file( package = "MSstatsConvert" )) -# tier 1 fills from compound_name; tier 3 fills the remaining as mz_rt +# MZMine compound name fills annotated features; the m/z-RT fallback fills the rest mzmine_converted = MZMinetoMSstatsFormat( mzmine_input, annotation = mzmine_annotation, @@ -421,7 +414,7 @@ mzmine_converted = MZMinetoMSstatsFormat( ) head(mzmine_converted) -# Worked example with SIRIUS — fills tier 2 for features the MZMine library missed: +# Worked example with SIRIUS — fills the SIRIUS name for features the MZMine library missed: sirius = data.table::fread(system.file( "tinytest/raw_data/MZMine/structure_identifications.tsv", package = "MSstatsConvert" From 7ddee25c3692dc7ac544fe70500ad82e776f1086 Mon Sep 17 00:00:00 2001 From: Swaraj Patil Date: Wed, 10 Jun 2026 12:27:30 -0400 Subject: [PATCH 4/4] Remove the explicit @param mzmine_annotations block; instead inherit from .cleanRawMZMine alongside sirius_annotations --- R/clean_MZMine.R | 4 +--- R/converters_MZMinetoMSstatsFormat.R | 5 ----- man/MSstatsClean.Rd | 4 +--- man/MZMinetoMSstatsFormat.Rd | 8 ++++---- man/dot-cleanRawMZMine.Rd | 4 +--- 5 files changed, 7 insertions(+), 18 deletions(-) diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index 1113ae7f..35f0cf00 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -10,9 +10,7 @@ #' annotations with columns `id`, `compound_name`, `score`. Required; #' passing `NULL` raises an error. The highest-scoring `compound_name` #' per feature (MSI Level 2 putative identification via MS/MS spectral -#' matching) is used as `ProteinName`. See the public -#' `MZMinetoMSstatsFormat` docstring for the full discussion of -#' identification sources and precedence. +#' matching) is used as `ProteinName`. #' @param sirius_annotations Optional `data.frame` of SIRIUS #' `structure_identifications.tsv` output, or `NULL`. Only the #' `mappingFeatureId` and `name` columns are read; score columns diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index 3140c7f5..ad872727 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -12,11 +12,6 @@ #' trailing `"Peakarea"` suffix removed. For example, a quant-file column #' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization, #' so the corresponding `Run` value must be `sampleAmzML`. -#' @param mzmine_annotations `data.frame` of MZMine spectral-library -#' annotations with columns `id`, `compound_name`, `score`. Required: -#' the highest-scoring `compound_name` per feature (MSI Level 2 -#' putative identification via MS/MS spectral matching) is used as -#' `ProteinName`. #' #' @details #' `ProteinName` is assigned from one of three sources, in priority diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 6d4030e4..bfa79d6a 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -205,9 +205,7 @@ peptides receive \code{IsotopeLabelType = "Light"}.} annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} per feature (MSI Level 2 putative identification via MS/MS spectral -matching) is used as \code{ProteinName}. See the public -\code{MZMinetoMSstatsFormat} docstring for the full discussion of -identification sources and precedence.} +matching) is used as \code{ProteinName}.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index 70be5876..971d6974 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -31,10 +31,10 @@ trailing \code{"Peakarea"} suffix removed. For example, a quant-file column so the corresponding \code{Run} value must be \code{sampleAmzML}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library -annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: -the highest-scoring \code{compound_name} per feature (MSI Level 2 -putative identification via MS/MS spectral matching) is used as -\code{ProteinName}.} +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd index 2a2189bf..0c936d52 100644 --- a/man/dot-cleanRawMZMine.Rd +++ b/man/dot-cleanRawMZMine.Rd @@ -13,9 +13,7 @@ annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} per feature (MSI Level 2 putative identification via MS/MS spectral -matching) is used as \code{ProteinName}. See the public -\code{MZMinetoMSstatsFormat} docstring for the full discussion of -identification sources and precedence.} +matching) is used as \code{ProteinName}.} \item{sirius_annotations}{Optional \code{data.frame} of SIRIUS \code{structure_identifications.tsv} output, or \code{NULL}. Only the