diff --git a/R/clean_MZMine.R b/R/clean_MZMine.R index cee0cf05..35f0cf00 100644 --- a/R/clean_MZMine.R +++ b/R/clean_MZMine.R @@ -9,17 +9,25 @@ #' @param mzmine_annotations `data.frame` of MZMine spectral-library #' annotations with columns `id`, `compound_name`, `score`. Required; #' passing `NULL` raises an error. The highest-scoring `compound_name` -#' per feature is used as `ProteinName`, and features in the quant -#' table with no matching annotation row are dropped from the output. -#' These are MSI Level 2 annotations (putative identification via -#' MS/MS spectral matching). See the public `MZMinetoMSstatsFormat` -#' docstring for the full scope discussion. +#' per feature (MSI Level 2 putative identification via MS/MS spectral +#' matching) is used as `ProteinName`. +#' @param sirius_annotations Optional `data.frame` of SIRIUS +#' `structure_identifications.tsv` output, or `NULL`. Only the +#' `mappingFeatureId` and `name` columns are read; score columns +#' (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, +#' `SiriusScore`) are ignored in this release. When supplied, the +#' SIRIUS `name` (MSI Level 3, in-silico structure prediction) fills +#' `ProteinName` for features that received no MZMine compound name. +#' The schema is validated against SIRIUS 6 output; users on other +#' versions can rename columns to match. Pass `NULL` to disable. #' @return data.table #' @keywords internal -.cleanRawMZMine <- function(msstats_object, mzmine_annotations) { +.cleanRawMZMine <- function(msstats_object, mzmine_annotations, + sirius_annotations = NULL) { ProteinName = PeptideSequence = Intensity = Run = NULL PrecursorCharge = FragmentIon = ProductCharge = NULL id = score = compound_name = i.compound_name = NULL + rowmz = rowretentiontime = mappingFeatureId = name = NULL mz_input = getInputFile(msstats_object, "input") mz_input = data.table::as.data.table(mz_input) @@ -32,10 +40,13 @@ "columns named ' Peak area' (e.g. 'sampleA.mzML Peak area').") } id_col <- "rowID" - required_meta <- id_col + mz_col <- "rowmz" + rt_col <- "rowretentiontime" + required_meta <- c(id_col, mz_col, rt_col) missing_meta <- setdiff(required_meta, colnames(mz_input)) if (length(missing_meta) > 0) { - stop("Missing required MZMine metadata column (expected 'row ID'). ", + stop("Missing required MZMine metadata column(s) ", + "(expected 'row ID', 'row m/z', 'row retention time'). ", "After standardization, looked for: ", paste(missing_meta, collapse = ", "), ".") } @@ -57,20 +68,47 @@ } data.table::setorder(feature_to_compound, id, -score) feature_to_compound <- unique(feature_to_compound, by = "id") - # Inner-join filter: drop quant rows with no matching annotation. + # MZMine compound name fill (left-join, no drop). mz_input[ feature_to_compound, ProteinName := i.compound_name, on = setNames("id", id_col) ] - mz_input <- mz_input[!is.na(ProteinName)] + n_mzmine <- sum(!is.na(mz_input$ProteinName)) - retained_ids <- feature_to_compound$id - retained_msg <- paste0("** MZMine: retained ", length(retained_ids), - " feature(s) after annotation join: ", - paste(retained_ids, collapse = ", ")) - getOption("MSstatsLog")("INFO", retained_msg) - getOption("MSstatsMsg")("INFO", retained_msg) + # SIRIUS name fills features still NA after the MZMine compound fill. + n_sirius <- 0L + if (!is.null(sirius_annotations)) { + sirius_dt <- data.table::copy(data.table::as.data.table(sirius_annotations)) + drop_cols <- setdiff(colnames(sirius_dt), c("mappingFeatureId", "name")) + for (col in drop_cols) data.table::set(sirius_dt, j = col, value = NULL) + sirius_dt[, name := ifelse(is.na(name) | name == "", + NA_character_, as.character(name))] + sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)] + data.table::setorder(sirius_dt, mappingFeatureId, name) + # unique() keeps the dedup 1:1 for the join and handles + # multiple structure candidates per feature. + sirius_dt <- unique(sirius_dt, by = "mappingFeatureId") + mz_input[is.na(ProteinName), ProteinName := + sirius_dt[.(as.character(get(id_col))), on = "mappingFeatureId", name]] + n_sirius <- sum(!is.na(mz_input$ProteinName)) - n_mzmine + } + + # m/z-RT fallback for features still NA. + na_mask <- is.na(mz_input$ProteinName) + n_fallback <- sum(na_mask) + if (n_fallback > 0) { + mz_input[na_mask, ProteinName := paste0( + round(get(mz_col), 4), "_", round(get(rt_col), 2))] + } + + assignment_msg <- paste0( + "** MZMine ProteinName assignment: ", + "MZMine compound: ", n_mzmine, " feature(s); ", + "SIRIUS name: ", n_sirius, " feature(s); ", + "m/z-RT fallback: ", n_fallback, " feature(s).") + getOption("MSstatsLog")("INFO", assignment_msg) + getOption("MSstatsMsg")("INFO", assignment_msg) mz_input[, PeptideSequence := as.character(get(id_col))] diff --git a/R/converters_MZMinetoMSstatsFormat.R b/R/converters_MZMinetoMSstatsFormat.R index 4e3df158..ad872727 100644 --- a/R/converters_MZMinetoMSstatsFormat.R +++ b/R/converters_MZMinetoMSstatsFormat.R @@ -1,6 +1,7 @@ #' Import MZMine files #' #' @inheritParams .sharedParametersAmongConverters +#' @inheritParams .cleanRawMZMine #' @param input MZMine feature-quantification table (wide format; one row per #' feature). Must include the metadata columns `row ID`, `row m/z`, #' `row retention time`, and per-sample peak-area columns named @@ -11,19 +12,39 @@ #' trailing `"Peakarea"` suffix removed. For example, a quant-file column #' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization, #' so the corresponding `Run` value must be `sampleAmzML`. -#' @param mzmine_annotations `data.frame` of MZMine spectral-library -#' annotations with columns `id`, `compound_name`, `score`. Required: -#' the highest-scoring `compound_name` per feature is used as -#' `ProteinName`, and features in the quant table with no matching -#' annotation row are dropped from the output. #' -#' These are MSI Level 2 annotations (putative identification via -#' MS/MS spectral matching against a reference library). Higher- -#' confidence Level 1 identifications require pure reference standards -#' and are out of scope here. Lower-confidence annotations such as -#' Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via -#' CANOPUS) are not currently supported -- features without a Level 2 -#' annotation row are filtered out. +#' @details +#' `ProteinName` is assigned from one of three sources, in priority +#' order: the MZMine compound name (mandatory), the SIRIUS name +#' (optional), and an m/z-RT fallback (always available). +#' +#' The **MZMine compound name** is the highest-scoring `compound_name` +#' from `mzmine_annotations` for each feature. This corresponds to MSI +#' Level 2 (Sumner et al. 2007, PMID 27624161): a putative +#' identification by MS/MS spectral matching to a reference library. +#' +#' The **SIRIUS name** comes from SIRIUS's +#' `structure_identifications.tsv` and corresponds to MSI Level 3: an +#' in-silico structure prediction. When `sirius_annotations` is +#' non-NULL, the SIRIUS `name` fills `ProteinName` only for features +#' the MZMine library missed -- the MZMine compound name takes +#' precedence. +#' +#' The **m/z-RT fallback** is an identifier built from the feature's +#' m/z and retention time (for example, `455.282_0.65`). Features that +#' receive no MZMine or SIRIUS annotation are retained, not dropped, +#' and assigned an m/z-RT identifier as their `ProteinName`. +#' +#' Retaining every feature is a deliberate trade-off. A fuller feature +#' set gives more stable medians and a more reliable empirical +#' distribution for global normalization. SIRIUS extends discovery +#' coverage to features that level-2 spectral matching misses. The +#' cost is an increase in the number of hypotheses tested downstream +#' (in `MSstats::groupComparison`), which weakens multiple-testing +#' correction. Users running confirmatory analyses should restrict to +#' the MZMine-annotated features post-conversion; users running +#' discovery analyses benefit from the additional sources despite the +#' FDR burden. #' #' @return data.table in the MSstats required format. #' @@ -43,11 +64,23 @@ #' mzmine_annotations = lib, #' use_log_file = FALSE) #' head(output) +#' +#' # With SIRIUS annotations: +#' sirius_path = system.file( +#' "tinytest/raw_data/MZMine/structure_identifications.tsv", +#' package = "MSstatsConvert") +#' sirius = data.table::fread(sirius_path) +#' output_with_sirius = MZMinetoMSstatsFormat( +#' input, annotation = annot, +#' mzmine_annotations = lib, +#' sirius_annotations = sirius, +#' use_log_file = FALSE) +#' head(output_with_sirius) MZMinetoMSstatsFormat = function( input, annotation = NULL, mzmine_annotations, - removeProtein_with1Feature = FALSE, + sirius_annotations = NULL, summaryforMultipleRows = max, use_log_file = TRUE, append = FALSE, @@ -62,10 +95,22 @@ MZMinetoMSstatsFormat = function( "columns 'id', 'compound_name', 'score'.") } + if (!is.null(sirius_annotations)) { + sirius_cols = colnames(sirius_annotations) + missing_sirius = setdiff(c("mappingFeatureId", "name"), sirius_cols) + if (length(missing_sirius) > 0) { + stop("sirius_annotations is missing required column(s): ", + paste(missing_sirius, collapse = ", "), + ". Required: 'mappingFeatureId' and 'name'.") + } + } + input = MSstatsConvert::MSstatsImport(list(input = input), "MSstats", "MZMine", ...) input = MSstatsConvert::MSstatsClean( - input, mzmine_annotations = mzmine_annotations) + input, + mzmine_annotations = mzmine_annotations, + sirius_annotations = sirius_annotations) annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation) feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge") @@ -75,7 +120,7 @@ MZMinetoMSstatsFormat = function( annotation, feature_columns, remove_shared_peptides = FALSE, - remove_single_feature_proteins = removeProtein_with1Feature, + remove_single_feature_proteins = FALSE, exact_filtering = NULL, pattern_filtering = NULL, aggregate_isotopic = FALSE, diff --git a/inst/tinytest/raw_data/MZMine/structure_identifications.tsv b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv new file mode 100644 index 00000000..a0e38e69 --- /dev/null +++ b/inst/tinytest/raw_data/MZMine/structure_identifications.tsv @@ -0,0 +1,5 @@ +mappingFeatureId name ConfidenceScoreExact ConfidenceScoreApproximate SiriusScore +1 DuplicateFromSirius 0.30 0.40 5.5 +4 Caffeic acid 0.85 0.88 22.1 +5 0.10 0.12 1.0 +99 Ghost 0.50 0.55 8.0 diff --git a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R index dcfccf90..2589f1fc 100644 --- a/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R +++ b/inst/tinytest/test_converters_MZMinetoMSstatsFormat.R @@ -15,10 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot, use_log_file = FALSE) output_dt = data.table::as.data.table(output) -# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns -# Features 4 and 5 have no annotation row and are dropped by the inner join. +# Basic structure: 6 features x 4 runs = 24 rows; all features retained. +# Features 4 and 5 have no MZMine annotation and receive mz_rt fallback names. expect_equal(ncol(output), 11) -expect_equal(nrow(output), 16) +expect_equal(nrow(output), 24) expect_true("Run" %in% colnames(output)) expect_true("ProteinName" %in% colnames(output)) expect_true("PeptideSequence" %in% colnames(output)) @@ -54,11 +54,13 @@ expect_equal(as.character(feature3_proteins), "Lactate") feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName]) expect_equal(as.character(feature6_proteins), "Caffeine") -# Features absent from the annotations file are filtered out (no mz_rt fallback) -expect_false("4" %in% as.character(output_dt$PeptideSequence)) -expect_false("5" %in% as.character(output_dt$PeptideSequence)) -expect_false(any(as.character(output_dt$ProteinName) %in% - c("489.334_7.89", "555.447_9.1"))) +# Features absent from the MZMine annotations file get mz_rt fallback ProteinNames. +expect_true("4" %in% as.character(output_dt$PeptideSequence)) +expect_true("5" %in% as.character(output_dt$PeptideSequence)) +feature4_protein = unique(output_dt[PeptideSequence == "4", ProteinName]) +expect_equal(as.character(feature4_protein), "489.334_7.89") +feature5_protein = unique(output_dt[PeptideSequence == "5", ProteinName]) +expect_equal(as.character(feature5_protein), "555.447_9.1") # Zero-intensity input cells are converted to NA in output # Feature 3 sampleB = 0 -> NA (feature 3 is annotated as Lactate) @@ -97,17 +99,43 @@ expect_error( "mzmine_annotations is required" ) -# removeProtein_with1Feature filters non-Caffeine proteins ------------------- -# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6); -# Lactate and Glucose each have 1. -output_filtered = MZMinetoMSstatsFormat(input, annotation = annot, - mzmine_annotations = mzmine_ann, - removeProtein_with1Feature = TRUE, - use_log_file = FALSE) -output_filtered_dt = data.table::as.data.table(output_filtered) - -expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine") -# 2 features x 4 runs = 8 rows -expect_equal(nrow(output_filtered), 8) -expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))), - c("1", "6")) +# With sirius_annotations supplied --------------------------------------------- +sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert") +sirius = data.table::fread(sirius_path) + +output_sirius = MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + sirius_annotations = sirius, + use_log_file = FALSE) +output_sirius_dt = data.table::as.data.table(output_sirius) + +# All 6 features still retained +expect_equal(nrow(output_sirius), 24) + +# Precedence: feature 1 hit by both MZMine (Caffeine) and SIRIUS +# (DuplicateFromSirius). MZMine wins. +feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName]) +expect_equal(as.character(feature1_proteins), "Caffeine") + +# SIRIUS fill: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid" +feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName]) +expect_equal(as.character(feature4_proteins), "Caffeic acid") + +# m/z-RT fallback: feature 5 has only an empty-name SIRIUS row; falls to m/z-RT +feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName]) +expect_equal(as.character(feature5_proteins), "555.447_9.1") + +# An irrelevant SIRIUS row (mappingFeatureId=99) must not introduce new features +expect_false("99" %in% as.character(output_sirius_dt$PeptideSequence)) +expect_false("Ghost" %in% as.character(output_sirius_dt$ProteinName)) + +# sirius_annotations missing required columns triggers stop() ------------------ +bad_sirius = data.frame(mappingFeatureId = 1, score = 0.9) # no 'name' +expect_error( + MZMinetoMSstatsFormat(input, annotation = annot, + mzmine_annotations = mzmine_ann, + sirius_annotations = bad_sirius, + use_log_file = FALSE), + "missing required column" +) diff --git a/man/MSstatsClean.Rd b/man/MSstatsClean.Rd index 8863b4ea..bfa79d6a 100644 --- a/man/MSstatsClean.Rd +++ b/man/MSstatsClean.Rd @@ -82,7 +82,7 @@ MSstatsClean(msstats_object, ...) \S4method{MSstatsClean}{MSstatsProteinProspectorFiles}(msstats_object) -\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations) +\S4method{MSstatsClean}{MSstatsMZMineFiles}(msstats_object, mzmine_annotations, sirius_annotations = NULL) } \arguments{ \item{msstats_object}{object that inherits from \code{MSstatsInputFiles} class.} @@ -204,11 +204,18 @@ peptides receive \code{IsotopeLabelType = "Light"}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName}, and features in the quant -table with no matching annotation row are dropped from the output. -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} -docstring for the full scope discussion.} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}.} + +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} } \value{ data.table diff --git a/man/MZMinetoMSstatsFormat.Rd b/man/MZMinetoMSstatsFormat.Rd index b3fdcfe1..971d6974 100644 --- a/man/MZMinetoMSstatsFormat.Rd +++ b/man/MZMinetoMSstatsFormat.Rd @@ -8,7 +8,7 @@ MZMinetoMSstatsFormat( input, annotation = NULL, mzmine_annotations, - removeProtein_with1Feature = FALSE, + sirius_annotations = NULL, summaryforMultipleRows = max, use_log_file = TRUE, append = FALSE, @@ -31,22 +31,26 @@ trailing \code{"Peakarea"} suffix removed. For example, a quant-file column so the corresponding \code{Run} value must be \code{sampleAmzML}.} \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library -annotations with columns \code{id}, \code{compound_name}, \code{score}. Required: -the highest-scoring \code{compound_name} per feature is used as -\code{ProteinName}, and features in the quant table with no matching -annotation row are dropped from the output. +annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; +passing \code{NULL} raises an error. The highest-scoring \code{compound_name} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}.} -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching against a reference library). Higher- -confidence Level 1 identifications require pure reference standards -and are out of scope here. Lower-confidence annotations such as -Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via -CANOPUS) are not currently supported -- features without a Level 2 -annotation row are filtered out.} +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} -\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.} - -\item{summaryforMultipleRows}{max or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities. Default is max for label-free converters and sum for TMT converters.} +\item{summaryforMultipleRows}{max or sum - when multiple PSMs identify +the same feature within a single MS run (duplicate PSMs), use the +highest (max) or sum of the duplicate intensities. Default is max for +label-free converters and sum for TMT converters. Note that this parameter +does NOT control collapsing across fractions of the same biological mixture.} \item{use_log_file}{logical. If TRUE, information about data processing will be saved to a file.} @@ -70,6 +74,39 @@ data.table in the MSstats required format. \description{ Import MZMine files } +\details{ +\code{ProteinName} is assigned from one of three sources, in priority +order: the MZMine compound name (mandatory), the SIRIUS name +(optional), and an m/z-RT fallback (always available). + +The \strong{MZMine compound name} is the highest-scoring \code{compound_name} +from \code{mzmine_annotations} for each feature. This corresponds to MSI +Level 2 (Sumner et al. 2007, PMID 27624161): a putative +identification by MS/MS spectral matching to a reference library. + +The \strong{SIRIUS name} comes from SIRIUS's +\code{structure_identifications.tsv} and corresponds to MSI Level 3: an +in-silico structure prediction. When \code{sirius_annotations} is +non-NULL, the SIRIUS \code{name} fills \code{ProteinName} only for features +the MZMine library missed -- the MZMine compound name takes +precedence. + +The \strong{m/z-RT fallback} is an identifier built from the feature's +m/z and retention time (for example, \verb{455.282_0.65}). Features that +receive no MZMine or SIRIUS annotation are retained, not dropped, +and assigned an m/z-RT identifier as their \code{ProteinName}. + +Retaining every feature is a deliberate trade-off. A fuller feature +set gives more stable medians and a more reliable empirical +distribution for global normalization. SIRIUS extends discovery +coverage to features that level-2 spectral matching misses. The +cost is an increase in the number of hypotheses tested downstream +(in \code{MSstats::groupComparison}), which weakens multiple-testing +correction. Users running confirmatory analyses should restrict to +the MZMine-annotated features post-conversion; users running +discovery analyses benefit from the additional sources despite the +FDR burden. +} \examples{ input_path = system.file("tinytest/raw_data/MZMine/mzmine_input.csv", package = "MSstatsConvert") @@ -84,4 +121,16 @@ output = MZMinetoMSstatsFormat(input, annotation = annot, mzmine_annotations = lib, use_log_file = FALSE) head(output) + +# With SIRIUS annotations: +sirius_path = system.file( + "tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert") +sirius = data.table::fread(sirius_path) +output_with_sirius = MZMinetoMSstatsFormat( + input, annotation = annot, + mzmine_annotations = lib, + sirius_annotations = sirius, + use_log_file = FALSE) +head(output_with_sirius) } diff --git a/man/dot-cleanRawMZMine.Rd b/man/dot-cleanRawMZMine.Rd index 8c93db08..0c936d52 100644 --- a/man/dot-cleanRawMZMine.Rd +++ b/man/dot-cleanRawMZMine.Rd @@ -4,7 +4,7 @@ \alias{.cleanRawMZMine} \title{Clean raw MZMine files} \usage{ -.cleanRawMZMine(msstats_object, mzmine_annotations) +.cleanRawMZMine(msstats_object, mzmine_annotations, sirius_annotations = NULL) } \arguments{ \item{msstats_object}{an object of class \code{MSstatsMZMineFiles}.} @@ -12,11 +12,18 @@ \item{mzmine_annotations}{\code{data.frame} of MZMine spectral-library annotations with columns \code{id}, \code{compound_name}, \code{score}. Required; passing \code{NULL} raises an error. The highest-scoring \code{compound_name} -per feature is used as \code{ProteinName}, and features in the quant -table with no matching annotation row are dropped from the output. -These are MSI Level 2 annotations (putative identification via -MS/MS spectral matching). See the public \code{MZMinetoMSstatsFormat} -docstring for the full scope discussion.} +per feature (MSI Level 2 putative identification via MS/MS spectral +matching) is used as \code{ProteinName}.} + +\item{sirius_annotations}{Optional \code{data.frame} of SIRIUS +\code{structure_identifications.tsv} output, or \code{NULL}. Only the +\code{mappingFeatureId} and \code{name} columns are read; score columns +(\code{ConfidenceScoreExact}, \code{ConfidenceScoreApproximate}, +\code{SiriusScore}) are ignored in this release. When supplied, the +SIRIUS \code{name} (MSI Level 3, in-silico structure prediction) fills +\code{ProteinName} for features that received no MZMine compound name. +The schema is validated against SIRIUS 6 output; users on other +versions can rename columns to match. Pass \code{NULL} to disable.} } \value{ data.table diff --git a/vignettes/msstats_data_format.Rmd b/vignettes/msstats_data_format.Rmd index 64a4d882..2115f1c5 100644 --- a/vignettes/msstats_data_format.Rmd +++ b/vignettes/msstats_data_format.Rmd @@ -353,17 +353,43 @@ per feature, one ` Peak area` column per sample) together with a standar MSstats annotation and produces an MSstats-ready long-format `data.table`. An MZMine spectral-library annotation table with `id`, `compound_name`, and -`score` columns is **required**. The highest-scoring `compound_name` per feature -is used as `ProteinName`. Features in the quant table with no matching annotation -row are dropped from the output — there is no synthesized mz_rt fallback, -because placeholder identifiers inflate the hypothesis count for downstream -`groupComparison` without biological signal. - -These are [MSI Level 2 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) -(putative identification via MS/MS spectral matching against a reference library). -Lower-confidence annotation sources — SIRIUS / MS2Query (Level 3) and CANOPUS -(Level 4) — are out of scope for this iteration; features without a Level 2 -annotation row are filtered out. +`score` columns is **required**. All features in the quant table are retained +in the output: see "Annotation levels and the SIRIUS option" below for how +`ProteinName` is assigned. + +## Annotation levels and the SIRIUS option + +`ProteinName` is filled from one of three sources, in priority order: + +| Source | MSI Level | Status | +|--------|-----------|--------| +| MZMine `compound_name` (highest-scoring) | Level 2 — MS/MS spectral match | Mandatory | +| SIRIUS `name` from `structure_identifications.tsv` | Level 3 — in-silico structure prediction | Optional (`sirius_annotations = NULL` by default) | +| m/z-RT identifier built from the feature's m/z and retention time (e.g. `455.282_0.65`) | — | Always, for features still unannotated | + +MZMine annotations (level 2) take precedence over SIRIUS (level 3). +The m/z-RT fallback retains features rather than dropping them — a +deliberate trade-off: + +- **Pro**: a fuller feature set gives more stable medians and a more + reliable empirical distribution for global normalization. +- **Pro**: SIRIUS extends discovery coverage to features that level-2 + spectral matching misses. +- **Con**: retaining all features increases the number of hypotheses + tested downstream, which weakens multiple-testing correction. For + confirmatory analyses, restrict to the MZMine-annotated features + after conversion. + +The SIRIUS schema we depend on is `mappingFeatureId` (join key against +MZMine `rowID`) and `name`. Score columns +(`ConfidenceScoreExact`, `ConfidenceScoreApproximate`, `SiriusScore`) +are present in the file but not used in this release. The schema is +validated against SIRIUS 6 output; users on different SIRIUS versions +can rename their columns to match. + +The reference background is +[MSI Level 2 / Level 3 annotations](https://pmc.ncbi.nlm.nih.gov/articles/PMC5110944/) +(Sumner et al. 2007, PMID 27624161). ```{r mzmine} mzmine_input = data.table::fread(system.file( @@ -379,7 +405,7 @@ mzmine_library = data.table::fread(system.file( package = "MSstatsConvert" )) -# ProteinName comes from the matched compound_name; unannotated features are dropped +# MZMine compound name fills annotated features; the m/z-RT fallback fills the rest mzmine_converted = MZMinetoMSstatsFormat( mzmine_input, annotation = mzmine_annotation, @@ -387,6 +413,20 @@ mzmine_converted = MZMinetoMSstatsFormat( use_log_file = FALSE ) head(mzmine_converted) + +# Worked example with SIRIUS — fills the SIRIUS name for features the MZMine library missed: +sirius = data.table::fread(system.file( + "tinytest/raw_data/MZMine/structure_identifications.tsv", + package = "MSstatsConvert" +)) +mzmine_with_sirius = MZMinetoMSstatsFormat( + mzmine_input, + annotation = mzmine_annotation, + mzmine_annotations = mzmine_library, + sirius_annotations = sirius, + use_log_file = FALSE +) +unique(mzmine_with_sirius[, c("PeptideSequence", "ProteinName")]) ``` Since metabolomics features do not carry peptide-level identifiers, `PeptideSequence`