Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 54 additions & 16 deletions R/clean_MZMine.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,25 @@
#' @param mzmine_annotations `data.frame` of MZMine spectral-library
#' annotations with columns `id`, `compound_name`, `score`. Required;
#' passing `NULL` raises an error. The highest-scoring `compound_name`
#' per feature is used as `ProteinName`, and features in the quant
#' table with no matching annotation row are dropped from the output.
#' These are MSI Level 2 annotations (putative identification via
#' MS/MS spectral matching). See the public `MZMinetoMSstatsFormat`
#' docstring for the full scope discussion.
#' per feature (MSI Level 2 putative identification via MS/MS spectral
#' matching) is used as `ProteinName`.
#' @param sirius_annotations Optional `data.frame` of SIRIUS
#' `structure_identifications.tsv` output, or `NULL`. Only the
#' `mappingFeatureId` and `name` columns are read; score columns
#' (`ConfidenceScoreExact`, `ConfidenceScoreApproximate`,
#' `SiriusScore`) are ignored in this release. When supplied, the
#' SIRIUS `name` (MSI Level 3, in-silico structure prediction) fills
#' `ProteinName` for features that received no MZMine compound name.
#' The schema is validated against SIRIUS 6 output; users on other
#' versions can rename columns to match. Pass `NULL` to disable.
#' @return data.table
#' @keywords internal
.cleanRawMZMine <- function(msstats_object, mzmine_annotations) {
.cleanRawMZMine <- function(msstats_object, mzmine_annotations,
sirius_annotations = NULL) {
ProteinName = PeptideSequence = Intensity = Run = NULL
PrecursorCharge = FragmentIon = ProductCharge = NULL
id = score = compound_name = i.compound_name = NULL
rowmz = rowretentiontime = mappingFeatureId = name = NULL

mz_input = getInputFile(msstats_object, "input")
mz_input = data.table::as.data.table(mz_input)
Expand All @@ -32,10 +40,13 @@
"columns named '<run> Peak area' (e.g. 'sampleA.mzML Peak area').")
}
id_col <- "rowID"
required_meta <- id_col
mz_col <- "rowmz"
rt_col <- "rowretentiontime"
required_meta <- c(id_col, mz_col, rt_col)
missing_meta <- setdiff(required_meta, colnames(mz_input))
if (length(missing_meta) > 0) {
stop("Missing required MZMine metadata column (expected 'row ID'). ",
stop("Missing required MZMine metadata column(s) ",
"(expected 'row ID', 'row m/z', 'row retention time'). ",
"After standardization, looked for: ",
paste(missing_meta, collapse = ", "), ".")
}
Expand All @@ -57,20 +68,47 @@
}
data.table::setorder(feature_to_compound, id, -score)
feature_to_compound <- unique(feature_to_compound, by = "id")
# Inner-join filter: drop quant rows with no matching annotation.
# MZMine compound name fill (left-join, no drop).
mz_input[
feature_to_compound,
ProteinName := i.compound_name,
on = setNames("id", id_col)
]
mz_input <- mz_input[!is.na(ProteinName)]
n_mzmine <- sum(!is.na(mz_input$ProteinName))

retained_ids <- feature_to_compound$id
retained_msg <- paste0("** MZMine: retained ", length(retained_ids),
" feature(s) after annotation join: ",
paste(retained_ids, collapse = ", "))
getOption("MSstatsLog")("INFO", retained_msg)
getOption("MSstatsMsg")("INFO", retained_msg)
# SIRIUS name fills features still NA after the MZMine compound fill.
n_sirius <- 0L
if (!is.null(sirius_annotations)) {
sirius_dt <- data.table::copy(data.table::as.data.table(sirius_annotations))
drop_cols <- setdiff(colnames(sirius_dt), c("mappingFeatureId", "name"))
for (col in drop_cols) data.table::set(sirius_dt, j = col, value = NULL)
sirius_dt[, name := ifelse(is.na(name) | name == "",
NA_character_, as.character(name))]
sirius_dt[, mappingFeatureId := as.character(mappingFeatureId)]
data.table::setorder(sirius_dt, mappingFeatureId, name)
# unique() keeps the dedup 1:1 for the join and handles
# multiple structure candidates per feature.
sirius_dt <- unique(sirius_dt, by = "mappingFeatureId")
Comment thread
tonywu1999 marked this conversation as resolved.
mz_input[is.na(ProteinName), ProteinName :=
sirius_dt[.(as.character(get(id_col))), on = "mappingFeatureId", name]]
n_sirius <- sum(!is.na(mz_input$ProteinName)) - n_mzmine
}

# m/z-RT fallback for features still NA.
na_mask <- is.na(mz_input$ProteinName)
n_fallback <- sum(na_mask)
if (n_fallback > 0) {
mz_input[na_mask, ProteinName := paste0(
round(get(mz_col), 4), "_", round(get(rt_col), 2))]
}

assignment_msg <- paste0(
"** MZMine ProteinName assignment: ",
"MZMine compound: ", n_mzmine, " feature(s); ",
"SIRIUS name: ", n_sirius, " feature(s); ",
"m/z-RT fallback: ", n_fallback, " feature(s).")
getOption("MSstatsLog")("INFO", assignment_msg)
getOption("MSstatsMsg")("INFO", assignment_msg)

mz_input[, PeptideSequence := as.character(get(id_col))]

Expand Down
75 changes: 60 additions & 15 deletions R/converters_MZMinetoMSstatsFormat.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#' Import MZMine files
#'
#' @inheritParams .sharedParametersAmongConverters
#' @inheritParams .cleanRawMZMine
#' @param input MZMine feature-quantification table (wide format; one row per
#' feature). Must include the metadata columns `row ID`, `row m/z`,
#' `row retention time`, and per-sample peak-area columns named
Expand All @@ -11,19 +12,39 @@
#' trailing `"Peakarea"` suffix removed. For example, a quant-file column
#' `"sampleA.mzML Peak area"` becomes `"sampleAmzML"` after standardization,
#' so the corresponding `Run` value must be `sampleAmzML`.
#' @param mzmine_annotations `data.frame` of MZMine spectral-library
#' annotations with columns `id`, `compound_name`, `score`. Required:
#' the highest-scoring `compound_name` per feature is used as
#' `ProteinName`, and features in the quant table with no matching
#' annotation row are dropped from the output.
#'
#' These are MSI Level 2 annotations (putative identification via
#' MS/MS spectral matching against a reference library). Higher-
#' confidence Level 1 identifications require pure reference standards
#' and are out of scope here. Lower-confidence annotations such as
#' Level 3 (SIRIUS, MS2Query) or Level 4 (molecular formula via
#' CANOPUS) are not currently supported -- features without a Level 2
#' annotation row are filtered out.
#' @details
#' `ProteinName` is assigned from one of three sources, in priority
#' order: the MZMine compound name (mandatory), the SIRIUS name
#' (optional), and an m/z-RT fallback (always available).
#'
#' The **MZMine compound name** is the highest-scoring `compound_name`
#' from `mzmine_annotations` for each feature. This corresponds to MSI
#' Level 2 (Sumner et al. 2007, PMID 27624161): a putative
#' identification by MS/MS spectral matching to a reference library.
#'
#' The **SIRIUS name** comes from SIRIUS's
#' `structure_identifications.tsv` and corresponds to MSI Level 3: an
#' in-silico structure prediction. When `sirius_annotations` is
#' non-NULL, the SIRIUS `name` fills `ProteinName` only for features
#' the MZMine library missed -- the MZMine compound name takes
#' precedence.
#'
#' The **m/z-RT fallback** is an identifier built from the feature's
#' m/z and retention time (for example, `455.282_0.65`). Features that
#' receive no MZMine or SIRIUS annotation are retained, not dropped,
#' and assigned an m/z-RT identifier as their `ProteinName`.
#'
#' Retaining every feature is a deliberate trade-off. A fuller feature
#' set gives more stable medians and a more reliable empirical
#' distribution for global normalization. SIRIUS extends discovery
#' coverage to features that level-2 spectral matching misses. The
#' cost is an increase in the number of hypotheses tested downstream
#' (in `MSstats::groupComparison`), which weakens multiple-testing
#' correction. Users running confirmatory analyses should restrict to
#' the MZMine-annotated features post-conversion; users running
#' discovery analyses benefit from the additional sources despite the
#' FDR burden.
#'
#' @return data.table in the MSstats required format.
#'
Expand All @@ -43,11 +64,23 @@
#' mzmine_annotations = lib,
#' use_log_file = FALSE)
#' head(output)
#'
#' # With SIRIUS annotations:
#' sirius_path = system.file(
#' "tinytest/raw_data/MZMine/structure_identifications.tsv",
#' package = "MSstatsConvert")
#' sirius = data.table::fread(sirius_path)
#' output_with_sirius = MZMinetoMSstatsFormat(
#' input, annotation = annot,
#' mzmine_annotations = lib,
#' sirius_annotations = sirius,
#' use_log_file = FALSE)
#' head(output_with_sirius)
MZMinetoMSstatsFormat = function(
input,
annotation = NULL,
mzmine_annotations,
removeProtein_with1Feature = FALSE,
sirius_annotations = NULL,
summaryforMultipleRows = max,
use_log_file = TRUE,
append = FALSE,
Expand All @@ -62,10 +95,22 @@ MZMinetoMSstatsFormat = function(
"columns 'id', 'compound_name', 'score'.")
}

if (!is.null(sirius_annotations)) {
sirius_cols = colnames(sirius_annotations)
missing_sirius = setdiff(c("mappingFeatureId", "name"), sirius_cols)
if (length(missing_sirius) > 0) {
stop("sirius_annotations is missing required column(s): ",
paste(missing_sirius, collapse = ", "),
". Required: 'mappingFeatureId' and 'name'.")
}
}

input = MSstatsConvert::MSstatsImport(list(input = input),
"MSstats", "MZMine", ...)
input = MSstatsConvert::MSstatsClean(
input, mzmine_annotations = mzmine_annotations)
input,
mzmine_annotations = mzmine_annotations,
sirius_annotations = sirius_annotations)
annotation = MSstatsConvert::MSstatsMakeAnnotation(input, annotation)

feature_columns = c("PeptideSequence", "PrecursorCharge", "FragmentIon", "ProductCharge")
Expand All @@ -75,7 +120,7 @@ MZMinetoMSstatsFormat = function(
annotation,
feature_columns,
remove_shared_peptides = FALSE,
remove_single_feature_proteins = removeProtein_with1Feature,
remove_single_feature_proteins = FALSE,
exact_filtering = NULL,
pattern_filtering = NULL,
aggregate_isotopic = FALSE,
Expand Down
5 changes: 5 additions & 0 deletions inst/tinytest/raw_data/MZMine/structure_identifications.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mappingFeatureId name ConfidenceScoreExact ConfidenceScoreApproximate SiriusScore
1 DuplicateFromSirius 0.30 0.40 5.5
4 Caffeic acid 0.85 0.88 22.1
5 0.10 0.12 1.0
99 Ghost 0.50 0.55 8.0
72 changes: 50 additions & 22 deletions inst/tinytest/test_converters_MZMinetoMSstatsFormat.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ output = MZMinetoMSstatsFormat(input, annotation = annot,
use_log_file = FALSE)
output_dt = data.table::as.data.table(output)

# Basic structure: 4 annotated features x 4 runs = 16 rows, 11 standard columns
# Features 4 and 5 have no annotation row and are dropped by the inner join.
# Basic structure: 6 features x 4 runs = 24 rows; all features retained.
# Features 4 and 5 have no MZMine annotation and receive mz_rt fallback names.
expect_equal(ncol(output), 11)
expect_equal(nrow(output), 16)
expect_equal(nrow(output), 24)
expect_true("Run" %in% colnames(output))
expect_true("ProteinName" %in% colnames(output))
expect_true("PeptideSequence" %in% colnames(output))
Expand Down Expand Up @@ -54,11 +54,13 @@ expect_equal(as.character(feature3_proteins), "Lactate")
feature6_proteins = unique(output_dt[PeptideSequence == "6", ProteinName])
expect_equal(as.character(feature6_proteins), "Caffeine")

# Features absent from the annotations file are filtered out (no mz_rt fallback)
expect_false("4" %in% as.character(output_dt$PeptideSequence))
expect_false("5" %in% as.character(output_dt$PeptideSequence))
expect_false(any(as.character(output_dt$ProteinName) %in%
c("489.334_7.89", "555.447_9.1")))
# Features absent from the MZMine annotations file get mz_rt fallback ProteinNames.
expect_true("4" %in% as.character(output_dt$PeptideSequence))
expect_true("5" %in% as.character(output_dt$PeptideSequence))
feature4_protein = unique(output_dt[PeptideSequence == "4", ProteinName])
expect_equal(as.character(feature4_protein), "489.334_7.89")
feature5_protein = unique(output_dt[PeptideSequence == "5", ProteinName])
expect_equal(as.character(feature5_protein), "555.447_9.1")

# Zero-intensity input cells are converted to NA in output
# Feature 3 sampleB = 0 -> NA (feature 3 is annotated as Lactate)
Expand Down Expand Up @@ -97,17 +99,43 @@ expect_error(
"mzmine_annotations is required"
)

# removeProtein_with1Feature filters non-Caffeine proteins -------------------
# Of the annotated features (1, 2, 3, 6), Caffeine has 2 (IDs 1 and 6);
# Lactate and Glucose each have 1.
output_filtered = MZMinetoMSstatsFormat(input, annotation = annot,
mzmine_annotations = mzmine_ann,
removeProtein_with1Feature = TRUE,
use_log_file = FALSE)
output_filtered_dt = data.table::as.data.table(output_filtered)

expect_equal(unique(as.character(output_filtered_dt$ProteinName)), "Caffeine")
# 2 features x 4 runs = 8 rows
expect_equal(nrow(output_filtered), 8)
expect_equal(sort(unique(as.character(output_filtered_dt$PeptideSequence))),
c("1", "6"))
# With sirius_annotations supplied ---------------------------------------------
sirius_path = system.file("tinytest/raw_data/MZMine/structure_identifications.tsv",
package = "MSstatsConvert")
sirius = data.table::fread(sirius_path)

output_sirius = MZMinetoMSstatsFormat(input, annotation = annot,
mzmine_annotations = mzmine_ann,
sirius_annotations = sirius,
use_log_file = FALSE)
output_sirius_dt = data.table::as.data.table(output_sirius)

# All 6 features still retained
expect_equal(nrow(output_sirius), 24)

# Precedence: feature 1 hit by both MZMine (Caffeine) and SIRIUS
# (DuplicateFromSirius). MZMine wins.
feature1_proteins = unique(output_sirius_dt[PeptideSequence == "1", ProteinName])
expect_equal(as.character(feature1_proteins), "Caffeine")

# SIRIUS fill: feature 4 has no MZMine annotation; SIRIUS fills "Caffeic acid"
feature4_proteins = unique(output_sirius_dt[PeptideSequence == "4", ProteinName])
expect_equal(as.character(feature4_proteins), "Caffeic acid")

# m/z-RT fallback: feature 5 has only an empty-name SIRIUS row; falls to m/z-RT
feature5_proteins = unique(output_sirius_dt[PeptideSequence == "5", ProteinName])
expect_equal(as.character(feature5_proteins), "555.447_9.1")

# An irrelevant SIRIUS row (mappingFeatureId=99) must not introduce new features
expect_false("99" %in% as.character(output_sirius_dt$PeptideSequence))
expect_false("Ghost" %in% as.character(output_sirius_dt$ProteinName))

# sirius_annotations missing required columns triggers stop() ------------------
bad_sirius = data.frame(mappingFeatureId = 1, score = 0.9) # no 'name'
expect_error(
MZMinetoMSstatsFormat(input, annotation = annot,
mzmine_annotations = mzmine_ann,
sirius_annotations = bad_sirius,
use_log_file = FALSE),
"missing required column"
)
19 changes: 13 additions & 6 deletions man/MSstatsClean.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading