Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 126 additions & 83 deletions R/annotateProteinInfoFromIndra.R

Large diffs are not rendered by default.

9 changes: 5 additions & 4 deletions R/cytoscapeNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@
#' overlaps are surfaced as hover tooltips.
#'
#' @param nodes Data frame with at minimum an \code{id} column. Optional
#' columns: \code{logFC} (numeric), \code{hgncName}
#' (character), \code{Site} (character, underscore-separated
#' PTM site list).
#' columns: \code{logFC} (numeric), \code{entityName}
#' (character; may be semicolon-joined for multi-grounded
#' rows), \code{entityId} (character), \code{Site}
#' (character, underscore-separated PTM site list).
#' @param edges Data frame with columns \code{source}, \code{target},
#' \code{interaction}. Optional: \code{site},
#' \code{evidenceLink}.
#' @param displayLabelType \code{"id"} (default) or \code{"hgncName"} –
#' @param displayLabelType \code{"id"} (default) or \code{"entityName"} –
#' controls which column is used as the visible node label.
#' @param nodeFontSize Font size (px) for node labels. Default \code{12}.
#' @param layoutOptions Named list of dagre layout options to override the
Expand Down
13 changes: 8 additions & 5 deletions R/getSubnetworkFromIndra.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
#' Using differential abundance results from MSstats, this function retrieves
#' a subnetwork of protein interactions from INDRA database.
#'
#' @param input output of \code{\link[MSstats]{groupComparison}} function's
#' comparisionResult table, which contains a list of proteins and their
#' corresponding p-values, logFCs, along with additional HGNC ID and HGNC
#' name columns
#' @param input output of \code{\link[MSstats]{groupComparison}} function's
#' comparisionResult table, annotated by
#' \code{\link{annotateProteinInfoFromIndra}}. Must contain \code{Protein},
#' \code{EntityNamespace}, and \code{EntityId} columns (and typically also
#' \code{EntityName}, \code{log2FC}, \code{adj.pvalue}). When an analyte
#' grounds to multiple candidates the three \code{Entity*} columns are
#' semicolon-joined and positionally aligned.
#' @param protein_level_data output of the \code{\link[MSstats]{dataProcess}}
#' function's ProteinLevelData table, which contains a list of proteins and
#' their corresponding abundances. Used for annotating correlation information
Expand Down Expand Up @@ -72,7 +75,7 @@ getSubnetworkFromIndra <- function(input,
direction = match.arg(direction)
input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other, include_infinite_fc, direction)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a differential abundance analysis results table here. It's labeled as data-2026-06-10.csv

I noticed that getSubnetworkFromIndra fails with this dataset, but after I filter out all of the rows that have NA in the EntityName/EntityId/EntityNamespace columns, the function works fine. Could you look into the root cause? One solution I thought of was to filter out NA EntityId rows in .filterGetSubnetworkFromIndraInput, but that'd be if the NAs are truly causing the problems

.validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other)
res <- .callIndraCogexApi(input$HgncId, force_include_other)
res <- .callIndraCogexApi(input$EntityNamespace, input$EntityId, force_include_other)
res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter)
edges <- .constructEdgesDataFrame(res, input, protein_level_data)
edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff)
Expand Down
89 changes: 59 additions & 30 deletions R/utils_annotateProteinInfoFromIndra.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,32 +254,43 @@ INDRA_API_URL = "https://discovery.indra.bio"
return(res)
}

#' Call gilda API to get HGNC IDs from HGNC names
#' @param hgncNames list of hgnc names
#' @return named character vector mapping HGNC names to HGNC IDs
#' Call Gilda API to ground entity text against any namespace
#'
#' Posts each input text to Gilda's `ground_multi` endpoint and returns
#' every grounding candidate per input (in Gilda's ranking order). When
#' `keep_only` is set, candidates whose `term$db` does not match are
#' filtered out. The canonical entity name is taken from `term$entry_name`
#' when present, falling back to `term$text` (the input string).
#' @param textInputs list of character strings to ground
#' @param keep_only optional character; if non-NULL, only candidates whose
#' `term$db == keep_only` are retained
#' @return Named list keyed by input text. Each value is a list with
#' three equal-length character vectors: `ns`, `id`, `name`,
#' positionally aligned across Gilda's returned candidates.
#' Texts with no surviving grounding are omitted from the result.
#' @importFrom jsonlite toJSON
#' @importFrom httr POST add_headers content
#' @keywords internal
#' @noRd
.callGetHgncIdsFromGildaApi <- function(hgncNames) {
if (!is.list(hgncNames)) {
.callGroundEntitiesFromGildaApi <- function(textInputs, keep_only = NULL) {

if (!is.list(textInputs)) {
stop("Input must be a list.")
}
if (any(!sapply(hgncNames, is.character))) {
stop("All elements in the list must be character strings representing hgnc names.")

if (any(!sapply(textInputs, is.character))) {
stop("All elements in the list must be character strings.")
}
if (length(hgncNames) == 0) {

if (length(textInputs) == 0) {
stop("Input list must not be empty.")
}

apiUrl <- file.path("https://grounding.indra.bio/", "ground_multi")
requestBody <- lapply(hgncNames, function(hgnc_name) {

requestBody <- lapply(textInputs, function(text_input) {
list(
text = hgnc_name,
text = text_input,
organisms = list("9606")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you check if the results change (i.e. counting number of rows with NA entityName should be sufficient) if we remove this parameter for organisms (i.e. with the dataset linked in the google drive)? My thinking is that we might accidentally be losing out on chemicals from other organisms (e.g. bacteria).

)
})
Expand All @@ -296,27 +307,45 @@ INDRA_API_URL = "https://discovery.indra.bio"
message("Error in API call: ", e)
NULL
})

if (is.null(res)) {
return(NULL)
}

hgnc_mapping <- character(0)

for (item in res) {
# Find the term where db == "HGNC"
hgnc_term <- NULL

grounding_map <- list()

for (i in seq_along(res)) {
item <- res[[i]]
input_text <- as.character(textInputs[[i]])

ns_vec <- character(0)
id_vec <- character(0)
name_vec <- character(0)

for (entry in item) {
if (!is.null(entry$term$db) && entry$term$db == "HGNC") {
hgnc_term <- entry$term
break
term <- entry$term
if (is.null(term) || is.null(term$db) || is.null(term$id)) next
if (!is.null(keep_only) && term$db != keep_only) next

entry_name <- if (!is.null(term$entry_name) && nzchar(term$entry_name)) {
term$entry_name
} else {
term$text
}

ns_vec <- c(ns_vec, term$db)
id_vec <- c(id_vec, term$id)
name_vec <- c(name_vec, entry_name)
}

# Only add to mapping if HGNC term was found
if (!is.null(hgnc_term)) {
hgnc_mapping[hgnc_term$text] <- hgnc_term$id

if (length(ns_vec) > 0) {
grounding_map[[input_text]] <- list(
ns = ns_vec,
id = id_vec,
name = name_vec
)
}
}
return(hgnc_mapping)

return(grounding_map)
}
28 changes: 16 additions & 12 deletions R/utils_cytoscapeNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -238,35 +238,39 @@
rep("#D3D3D3", nrow(nodes))
}

label_col <- if (display_label_type == "hgncName" &&
"hgncName" %in% names(nodes)) "hgncName" else "id"
label_col <- if (display_label_type == "entityName" &&
"entityName" %in% names(nodes)) "entityName" else "id"

has_ptm_sites <- if ("Site" %in% names(nodes)) {
unique(nodes$id[!is.na(nodes$Site) & trimws(nodes$Site) != ""])
} else {
character(0)
}

elements <- list()
emitted_prots <- character(0)
# `emitted_cpds` and `node_type = "compound"` below refer to Cytoscape
# grouping containers used to parent PTM satellite nodes around a protein.
# This Cytoscape "compound" concept is UNRELATED to the chemical
# `proteinIdType = "Compound"` analyte type in annotateProteinInfoFromIndra.
Comment on lines +252 to +255

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now, let's use metabolite instead of compound as an enum for proteinIdType. Could you make this change? And then this comment could get removed.

emitted_cpds <- character(0)
emitted_ptm_n <- character(0)
emitted_ptm_e <- character(0)

for (i in seq_len(nrow(nodes))) {
row <- nodes[i, , drop = FALSE]
color <- node_colors[i]
has_site <- "Site" %in% names(nodes) &&
!is.na(row$Site) && trimws(row$Site) != ""
display_label <- if (label_col == "hgncName" &&
!is.na(row$hgncName) && row$hgncName != "")
row$hgncName else row$id

display_label <- if (label_col == "entityName" &&
!is.na(row$entityName) && row$entityName != "")
row$entityName else row$id

needs_compound <- row$id %in% has_ptm_sites
compound_id <- paste0(row$id, "__compound__")
# Compound container

# Cytoscape compound container (PTM grouping parent — not a chemical compound)
if (needs_compound && !(compound_id %in% emitted_cpds)) {
elements <- c(elements, list(
list(data = list(id = compound_id,
Expand Down
Loading
Loading