-
Notifications
You must be signed in to change notification settings - Fork 0
Add Compound proteinIdType and entity-agnostic grounding for metabolite networks #105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -254,32 +254,43 @@ INDRA_API_URL = "https://discovery.indra.bio" | |
| return(res) | ||
| } | ||
|
|
||
| #' Call gilda API to get HGNC IDs from HGNC names | ||
| #' @param hgncNames list of hgnc names | ||
| #' @return named character vector mapping HGNC names to HGNC IDs | ||
| #' Call Gilda API to ground entity text against any namespace | ||
| #' | ||
| #' Posts each input text to Gilda's `ground_multi` endpoint and returns | ||
| #' every grounding candidate per input (in Gilda's ranking order). When | ||
| #' `keep_only` is set, candidates whose `term$db` does not match are | ||
| #' filtered out. The canonical entity name is taken from `term$entry_name` | ||
| #' when present, falling back to `term$text` (the input string). | ||
| #' @param textInputs list of character strings to ground | ||
| #' @param keep_only optional character; if non-NULL, only candidates whose | ||
| #' `term$db == keep_only` are retained | ||
| #' @return Named list keyed by input text. Each value is a list with | ||
| #' three equal-length character vectors: `ns`, `id`, `name`, | ||
| #' positionally aligned across Gilda's returned candidates. | ||
| #' Texts with no surviving grounding are omitted from the result. | ||
| #' @importFrom jsonlite toJSON | ||
| #' @importFrom httr POST add_headers content | ||
| #' @keywords internal | ||
| #' @noRd | ||
| .callGetHgncIdsFromGildaApi <- function(hgncNames) { | ||
| if (!is.list(hgncNames)) { | ||
| .callGroundEntitiesFromGildaApi <- function(textInputs, keep_only = NULL) { | ||
|
|
||
| if (!is.list(textInputs)) { | ||
| stop("Input must be a list.") | ||
| } | ||
| if (any(!sapply(hgncNames, is.character))) { | ||
| stop("All elements in the list must be character strings representing hgnc names.") | ||
|
|
||
| if (any(!sapply(textInputs, is.character))) { | ||
| stop("All elements in the list must be character strings.") | ||
| } | ||
| if (length(hgncNames) == 0) { | ||
|
|
||
| if (length(textInputs) == 0) { | ||
| stop("Input list must not be empty.") | ||
| } | ||
|
|
||
| apiUrl <- file.path("https://grounding.indra.bio/", "ground_multi") | ||
| requestBody <- lapply(hgncNames, function(hgnc_name) { | ||
|
|
||
| requestBody <- lapply(textInputs, function(text_input) { | ||
| list( | ||
| text = hgnc_name, | ||
| text = text_input, | ||
| organisms = list("9606") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you check if the results change (i.e. counting number of rows with NA entityName should be sufficient) if we remove this parameter for organisms (i.e. with the dataset linked in the google drive)? My thinking is that we might accidentally be losing out on chemicals from other organisms (e.g. bacteria). |
||
| ) | ||
| }) | ||
|
|
@@ -296,27 +307,45 @@ INDRA_API_URL = "https://discovery.indra.bio" | |
| message("Error in API call: ", e) | ||
| NULL | ||
| }) | ||
|
|
||
| if (is.null(res)) { | ||
| return(NULL) | ||
| } | ||
|
|
||
| hgnc_mapping <- character(0) | ||
|
|
||
| for (item in res) { | ||
| # Find the term where db == "HGNC" | ||
| hgnc_term <- NULL | ||
|
|
||
| grounding_map <- list() | ||
|
|
||
| for (i in seq_along(res)) { | ||
| item <- res[[i]] | ||
| input_text <- as.character(textInputs[[i]]) | ||
|
|
||
| ns_vec <- character(0) | ||
| id_vec <- character(0) | ||
| name_vec <- character(0) | ||
|
|
||
| for (entry in item) { | ||
| if (!is.null(entry$term$db) && entry$term$db == "HGNC") { | ||
| hgnc_term <- entry$term | ||
| break | ||
| term <- entry$term | ||
| if (is.null(term) || is.null(term$db) || is.null(term$id)) next | ||
| if (!is.null(keep_only) && term$db != keep_only) next | ||
|
|
||
| entry_name <- if (!is.null(term$entry_name) && nzchar(term$entry_name)) { | ||
| term$entry_name | ||
| } else { | ||
| term$text | ||
| } | ||
|
|
||
| ns_vec <- c(ns_vec, term$db) | ||
| id_vec <- c(id_vec, term$id) | ||
| name_vec <- c(name_vec, entry_name) | ||
| } | ||
|
|
||
| # Only add to mapping if HGNC term was found | ||
| if (!is.null(hgnc_term)) { | ||
| hgnc_mapping[hgnc_term$text] <- hgnc_term$id | ||
|
|
||
| if (length(ns_vec) > 0) { | ||
| grounding_map[[input_text]] <- list( | ||
| ns = ns_vec, | ||
| id = id_vec, | ||
| name = name_vec | ||
| ) | ||
| } | ||
| } | ||
| return(hgnc_mapping) | ||
|
|
||
| return(grounding_map) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -238,35 +238,39 @@ | |
| rep("#D3D3D3", nrow(nodes)) | ||
| } | ||
|
|
||
| label_col <- if (display_label_type == "hgncName" && | ||
| "hgncName" %in% names(nodes)) "hgncName" else "id" | ||
| label_col <- if (display_label_type == "entityName" && | ||
| "entityName" %in% names(nodes)) "entityName" else "id" | ||
|
|
||
| has_ptm_sites <- if ("Site" %in% names(nodes)) { | ||
| unique(nodes$id[!is.na(nodes$Site) & trimws(nodes$Site) != ""]) | ||
| } else { | ||
| character(0) | ||
| } | ||
|
|
||
| elements <- list() | ||
| emitted_prots <- character(0) | ||
| # `emitted_cpds` and `node_type = "compound"` below refer to Cytoscape | ||
| # grouping containers used to parent PTM satellite nodes around a protein. | ||
| # This Cytoscape "compound" concept is UNRELATED to the chemical | ||
| # `proteinIdType = "Compound"` analyte type in annotateProteinInfoFromIndra. | ||
|
Comment on lines
+252
to
+255
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now, let's use |
||
| emitted_cpds <- character(0) | ||
| emitted_ptm_n <- character(0) | ||
| emitted_ptm_e <- character(0) | ||
|
|
||
| for (i in seq_len(nrow(nodes))) { | ||
| row <- nodes[i, , drop = FALSE] | ||
| color <- node_colors[i] | ||
| has_site <- "Site" %in% names(nodes) && | ||
| !is.na(row$Site) && trimws(row$Site) != "" | ||
| display_label <- if (label_col == "hgncName" && | ||
| !is.na(row$hgncName) && row$hgncName != "") | ||
| row$hgncName else row$id | ||
|
|
||
| display_label <- if (label_col == "entityName" && | ||
| !is.na(row$entityName) && row$entityName != "") | ||
| row$entityName else row$id | ||
|
|
||
| needs_compound <- row$id %in% has_ptm_sites | ||
| compound_id <- paste0(row$id, "__compound__") | ||
| # Compound container | ||
|
|
||
| # Cytoscape compound container (PTM grouping parent — not a chemical compound) | ||
| if (needs_compound && !(compound_id %in% emitted_cpds)) { | ||
| elements <- c(elements, list( | ||
| list(data = list(id = compound_id, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added a differential abundance analysis results table here. It's labeled as
data-2026-06-10.csvI noticed that
getSubnetworkFromIndrafails with this dataset, but after I filter out all of the rows that have NA in theEntityName/EntityId/EntityNamespacecolumns, the function works fine. Could you look into the root cause? One solution I thought of was to filter out NAEntityIdrows in.filterGetSubnetworkFromIndraInput, but that'd be if the NAs are truly causing the problems