From 20d2764dae5d3f33861cb1f62b44ff96c2da56f9 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 13:39:20 -0400 Subject: [PATCH 01/16] corrected importFrom statement in parentage functions and ped_check --- NAMESPACE | 1 + R/check_ped.R | 5 +++-- R/validate_pedigree.R | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c919736..4b5cb05 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) import(dplyr) import(janitor) import(parallel) diff --git a/R/check_ped.R b/R/check_ped.R index 0ebbd5a..e71e79b 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -38,10 +38,11 @@ #' messy_ids <- ped_errors$messy_parents$id #' print(messy_ids) #' -#' @import dplyr -#' @import janitor +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows +#' @importFrom janitor clean_names #' @importFrom stats setNames #' @importFrom utils read.table +#' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8b2ecbc..5b366c9 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -51,6 +51,7 @@ #' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} #' } #' @export +#' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, trio_error_threshold = 5.0, From f2d3f408c2793bffe3cd25f8b726327343be2e73 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 22 Apr 2026 13:43:10 -0400 Subject: [PATCH 02/16] updated NAMESPACE to reflect updated importFrom statements --- NAMESPACE | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 4b5cb05..1fde3fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,7 +25,6 @@ export(thinSNP) export(updog2vcf) export(validate_pedigree) import(dplyr) -import(janitor) import(parallel) import(quadprog) import(stringr) @@ -47,15 +46,20 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) +importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) +importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -66,6 +70,7 @@ importFrom(stats,cor) importFrom(stats,reorder) importFrom(stats,setNames) importFrom(tibble,as_tibble) +importFrom(tools,file_path_sans_ext) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) From c1fe54372792b4eb3519fd03475c444304f82d6b Mon Sep 17 00:00:00 2001 From: Josue Chinchilla-Vargas Date: Wed, 22 Apr 2026 13:58:21 -0400 Subject: [PATCH 03/16] Update R/check_ped.R Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/check_ped.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_ped.R b/R/check_ped.R index e71e79b..0065f78 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -38,7 +38,7 @@ #' messy_ids <- ped_errors$messy_parents$id #' print(messy_ids) #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n #' @importFrom janitor clean_names #' @importFrom stats setNames #' @importFrom utils read.table From 0a13483db94177fc605d9e8130058301e7ea38af Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 24 Apr 2026 13:52:51 -0400 Subject: [PATCH 04/16] updated error handling and reporting for check_ped, Also major improvement for documentation on check_ped --- .Rbuildignore | 2 + .gitignore | 1 + NAMESPACE | 7 - R/check_ped.R | 331 ++++++++++-------- man/check_ped.Rd | 30 +- tests/testthat/corrected_pedigree.txt | 4 - tests/testthat/test-check_ped.R | 463 +++++++++++++++++++++++++- 7 files changed, 645 insertions(+), 193 deletions(-) delete mode 100644 tests/testthat/corrected_pedigree.txt diff --git a/.Rbuildignore b/.Rbuildignore index 307d5d0..50c7896 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,5 @@ ^package-code\.Rmd$ ^docs$ ^CRAN-SUBMISSION$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index d3ffaad..e056686 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata revdep/ .DS_Store +.positai diff --git a/NAMESPACE b/NAMESPACE index 1fde3fd..ba1c1d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,20 +46,15 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) -importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) -importFrom(dplyr,distinct) importFrom(dplyr,filter) -importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) importFrom(dplyr,select) importFrom(dplyr,summarise) -importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) -importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -68,9 +63,7 @@ importFrom(reshape2,melt) importFrom(rlang,sym) importFrom(stats,cor) importFrom(stats,reorder) -importFrom(stats,setNames) importFrom(tibble,as_tibble) -importFrom(tools,file_path_sans_ext) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/check_ped.R b/R/check_ped.R index 0065f78..71a889e 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,13 +1,13 @@ #' Check a pedigree file for accuracy and report/correct common errors #' -#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `male_parent`, `female_parent` in any order) #' and performs quality checks, optionally correcting or flagging errors. #' #' The function checks for: #' * Exact duplicate rows and removes them (keeping one copy) -#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -#' * IDs that appear in both sire and dam columns -#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * IDs that appear more than once with conflicting male_parent/female_parent assignments (sets both to "0") +#' * IDs that appear in both male_parent and female_parent columns +#' * Missing parents (IDs referenced as male_parent/female_parent but not in `id` column), adds them with both = "0" #' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' #' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. @@ -18,124 +18,92 @@ #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' @param verbose Logical. If TRUE (default), prints findings and prompts for saving. +#' @param correct Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues. +#' @param save_zip Logical. If TRUE, writes each report component as a .txt file and bundles them into a .zip archive in the working directory. +#' @param save_corrected_zip Logical. If TRUE (and save_zip = TRUE and correct = TRUE), also writes the corrected pedigree into the .zip archive. #' -#' @return A list of data.frames containing detected issues: -#' * `exact_duplicates`: rows that were exact duplicates -#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam -#' * `messy_parents`: IDs appearing as both sire and dam -#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam -#' * `dependencies`: detected cycles in the pedigree +#' @return A named list of data.frames with offending rows (including row numbers): +#' * `exact_duplicates` +#' * `repeated_ids_diff` +#' * `inconsistent_sex_roles` +#' * `missing_parents` +#' * `dependencies` #' -#' @examples -#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -#' -#' # Access messy parents -#' ped_errors$messy_parents -#' -#' # IDs with messy parents -#' messy_ids <- ped_errors$messy_parents$id -#' print(messy_ids) -#' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select #' @importFrom janitor clean_names #' @importFrom stats setNames -#' @importFrom utils read.table +#' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext #' @export -check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { +check_ped <- function(ped.file, + seed = NULL, + verbose = TRUE, + correct = TRUE, + save_zip = FALSE, + save_corrected_zip = FALSE) { #### setup #### if (!is.null(seed)) set.seed(seed) - - # Read and clean data data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( - id = as.character(id), - sire = as.character(sire), - dam = as.character(dam) + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) ) - - original_data <- data + # Add row numbers before any processing + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) #### check 1: exact duplicates #### - exact_duplicates <- data[duplicated(data), ] - if (nrow(exact_duplicates) > 0) { - data <- distinct(data) # remove exact duplicates - } + exact_duplicates <- data[ + duplicated(data %>% select(-row_number)) | + duplicated(data %>% select(-row_number), fromLast = TRUE), + ] - #### check 2: repeated IDs with conflicting sire/dam #### - repeated_ids <- data %>% + #### check 2: repeated IDs with conflicting male_parent/female_parent #### + repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% + filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() - # Only IDs with actual conflicting sire/dam - conflicting_ids <- repeated_ids %>% - group_by(id) %>% - filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% - ungroup() - - if (nrow(conflicting_ids) > 0) { - # Keep one row per ID, set sire/dam to "0" - data <- data %>% - group_by(id) %>% - summarize( - sire = if(n_distinct(sire) > 1) "0" else first(sire), - dam = if(n_distinct(dam) > 1) "0" else first(dam), - .groups = "drop" - ) - } - - repeated_ids_report <- conflicting_ids - - #### check 3: missing parents #### - for (i in seq_len(nrow(data))) { - id <- data$id[i] - sire <- data$sire[i] - dam <- data$dam[i] - - if (sire != "0" && sire != id && !sire %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) - } - if (dam != "0" && dam != id && !dam %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) - } - - if (sire == id || dam == id) { - errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) - } - } - - missing_parents <- distinct(missing_parents) - if (nrow(missing_parents) > 0) { - data <- bind_rows(data, missing_parents) - } - - #### check 4: messy parents #### - sire_ids <- unique(data$sire[data$sire != "0"]) - dam_ids <- unique(data$dam[data$dam != "0"]) - messy_ids <- intersect(sire_ids, dam_ids) - messy_parents <- data %>% filter(id %in% messy_ids) + #### check 3: inconsistent parent sex roles #### + male_ids <- unique(data$male_parent[data$male_parent != "0"]) + female_ids <- unique(data$female_parent[data$female_parent != "0"]) + messy_ids <- intersect(male_ids, female_ids) + inconsistent_sex_roles <- data %>% + filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + + #### check 4: missing parents #### + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + ref_ids <- ref_ids[ref_ids != "0"] + missing_ids <- setdiff(ref_ids, all_ids) + missing_parents <- data %>% + filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + # For the corrected pedigree, build founder rows for truly absent parents + missing_founders <- data.frame( + row_number = NA_integer_, + id = missing_ids, + male_parent = "0", + female_parent = "0", + stringsAsFactors = FALSE + ) #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] - c(row$sire, row$dam) + c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id - dfs <- function(node, visited, rec_stack, path) { - visited[node] <- TRUE + visited[node] <- TRUE rec_stack[node] <- TRUE - path <- append(path, node) + path <- append(path, node) cycles <- list() - for (neighbor in adj_list[[node]]) { if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { @@ -146,94 +114,163 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } - rec_stack[node] <- FALSE return(cycles) } - - visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) { - all_cycles <- append(all_cycles, node_cycles) - } + if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) } } return(all_cycles) } - cycles <- detect_all_cycles(data) + cycles <- detect_all_cycles(data) + cycle_ids <- character(0) if (length(cycles) > 0) { for (cycle_group in cycles) { - cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) + ids <- unique(unlist(cycle_group)) + cycle_ids <- unique(c(cycle_ids, ids)) + errors <- append(errors, + paste("Cycle detected involving IDs:", + paste(ids, collapse = " -> "))) } } + dependencies <- data %>% filter(id %in% cycle_ids) + if (nrow(dependencies) == 0 && length(errors) > 0) { + dependencies <- data.frame(Dependency = unique(unlist(errors)), + stringsAsFactors = FALSE) + } - #### compile findings #### + #### compile report #### input_ped_report <- list( - exact_duplicates = exact_duplicates, - repeated_ids_diff = repeated_ids_report, - messy_parents = messy_parents, - missing_parents = missing_parents, - dependencies = data.frame(Dependency = unique(unlist(errors))) + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_diff, + inconsistent_sex_roles = inconsistent_sex_roles, + missing_parents = missing_parents, + dependencies = dependencies ) + #### build corrected pedigree (no row_number column) #### + if (correct) { + corrected <- data %>% select(-row_number) + # Remove exact duplicates + corrected <- distinct(corrected) + # Resolve conflicting IDs + if (nrow(repeated_ids_diff) > 0) { + corrected <- corrected %>% + group_by(id) %>% + summarize( + male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), + female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), + .groups = "drop" + ) + } + # Add missing founders + if (length(missing_ids) > 0) { + corrected <- bind_rows( + corrected, + missing_founders %>% select(-row_number) + ) + } + } + #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) + file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") + report_name <- paste0(file_base, "_report") + zip_name <- paste0(file_base, "_report.zip") #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") - - if (nrow(exact_duplicates) > 0) { - cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") - print(exact_duplicates) - } else cat("\nNo exact duplicate trios found.\n") - - if (nrow(repeated_ids_report) > 0) { - cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") - print(repeated_ids_report) - } else cat("\nNo conflicting repeated trios found.\n") - - if (nrow(missing_parents) > 0) { - cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") - print(missing_parents) - } else cat("\nNo missing parents found.\n") - - if (nrow(messy_parents) > 0) { - cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") - print(messy_parents) - } else cat("\nNo IDs found as both sire and dam.\n") - - - if (nrow(input_ped_report$dependencies) > 0) { - cat("\nDependencies detected:\n") - print(input_ped_report$dependencies) - } else cat("\nNo dependencies detected.\n") - - #### interactive save #### - cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, data, envir = .GlobalEnv) - assign("input_ped_report", input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + cat("\n--- Exact Duplicates ---\n") + if (nrow(exact_duplicates) > 0) print(exact_duplicates) else cat("None found.\n") + cat("\n--- Conflicting IDs ---\n") + if (nrow(repeated_ids_diff) > 0) print(repeated_ids_diff) else cat("None found.\n") + cat("\n--- Inconsistent Parent Sex Roles ---\n") + if (nrow(inconsistent_sex_roles) > 0) print(inconsistent_sex_roles) else cat("None found.\n") + cat("\n--- Missing Parents (rows that reference them) ---\n") + if (nrow(missing_parents) > 0) print(missing_parents) else cat("None found.\n") + cat("\n--- Cycles / Dependencies ---\n") + if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") + if (correct) { + cat(paste0("\nSave corrected pedigree as `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, corrected, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved `", corrected_name, "` and `", report_name, "`.\n")) + } else { + cat("No corrected pedigree saved.\n") + assign(report_name, input_ped_report, envir = .GlobalEnv) + } } else { - cat("No corrected pedigree was saved.\n") + cat("\n`correct = FALSE`: no changes made to the pedigree.\n") + assign(report_name, input_ped_report, envir = .GlobalEnv) } - } else { - # Silent automatic mode - assign(corrected_name, data, envir = .GlobalEnv) + # Silent mode: always save report; save corrected only if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) assign(report_name, input_ped_report, envir = .GlobalEnv) } + #### zip export #### + if (save_zip) { + tmp_dir <- tempfile() + dir.create(tmp_dir) + zip_files <- character(0) + + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) + + for (component in names(section_labels)) { + df <- input_ped_report[[component]] + file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + + if (nrow(df) == 0) { + # Write header + "None found." for clean empty reports + writeLines( + c(paste0("--- ", section_labels[[component]], " ---"), "None found."), + con = file_path + ) + } else { + # Write header, then column names, then data — avoids col.names warning + file_con <- file(file_path, open = "wt") + writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), con = file_con) + close(file_con) + utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, + quote = FALSE, append = TRUE, col.names = FALSE) + } + + zip_files <- c(zip_files, file_path) + } + + # Optionally include corrected pedigree in zip + if (save_corrected_zip && correct) { + corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) + utils::write.table(corrected, file = corrected_path, sep = "\t", + row.names = FALSE, quote = FALSE) + zip_files <- c(zip_files, corrected_path) + } + + # Bundle all files into zip in the working directory + zip_path <- file.path(getwd(), zip_name) + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) + } + invisible(input_ped_report) } + diff --git a/man/check_ped.Rd b/man/check_ped.Rd index ea63de7..0d1df79 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -4,23 +4,25 @@ \alias{check_ped} \title{Check a pedigree file for accuracy and report/correct common errors} \usage{ -check_ped(ped.file, seed = NULL, verbose = TRUE) +check_ped(ped.file, seed = NULL, verbose = TRUE, correct = TRUE) } \arguments{ \item{ped.file}{Path to the pedigree text file.} \item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} +\item{verbose}{Logical. If TRUE (default), prints findings and prompts for saving.} + +\item{correct}{Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues.} } \value{ -A list of data.frames containing detected issues: +A named list of data.frames with offending rows (including row numbers): \itemize{ -\item \code{exact_duplicates}: rows that were exact duplicates -\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam -\item \code{messy_parents}: IDs appearing as both sire and dam -\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam -\item \code{dependencies}: detected cycles in the pedigree +\item \code{exact_duplicates} +\item \code{repeated_ids_diff} +\item \code{inconsistent_sex_roles} +\item \code{missing_parents} +\item \code{dependencies} } } \description{ @@ -45,15 +47,3 @@ The function does \strong{not} overwrite the input file. Instead, it prints find \item A report listing all detected issues } } -\examples{ -ped_file <- system.file("check_ped_test.txt", package = "BIGr") -ped_errors <- check_ped(ped.file = ped_file, seed = 101919) - -# Access messy parents -ped_errors$messy_parents - -# IDs with messy parents -messy_ids <- ped_errors$messy_parents$id -print(messy_ids) - -} diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt deleted file mode 100644 index 0434b0c..0000000 --- a/tests/testthat/corrected_pedigree.txt +++ /dev/null @@ -1,4 +0,0 @@ -ID Male_Parent Female_Parent -IND_C IND_A IND_B -IND_D 0 IND_A -GHOST IND_A IND_B diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 706143f..64d13ca 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -1,22 +1,455 @@ -context("Imputation Concordance") +context("check_ped – Pedigree Quality Checks") +# --------------------------------------------------------------------------- +# Helper: write a data.frame to a temp tab-separated file and return the path +# --------------------------------------------------------------------------- +write_ped <- function(df) { + f <- tempfile(fileext = ".txt") + utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) + f +} -test_that("test imputation",{ - #Input variables - ped_file <- system.file("check_ped_test.txt", package="BIGr") +# =========================================================================== +# 1. Return structure +# Tests that check_ped returns a named list of exactly 5 data frame +# components covering all five issue categories. +# =========================================================================== +test_that("check_ped returns a named list of length 5", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE, correct = FALSE) - #Calculations - output.list <- check_ped(ped_file, - seed = 101919, - verbose = FALSE) + expect_type(out, "list") + expect_length(out, 5) + expect_named(out, c( + "exact_duplicates", + "repeated_ids_diff", + "inconsistent_sex_roles", + "missing_parents", + "dependencies" + )) +}) + +# =========================================================================== +# 2. Component types +# Tests that every element in the returned list is a data.frame, ensuring +# downstream code can safely call nrow(), colnames(), etc. on any component. +# =========================================================================== +test_that("check_ped report components are data.frames", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_true(is.data.frame(out$exact_duplicates)) + expect_true(is.data.frame(out$repeated_ids_diff)) + expect_true(is.data.frame(out$inconsistent_sex_roles)) + expect_true(is.data.frame(out$missing_parents)) + expect_true(is.data.frame(out$dependencies)) +}) + +# =========================================================================== +# 3. Clean pedigree → empty report +# Tests that a structurally valid pedigree with no issues produces zero rows +# in every report component (no false positives). +# =========================================================================== +test_that("clean pedigree produces no issues", { + ped <- data.frame( + id = c("G1", "G2", "P1"), + male_parent = c("0", "0", "G1"), + female_parent = c("0", "0", "G2") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$exact_duplicates), 0) + expect_equal(nrow(out$repeated_ids_diff), 0) + expect_equal(nrow(out$inconsistent_sex_roles), 0) + expect_equal(nrow(out$missing_parents), 0) + expect_equal(nrow(out$dependencies), 0) +}) + +# =========================================================================== +# 4. Exact duplicate detection and correction +# Tests that rows sharing the same (id, male_parent, female_parent) triplet +# are flagged, and that correct = TRUE retains only one copy of each. +# =========================================================================== +test_that("check_ped detects exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$exact_duplicates), 2) + expect_true(all(out$exact_duplicates$id == "A")) +}) + +test_that("correct = TRUE removes exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # Only one "A" row should remain + expect_equal(sum(corrected$id == "A"), 1) +}) + +# =========================================================================== +# 5. Repeated IDs with conflicting parent assignments +# Tests that the same ID appearing on multiple rows with different parent +# values is flagged, and that correct = TRUE blanks only the conflicting +# parent field while preserving consistent values. +# =========================================================================== +test_that("check_ped detects repeated IDs with conflicting parents", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$repeated_ids_diff), 2) + expect_true(all(out$repeated_ids_diff$id == "A")) +}) + +test_that("correct = TRUE resolves conflicting IDs: conflicting field -> '0', consistent field kept", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), # conflicting -> should become "0" + female_parent = c("M", "M", "0") # consistent -> should stay "M" + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + a_row <- corrected[corrected$id == "A", ] + + expect_equal(nrow(a_row), 1) + expect_equal(a_row$male_parent, "0") + expect_equal(a_row$female_parent, "M") +}) + +# =========================================================================== +# 6. Inconsistent parent sex roles +# Tests that an individual ID appearing in both male_parent and female_parent +# columns is flagged. Also confirms that correct = TRUE does NOT auto-resolve +# this issue since the correct fix requires manual review. +# =========================================================================== +test_that("check_ped detects inconsistent parent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_true("inconsistent_sex_roles" %in% names(out)) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + + # Both rows that reference P should be flagged + expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | + out$inconsistent_sex_roles$female_parent == "P")) +}) + +test_that("correct = TRUE does NOT auto-correct inconsistent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # P should still appear with its original (inconsistent) parent assignments + expect_true(any(corrected$male_parent == "P" | + corrected$female_parent == "P")) +}) + +# =========================================================================== +# 7. Missing parent detection and founder row injection +# Tests that parent IDs referenced in male_parent/female_parent but absent +# from the id column are flagged. Confirms that correct = TRUE appends a +# founder row (both parents = "0") for each missing parent. +# =========================================================================== +test_that("check_ped detects missing parents", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_equal(nrow(out$missing_parents), 1) + expect_true(all(out$missing_parents$id == "B")) +}) + +test_that("correct = TRUE adds missing parents as founder rows", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # X and Y should now appear as founder rows + expect_true("X" %in% corrected$id) + expect_true("Y" %in% corrected$id) + x_row <- corrected[corrected$id == "X", ] + y_row <- corrected[corrected$id == "Y", ] + expect_equal(x_row$male_parent, "0") + expect_equal(x_row$female_parent, "0") + expect_equal(y_row$male_parent, "0") + expect_equal(y_row$female_parent, "0") +}) + +# =========================================================================== +# 8. Pedigree cycle / dependency detection +# Tests that circular ancestry chains (e.g. A is parent of B and B is parent +# of A) are detected and reported. Also confirms that correct = TRUE does NOT +# silently remove cycle-involved individuals, as cycles require manual review. +# =========================================================================== +test_that("check_ped detects a direct two-node cycle", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + + expect_gt(nrow(out$dependencies), 0) + expect_true(all(c("A", "B") %in% out$dependencies$id)) +}) + +test_that("check_ped does NOT auto-correct cycles", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + corrected <- get(corr_name, envir = .GlobalEnv) + + # Cycle-involved IDs should still be present (no silent removal) + expect_true("A" %in% corrected$id) + expect_true("B" %in% corrected$id) +}) + +# =========================================================================== +# 9. Global environment assignments +# Tests that the report list is always written to the global environment, +# that the corrected pedigree is written only when correct = TRUE, and that +# the internal row_number column does not leak into the corrected output. +# =========================================================================== +test_that("report is always assigned to global environment", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + rep_name <- paste0(file_base, "_report") + on.exit(if (exists(rep_name, envir = .GlobalEnv)) + rm(list = rep_name, envir = .GlobalEnv)) + + check_ped(f, verbose = FALSE, correct = FALSE) + + expect_true(exists(rep_name, envir = .GlobalEnv)) + expect_type(get(rep_name, envir = .GlobalEnv), "list") +}) + +test_that("corrected pedigree is assigned to global env when correct = TRUE", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = TRUE) + + expect_true(exists(corr_name, envir = .GlobalEnv)) + corrected <- get(corr_name, envir = .GlobalEnv) + expect_true(is.data.frame(corrected)) + + # row_number must not leak into corrected output + expect_false("row_number" %in% names(corrected)) + expect_true(all(c("id", "male_parent", "female_parent") %in% names(corrected))) +}) + +test_that("corrected pedigree is NOT assigned to global env when correct = FALSE", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + on.exit({ + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = FALSE) + + expect_false(exists(corr_name, envir = .GlobalEnv)) + expect_true(exists(rep_name, envir = .GlobalEnv)) +}) + +# =========================================================================== +# 10. Missing required columns → hard stop +# Tests that a file lacking the canonical column names (after alias remapping) +# causes an immediate error rather than silently producing empty results. +# Column names like animal_id/parent1/parent2 are not in the alias list and +# therefore cannot be remapped, triggering the validation stop. +# =========================================================================== +test_that("check_ped errors when required columns are missing", { + bad_df <- data.frame( + animal_id = c("a", "b"), + parent1 = c("0", "a"), + parent2 = c("0", "0") + ) + expect_error( + check_ped(write_ped(bad_df), verbose = FALSE), + "Input file is missing required column" + ) +}) + +# =========================================================================== +# 11. Integration test using the bundled check_ped_test.txt fixture +# End-to-end test against a real fixture file shipped with the package. +# Validates the overall row counts and that known problematic individuals +# (grandfather2, grandfather3) are flagged for inconsistent sex roles. +# =========================================================================== +test_that("integration test with bundled fixture file", { + ped_file <- system.file("check_ped_test.txt", package = "BIGr") + skip_if(ped_file == "", "Bundled fixture file not found; skipping integration test.") + + out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + + expect_length(out, 5) + + # Inconsistent sex roles should flag grandfather2 & grandfather3 + expect_true(all( + c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$male_parent | + c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$female_parent + )) + expect_equal(nrow(out$missing_parents), 8) +}) + +# =========================================================================== +# 12. Zip export (save_zip = TRUE / save_corrected_zip = TRUE) +# Tests that save_zip = TRUE produces a .zip archive in the working directory, +# and that save_corrected_zip = TRUE includes the corrected pedigree file +# inside that archive when correct = TRUE. +# =========================================================================== +test_that("save_zip = TRUE creates a zip archive in the working directory", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + zip_name <- paste0(file_base, "_report.zip") + rep_name <- paste0(file_base, "_report") + zip_path <- file.path(getwd(), zip_name) + + on.exit({ + if (file.exists(zip_path)) file.remove(zip_path) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) + + check_ped(f, verbose = FALSE, correct = FALSE, save_zip = TRUE) + + expect_true(file.exists(zip_path)) +}) + +test_that("save_zip + save_corrected_zip + correct = TRUE includes corrected file in zip", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + f <- write_ped(ped) + file_base <- tools::file_path_sans_ext(basename(f)) + zip_name <- paste0(file_base, "_report.zip") + corr_name <- paste0(file_base, "_corrected") + rep_name <- paste0(file_base, "_report") + zip_path <- file.path(getwd(), zip_name) - #Check - df_length <- length(output.list) - messy_parents <- output.list$messy_parents - missing_parents <- output.list$missing_parents + on.exit({ + if (file.exists(zip_path)) file.remove(zip_path) + if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) + if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) + }) - expect_true(df_length == 5) # Before was 4 - expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) - expect_true(nrow(missing_parents) == 13) + check_ped(f, verbose = FALSE, correct = TRUE, + save_zip = TRUE, save_corrected_zip = TRUE) + expect_true(file.exists(zip_path)) + zip_contents <- utils::unzip(zip_path, list = TRUE)$Name + expect_true(any(grepl("_corrected", zip_contents))) }) From 9bbc5077612b73e356b0498ab88036ead189b88b Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Fri, 24 Apr 2026 13:52:51 -0400 Subject: [PATCH 05/16] updated error handling and reporting for check_ped --- NAMESPACE | 11 ++++ R/check_ped.R | 164 ++++++++++++++++++++++++----------------------- man/check_ped.Rd | 64 +++++++++--------- 3 files changed, 129 insertions(+), 110 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ba1c1d1..4e518d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,13 +46,20 @@ importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) +importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) importFrom(dplyr,mutate) +importFrom(dplyr,n) +importFrom(dplyr,n_distinct) +importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) importFrom(pwalign,nucleotideSubstitutionMatrix) @@ -63,13 +70,17 @@ importFrom(reshape2,melt) importFrom(rlang,sym) importFrom(stats,cor) importFrom(stats,reorder) +importFrom(stats,setNames) importFrom(tibble,as_tibble) +importFrom(tools,file_path_sans_ext) +importFrom(utils,capture.output) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) importFrom(utils,tail) importFrom(utils,write.csv) importFrom(utils,write.table) +importFrom(utils,zip) importFrom(vcfR,extract.gt) importFrom(vcfR,maf) importFrom(vcfR,read.vcfR) diff --git a/R/check_ped.R b/R/check_ped.R index 71a889e..e1d9407 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,39 +1,37 @@ -#' Check a pedigree file for accuracy and report/correct common errors +#' Check a pedigree file for structural issues and optionally apply corrections #' -#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `male_parent`, `female_parent` in any order) -#' and performs quality checks, optionally correcting or flagging errors. +#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and +#' `female_parent` and checks for five classes of issues: missing required +#' columns, exact duplicate rows, repeated IDs with conflicting parent +#' assignments, inconsistent parent sex roles, and pedigree cycles. The input +#' file is never modified. A report list is always assigned to the global +#' environment; when `correct = TRUE`, a corrected pedigree is assigned as well. #' -#' The function checks for: -#' * Exact duplicate rows and removes them (keeping one copy) -#' * IDs that appear more than once with conflicting male_parent/female_parent assignments (sets both to "0") -#' * IDs that appear in both male_parent and female_parent columns -#' * Missing parents (IDs referenced as male_parent/female_parent but not in `id` column), adds them with both = "0" -#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant -#' -#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -#' -#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: -#' * Corrected pedigree as a dataframe in the global environment -#' * A report listing all detected issues +#' When `correct = TRUE`, the function removes exact duplicates, collapses +#' conflicting repeated IDs (replacing the inconsistent parent field with `"0"`), +#' and appends missing parents as founder rows. Inconsistent sex roles and +#' pedigree cycles are reported but not corrected and must be resolved manually. #' #' @param ped.file Path to the pedigree text file. -#' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints findings and prompts for saving. -#' @param correct Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues. -#' @param save_zip Logical. If TRUE, writes each report component as a .txt file and bundles them into a .zip archive in the working directory. -#' @param save_corrected_zip Logical. If TRUE (and save_zip = TRUE and correct = TRUE), also writes the corrected pedigree into the .zip archive. +#' @param seed Optional integer seed for reproducibility. +#' @param verbose Logical. If `TRUE` (default), prints findings to the console. +#' @param correct Logical. If `TRUE` (default), builds and assigns a corrected +#' pedigree to the global environment. If `FALSE`, issues are only reported. +#' @param save_zip Logical. If `TRUE`, bundles report components into a `.zip` +#' archive in the working directory. +#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = TRUE` and +#' `correct = TRUE`), includes the corrected pedigree in the `.zip` archive. #' -#' @return A named list of data.frames with offending rows (including row numbers): -#' * `exact_duplicates` -#' * `repeated_ids_diff` -#' * `inconsistent_sex_roles` -#' * `missing_parents` -#' * `dependencies` +#' @return An invisible named list of data frames describing detected issues: +#' * `exact_duplicates`: exact duplicate rows. +#' * `repeated_ids_diff`: rows where the same `id` has conflicting parents. +#' * `inconsistent_sex_roles`: rows involving IDs used as both parents. +#' * `missing_parents`: rows referencing parent IDs absent from `id`. +#' * `dependencies`: rows whose `id` is involved in a pedigree cycle. #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select -#' @importFrom janitor clean_names +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip +#' @importFrom utils read.table write.table zip capture.output #' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, @@ -42,40 +40,61 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { - #### setup #### if (!is.null(seed)) set.seed(seed) - data <- utils::read.table(ped.file, header = TRUE) %>% - janitor::clean_names() %>% + data <- utils::read.table(ped.file, header = TRUE) + # ── Column alias remapping ────────────────────────────────────────────────── + # Accept common alternative column names and silently remap them so the rest + # of the function always works with the canonical names. + col_aliases <- list( + id = c("id", "ID", "animal", "Animal", "ind", "Ind"), + male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), + female_parent = c("female_parent", "dam", "Dam", "mother", "Mother", "mat", "Mat") + ) + for (canonical in names(col_aliases)) { + if (!canonical %in% colnames(data)) { + match <- intersect(col_aliases[[canonical]], colnames(data)) + if (length(match) > 0) { + colnames(data)[colnames(data) == match[1]] <- canonical + } + } + } + # Validate required column names before any processing + required_cols <- c("id", "male_parent", "female_parent") + missing_cols <- setdiff(required_cols, colnames(data)) + if (length(missing_cols) > 0) { + stop( + "Input file is missing required column(s): ", + paste(missing_cols, collapse = ", "), + ".\nExpected columns: id, male_parent, female_parent." + ) + } + data <- data %>% mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() - #### check 1: exact duplicates #### exact_duplicates <- data[ duplicated(data %>% select(-row_number)) | duplicated(data %>% select(-row_number), fromLast = TRUE), ] - #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() - #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% filter(male_parent %in% messy_ids | female_parent %in% messy_ids) - #### check 4: missing parents #### all_ids <- unique(data$id) ref_ids <- unique(c(data$male_parent, data$female_parent)) @@ -83,15 +102,24 @@ check_ped <- function(ped.file, missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% filter(male_parent %in% missing_ids | female_parent %in% missing_ids) - # For the corrected pedigree, build founder rows for truly absent parents - missing_founders <- data.frame( - row_number = NA_integer_, - id = missing_ids, - male_parent = "0", - female_parent = "0", - stringsAsFactors = FALSE - ) - + # Only build founder rows if there are actually missing parents to add + if (length(missing_ids) > 0) { + missing_founders <- data.frame( + row_number = NA_integer_, + id = missing_ids, + male_parent = "0", + female_parent = "0", + stringsAsFactors = FALSE + ) + } else { + missing_founders <- data.frame( + row_number = integer(0), + id = character(0), + male_parent = character(0), + female_parent = character(0), + stringsAsFactors = FALSE + ) + } #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { @@ -128,7 +156,6 @@ check_ped <- function(ped.file, } return(all_cycles) } - cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -145,7 +172,6 @@ check_ped <- function(ped.file, dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) } - #### compile report #### input_ped_report <- list( exact_duplicates = exact_duplicates, @@ -154,7 +180,6 @@ check_ped <- function(ped.file, missing_parents = missing_parents, dependencies = dependencies ) - #### build corrected pedigree (no row_number column) #### if (correct) { corrected <- data %>% select(-row_number) @@ -178,13 +203,11 @@ check_ped <- function(ped.file, ) } } - #### file names #### file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -199,32 +222,20 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("\nSave corrected pedigree as `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, corrected, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved `", corrected_name, "` and `", report_name, "`.\n")) - } else { - cat("No corrected pedigree saved.\n") - assign(report_name, input_ped_report, envir = .GlobalEnv) - } + cat(paste0("\`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("\n`correct = FALSE`: no changes made to the pedigree.\n") - assign(report_name, input_ped_report, envir = .GlobalEnv) + cat("\`correct = FALSE`: no changes made to the pedigree.\n") } - } else { - # Silent mode: always save report; save corrected only if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) } - + # Always save report to global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) #### zip export #### if (save_zip) { tmp_dir <- tempfile() dir.create(tmp_dir) zip_files <- character(0) - # Section headers matching the console output labels section_labels <- c( exact_duplicates = "Exact Duplicates", @@ -233,19 +244,15 @@ check_ped <- function(ped.file, missing_parents = "Missing Parents (rows that reference them)", dependencies = "Cycles / Dependencies" ) - for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) - if (nrow(df) == 0) { - # Write header + "None found." for clean empty reports writeLines( c(paste0("--- ", section_labels[[component]], " ---"), "None found."), con = file_path ) } else { - # Write header, then column names, then data — avoids col.names warning file_con <- file(file_path, open = "wt") writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) writeLines(paste(colnames(df), collapse = "\t"), con = file_con) @@ -253,24 +260,19 @@ check_ped <- function(ped.file, utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, quote = FALSE, append = TRUE, col.names = FALSE) } - zip_files <- c(zip_files, file_path) } - - # Optionally include corrected pedigree in zip if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } - - # Bundle all files into zip in the working directory zip_path <- file.path(getwd(), zip_name) - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + invisible(capture.output( + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") + )) if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } - invisible(input_ped_report) } - diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 0d1df79..bbae7c3 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,54 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for accuracy and report/correct common errors} +\title{Check a pedigree file for structural issues and optionally apply corrections} \usage{ -check_ped(ped.file, seed = NULL, verbose = TRUE, correct = TRUE) +check_ped( + ped.file, + seed = NULL, + verbose = TRUE, + correct = TRUE, + save_zip = FALSE, + save_corrected_zip = FALSE +) } \arguments{ \item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility.} +\item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints findings and prompts for saving.} +\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the console.} -\item{correct}{Logical. If TRUE (default), saves a corrected pedigree. If FALSE, only reports issues.} +\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a corrected +pedigree to the global environment. If \code{FALSE}, issues are only reported.} + +\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a \code{.zip} +archive in the working directory.} + +\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and +\code{correct = TRUE}), includes the corrected pedigree in the \code{.zip} archive.} } \value{ -A named list of data.frames with offending rows (including row numbers): +An invisible named list of data frames describing detected issues: \itemize{ -\item \code{exact_duplicates} -\item \code{repeated_ids_diff} -\item \code{inconsistent_sex_roles} -\item \code{missing_parents} -\item \code{dependencies} +\item \code{exact_duplicates}: exact duplicate rows. +\item \code{repeated_ids_diff}: rows where the same \code{id} has conflicting parents. +\item \code{inconsistent_sex_roles}: rows involving IDs used as both parents. +\item \code{missing_parents}: rows referencing parent IDs absent from \code{id}. +\item \code{dependencies}: rows whose \code{id} is involved in a pedigree cycle. } } \description{ -\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) -and performs quality checks, optionally correcting or flagging errors. +Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and +\code{female_parent} and checks for five classes of issues: missing required +columns, exact duplicate rows, repeated IDs with conflicting parent +assignments, inconsistent parent sex roles, and pedigree cycles. The input +file is never modified. A report list is always assigned to the global +environment; when \code{correct = TRUE}, a corrected pedigree is assigned as well. } \details{ -The function checks for: -\itemize{ -\item Exact duplicate rows and removes them (keeping one copy) -\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -\item IDs that appear in both sire and dam columns -\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" -\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant -} - -After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. - -The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: -\itemize{ -\item Corrected pedigree as a dataframe in the global environment -\item A report listing all detected issues -} +When \code{correct = TRUE}, the function removes exact duplicates, collapses +conflicting repeated IDs (replacing the inconsistent parent field with \code{"0"}), +and appends missing parents as founder rows. Inconsistent sex roles and +pedigree cycles are reported but not corrected and must be resolved manually. } From f798362faad1ccc1df8c3630b5d3387c65d3f74e Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 14:07:03 -0400 Subject: [PATCH 06/16] updated parentage functions to work allow user to display NA or not, and to plot results. --- R/find_parentage.R | 320 +++++++++------- R/validate_pedigree.R | 254 ++++++------ man/find_parentage.Rd | 155 +++----- man/validate_pedigree.Rd | 40 +- tests/testthat/test-find_parentage.R | 196 +++++++--- tests/testthat/test-validate_pedigree.R | 490 ++++++++++++++---------- 6 files changed, 850 insertions(+), 605 deletions(-) diff --git a/R/find_parentage.R b/R/find_parentage.R index 728822a..f53a323 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -7,104 +7,62 @@ #' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 #' (allele dosage). #' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. -#' Must include an 'ID' column. An optional 'Sex' column with values -#' 'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -#' which parents are tested for each role. If absent, all parents are treated -#' as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs to assign. +#' Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') +#' determines parent roles. If absent, all parents are treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs. #' Must include an 'ID' column. -#' @param method Character. Parentage assignment method. One of: -#' \itemize{ -#' \item \code{"best_male_parent"} — finds the best male parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_female_parent"} — finds the best female parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_match"} — finds the single best parent (either sex) -#' using homozygous mismatch rate. -#' \item \code{"best_pair"} — finds the best male-female parent pair for -#' each progeny using full Mendelian error rate (default). -#' } -#' @param min_markers Integer. Minimum number of non-missing markers required -#' to report a parentage assignment. Progeny-parent comparisons with fewer -#' markers are flagged as \code{LOW_MARKERS} and no assignment is made -#' (default: \code{10}). -#' @param error_threshold Numeric. Maximum mismatch percentage to report a -#' parentage assignment as confident. Assignments above this threshold are -#' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -#' (default: \code{5.0}). Must be between 0 and 100. -#' @param show_ties Logical. If \code{TRUE}, all tied best pairs (after -#' tie-breaking by maximum markers tested) are reported as additional columns -#' (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -#' \code{method = "best_pair"}. The base columns (\code{Male_Parent}, -#' \code{Female_Parent}, etc.) are always populated with the top result. -#' If \code{FALSE}, only one tied pair is reported with a warning. -#' Default is \code{TRUE}. -#' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where -#' both IDs are identical are excluded when \code{method = "best_pair"}. -#' Default is \code{TRUE}. -#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary -#' statistics, and the results table to the console. Default is \code{TRUE}. +#' @param method Character. One of \code{"best_male_parent"}, +#' \code{"best_female_parent"}, \code{"best_match"}, or +#' \code{"best_pair"} (default). See Details. +#' @param min_markers Integer. Minimum markers required for a valid assignment. +#' Fewer markers flags \code{LOW_MARKERS} (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage for a confident +#' assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). +#' Must be between 0 and 100. +#' @param show_ties Logical. If \code{TRUE}, tied best pairs are reported as +#' additional suffix columns (e.g. \code{Male_Parent_2}) when +#' \code{method = "best_pair"}. Default is \code{TRUE}. +#' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and +#' female parent IDs are excluded. Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress, summary, and +#' results to the console. Default is \code{TRUE}. #' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_results_dt.txt} in the working directory. Default is -#' \code{TRUE}. +#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. +#' @param na_string Character. String for missing values in output file. +#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). +#' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error +#' distribution colored by assignment status. Requires \code{ggplot2}. +#' Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. +#' Default is \code{TRUE}. #' -#' @return A \code{data.table} with one row per progeny. Columns depend on the -#' method used: -#' \itemize{ -#' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -#' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -#' \code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (ties remain after tie-breaking, -#' \code{show_ties = TRUE}): base columns are always populated with the -#' top result, plus suffix columns \code{Male_Parent_1}, -#' \code{Female_Parent_1}, etc. for each tied pair. -#' } -#' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -#' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +#' @return A \code{data.table} with one row per progeny. Base columns are +#' \code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), +#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +#' and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or +#' \code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are +#' appended when \code{show_ties = TRUE} and ties remain after tie-breaking. +#' Returned invisibly when \code{verbose = TRUE}. #' #' @details -#' A homozygous-only genotype matrix is pre-computed once at startup and shared -#' across all methods that require it, avoiding redundant computation. -#' -#' For \code{"best_male_parent"}, \code{"best_female_parent"}, and -#' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -#' comparison; heterozygous markers (coded 1) are set to \code{NA}. This -#' reduces false mismatches caused by phase ambiguity. -#' -#' For \code{"best_pair"}, all markers are used and full Mendelian inheritance -#' rules are applied. Mismatch rates and comparison counts are computed across -#' all progeny simultaneously using vectorised \code{vapply} calls, producing -#' \code{n_pairs x n_progeny} matrices and giving substantial speed gains for -#' large datasets. Both matrices are explicitly coerced to matrix form to -#' handle the edge case of a single parent pair correctly. -#' -#' When multiple pairs share the lowest Mendelian error rate, ties are broken -#' by selecting the pair(s) with the greatest number of markers tested. If ties -#' still remain after this step, all remaining tied pairs are reported when -#' \code{show_ties = TRUE}. +#' A homozygous-only matrix is pre-computed once and shared across +#' \code{"best_male_parent"}, \code{"best_female_parent"}, and +#' \code{"best_match"}, where heterozygous markers (coded 1) are set to +#' \code{NA} to reduce false mismatches from phase ambiguity. #' -#' The base columns (\code{Male_Parent}, \code{Female_Parent}, -#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -#' the top result, ensuring no missing values in these columns regardless of -#' tie status. +#' For \code{"best_pair"}, full Mendelian inheritance rules are applied across +#' all marker and pair combinations using vectorised \code{vapply} calls, +#' producing \code{n_pairs x n_progeny} mismatch and comparison matrices. #' -#' Output rows are pre-allocated as a \code{data.table} and filled by reference -#' using \code{data.table::set()}, avoiding repeated memory allocation during -#' the results-building step. +#' Ties on mismatch rate are broken by maximum markers tested. Remaining ties +#' are reported as suffix columns when \code{show_ties = TRUE}. Base columns +#' are always populated with the top result. #' -#' Individuals in \code{parents_file} or \code{progeny_file} that are absent -#' from \code{genotypes_file} are removed with a warning. -#' -#' Progeny with fewer non-missing markers than \code{min_markers} are flagged -#' \code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -#' match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' Parents or progeny absent from \code{genotypes_file} are removed with a +#' warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; +#' those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. #' #' @examples #' \dontrun{ -#' # Assign best male-female parent pair to each progeny #' results <- find_parentage( #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", @@ -113,10 +71,11 @@ #' min_markers = 50, #' error_threshold = 5.0, #' show_ties = TRUE, -#' allow_selfing = FALSE +#' allow_selfing = FALSE, +#' na_string = "NA", +#' plot_results = TRUE #' ) #' -#' # Find best individual parent match (ignoring sex) #' results <- find_parentage( #' genotypes_file = "genotypes.txt", #' parents_file = "parents.txt", @@ -136,8 +95,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE) { - + write_txt = TRUE, + na_string = "NA", + plot_results = TRUE) { + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) @@ -146,7 +107,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("min_markers must be a positive integer.") if (error_threshold < 0 || error_threshold > 100) stop("error_threshold must be between 0 and 100.") - + if (!na_string %in% c("NA", "")) + stop("na_string must be either 'NA' or ''.") + tryCatch({ genos <- data.table::fread(genotypes_file, sep = "auto") all_parents <- data.table::fread(parents_file, sep = "auto") @@ -154,7 +117,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (base::length(removed_parents) > 0) { @@ -162,48 +125,50 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } - + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } - + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } - + all_parents[, Sex := base::toupper(Sex)] male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - + #### Pre-compute genotype matrices once (shared across all methods) #### + # Full genotype matrix used by best_pair genos_mat <- base::as.matrix(genos, rownames = "ID") - + # Homozygous-only matrix (het markers set to NA) used by hom methods genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Assignment_Status from markers and error rate #### + # Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) } - + #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, @@ -211,11 +176,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - + # Subset pre-computed homozygous matrix for relevant parents and progeny - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( @@ -225,26 +190,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA - + best_idx <- base::which.min(percent_mismatch) - + # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") next } - + best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) @@ -252,7 +217,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } final_df <- results_dt } - + #### Logic for Best Pair Method #### if (method == "best_pair") { parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, @@ -262,18 +227,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - + # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - + # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - - # Vectorised mismatch computation across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -292,8 +256,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - - # Vectorised comparison count across ALL progeny at once + # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -304,11 +267,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - + # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, @@ -318,7 +281,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { @@ -326,48 +289,48 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch <- pct_mismatch_mat[, j] comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - + # No markers overlap at all — flag LOW_MARKERS if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") next } - + best_indices <- base::which(percent_mismatch == min_mismatch_val) - + # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) best_indices <- best_indices[best_markers_per_pair == max_markers] } - + best_pairs <- parent_pairs[best_indices] best_markers <- comparisons[best_indices[1]] best_error <- base::round(min_mismatch_val, 2) a_status <- assign_status(best_markers, best_error) - + if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - + num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - + # Always populate base columns with the top result data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, j, "Assignment_Status", a_status) - + # If ties remain after tie-breaking and show_ties is TRUE, # store tie details for later column binding if (show_ties && num_to_report > 1) { tie_row <- base::list(Progeny = prog_id) - for (k in base::seq_len(num_to_report)) { + for (k in base::seq(2, num_to_report)) { tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val @@ -376,19 +339,24 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, results_list[[j]] <- data.table::as.data.table(tie_row) } } - + # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), - fill = TRUE + fill = TRUE, + use.names = TRUE ) if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + # Replace any empty strings with NA + for (col in base::names(final_df)) { + data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) + } } else { final_df <- results_dt } } - + #### Summary #### if (verbose) { total <- base::nrow(final_df) @@ -401,18 +369,98 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::cat("Min markers threshold :", min_markers, "\n") base::cat("Error threshold :", error_threshold, "%\n\n") } - + #### Output #### if (write_txt) { - output_filename <- "parentage_results_dt.txt" + output_filename <- "parentage_testing_results.txt" tryCatch({ - data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) + data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE, + na = na_string) if (verbose) base::cat("Results successfully written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - + + #### Plot Results #### + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] + plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) + + plot_df$Plot_Status <- dplyr::case_when( + plot_df$Assignment_Status == "PASS" ~ "PASS", + plot_df$Assignment_Status == "HIGH_ERROR" ~ "HIGH_ERROR", + plot_df$Assignment_Status == "LOW_MARKERS" ~ "LOW_MARKERS", + TRUE ~ "OTHER" + ) + + n_total <- base::nrow(plot_df) + n_pass <- base::sum(plot_df$Assignment_Status == "PASS", na.rm = TRUE) + n_high <- base::sum(plot_df$Assignment_Status == "HIGH_ERROR", na.rm = TRUE) + n_low <- base::sum(plot_df$Assignment_Status == "LOW_MARKERS", na.rm = TRUE) + + threshold_label <- base::paste0( + "Error Threshold: ", error_threshold, "% | ", + "Pass: ", n_pass, " | ", + "High Error: ", n_high, " | ", + "Low Markers: ", n_low + ) + + p <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) + ) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline( + xintercept = error_threshold, + linetype = "dashed", + color = "black", + linewidth = 1 + ) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + + ggplot2::scale_fill_manual( + values = c( + "PASS" = "#339900", + "HIGH_ERROR" = "#cc3333", + "LOW_MARKERS" = "#F1C40F", + "OTHER" = "#BDC3C7" + ), + labels = c( + "PASS" = "Pass", + "HIGH_ERROR" = "High Error", + "LOW_MARKERS" = "Low Markers", + "OTHER" = "Other" + ) + ) + + ggplot2::labs( + title = "Parentage Mendelian Error Distribution", + subtitle = base::paste0("Progeny Tested: ", n_total, "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Progeny", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + + base::print(p) + + if (write_txt) { + plot_filename <- base::sub("\\.[^.]+$", ".jpg", "parentage_testing_results.txt") + tryCatch({ + ggplot2::ggsave(plot_filename, plot = p, + device = "jpeg", width = 10, height = 6, dpi = 300) + if (verbose) base::cat("Plot saved to:", plot_filename, "\n") + }, error = function(e) { + warning("Could not save plot. Error: ", e$message, call. = FALSE) + }) + } + } + } + if (verbose) { base::cat("\n--- Parentage Assignment Results ---\n") base::print(final_df) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 5b366c9..a3526ce 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -23,57 +23,67 @@ #' parent-level evaluation (default: 2.0). Must be between 0 and 100. #' @param verbose Logical. If TRUE, prints progress messages, a summary #' table, and results to the console (default: TRUE). -#' @param write_txt Logical. If TRUE, writes validation results to -#' output_filename (default: TRUE). -#' @param output_filename Character. Path/name of the output file -#' (default: "pedigree_validation_results.txt"). +#' @param write_results Logical. If TRUE, writes the validation results to +#' output_filename and saves the plot as a .jpg file (default: TRUE). +#' @param output_filename Character. Path/name of the output results file +#' (default: "__validation_report.txt"). The plot will be saved using the +#' same base name with a .jpg extension. +#' @param plot_results Logical. If TRUE, prints a histogram of Trio Mendelian +#' Error percentages with a threshold line (default: TRUE). If write_results +#' is also TRUE, the plot is additionally saved as a .jpg file. +#' @param na_string Character. String for missing values in the output file. +#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' #' @return A data.table (returned invisibly) with one row per trio and #' the following columns: #' \describe{ #' \item{ID}{Individual ID.} -#' \item{Male_Parent}{Declared male parent ID.} -#' \item{Female_Parent}{Declared female parent ID.} -#' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Markers_Tested}{Number of markers with non-missing genotypes.} +#' \item{Orig_Male_Parent}{Declared male parent ID.} +#' \item{Orig_Female_Parent}{Declared female parent ID.} +#' \item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +#' \item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} #' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, #' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, #' or NO_GENOTYPE_DATA.} -#' \item{Correction_Decision}{One of NONE, KEEP_BOTH, +#' \item{Recommended_Correction}{One of NONE, KEEP_BOTH, #' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, #' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, #' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} #' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} #' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -#' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} -#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -#' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} -#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +#' \item{Best_Male_Candidate}{Best-matching male parent candidate ID.} +#' \item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} +#' \item{Best_Female_Candidate}{Best-matching female parent candidate ID.} +#' \item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} #' } #' @export -#' @importFrom data.table fread fwrite copy data.table set rbindlist +#' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, trio_error_threshold = 5.0, min_markers = 10, single_parent_error_threshold = 2.0, verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt") { - + write_results = TRUE, + output_filename = "__validation_report.txt", + plot_results = TRUE, + na_string = "NA") { + #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") - + if (!na_string %in% c("NA", "")) + stop("na_string must be either 'NA' or ''.") + tryCatch({ pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + #### Check required columns #### required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) @@ -82,12 +92,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::paste(missing_cols, collapse = ", ")) if (!"ID" %in% base::names(genos)) stop("Genotypes file must have an 'ID' column") - - # Ensure parent columns are character for consistent "0" comparisons + pedigree[, Male_Parent := as.character(Male_Parent)] pedigree[, Female_Parent := as.character(Female_Parent)] original_pedigree <- data.table::copy(pedigree) - + #### Read founders list #### if (!is.null(founders_file)) { founders_raw <- tryCatch({ @@ -99,42 +108,36 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { founder_ids <- character(0) } - + #### Identify trios missing from the genotype file #### valid_ids <- as.character(genos$ID) has_geno <- pedigree[ID %in% valid_ids & (Male_Parent %in% valid_ids | Male_Parent == "0") & (Female_Parent %in% valid_ids | Female_Parent == "0")] - no_geno_rows <- pedigree[!(ID %in% valid_ids) | (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] - if (base::nrow(no_geno_rows) > 0 && verbose) base::cat("Found", base::nrow(no_geno_rows), "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") - pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - + #### Build genotype matrices #### genos_mat <- base::as.matrix(genos, rownames = "ID") - - # Homozygous-only matrix (het markers set to NA) genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - - #### Helper: find best matching parent via homozygous mismatch #### + + #### Find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), c(prog_id, exclude_ids)) if (base::length(candidates) == 0) return(base::list(id = NA_character_, error_pct = NA_real_)) - prog_hom <- genos_hom_mat[prog_id, ] errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] @@ -142,20 +145,16 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) } - + #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { - prog_id <- pedigree$ID[i] male_parent_id <- pedigree$Male_Parent[i] female_parent_id <- pedigree$Female_Parent[i] - - # Default values correction_decision <- "NONE" error_pct <- NA_real_ status <- "NO_DATA" @@ -166,51 +165,37 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_male_parent_pct <- NA_real_ best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - - ## Founder check — both parents "0" and ID in founders list + if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { status <- "FOUNDERS" correction_decision <- "NONE" - } else { - - ## Missing parent(s) — recommendations only, "0"s preserved in pedigree if (male_parent_id == "0" && female_parent_id == "0") { status <- "MISSING_BOTH_PARENTS" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else if (male_parent_id == "0" && female_parent_id != "0") { status <- "MISSING_MALE_PARENT" correction_decision <- "NONE" - best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - } else if (male_parent_id != "0" && female_parent_id == "0") { status <- "MISSING_FEMALE_PARENT" correction_decision <- "NONE" - best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else { - - ## Both parents present — Mendelian error calculation progeny_vec <- genos_mat[prog_id, ] male_parent_vec <- genos_mat[male_parent_id, ] female_parent_vec <- genos_mat[female_parent_id, ] - mismatches <- base::sum( (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | @@ -222,19 +207,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - markers_tested <- base::sum(!base::is.na(male_parent_vec) & !base::is.na(female_parent_vec) & !base::is.na(progeny_vec)) - if (markers_tested == 0) { status <- "NO_DATA" correction_decision <- "NONE" - } else { error_pct <- (mismatches / markers_tested) * 100 - - # LOW_MARKERS still computes parent mismatch/recommendations if (markers_tested < min_markers) { status <- "LOW_MARKERS" } else if (error_pct <= trio_error_threshold) { @@ -243,32 +223,24 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { status <- "FAIL" } - - # Run parent-level evaluation for both FAIL and LOW_MARKERS if (status %in% c("FAIL", "LOW_MARKERS")) { - - # Homozygous mismatch per parent progeny_hom <- genos_hom_mat[prog_id, ] male_parent_hom <- genos_hom_mat[male_parent_id, ] female_parent_hom <- genos_hom_mat[female_parent_id, ] - male_comparisons <- base::sum(!base::is.na(male_parent_hom) & !base::is.na(progeny_hom)) male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else base::round((base::sum(male_parent_hom != progeny_hom, na.rm = TRUE) / male_comparisons) * 100, 2) - female_comparisons <- base::sum(!base::is.na(female_parent_hom) & !base::is.na(progeny_hom)) female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / female_comparisons) * 100, 2) - male_acceptable <- !is.na(male_parent_error_pct) && male_parent_error_pct <= single_parent_error_threshold female_acceptable <- !is.na(female_parent_error_pct) && female_parent_error_pct <= single_parent_error_threshold - if (male_acceptable && female_acceptable) { correction_decision <- "KEEP_BOTH" } else if (male_acceptable && !female_acceptable) { @@ -290,61 +262,58 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } - - # Do not alter corrected pedigree for LOW_MARKERS rows if (status == "LOW_MARKERS") correction_decision <- paste0("LOW_MARKERS_", correction_decision) } } } } - + data.table::data.table( - ID = prog_id, - Male_Parent = male_parent_id, - Female_Parent = female_parent_id, - Mendelian_Error_Pct = base::round(error_pct, 2), - Markers_Tested = markers_tested, - Status = status, - Correction_Decision = correction_decision, - Male_Parent_Hom_Error_Pct = male_parent_error_pct, - Female_Parent_Hom_Error_Pct = female_parent_error_pct, - Best_Male_Parent = best_male_parent, - Best_Male_Parent_Error_Pct = best_male_parent_pct, - Best_Female_Parent = best_female_parent, - Best_Female_Parent_Error_Pct = best_female_parent_pct + ID = prog_id, + Orig_Male_Parent = male_parent_id, + Orig_Female_Parent = female_parent_id, + Trio_Mendelian_Error_Pct = base::round(error_pct, 2), + Trio_Markers_Tested = markers_tested, + Status = status, + Recommended_Correction = correction_decision, + Male_Parent_Hom_Error_Pct = male_parent_error_pct, + Female_Parent_Hom_Error_Pct = female_parent_error_pct, + Best_Male_Candidate = best_male_parent, + Best_Male_Candidate_Error_Pct = best_male_parent_pct, + Best_Female_Candidate = best_female_parent, + Best_Female_Candidate_Error_Pct = best_female_parent_pct ) }) - + final_df <- data.table::rbindlist(results_list) - - #### Append NO_GENOTYPE_DATA rows to the final report #### + + #### Append NO_GENOTYPE_DATA rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( - ID = no_geno_rows$ID, - Male_Parent = no_geno_rows$Male_Parent, - Female_Parent = no_geno_rows$Female_Parent, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = 0L, - Status = "NO_GENOTYPE_DATA", - Correction_Decision = "NONE", - Male_Parent_Hom_Error_Pct = NA_real_, - Female_Parent_Hom_Error_Pct = NA_real_, - Best_Male_Parent = NA_character_, - Best_Male_Parent_Error_Pct = NA_real_, - Best_Female_Parent = NA_character_, - Best_Female_Parent_Error_Pct = NA_real_ + ID = no_geno_rows$ID, + Orig_Male_Parent = no_geno_rows$Male_Parent, + Orig_Female_Parent = no_geno_rows$Female_Parent, + Trio_Mendelian_Error_Pct = NA_real_, + Trio_Markers_Tested = 0L, + Status = "NO_GENOTYPE_DATA", + Recommended_Correction = "NONE", + Male_Parent_Hom_Error_Pct = NA_real_, + Female_Parent_Hom_Error_Pct = NA_real_, + Best_Male_Candidate = NA_character_, + Best_Male_Candidate_Error_Pct = NA_real_, + Best_Female_Candidate = NA_character_, + Best_Female_Candidate_Error_Pct = NA_real_ ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - + #### Write corrected pedigree #### corrected_pedigree <- data.table::copy(original_pedigree) for (i in base::seq_len(base::nrow(final_df))) { prog_id <- final_df$ID[i] - decision <- final_df$Correction_Decision[i] + decision <- final_df$Recommended_Correction[i] row_idx <- base::which(corrected_pedigree$ID == prog_id) - if (decision == "REMOVE_MALE_PARENT") { data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") } else if (decision == "REMOVE_FEMALE_PARENT") { @@ -354,7 +323,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") } } - tryCatch({ data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", sep = "\t", quote = FALSE) @@ -362,7 +330,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, }, error = function(e) { warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) }) - + #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) @@ -376,8 +344,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("Error threshold:", trio_error_threshold, "%\n") base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") base::cat("Minimum markers required:", min_markers, "\n\n") - - corrections <- base::table(final_df$Correction_Decision) + corrections <- base::table(final_df$Recommended_Correction) base::cat("Correction summary:\n") for (decision in base::names(corrections)) if (decision != "NONE") @@ -385,17 +352,86 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("\n") base::print(final_df) } - + #### Write results #### - if (write_txt) { + if (write_results) { tryCatch({ data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE) + sep = "\t", quote = FALSE, na = na_string) # <-- na_string used here if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results. Error: ", e$message, call. = FALSE) }) } - + + #### Plot results #### + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$Trio_Mendelian_Error_Pct)] + plot_df$Plot_Status <- dplyr::case_when( + plot_df$Recommended_Correction %in% c("NONE", + "KEEP_BOTH", + "LOW_MARKERS_KEEP_BOTH") ~ "PASS", + plot_df$Recommended_Correction %in% c("REMOVE_MALE_PARENT", + "REMOVE_FEMALE_PARENT", + "LOW_MARKERS_REMOVE_MALE_PARENT", + "LOW_MARKERS_REMOVE_FEMALE_PARENT") ~ "FAIL_ONE_PARENT", + plot_df$Recommended_Correction %in% c("REMOVE_BOTH", + "LOW_MARKERS_REMOVE_BOTH") ~ "FAIL_BOTH_PARENTS", + TRUE ~ "OTHER" + ) + n_total <- nrow(plot_df) + n_fail <- sum(plot_df$Trio_Mendelian_Error_Pct > trio_error_threshold) + n_pass <- sum(plot_df$Trio_Mendelian_Error_Pct <= trio_error_threshold) + pct_lost <- round((n_fail / n_total) * 100, 1) + threshold_label <- paste0("Mendelian Error Threshold: ", trio_error_threshold, "% | ", + "Lost: ", n_fail, " trios | ", + "Kept: ", n_pass, " trios") + p <- ggplot2::ggplot(plot_df, + ggplot2::aes(x = Trio_Mendelian_Error_Pct, fill = Plot_Status)) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline(xintercept = trio_error_threshold, + linetype = "dashed", color = "black", linewidth = 1) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_fill_manual( + values = c( + "PASS" = "#339900", + "FAIL_ONE_PARENT" = "#F1C40F", + "FAIL_BOTH_PARENTS" = "#cc3333", + "OTHER" = "#BDC3C7" + ), + labels = c( + "PASS" = "Pass", + "FAIL_ONE_PARENT" = "Fail - One Parent", + "FAIL_BOTH_PARENTS" = "Fail - Both Parents", + "OTHER" = "Other" + ) + ) + + ggplot2::labs( + title = "Trio Mendelian Error Distribution", + subtitle = paste0("Trios with Genotype Data Tested: ", n_total, "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Trios", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + print(p) + if (write_results) { + plot_filename <- base::sub("\\.[^.]+$", ".jpg", output_filename) + tryCatch({ + ggplot2::ggsave(plot_filename, plot = p, + device = "jpeg", width = 10, height = 6, dpi = 300) + if (verbose) base::cat("Plot saved to:", plot_filename, "\n") + }, error = function(e) { + warning("Could not save plot. Error: ", e$message, call. = FALSE) + }) + } + } + } + return(base::invisible(final_df)) -} +} \ No newline at end of file diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 706a15f..44b9772 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -14,7 +14,9 @@ find_parentage( show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE + write_txt = TRUE, + na_string = "NA", + plot_results = TRUE ) } \arguments{ @@ -23,117 +25,77 @@ Must include an 'ID' column followed by marker columns coded as 0, 1, 2 (allele dosage).} \item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. -Must include an 'ID' column. An optional 'Sex' column with values -'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -which parents are tested for each role. If absent, all parents are treated -as ambiguous.} +Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') +determines parent roles. If absent, all parents are treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs to assign. +\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs. Must include an 'ID' column.} -\item{method}{Character. Parentage assignment method. One of: -\itemize{ -\item \code{"best_male_parent"} — finds the best male parent for each -progeny using homozygous mismatch rate. -\item \code{"best_female_parent"} — finds the best female parent for each -progeny using homozygous mismatch rate. -\item \code{"best_match"} — finds the single best parent (either sex) -using homozygous mismatch rate. -\item \code{"best_pair"} — finds the best male-female parent pair for -each progeny using full Mendelian error rate (default). -}} - -\item{min_markers}{Integer. Minimum number of non-missing markers required -to report a parentage assignment. Progeny-parent comparisons with fewer -markers are flagged as \code{LOW_MARKERS} and no assignment is made -(default: \code{10}).} - -\item{error_threshold}{Numeric. Maximum mismatch percentage to report a -parentage assignment as confident. Assignments above this threshold are -flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -(default: \code{5.0}). Must be between 0 and 100.} - -\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs (after -tie-breaking by maximum markers tested) are reported as additional columns -(\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -\code{method = "best_pair"}. The base columns (\code{Male_Parent}, -\code{Female_Parent}, etc.) are always populated with the top result. -If \code{FALSE}, only one tied pair is reported with a warning. -Default is \code{TRUE}.} +\item{method}{Character. One of \code{"best_male_parent"}, +\code{"best_female_parent"}, \code{"best_match"}, or +\code{"best_pair"} (default). See Details.} -\item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where -both IDs are identical are excluded when \code{method = "best_pair"}. -Default is \code{TRUE}.} +\item{min_markers}{Integer. Minimum markers required for a valid assignment. +Fewer markers flags \code{LOW_MARKERS} (default: \code{10}).} + +\item{error_threshold}{Numeric. Maximum mismatch percentage for a confident +assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). +Must be between 0 and 100.} + +\item{show_ties}{Logical. If \code{TRUE}, tied best pairs are reported as +additional suffix columns (e.g. \code{Male_Parent_2}) when +\code{method = "best_pair"}. Default is \code{TRUE}.} + +\item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and +female parent IDs are excluded. Default is \code{TRUE}.} -\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary -statistics, and the results table to the console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress, summary, and +results to the console. Default is \code{TRUE}.} \item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_results_dt.txt} in the working directory. Default is -\code{TRUE}.} +\code{parentage_testing_results.txt}. Default is \code{TRUE}.} + +\item{na_string}{Character. String for missing values in output file. +Use \code{"NA"} or \code{""} (default: \code{"NA"}).} + +\item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error +distribution colored by assignment status. Requires \code{ggplot2}. +Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. +Default is \code{TRUE}.} } \value{ -A \code{data.table} with one row per progeny. Columns depend on the -method used: -\itemize{ -\item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -\code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -\code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (ties remain after tie-breaking, -\code{show_ties = TRUE}): base columns are always populated with the -top result, plus suffix columns \code{Male_Parent_1}, -\code{Female_Parent_1}, etc. for each tied pair. -} -\code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -\code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. +A \code{data.table} with one row per progeny. Base columns are +\code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), +\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, +and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or +\code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are +appended when \code{show_ties = TRUE} and ties remain after tie-breaking. +Returned invisibly when \code{verbose = TRUE}. } \description{ Assigns the most likely parent(s) to each progeny individual based on genotypic data using Mendelian error rates or homozygous mismatch rates. } \details{ -A homozygous-only genotype matrix is pre-computed once at startup and shared -across all methods that require it, avoiding redundant computation. - -For \code{"best_male_parent"}, \code{"best_female_parent"}, and -\code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -comparison; heterozygous markers (coded 1) are set to \code{NA}. This -reduces false mismatches caused by phase ambiguity. - -For \code{"best_pair"}, all markers are used and full Mendelian inheritance -rules are applied. Mismatch rates and comparison counts are computed across -all progeny simultaneously using vectorised \code{vapply} calls, producing -\code{n_pairs x n_progeny} matrices and giving substantial speed gains for -large datasets. Both matrices are explicitly coerced to matrix form to -handle the edge case of a single parent pair correctly. - -When multiple pairs share the lowest Mendelian error rate, ties are broken -by selecting the pair(s) with the greatest number of markers tested. If ties -still remain after this step, all remaining tied pairs are reported when -\code{show_ties = TRUE}. - -The base columns (\code{Male_Parent}, \code{Female_Parent}, -\code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -the top result, ensuring no missing values in these columns regardless of -tie status. - -Output rows are pre-allocated as a \code{data.table} and filled by reference -using \code{data.table::set()}, avoiding repeated memory allocation during -the results-building step. - -Individuals in \code{parents_file} or \code{progeny_file} that are absent -from \code{genotypes_file} are removed with a warning. - -Progeny with fewer non-missing markers than \code{min_markers} are flagged -\code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +A homozygous-only matrix is pre-computed once and shared across +\code{"best_male_parent"}, \code{"best_female_parent"}, and +\code{"best_match"}, where heterozygous markers (coded 1) are set to +\code{NA} to reduce false mismatches from phase ambiguity. + +For \code{"best_pair"}, full Mendelian inheritance rules are applied across +all marker and pair combinations using vectorised \code{vapply} calls, +producing \code{n_pairs x n_progeny} mismatch and comparison matrices. + +Ties on mismatch rate are broken by maximum markers tested. Remaining ties +are reported as suffix columns when \code{show_ties = TRUE}. Base columns +are always populated with the top result. + +Parents or progeny absent from \code{genotypes_file} are removed with a +warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; +those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. } \examples{ \dontrun{ -# Assign best male-female parent pair to each progeny results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", @@ -142,10 +104,11 @@ results <- find_parentage( min_markers = 50, error_threshold = 5.0, show_ties = TRUE, - allow_selfing = FALSE + allow_selfing = FALSE, + na_string = "NA", + plot_results = TRUE ) -# Find best individual parent match (ignoring sex) results <- find_parentage( genotypes_file = "genotypes.txt", parents_file = "parents.txt", diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 8f47a7d..8ec975b 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -12,8 +12,10 @@ validate_pedigree( min_markers = 10, single_parent_error_threshold = 2, verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt" + write_results = TRUE, + output_filename = "__validation_report.txt", + plot_results = TRUE, + na_string = "NA" ) } \arguments{ @@ -40,34 +42,42 @@ parent-level evaluation (default: 2.0). Must be between 0 and 100.} \item{verbose}{Logical. If TRUE, prints progress messages, a summary table, and results to the console (default: TRUE).} -\item{write_txt}{Logical. If TRUE, writes validation results to -output_filename (default: TRUE).} +\item{write_results}{Logical. If TRUE, writes the validation results to +output_filename and saves the plot as a .jpg file (default: TRUE).} -\item{output_filename}{Character. Path/name of the output file -(default: "pedigree_validation_results.txt").} +\item{output_filename}{Character. Path/name of the output results file +(default: "__validation_report.txt"). The plot will be saved using the +same base name with a .jpg extension.} + +\item{plot_results}{Logical. If TRUE, prints a histogram of Trio Mendelian +Error percentages with a threshold line (default: TRUE). If write_results +is also TRUE, the plot is additionally saved as a .jpg file.} + +\item{na_string}{Character. String for missing values in the output file. +Use \code{"NA"} or \code{""} (default: \code{"NA"}).} } \value{ A data.table (returned invisibly) with one row per trio and the following columns: \describe{ \item{ID}{Individual ID.} -\item{Male_Parent}{Declared male parent ID.} -\item{Female_Parent}{Declared female parent ID.} -\item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Markers_Tested}{Number of markers with non-missing genotypes.} +\item{Orig_Male_Parent}{Declared male parent ID.} +\item{Orig_Female_Parent}{Declared female parent ID.} +\item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} +\item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, or NO_GENOTYPE_DATA.} -\item{Correction_Decision}{One of NONE, KEEP_BOTH, +\item{Recommended_Correction}{One of NONE, KEEP_BOTH, REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -\item{Best_Male_Parent}{Best-matching male parent candidate ID.} -\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -\item{Best_Female_Parent}{Best-matching female parent candidate ID.} -\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +\item{Best_Male_Candidate}{Best-matching male parent candidate ID.} +\item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} +\item{Best_Female_Candidate}{Best-matching female parent candidate ID.} +\item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} } } \description{ diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index 55e6913..d1dfb3c 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -17,11 +17,12 @@ make_files <- function(genos, parents, progeny, dir = tempdir()) { list(g = geno_file, p = parent_file, pr = progeny_file) } -# ------------------------------------------------------------------------------ +# ============================================================================== # Base toy data # S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 # S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 -# ------------------------------------------------------------------------------ +# ============================================================================== + base_genos <- data.table::data.table( ID = c("S1","S2","D1","D2","child1","child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), @@ -54,7 +55,7 @@ test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, method = "bad_method", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "Method must be one of" ) }) @@ -63,7 +64,7 @@ test_that("min_markers < 1 throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, min_markers = 0, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "min_markers" ) }) @@ -72,21 +73,31 @@ test_that("error_threshold out of range throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = 150, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "error_threshold" ) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = -1, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "error_threshold" ) }) +test_that("invalid na_string throws an error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_error( + find_parentage(f$g, f$p, f$pr, na_string = "NULL", + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + regexp = "na_string" + ) +}) + test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage("nonexistent.txt", f$p, f$pr, - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + regexp = "Error reading input files" ) }) @@ -96,7 +107,7 @@ test_that("parent IDs absent from genotype file raise a warning and are dropped" f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "GHOST" ) }) @@ -107,7 +118,7 @@ test_that("progeny IDs absent from genotype file raise a warning and are dropped f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "GHOST_KID" ) }) @@ -118,7 +129,7 @@ test_that("no valid progeny candidates after filtering stops with an error", { expect_warning( expect_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "No valid progeny" ) ) @@ -129,7 +140,7 @@ test_that("missing Sex column raises a warning and defaults to ambiguous", { f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, write_txt = FALSE, plot_results = FALSE), regexp = "Sex" ) }) @@ -141,7 +152,8 @@ test_that("missing Sex column raises a warning and defaults to ambiguous", { test_that("best_pair returns a data.table with expected columns (no ties)", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Male_Parent","Female_Parent", "Mendelian_Error_Pct","Markers_Tested", @@ -152,17 +164,18 @@ test_that("best_pair returns a data.table with expected columns (no ties)", { test_that("best_male_parent returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) + expect_false("Male_Parent" %in% names(res)) expect_equal(nrow(res), 1L) }) test_that("best_female_parent returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) @@ -172,7 +185,7 @@ test_that("best_female_parent returns expected columns", { test_that("best_match returns expected columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", "Markers_Tested","Assignment_Status") %in% names(res))) @@ -183,7 +196,7 @@ test_that("one row returned per progeny for single-parent methods", { f <- make_files(base_genos, base_parents, child1_progeny) for (m in c("best_male_parent","best_female_parent","best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(nrow(res), 1L, label = paste("row count for method", m)) } }) @@ -191,14 +204,16 @@ test_that("one row returned per progeny for single-parent methods", { test_that("Markers_Tested equals the number of non-NA marker columns", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) }) test_that("Mendelian_Error_Pct is between 0 and 100", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) pct <- as.numeric(res$Mendelian_Error_Pct) expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) @@ -210,7 +225,8 @@ test_that("Mendelian_Error_Pct is between 0 and 100", { test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Male_Parent, "S1") expect_equal(res$Female_Parent, "D1") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) @@ -219,7 +235,8 @@ test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { f <- make_files(base_genos, base_parents, child2_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Male_Parent, "S2") expect_equal(res$Female_Parent, "D2") expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) @@ -228,24 +245,39 @@ test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { test_that("best_male_parent identifies S1 as best male for child1", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(res$Best_Match, "S1") }) test_that("best_female_parent identifies D1 as best female for child1", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_equal(res$Best_Match, "D1") }) test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) }) +test_that("both child1 and child2 are correctly assigned when run together", { + f <- make_files(base_genos, base_parents, base_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_equal(nrow(res), 2L) + child1_row <- res[Progeny == "child1"] + child2_row <- res[Progeny == "child2"] + expect_equal(child1_row$Male_Parent, "S1") + expect_equal(child1_row$Female_Parent, "D1") + expect_equal(child2_row$Male_Parent, "S2") + expect_equal(child2_row$Female_Parent, "D2") +}) + # ============================================================================== # 4. Assignment_Status — min_markers and error_threshold # ============================================================================== @@ -254,7 +286,8 @@ test_that("Assignment_Status = PASS for perfect trio within thresholds", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "PASS") }) @@ -262,7 +295,8 @@ test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available ma f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 99999, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "LOW_MARKERS") }) @@ -281,7 +315,8 @@ test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { f <- make_files(high_error_genos, parents, progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(res$Assignment_Status, "HIGH_ERROR") }) @@ -289,12 +324,24 @@ test_that("Assignment_Status column is present in all methods", { f <- make_files(base_genos, base_parents, child1_progeny) for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { res <- find_parentage(f$g, f$p, f$pr, method = m, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_true("Assignment_Status" %in% names(res), label = paste("Assignment_Status present for method", m)) } }) +test_that("LOW_MARKERS is flagged for single-parent methods too", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_male_parent","best_female_parent","best_match")) { + res <- find_parentage(f$g, f$p, f$pr, method = m, + min_markers = 99999, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_equal(res$Assignment_Status, "LOW_MARKERS", + label = paste("LOW_MARKERS for method", m)) + } +}) + # ============================================================================== # 5. allow_selfing # ============================================================================== @@ -302,11 +349,10 @@ test_that("Assignment_Status column is present in all methods", { test_that("allow_selfing = FALSE removes self-pairs from candidates", { ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) f <- make_files(base_genos, ambig_parents, child1_progeny) - expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - allow_selfing = FALSE, show_ties = FALSE, - verbose = FALSE, write_txt = FALSE), - regexp = "tied" + res <- suppressWarnings( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + allow_selfing = FALSE, show_ties = FALSE, + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) ) if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) expect_false(res$Male_Parent == res$Female_Parent) @@ -316,10 +362,11 @@ test_that("allow_selfing = FALSE removes self-pairs from candidates", { # 6. show_ties # ============================================================================== -test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { +test_that("show_ties = TRUE produces suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = TRUE, verbose = FALSE, write_txt = FALSE) + show_ties = TRUE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_true(any(grepl("^Male_Parent_", names(res)))) expect_true(any(grepl("^Female_Parent_", names(res)))) }) @@ -328,7 +375,8 @@ test_that("show_ties = FALSE warns about ties and returns single-result columns" f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE), + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE), regexp = "tied" ) expect_true("Male_Parent" %in% names(res)) @@ -337,33 +385,42 @@ test_that("show_ties = FALSE warns about ties and returns single-result columns" expect_false(any(grepl("^Female_Parent_\\d", names(res)))) }) +test_that("base columns are always populated even when ties exist", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + expect_false(is.na(res$Male_Parent[1])) + expect_false(is.na(res$Female_Parent[1])) +}) + # ============================================================================== # 7. verbose / write_txt # ============================================================================== -test_that("verbose = TRUE returns the result invisibly as data.table", { +test_that("verbose = TRUE returns the result as data.table", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = TRUE, write_txt = FALSE) + verbose = TRUE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") }) test_that("verbose = FALSE returns the result as data.table", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_s3_class(res, "data.table") }) -test_that("write_txt = TRUE creates the output file", { +test_that("write_txt = TRUE creates parentage_testing_results.txt", { old_wd <- getwd() tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = TRUE) - expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) + verbose = FALSE, write_txt = TRUE, plot_results = FALSE) + expect_true(file.exists(file.path(tmp, "parentage_testing_results.txt"))) }) test_that("write_txt = FALSE does not create the output file", { @@ -371,14 +428,23 @@ test_that("write_txt = FALSE does not create the output file", { tmp <- tempdir() setwd(tmp) on.exit(setwd(old_wd), add = TRUE) - out_file <- file.path(tmp, "parentage_results_dt.txt") + out_file <- file.path(tmp, "parentage_testing_results.txt") if (file.exists(out_file)) file.remove(out_file) f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) expect_false(file.exists(out_file)) }) +test_that("na_string = '' is accepted without error", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + na_string = "", verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + ) +}) + # ============================================================================== # 8. Sex-based candidate filtering # ============================================================================== @@ -386,7 +452,7 @@ test_that("write_txt = FALSE does not create the output file", { test_that("best_male_parent only assigns M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID expect_true(res$Best_Match %in% valid_male_parents) }) @@ -394,11 +460,29 @@ test_that("best_male_parent only assigns M or A parents", { test_that("best_female_parent only assigns F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID expect_true(res$Best_Match %in% valid_female_parents) }) +test_that("best_pair male slot contains only M or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + valid_males <- base_parents[Sex %in% c("M","A")]$ID + expect_true(res$Male_Parent %in% valid_males) +}) + +test_that("best_pair female slot contains only F or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + valid_females <- base_parents[Sex %in% c("F","A")]$ID + expect_true(res$Female_Parent %in% valid_females) +}) + # ============================================================================== # 9. Edge cases # ============================================================================== @@ -406,7 +490,8 @@ test_that("best_female_parent only assigns F or A parents", { test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_equal(nrow(res), 1L) }) @@ -416,20 +501,33 @@ test_that("all-NA marker column does not cause an error", { f <- make_files(na_genos, base_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, write_txt = FALSE, plot_results = FALSE) ) }) test_that("Progeny column contains the correct progeny IDs", { f <- make_files(base_genos, base_parents, child1_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_setequal(res$Progeny, child1_progeny$ID) }) test_that("multiple progeny are all represented in output", { f <- make_files(base_genos, base_parents, base_progeny) res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) expect_setequal(res$Progeny, base_progeny$ID) }) + +test_that("single parent pair (n_pairs = 1) does not cause dimension errors", { + single_pair_parents <- data.table::data.table(ID = c("S1","D1"), + Sex = c("M","F")) + f <- make_files(base_genos, single_pair_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + write_txt = FALSE, plot_results = FALSE) + ) +}) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index ef4e493..02acc22 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -43,56 +43,84 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { list(ped = ped_file, genos = genos_file) } +# Helper to run validate_pedigree in a temporary working directory +# so corrected_pedigree.txt doesn't pollute the real wd +run_in_tmpdir <- function(expr) { + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + force(expr) +} + # ============================================================================== # 1. Input validation # ============================================================================== test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, write_txt = FALSE)) + run_in_tmpdir({ + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + }) }) test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, write_txt = FALSE)) + run_in_tmpdir({ + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, + verbose = FALSE, write_results = FALSE, + plot_results = FALSE)) + }) }) test_that("missing required pedigree column raises an error", { bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") f <- write_temp_files(ped = bad_ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "missing required columns" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "missing required columns" + ) + }) }) test_that("missing ID column in genotypes raises an error", { bad_genos <- copy(make_genos()) setnames(bad_genos, "ID", "SampleID") f <- write_temp_files(genos = bad_genos) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "ID" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "ID" + ) + }) }) test_that("all trios with no genotype data stop with an error", { ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") f <- write_temp_files(ped = ped) - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "No valid trios remain" - ) + run_in_tmpdir({ + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, + plot_results = FALSE), + regexp = "No valid trios remain" + ) + }) }) # ============================================================================== @@ -100,29 +128,38 @@ test_that("all trios with no genotype data stop with an error", { # ============================================================================== test_that("returns a data.table", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_s3_class(res, "data.table") + }) }) test_that("result has one row per pedigree entry", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 2L) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(nrow(res), 2L) + }) }) test_that("result has all expected columns", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expected_cols <- c( - "ID", "Male_Parent", "Female_Parent", - "Mendelian_Error_Pct", "Markers_Tested", "Status", - "Correction_Decision", - "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", - "Best_Male_Parent", "Best_Male_Parent_Error_Pct", - "Best_Female_Parent", "Best_Female_Parent_Error_Pct" - ) - expect_true(all(expected_cols %in% names(res))) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expected_cols <- c( + "ID", "Orig_Male_Parent", "Orig_Female_Parent", + "Trio_Mendelian_Error_Pct", "Trio_Markers_Tested", "Status", + "Recommended_Correction", + "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", + "Best_Male_Candidate", "Best_Male_Candidate_Error_Pct", + "Best_Female_Candidate", "Best_Female_Candidate_Error_Pct" + ) + expect_true(all(expected_cols %in% names(res))) + }) }) # ============================================================================== @@ -130,68 +167,87 @@ test_that("result has all expected columns", { # ============================================================================== test_that("PASS trio is correctly identified", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - pass_row <- res[ID == "IND_C"] - expect_equal(nrow(pass_row), 1L) - expect_equal(pass_row$Status, "PASS") - expect_equal(pass_row$Mendelian_Error_Pct, 0) - expect_equal(pass_row$Correction_Decision, "NONE") + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + pass_row <- res[ID == "IND_C"] + expect_equal(nrow(pass_row), 1L) + expect_equal(pass_row$Status, "PASS") + expect_equal(pass_row$Trio_Mendelian_Error_Pct, 0) + expect_equal(pass_row$Recommended_Correction, "NONE") + }) }) test_that("FAIL trio is correctly identified", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(nrow(fail_row), 1L) - expect_equal(fail_row$Status, "FAIL") - expect_gt(fail_row$Mendelian_Error_Pct, 5.0) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(nrow(fail_row), 1L) + expect_equal(fail_row$Status, "FAIL") + expect_gt(fail_row$Trio_Mendelian_Error_Pct, 5.0) + }) }) test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") - expect_false(is.na(fail_row$Best_Male_Parent)) - expect_true(is.na(fail_row$Best_Female_Parent)) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + fail_row <- res[ID == "IND_D"] + expect_equal(fail_row$Recommended_Correction, "REMOVE_MALE_PARENT") + expect_false(is.na(fail_row$Best_Male_Candidate)) + expect_true(is.na(fail_row$Best_Female_Candidate)) + }) }) -test_that("Mendelian_Error_Pct is 0 for a perfect trio", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) +test_that("Trio_Mendelian_Error_Pct is 0 for a perfect trio", { + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Mendelian_Error_Pct, 0) + }) }) -test_that("Markers_Tested equals number of markers for complete data", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) +test_that("Trio_Markers_Tested equals number of markers for complete data", { + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 20L) + }) }) test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { - f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(grepl("^LOW_MARKERS_", res$Correction_Decision))) + f <- write_temp_files() + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE, + min_markers = 25L) + expect_true(all(res$Status == "LOW_MARKERS")) + expect_true(all(grepl("^LOW_MARKERS_", res$Recommended_Correction))) + }) }) -test_that("NA markers reduce Markers_Tested and do not cause errors", { +test_that("NA markers reduce Trio_Markers_Tested and do not cause errors", { genos <- make_genos() genos[ID == "IND_C", M1 := NA_integer_] genos[ID == "IND_C", M2 := NA_integer_] - f <- write_temp_files(genos = genos) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) - expect_equal(res[ID == "IND_C"]$Status, "PASS") + f <- write_temp_files(genos = genos) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 18L) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + }) }) # ============================================================================== # 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / # MISSING_BOTH_PARENTS) -# Note: each test includes make_pedigree() rows so has_geno is never empty, -# and filters res by the specific ID under test [3][4] # ============================================================================== test_that("MISSING_MALE_PARENT status and recommendation are correct", { @@ -199,13 +255,16 @@ test_that("MISSING_MALE_PARENT status and recommendation are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_MALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_true(is.na(r$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_MALE_PARENT") + expect_equal(r$Recommended_Correction, "NONE") + expect_false(is.na(r$Best_Male_Candidate)) + expect_true(is.na(r$Best_Female_Candidate)) + }) }) test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { @@ -213,13 +272,16 @@ test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_FEMALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_true(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_FEMALE_PARENT") + expect_equal(r$Recommended_Correction, "NONE") + expect_true(is.na(r$Best_Male_Candidate)) + expect_false(is.na(r$Best_Female_Candidate)) + }) }) test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { @@ -227,16 +289,19 @@ test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_BOTH_PARENTS") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) -}) - -test_that("MISSING_* rows preserve 0s in corrected pedigree", { + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + expect_equal(r$Status, "MISSING_BOTH_PARENTS") + expect_equal(r$Recommended_Correction, "NONE") + expect_false(is.na(r$Best_Male_Candidate)) + expect_false(is.na(r$Best_Female_Candidate)) + }) +}) + +test_that("MISSING_* rows preserve 0s in corrected_pedigree.txt", { ped <- rbind( make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") @@ -247,22 +312,29 @@ test_that("MISSING_* rows preserve 0s in corrected pedigree", { old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), colClasses = "character") - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + # MISSING_MALE_PARENT — male stays "0", female unchanged + expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") }) -test_that("Best_Male_Parent for MISSING_MALE_PARENT is excluded from being the known female", { +test_that("Best_Male_Candidate for MISSING_MALE_PARENT is not the known female parent", { ped <- rbind( make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - # The known female parent should not be suggested as the best male parent - expect_false(r$Best_Male_Parent == "IND_B") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + r <- res[ID == "IND_E"] + # The known female parent should not be suggested as the best male candidate + expect_false(r$Best_Male_Candidate == "IND_B") + }) }) # ============================================================================== @@ -277,16 +349,19 @@ test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents founders_file <- tempfile(fileext = ".txt") fwrite(data.table(ID = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - founder_row <- res[ID == "IND_A"] - expect_equal(founder_row$Status, "FOUNDERS") - expect_equal(founder_row$Correction_Decision, "NONE") - expect_true(is.na(founder_row$Best_Male_Parent)) - expect_true(is.na(founder_row$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + founder_row <- res[ID == "IND_A"] + expect_equal(founder_row$Status, "FOUNDERS") + expect_equal(founder_row$Recommended_Correction, "NONE") + expect_true(is.na(founder_row$Best_Male_Candidate)) + expect_true(is.na(founder_row$Best_Female_Candidate)) + }) }) test_that("non-founder rows are still evaluated normally when founders file is supplied", { @@ -297,13 +372,16 @@ test_that("non-founder rows are still evaluated normally when founders file is s founders_file <- tempfile(fileext = ".txt") fwrite(data.table(ID = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - # IND_C has real parents — should still get PASS - expect_equal(res[ID == "IND_C"]$Status, "PASS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = founders_file, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + # IND_C has real parents — should still get PASS + expect_equal(res[ID == "IND_C"]$Status, "PASS") + }) }) test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { @@ -311,9 +389,12 @@ test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + }) }) test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { @@ -321,12 +402,15 @@ test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUND make_pedigree(), data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = NULL, - verbose = FALSE, - write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, + founders_file = NULL, + verbose = FALSE, + write_results = FALSE, + plot_results = FALSE) + expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + }) }) # ============================================================================== @@ -338,36 +422,45 @@ test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_equal(nrow(ghost_row), 1L) - expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") - expect_equal(ghost_row$Correction_Decision, "NONE") -}) - -test_that("NO_GENOTYPE_DATA rows have NA for all analysis columns", { + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_equal(nrow(ghost_row), 1L) + expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") + expect_equal(ghost_row$Recommended_Correction, "NONE") + }) +}) + +test_that("NO_GENOTYPE_DATA rows have NA/0 for all analysis columns", { ped <- rbind( make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_true(is.na(ghost_row$Mendelian_Error_Pct)) - expect_equal(ghost_row$Markers_Tested, 0L) - expect_true(is.na(ghost_row$Best_Male_Parent)) - expect_true(is.na(ghost_row$Best_Female_Parent)) + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + ghost_row <- res[ID == "GHOST"] + expect_true(is.na(ghost_row$Trio_Mendelian_Error_Pct)) + expect_equal(ghost_row$Trio_Markers_Tested, 0L) + expect_true(is.na(ghost_row$Best_Male_Candidate)) + expect_true(is.na(ghost_row$Best_Female_Candidate)) + }) }) test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { ped <- rbind( - make_pedigree(), # ensures has_geno is not empty + make_pedigree(), data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") + }) }) test_that("valid trios still evaluated correctly when ghost rows are present", { @@ -375,24 +468,29 @@ test_that("valid trios still evaluated correctly when ghost rows are present", { make_pedigree(), data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") ) - f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Status, "PASS") - expect_equal(res[ID == "IND_D"]$Status, "FAIL") + f <- write_temp_files(ped = ped) + run_in_tmpdir({ + res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) + expect_equal(res[ID == "IND_C"]$Status, "PASS") + expect_equal(res[ID == "IND_D"]$Status, "FAIL") + }) }) # ============================================================================== # 7. Corrected pedigree output # ============================================================================== -test_that("corrected_pedigree.txt is written and PASS parents are unchanged", { +test_that("corrected_pedigree.txt is always written and PASS parents are unchanged", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) # IND_C passes — parents must be unchanged @@ -407,7 +505,9 @@ test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, plot_results = FALSE) corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), colClasses = "character") # IND_D fails with REMOVE_MALE_PARENT — male should become "0" @@ -415,45 +515,32 @@ test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") }) -test_that("corrected_pedigree.txt preserves 0s for MISSING_* rows", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # MISSING_MALE_PARENT — male stays "0", female unchanged - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") -}) - # ============================================================================== -# 8. write_txt / output file +# 8. write_results / output file # ============================================================================== -test_that("write_txt = TRUE writes output file with correct row count", { +test_that("write_results = TRUE writes output file with correct row count", { f <- write_temp_files() out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - expect_true(file.exists(out_file)) - written <- fread(out_file) - expect_equal(nrow(written), 2L) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = TRUE, output_filename = out_file, + plot_results = FALSE) + expect_true(file.exists(out_file)) + written <- fread(out_file) + expect_equal(nrow(written), 2L) + }) }) -test_that("write_txt = FALSE does not create output file", { +test_that("write_results = FALSE does not create output file", { f <- write_temp_files() out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, output_filename = out_file) - expect_false(file.exists(out_file)) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = FALSE, output_filename = out_file, + plot_results = FALSE) + expect_false(file.exists(out_file)) + }) }) test_that("output file contains correct number of rows when ghost rows present", { @@ -463,9 +550,12 @@ test_that("output file contains correct number of rows when ghost rows present", ) f <- write_temp_files(ped = ped) out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - written <- fread(out_file) - # 2 valid trios + 1 ghost = 3 rows total - expect_equal(nrow(written), 3L) + run_in_tmpdir({ + validate_pedigree(f$ped, f$genos, verbose = FALSE, + write_results = TRUE, output_filename = out_file, + plot_results = FALSE) + written <- fread(out_file) + # 2 valid trios + 1 ghost = 3 rows total + expect_equal(nrow(written), 3L) + }) }) From 03725d8b5567cef3749e170806f6dfe0c6394a8a Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 15:09:40 -0400 Subject: [PATCH 07/16] Made edits suggested by autopilot --- R/check_ped.R | 50 ++++++++++++--- R/find_parentage.R | 127 +++++++++++++++++++++------------------ R/validate_pedigree.R | 111 +++++++++++++++++++--------------- man/find_parentage.Rd | 8 ++- man/validate_pedigree.Rd | 17 ++++-- 5 files changed, 193 insertions(+), 120 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index e1d9407..0ec5cf2 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -40,9 +40,11 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { + #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) + # ── Column alias remapping ────────────────────────────────────────────────── # Accept common alternative column names and silently remap them so the rest # of the function always works with the canonical names. @@ -59,6 +61,7 @@ check_ped <- function(ped.file, } } } + # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) @@ -75,33 +78,44 @@ check_ped <- function(ped.file, male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) + # Add row numbers before any processing data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() + #### check 1: exact duplicates #### exact_duplicates <- data[ duplicated(data %>% select(-row_number)) | duplicated(data %>% select(-row_number), fromLast = TRUE), ] + #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% group_by(id) %>% filter(n() > 1) %>% filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% ungroup() + #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - ref_ids <- ref_ids[ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + # Fix #1: trimws() strips whitespace-only strings; the combined filter then + # drops NA (which NA != "0" would not catch), empty strings, and "0" + # to prevent spurious founder rows with id = NA in the corrected pedigree. + ref_ids <- trimws(ref_ids) + ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] + + missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { missing_founders <- data.frame( @@ -120,6 +134,7 @@ check_ped <- function(ped.file, stringsAsFactors = FALSE ) } + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { @@ -127,6 +142,7 @@ check_ped <- function(ped.file, c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE @@ -145,6 +161,7 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() @@ -156,6 +173,7 @@ check_ped <- function(ped.file, } return(all_cycles) } + cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -172,6 +190,7 @@ check_ped <- function(ped.file, dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) } + #### compile report #### input_ped_report <- list( exact_duplicates = exact_duplicates, @@ -180,11 +199,14 @@ check_ped <- function(ped.file, missing_parents = missing_parents, dependencies = dependencies ) + #### build corrected pedigree (no row_number column) #### if (correct) { corrected <- data %>% select(-row_number) + # Remove exact duplicates corrected <- distinct(corrected) + # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% @@ -195,6 +217,7 @@ check_ped <- function(ped.file, .groups = "drop" ) } + # Add missing founders if (length(missing_ids) > 0) { corrected <- bind_rows( @@ -203,11 +226,13 @@ check_ped <- function(ped.file, ) } } + #### file names #### file_base <- tools::file_path_sans_ext(basename(ped.file)) corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") + #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -222,20 +247,28 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("\`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) + cat(paste0("`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("\`correct = FALSE`: no changes made to the pedigree.\n") + cat("`correct = FALSE`: no changes made to the pedigree.\n") } } + # Always save report to global environment assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + #### zip export #### if (save_zip) { - tmp_dir <- tempfile() + tmp_dir <- tempfile() dir.create(tmp_dir) + # Fix #2: register cleanup so the temp directory is always removed on + # function exit, whether normal or due to an error. + on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) + zip_files <- character(0) + # Section headers matching the console output labels section_labels <- c( exact_duplicates = "Exact Duplicates", @@ -262,17 +295,20 @@ check_ped <- function(ped.file, } zip_files <- c(zip_files, file_path) } + if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } + zip_path <- file.path(getwd(), zip_name) invisible(capture.output( utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") )) if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } + invisible(input_ped_report) } diff --git a/R/find_parentage.R b/R/find_parentage.R index f53a323..410981b 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -27,7 +27,11 @@ #' @param verbose Logical. If \code{TRUE}, prints progress, summary, and #' results to the console. Default is \code{TRUE}. #' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. +#' \code{output_filename}. The plot will be saved using the same base name +#' with a .jpg extension. Default is \code{TRUE}. +#' @param output_filename Character. Path/name of the output results file +#' (default: \code{"parentage_testing_results.txt"}). The plot will be saved +#' using the same base name with a .jpg extension. #' @param na_string Character. String for missing values in output file. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error @@ -96,9 +100,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, + output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE) { - + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) @@ -109,7 +114,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("error_threshold must be between 0 and 100.") if (!na_string %in% c("NA", "")) stop("na_string must be either 'NA' or ''.") - + tryCatch({ genos <- data.table::fread(genotypes_file, sep = "auto") all_parents <- data.table::fread(parents_file, sep = "auto") @@ -117,7 +122,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + valid_ids <- genos$ID removed_parents <- base::setdiff(all_parents$ID, valid_ids) if (base::length(removed_parents) > 0) { @@ -125,50 +130,50 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, paste(removed_parents, collapse = ", "), call. = FALSE) all_parents <- all_parents[ID %in% valid_ids] } - + removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) progeny_candidates <- progeny_candidates[ID %in% valid_ids] } - + if (!"Sex" %in% base::colnames(all_parents)) { warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") all_parents[, Sex := "A"] } - + all_parents[, Sex := base::toupper(Sex)] male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] - + if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - + #### Pre-compute genotype matrices once (shared across all methods) #### - + # Full genotype matrix used by best_pair genos_mat <- base::as.matrix(genos, rownames = "ID") - + # Homozygous-only matrix (het markers set to NA) used by hom methods genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Assignment_Status from markers and error rate #### - + # Returns LOW_MARKERS, HIGH_ERROR, or PASS assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) } - + #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, @@ -176,11 +181,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - + # Subset pre-computed homozygous matrix for relevant parents and progeny - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( @@ -190,26 +195,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA - + best_idx <- base::which.min(percent_mismatch) - + # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") next } - + best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - + data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) @@ -217,7 +222,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } final_df <- results_dt } - + #### Logic for Best Pair Method #### if (method == "best_pair") { parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, @@ -227,17 +232,17 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - + # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - + # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - + # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -256,7 +261,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { @@ -267,11 +272,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, }, numeric(n_pairs)), nrow = n_pairs, ncol = n_progeny ) - + # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - + # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, @@ -281,7 +286,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - + # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { @@ -289,43 +294,43 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch <- pct_mismatch_mat[, j] comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - - # No markers overlap at all — flag LOW_MARKERS + + # No markers overlap at all — flag LOW_MARKERS and continue if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") next } - + best_indices <- base::which(percent_mismatch == min_mismatch_val) - + # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) best_indices <- best_indices[best_markers_per_pair == max_markers] } - + best_pairs <- parent_pairs[best_indices] best_markers <- comparisons[best_indices[1]] best_error <- base::round(min_mismatch_val, 2) a_status <- assign_status(best_markers, best_error) - + if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) } - + num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - + # Always populate base columns with the top result data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, j, "Assignment_Status", a_status) - + # If ties remain after tie-breaking and show_ties is TRUE, # store tie details for later column binding if (show_ties && num_to_report > 1) { @@ -339,7 +344,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, results_list[[j]] <- data.table::as.data.table(tie_row) } } - + # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), @@ -348,15 +353,21 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) - # Replace any empty strings with NA - for (col in base::names(final_df)) { - data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) + + char_cols <- base::names(final_df)[ + base::vapply(final_df, base::is.character, logical(1)) + ] + for (col in char_cols) { + empty_idx <- which(final_df[[col]] == "") + if (base::length(empty_idx) > 0) { + data.table::set(final_df, empty_idx, col, NA_character_) + } } } else { final_df <- results_dt } } - + #### Summary #### if (verbose) { total <- base::nrow(final_df) @@ -369,10 +380,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::cat("Min markers threshold :", min_markers, "\n") base::cat("Error threshold :", error_threshold, "%\n\n") } - + #### Output #### + if (write_txt) { - output_filename <- "parentage_testing_results.txt" tryCatch({ data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE, na = na_string) @@ -381,7 +392,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) } - + #### Plot Results #### if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -389,26 +400,26 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } else { plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) - + plot_df$Plot_Status <- dplyr::case_when( plot_df$Assignment_Status == "PASS" ~ "PASS", plot_df$Assignment_Status == "HIGH_ERROR" ~ "HIGH_ERROR", plot_df$Assignment_Status == "LOW_MARKERS" ~ "LOW_MARKERS", TRUE ~ "OTHER" ) - + n_total <- base::nrow(plot_df) n_pass <- base::sum(plot_df$Assignment_Status == "PASS", na.rm = TRUE) n_high <- base::sum(plot_df$Assignment_Status == "HIGH_ERROR", na.rm = TRUE) n_low <- base::sum(plot_df$Assignment_Status == "LOW_MARKERS", na.rm = TRUE) - + threshold_label <- base::paste0( "Error Threshold: ", error_threshold, "% | ", "Pass: ", n_pass, " | ", "High Error: ", n_high, " | ", "Low Markers: ", n_low ) - + p <- ggplot2::ggplot( plot_df, ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) @@ -421,7 +432,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, linewidth = 1 ) + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + - ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + + + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks()) + ggplot2::scale_fill_manual( values = c( "PASS" = "#339900", @@ -445,11 +457,12 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) + ggplot2::theme_classic(base_size = 13) + ggplot2::theme(legend.position = "top") - + base::print(p) - + if (write_txt) { - plot_filename <- base::sub("\\.[^.]+$", ".jpg", "parentage_testing_results.txt") + + plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") tryCatch({ ggplot2::ggsave(plot_filename, plot = p, device = "jpeg", width = 10, height = 6, dpi = 300) @@ -460,7 +473,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } } } - + if (verbose) { base::cat("\n--- Parentage Assignment Results ---\n") base::print(final_df) diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index a3526ce..8784914 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -33,6 +33,9 @@ #' is also TRUE, the plot is additionally saved as a .jpg file. #' @param na_string Character. String for missing values in the output file. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). +#' @param corrected_pedigree_filename Character. Path/name of the output file +#' for the corrected pedigree (default: "corrected_pedigree.txt"). Set to +#' NULL to suppress writing the corrected pedigree. #' #' @return A data.table (returned invisibly) with one row per trio and #' the following columns: @@ -65,10 +68,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, single_parent_error_threshold = 2.0, verbose = TRUE, write_results = TRUE, - output_filename = "__validation_report.txt", plot_results = TRUE, - na_string = "NA") { - + na_string = "NA", + output_filename = "__validation_report.txt", + corrected_pedigree_filename = "corrected_pedigree.txt") { + #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") @@ -76,14 +80,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, stop("single_parent_error_threshold must be between 0 and 100") if (!na_string %in% c("NA", "")) stop("na_string must be either 'NA' or ''.") - + tryCatch({ pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") genos <- data.table::fread(genotypes_file, sep = "auto") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - + #### Check required columns #### required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) @@ -92,11 +96,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::paste(missing_cols, collapse = ", ")) if (!"ID" %in% base::names(genos)) stop("Genotypes file must have an 'ID' column") - + pedigree[, Male_Parent := as.character(Male_Parent)] pedigree[, Female_Parent := as.character(Female_Parent)] original_pedigree <- data.table::copy(pedigree) - + #### Read founders list #### if (!is.null(founders_file)) { founders_raw <- tryCatch({ @@ -108,7 +112,15 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { founder_ids <- character(0) } - + + #### Build genotype matrices #### + genos_mat <- base::as.matrix(genos, rownames = "ID") + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "ID") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + #### Identify trios missing from the genotype file #### valid_ids <- as.character(genos$ID) has_geno <- pedigree[ID %in% valid_ids & @@ -123,15 +135,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - - #### Build genotype matrices #### - genos_mat <- base::as.matrix(genos, rownames = "ID") - genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) - genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - + #### Find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), @@ -145,11 +149,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) + + if (base::all(base::is.na(errors))) + return(base::list(id = NA_character_, error_pct = NA_real_)) best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) } - + #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { prog_id <- pedigree$ID[i] @@ -165,7 +172,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_male_parent_pct <- NA_real_ best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - + if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { status <- "FOUNDERS" @@ -268,7 +275,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } } } - + data.table::data.table( ID = prog_id, Orig_Male_Parent = male_parent_id, @@ -285,9 +292,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, Best_Female_Candidate_Error_Pct = best_female_parent_pct ) }) - + final_df <- data.table::rbindlist(results_list) - + #### Append NO_GENOTYPE_DATA rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( @@ -307,30 +314,34 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - + #### Write corrected pedigree #### - corrected_pedigree <- data.table::copy(original_pedigree) - for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] - decision <- final_df$Recommended_Correction[i] - row_idx <- base::which(corrected_pedigree$ID == prog_id) - if (decision == "REMOVE_MALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") - } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + + # pass NULL to suppress writing. + if (!is.null(corrected_pedigree_filename)) { + corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$ID[i] + decision <- final_df$Recommended_Correction[i] + row_idx <- base::which(corrected_pedigree$ID == prog_id) + if (decision == "REMOVE_MALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + } else if (decision == "REMOVE_FEMALE_PARENT") { + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } else if (decision == "REMOVE_BOTH") { + data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") + data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + } } + tryCatch({ + data.table::fwrite(corrected_pedigree, file = corrected_pedigree_filename, + sep = "\t", quote = FALSE) + if (verbose) base::cat("Corrected pedigree written to:", corrected_pedigree_filename, "\n") + }, error = function(e) { + warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) + }) } - tryCatch({ - data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", - sep = "\t", quote = FALSE) - if (verbose) base::cat("Corrected pedigree written to: corrected_pedigree.txt\n") - }, error = function(e) { - warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) - }) - + #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) @@ -352,18 +363,18 @@ validate_pedigree <- function(pedigree_file, genotypes_file, base::cat("\n") base::print(final_df) } - + #### Write results #### if (write_results) { tryCatch({ data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE, na = na_string) # <-- na_string used here + sep = "\t", quote = FALSE, na = na_string) if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results. Error: ", e$message, call. = FALSE) }) } - + #### Plot results #### if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -421,7 +432,9 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ggplot2::theme(legend.position = "top") print(p) if (write_results) { - plot_filename <- base::sub("\\.[^.]+$", ".jpg", output_filename) + # Fix #3: use tools::file_path_sans_ext() to reliably build the .jpg + # filename regardless of whether output_filename has an extension or not. + plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") tryCatch({ ggplot2::ggsave(plot_filename, plot = p, device = "jpeg", width = 10, height = 6, dpi = 300) @@ -432,6 +445,6 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } } } - + return(base::invisible(final_df)) -} \ No newline at end of file +} diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 44b9772..5a148af 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -15,6 +15,7 @@ find_parentage( allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, + output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE ) @@ -53,7 +54,12 @@ female parent IDs are excluded. Default is \code{TRUE}.} results to the console. Default is \code{TRUE}.} \item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_testing_results.txt}. Default is \code{TRUE}.} +\code{output_filename}. The plot will be saved using the same base name +with a .jpg extension. Default is \code{TRUE}.} + +\item{output_filename}{Character. Path/name of the output results file +(default: \code{"parentage_testing_results.txt"}). The plot will be saved +using the same base name with a .jpg extension.} \item{na_string}{Character. String for missing values in output file. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 8ec975b..b4987de 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -13,9 +13,10 @@ validate_pedigree( single_parent_error_threshold = 2, verbose = TRUE, write_results = TRUE, - output_filename = "__validation_report.txt", plot_results = TRUE, - na_string = "NA" + na_string = "NA", + output_filename = "__validation_report.txt", + corrected_pedigree_filename = "corrected_pedigree.txt" ) } \arguments{ @@ -45,16 +46,20 @@ table, and results to the console (default: TRUE).} \item{write_results}{Logical. If TRUE, writes the validation results to output_filename and saves the plot as a .jpg file (default: TRUE).} -\item{output_filename}{Character. Path/name of the output results file -(default: "__validation_report.txt"). The plot will be saved using the -same base name with a .jpg extension.} - \item{plot_results}{Logical. If TRUE, prints a histogram of Trio Mendelian Error percentages with a threshold line (default: TRUE). If write_results is also TRUE, the plot is additionally saved as a .jpg file.} \item{na_string}{Character. String for missing values in the output file. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} + +\item{output_filename}{Character. Path/name of the output results file +(default: "__validation_report.txt"). The plot will be saved using the +same base name with a .jpg extension.} + +\item{corrected_pedigree_filename}{Character. Path/name of the output file +for the corrected pedigree (default: "corrected_pedigree.txt"). Set to +NULL to suppress writing the corrected pedigree.} } \value{ A data.table (returned invisibly) with one row per trio and From 506a47ff1dba98752d1e7910a251b6e5fdd66b8b Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Mon, 27 Apr 2026 15:44:13 -0400 Subject: [PATCH 08/16] added edits suggested by copilot --- R/check_ped.R | 152 +++++++++++++++++++++++------------------------ man/check_ped.Rd | 53 ++++++++--------- 2 files changed, 101 insertions(+), 104 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 0ec5cf2..26e0d8a 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,35 +1,38 @@ -#' Check a pedigree file for structural issues and optionally apply corrections +#' @title Check a pedigree file for structural issues #' -#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and -#' `female_parent` and checks for five classes of issues: missing required -#' columns, exact duplicate rows, repeated IDs with conflicting parent -#' assignments, inconsistent parent sex roles, and pedigree cycles. The input -#' file is never modified. A report list is always assigned to the global -#' environment; when `correct = TRUE`, a corrected pedigree is assigned as well. -#' -#' When `correct = TRUE`, the function removes exact duplicates, collapses -#' conflicting repeated IDs (replacing the inconsistent parent field with `"0"`), -#' and appends missing parents as founder rows. Inconsistent sex roles and -#' pedigree cycles are reported but not corrected and must be resolved manually. +#' @description +#' Reads a tab‑separated pedigree file with columns `id`, +#' `male_parent`, and `female_parent` and checks for five classes of +#' errors: missing required columns, exact duplicate rows, +#' repeated IDs with conflicting parents, inconsistent parent +#' sex roles, and pedigree cycles. The input file is never +#' modified. A report list is always assigned to the global +#' environment; when `correct = TRUE` a corrected pedigree is +#' assigned as well. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. If `TRUE` (default), prints findings to the console. -#' @param correct Logical. If `TRUE` (default), builds and assigns a corrected -#' pedigree to the global environment. If `FALSE`, issues are only reported. -#' @param save_zip Logical. If `TRUE`, bundles report components into a `.zip` -#' archive in the working directory. -#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = TRUE` and -#' `correct = TRUE`), includes the corrected pedigree in the `.zip` archive. +#' @param verbose Logical. If `TRUE` (default), prints findings to the +#' console. +#' @param correct Logical. If `TRUE` (default), builds and assigns a +#' corrected pedigree to the global environment. If `FALSE`, issues +#' are only reported. +#' @param save_zip Logical. If `TRUE`, bundles report components into a +#' `.zip` archive in the working directory. +#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = +#' TRUE` and `correct = TRUE`), includes the corrected pedigree in +#' the `.zip` archive. #' -#' @return An invisible named list of data frames describing detected issues: -#' * `exact_duplicates`: exact duplicate rows. -#' * `repeated_ids_diff`: rows where the same `id` has conflicting parents. -#' * `inconsistent_sex_roles`: rows involving IDs used as both parents. -#' * `missing_parents`: rows referencing parent IDs absent from `id`. -#' * `dependencies`: rows whose `id` is involved in a pedigree cycle. +#' @return An invisible named list of data frames describing detected +#' issues: +#' \item{exact_duplicates}{exact duplicate rows} +#' \item{repeated_ids_diff}{rows where the same `id` has conflicting parents} +#' \item{inconsistent_sex_roles}{rows involving IDs used as both parents} +#' \item{missing_parents}{rows referencing parent IDs absent from `id`} +#' \item{dependencies}{rows whose `id` is involved in a pedigree cycle} #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate +#' summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames #' @importFrom utils read.table write.table zip capture.output #' @importFrom tools file_path_sans_ext @@ -40,14 +43,11 @@ check_ped <- function(ped.file, correct = TRUE, save_zip = FALSE, save_corrected_zip = FALSE) { - #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) # ── Column alias remapping ────────────────────────────────────────────────── - # Accept common alternative column names and silently remap them so the rest - # of the function always works with the canonical names. col_aliases <- list( id = c("id", "ID", "animal", "Animal", "ind", "Ind"), male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), @@ -73,48 +73,47 @@ check_ped <- function(ped.file, ) } data <- data %>% - mutate( + dplyr::mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) errors <- list() #### check 1: exact duplicates #### exact_duplicates <- data[ - duplicated(data %>% select(-row_number)) | - duplicated(data %>% select(-row_number), fromLast = TRUE), + duplicated(data %>% dplyr::select(-row_number)) | + duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), ] #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% - group_by(id) %>% - filter(n() > 1) %>% - filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% - ungroup() + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | + dplyr::n_distinct(female_parent) > 1) %>% + dplyr::ungroup() #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% - filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + dplyr::filter(male_parent %in% messy_ids | female_parent %in% messy_ids) #### check 4: missing parents #### all_ids <- unique(data$id) ref_ids <- unique(c(data$male_parent, data$female_parent)) - # Fix #1: trimws() strips whitespace-only strings; the combined filter then - # drops NA (which NA != "0" would not catch), empty strings, and "0" - # to prevent spurious founder rows with id = NA in the corrected pedigree. + + # Trim, remove NAs and zeros ref_ids <- trimws(ref_ids) ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% - filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + dplyr::filter(male_parent %in% missing_ids | female_parent %in% missing_ids) # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { @@ -173,7 +172,6 @@ check_ped <- function(ped.file, } return(all_cycles) } - cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -185,7 +183,7 @@ check_ped <- function(ped.file, paste(ids, collapse = " -> "))) } } - dependencies <- data %>% filter(id %in% cycle_ids) + dependencies <- data %>% dplyr::filter(id %in% cycle_ids) if (nrow(dependencies) == 0 && length(errors) > 0) { dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) @@ -202,27 +200,27 @@ check_ped <- function(ped.file, #### build corrected pedigree (no row_number column) #### if (correct) { - corrected <- data %>% select(-row_number) + corrected <- data %>% dplyr::select(-row_number) # Remove exact duplicates - corrected <- distinct(corrected) + corrected <- dplyr::distinct(corrected) # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% - group_by(id) %>% - summarize( - male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), - female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), + dplyr::group_by(id) %>% + dplyr::summarize( + male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), + female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), .groups = "drop" ) } # Add missing founders if (length(missing_ids) > 0) { - corrected <- bind_rows( + corrected <- dplyr::bind_rows( corrected, - missing_founders %>% select(-row_number) + missing_founders %>% dplyr::select(-row_number) ) } } @@ -253,45 +251,42 @@ check_ped <- function(ped.file, } } - # Always save report to global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - - # Save corrected pedigree if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) - #### zip export #### if (save_zip) { tmp_dir <- tempfile() dir.create(tmp_dir) - # Fix #2: register cleanup so the temp directory is always removed on - # function exit, whether normal or due to an error. on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) - zip_files <- character(0) - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) if (nrow(df) == 0) { writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), "None found."), + c(paste0("--- ", section_labels[[component]], " ---"), + "None found."), con = file_path ) } else { file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), con = file_con) + writeLines(paste0("--- ", section_labels[[component]], " ---"), + con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), + con = file_con) close(file_con) - utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, - quote = FALSE, append = TRUE, col.names = FALSE) + utils::write.table(df, file = file_path, sep = "\t", + row.names = FALSE, + quote = FALSE, + append = TRUE, + col.names = FALSE) } zip_files <- c(zip_files, file_path) } @@ -299,7 +294,8 @@ check_ped <- function(ped.file, if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, quote = FALSE) + row.names = FALSE, + quote = FALSE) zip_files <- c(zip_files, corrected_path) } @@ -310,5 +306,9 @@ check_ped <- function(ped.file, if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } + # Assign objects to the global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + invisible(input_ped_report) } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index bbae7c3..0cf3840 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for structural issues and optionally apply corrections} +\title{Check a pedigree file for structural issues} \usage{ check_ped( ped.file, @@ -18,38 +18,35 @@ check_ped( \item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the console.} +\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the +console.} -\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a corrected -pedigree to the global environment. If \code{FALSE}, issues are only reported.} +\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a +corrected pedigree to the global environment. If \code{FALSE}, issues +are only reported.} -\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a \code{.zip} -archive in the working directory.} +\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a +\code{.zip} archive in the working directory.} -\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and -\code{correct = TRUE}), includes the corrected pedigree in the \code{.zip} archive.} +\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and \code{correct = TRUE}), includes the corrected pedigree in +the \code{.zip} archive.} } \value{ -An invisible named list of data frames describing detected issues: -\itemize{ -\item \code{exact_duplicates}: exact duplicate rows. -\item \code{repeated_ids_diff}: rows where the same \code{id} has conflicting parents. -\item \code{inconsistent_sex_roles}: rows involving IDs used as both parents. -\item \code{missing_parents}: rows referencing parent IDs absent from \code{id}. -\item \code{dependencies}: rows whose \code{id} is involved in a pedigree cycle. -} +An invisible named list of data frames describing detected +issues: +\item{exact_duplicates}{exact duplicate rows} +\item{repeated_ids_diff}{rows where the same \code{id} has conflicting parents} +\item{inconsistent_sex_roles}{rows involving IDs used as both parents} +\item{missing_parents}{rows referencing parent IDs absent from \code{id}} +\item{dependencies}{rows whose \code{id} is involved in a pedigree cycle} } \description{ -Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and -\code{female_parent} and checks for five classes of issues: missing required -columns, exact duplicate rows, repeated IDs with conflicting parent -assignments, inconsistent parent sex roles, and pedigree cycles. The input -file is never modified. A report list is always assigned to the global -environment; when \code{correct = TRUE}, a corrected pedigree is assigned as well. -} -\details{ -When \code{correct = TRUE}, the function removes exact duplicates, collapses -conflicting repeated IDs (replacing the inconsistent parent field with \code{"0"}), -and appends missing parents as founder rows. Inconsistent sex roles and -pedigree cycles are reported but not corrected and must be resolved manually. +Reads a tab‑separated pedigree file with columns \code{id}, +\code{male_parent}, and \code{female_parent} and checks for five classes of +errors: missing required columns, exact duplicate rows, +repeated IDs with conflicting parents, inconsistent parent +sex roles, and pedigree cycles. The input file is never +modified. A report list is always assigned to the global +environment; when \code{correct = TRUE} a corrected pedigree is +assigned as well. } From 977e0702fb52e430d2147333c352dbfe97cd0101 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 28 Apr 2026 11:15:05 -0400 Subject: [PATCH 09/16] modified default params --- R/check_ped.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 26e0d8a..6fb77ef 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -41,8 +41,8 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE, correct = TRUE, - save_zip = FALSE, - save_corrected_zip = FALSE) { + save_zip = TRUE, + save_corrected_zip = TRUE) { #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) From 5db5fa29477c69d830873ca6434671cf00b60d55 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 28 Apr 2026 13:13:50 -0400 Subject: [PATCH 10/16] Added authors fror functions and made headers concise --- NAMESPACE | 8 +- R/breedtools_functions.R | 282 +++++++++++++++------------------- R/check_ped.R | 225 ++++++++++++++------------- R/find_parentage.R | 2 + R/imputation_concordance.R | 95 +++++------- R/validate_pedigree.R | 3 + man/allele_freq_poly.Rd | 49 +++--- man/check_ped.Rd | 78 +++++++--- man/find_parentage.Rd | 3 + man/imputation_concordance.Rd | 82 +++++----- man/solve_composition_poly.Rd | 61 ++++---- man/validate_pedigree.Rd | 3 + 12 files changed, 433 insertions(+), 458 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4e518d1..c460be1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,13 @@ importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) +importFrom(ggplot2,aes) +importFrom(ggplot2,element_text) +importFrom(ggplot2,geom_bar) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,labs) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_minimal) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) @@ -73,7 +80,6 @@ importFrom(stats,reorder) importFrom(stats,setNames) importFrom(tibble,as_tibble) importFrom(tools,file_path_sans_ext) -importFrom(utils,capture.output) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R index 5f21836..52a2d85 100644 --- a/R/breedtools_functions.R +++ b/R/breedtools_functions.R @@ -1,149 +1,129 @@ -#' Compute Allele Frequencies for Populations +#' Compute allele frequencies for populations #' -#' Computes allele frequencies for specified populations given SNP array data +#' Computes allele frequencies for specified populations from SNP array data +#' coded as dosage of allele B. #' -#' @param geno matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' with individuals in rows (named) and SNPs in columns (named) -#' @param populations list of named populations. Each population has a vector of IDs -#' that belong to the population. Allele frequencies will be derived from all animals -#' @param ploidy integer indicating the ploidy level (default is 2 for diploid) -#' @return data.frame consisting of allele_frequencies for populations (columns) for -#' each SNP (rows) -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' @param geno Numeric matrix of genotypes coded as dosage of allele B +#' \code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +#' SNPs in columns (named). +#' @param populations Named list of populations, each containing a character +#' vector of individual IDs belonging to that population. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame of allele frequencies with SNPs as rows and populations +#' as columns. +#' +#' @author Josué Chinchilla-Vargas +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' #' @examples -#' # Example inputs #' geno_matrix <- matrix( -#' c(4, 1, 4, 0, # S1 -#' 2, 2, 1, 3, # S2 -#' 0, 4, 0, 4, # S3 -#' 3, 3, 2, 2, # S4 -#' 1, 4, 2, 3),# S5 -#' nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) +#' c(4, 1, 4, 0, +#' 2, 2, 1, 3, +#' 0, 4, 0, 4, +#' 3, 3, 2, 2, +#' 1, 4, 2, 3), +#' nrow = 4, ncol = 5, byrow = FALSE, +#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) #' ) -#' -#'pop_list <- list( -#' PopA = c("Ind1", "Ind2"), -#' PopB = c("Ind3", "Ind4") +#' pop_list <- list( +#' PopA = c("Ind1", "Ind2"), +#' PopB = c("Ind3", "Ind4") #' ) -#' #' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) #' print(allele_freqs) #' #' @export allele_freq_poly <- function(geno, populations, ploidy = 2) { - - # Initialize returned df + # Initialize result matrix X <- matrix(NA, nrow = ncol(geno), ncol = length(populations)) - - # Subset geno into different populations for (i in 1:length(populations)) { - - # Get name of ith item in the list (population name) pop_name <- names(populations[i]) - - # Subset geno to only include genotypes of IDs in pop - pop_geno <- geno[rownames(geno) %in% populations[[i]], ] - - # Calculate allele frequencies - al_freq <- colMeans(pop_geno, na.rm = TRUE) / ploidy - - # Add to X - X[, i] <- al_freq + pop_geno <- geno[base::rownames(geno) %in% populations[[i]], ] + al_freq <- base::colMeans(pop_geno, na.rm = TRUE) / ploidy + X[, i] <- al_freq } - - # Label X with populations and SNPs - colnames(X) <- names(populations) - rownames(X) <- colnames(geno) - + base::colnames(X) <- base::names(populations) + base::rownames(X) <- base::colnames(geno) return(X) } +#' Solve breed composition for a single animal via quadratic programming +#' +#' Internal helper that solves the constrained OLS problem for one animal, +#' returning breed proportion estimates and the R² of the fit. #' -#' Performs whole genome breed composition prediction. +#' @param Y Numeric vector of genotypes (named by SNP) for a single animal, +#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' +#' @return A named numeric vector of breed proportions plus `R2`. +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' -#' @param Y numeric vector of genotypes (with names as SNPs) from a single animal. -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies from reference animals -#' @param p numeric indicating number of breeds represented in X -#' @param names character names of breeds -#' @return data.frame of breed composition estimates #' @import quadprog #' @importFrom stats cor -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. #' #' @noRd QPsolve <- function(Y, X) { - - # Remove NAs from Y and remove corresponding - # SNPs from X. Ensure Y is numeric - Ymod <- Y[!is.na(Y)] - Xmod <- X[names(Ymod), ] - - # Determine properties from X matrix - the number of parameters (breeds) p - # and the names of those parameters. - p <- ncol(X) - names <- colnames(X) - - # perfom steps needed to solve OLS by framing - # as a QP problem - # Rinv - matrix should be of dimensions px(p+1) where p is the number of variables in X - Rinv <- solve(chol(t(Xmod) %*% Xmod)) - - # C - the first column is a sum restriction (all equal to 1) and the rest of the columns an identity matrix - C <- cbind(rep(1, p), diag(p)) - - # b2 - This should be a vector of length p+1 the first element is the value of the sum (1) - # the other elements are the restriction of individual coefficients (>) - # so a value 0 produces positive coefficients - b2 <- c(1, rep(0, p)) - - # dd - this should be a matrix NOT a vector - dd <- (t(Ymod) %*% Xmod) - - qp <- solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, Amat = C, bvec = b2, meq = 1) - beta <- qp$solution - rr <- cor(Ymod, Xmod %*% beta)^2 + Ymod <- Y[!base::is.na(Y)] + Xmod <- X[base::names(Ymod), ] + p <- base::ncol(X) + nms <- base::colnames(X) + Rinv <- base::solve(base::chol(base::t(Xmod) %*% Xmod)) + C <- base::cbind(base::rep(1, p), base::diag(p)) + b2 <- c(1, base::rep(0, p)) + dd <- base::t(Ymod) %*% Xmod + qp <- quadprog::solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, + Amat = C, bvec = b2, meq = 1) + beta <- qp$solution + rr <- stats::cor(Ymod, Xmod %*% beta)^2 result <- c(beta, rr) - names(result) <- c(names, "R2") + base::names(result) <- c(nms, "R2") return(result) } -#' Compute Genome-Wide Breed Composition -#' -#' Computes genome-wide breed/ancestry composition using quadratic programming on a -#' batch of animals. -#' -#' @param Y numeric matrix of genotypes (columns) from all animals (rows) in population -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -#' relative to allele B. -#' @param ped data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam" -#' @param groups list of IDs categorized by breed/population. If specified, output will be a list -#' of results categorized by breed/population. -#' @param mia logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing the inferred maternally inherited allele for each locus for each animal instead -#' of breed composition results. -#' @param sire logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing sire genotypes for each locus for each animal instead of breed composition results. -#' @param dam logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing dam genotypes for each locus for each animal instead of breed composition results. -#' @param ploidy integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.). -#' @return A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -#' results -#' @import quadprog -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' Compute genome-wide breed composition #' -#' @examples -#' # Example inputs for solve_composition_poly (ploidy = 4) +#' Estimates genome-wide breed/ancestry composition for a batch of animals +#' using quadratic programming, with optional pedigree-assisted and +#' grouped-output modes. +#' +#' @param Y Numeric matrix of genotypes with individuals as rows and SNPs as +#' columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' @param ped Optional data frame with pedigree information formatted with +#' columns `ID`, `Sire`, and `Dam`. When supplied, `QPsolve_par` is used +#' and only animals with genotyped parents are included. +#' @param groups Optional named list of IDs grouped by breed/population. +#' When supplied, results are returned as a named list partitioned by group. +#' @param mia Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' the inferred maternally inherited allele per locus per animal. Default `FALSE`. +#' @param sire Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' sire genotypes per locus per animal. Default `FALSE`. +#' @param dam Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' dam genotypes per locus per animal. Default `FALSE`. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame, or a named list of data frames when `groups` is +#' supplied, containing breed/ancestry composition estimates. +#' +#' @author Josué Chinchilla-Vargas #' -#' # (This would typically be the output from allele_freq_poly) +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. +#' +#' @examples #' allele_freqs_matrix <- matrix( #' c(0.625, 0.500, #' 0.500, 0.500, @@ -153,70 +133,50 @@ QPsolve <- function(Y, X) { #' nrow = 5, ncol = 2, byrow = TRUE, #' dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) #' ) -#' -#' # Validation Genotypes (individuals x SNPs) #' val_geno_matrix <- matrix( -#' c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 -#' 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 +#' c(2, 1, 2, 3, 4, +#' 3, 4, 2, 3, 0), #' nrow = 2, ncol = 5, byrow = TRUE, #' dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) #' ) -#' -#' # Calculate Breed Composition #' composition <- solve_composition_poly(Y = val_geno_matrix, #' X = allele_freqs_matrix, #' ploidy = 4) #' print(composition) #' +#' @import quadprog +#' #' @export solve_composition_poly <- function(Y, X, - ped = NULL, + ped = NULL, groups = NULL, - mia = FALSE, - sire = FALSE, - dam = FALSE, + mia = FALSE, + sire = FALSE, + dam = FALSE, ploidy = 2) { + # Transpose so Y is SNPs x animals, as required internally + Y <- base::t(Y) + # Retain only SNPs present in X + Y <- Y[base::rownames(Y) %in% base::rownames(X), ] + + if (!base::is.null(ped)) { + # Pedigree-assisted mode: use QPsolve_par for animals with genotyped parents + mat_results <- base::lapply(base::colnames(Y), QPsolve_par, Y, X, ped, + mia = mia, sire = sire, dam = dam) + mat_results_tab <- base::do.call(rbind, mat_results) + return(mat_results_tab) + + } else if (!base::is.null(groups)) { + # Grouped mode: standard computation, results partitioned by group + Y <- Y / ploidy + grouped_results <- base::lapply(groups, QPseparate, Y, X) + return(grouped_results) - # Functions require Y to be animals x SNPs. Transpose - Y <- t(Y) - - # SNPs in Y should only be the ones present in X - Y <- Y[rownames(Y) %in% rownames(X), ] - - # If ped is supplied, use QPsolve_par to compute genomic composition using - # only animals who have genotyped parents (by incorporating Sire genotype). - if (!is.null(ped)) { - mat_results <- lapply(colnames(Y), - QPsolve_par, - Y, - X, - ped, - mia = mia, - sire = sire, - dam = dam) - - mat_results_tab <- do.call(rbind, mat_results) - return (mat_results_tab) - - # Else if groups supplied - perform regular genomic computation - # and list results by groups - } else if (!is.null(groups)) { - - # When using regular genomic computation - adjust dosage based on ploidy - Y <- Y / ploidy #(default is 2) - - grouped_results <- lapply(groups, QPseparate, Y, X) - return (grouped_results) - - # If neither using the ped or grouping option - just perform normal, unsegregated - # calculation } else { - - # Adjust dosage based on ploidy (default is 2) - Y <- Y / ploidy - - results <- t(apply(Y, 2, QPsolve, X)) - return (results) + # Standard mode: unsegregated computation across all animals + Y <- Y / ploidy + results <- base::t(base::apply(Y, 2, QPsolve, X)) + return(results) } } diff --git a/R/check_ped.R b/R/check_ped.R index 6fb77ef..847ebe1 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,67 +1,74 @@ -#' @title Check a pedigree file for structural issues +#' Check and optionally correct a pedigree file #' -#' @description -#' Reads a tab‑separated pedigree file with columns `id`, -#' `male_parent`, and `female_parent` and checks for five classes of -#' errors: missing required columns, exact duplicate rows, -#' repeated IDs with conflicting parents, inconsistent parent -#' sex roles, and pedigree cycles. The input file is never -#' modified. A report list is always assigned to the global -#' environment; when `correct = TRUE` a corrected pedigree is -#' assigned as well. +#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and +#' `female_parent` (any column order), performs structural and consistency checks, +#' reports findings, and optionally builds a partially corrected pedigree object. +#' +#' The input file is never modified. Findings are printed to the console when +#' `verbose = TRUE`, a report list is assigned to the global environment, and when +#' `correct = TRUE`, a corrected pedigree data frame is also assigned there. +#' +#' @section Checks performed: +#' \describe{ +#' \item{Missing required columns}{Stops immediately if `id`, `male_parent`, or +#' `female_parent` are absent.} +#' \item{Exact duplicate rows}{Rows identical across all three fields. Removed +#' when `correct = TRUE`.} +#' \item{Repeated IDs with conflicting parents}{Same `id` with differing parent +#' assignments. Collapsed to one row when `correct = TRUE`; conflicting fields +#' are set to `"0"`.} +#' \item{Inconsistent sex roles}{IDs appearing as both `male_parent` and +#' `female_parent`. Reported only; no automatic correction.} +#' \item{Missing parents}{Parent IDs not listed as individuals. Reported, and +#' added as founder rows (`"0"/"0"`) when `correct = TRUE`.} +#' \item{Cycles}{Direct or indirect ancestry loops. Reported only; must be +#' resolved manually.} +#' } +#' +#' @section Correction behavior: +#' When `correct = TRUE`, the function removes exact duplicates, collapses +#' conflicting repeated IDs (replacing inconsistent parent fields with `"0"`), +#' and appends missing parents as founders. Inconsistent sex roles and cycles are +#' not automatically corrected. Running the function again on the corrected +#' pedigree is recommended to reassess downstream issues. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. If `TRUE` (default), prints findings to the -#' console. -#' @param correct Logical. If `TRUE` (default), builds and assigns a -#' corrected pedigree to the global environment. If `FALSE`, issues -#' are only reported. -#' @param save_zip Logical. If `TRUE`, bundles report components into a -#' `.zip` archive in the working directory. -#' @param save_corrected_zip Logical. If `TRUE` (requires `save_zip = -#' TRUE` and `correct = TRUE`), includes the corrected pedigree in -#' the `.zip` archive. +#' @param verbose Logical. Print findings to the console? Default `TRUE`. +#' @param correct Logical. Build and save a corrected pedigree? Default `TRUE`. +#' @param save_zip Logical. Export report components as a `.zip` archive? Default `FALSE`. +#' @param save_corrected_zip Logical. Include corrected pedigree in the `.zip`? +#' Only applies when `save_zip = TRUE` and `correct = TRUE`. Default `FALSE`. #' -#' @return An invisible named list of data frames describing detected -#' issues: -#' \item{exact_duplicates}{exact duplicate rows} -#' \item{repeated_ids_diff}{rows where the same `id` has conflicting parents} -#' \item{inconsistent_sex_roles}{rows involving IDs used as both parents} -#' \item{missing_parents}{rows referencing parent IDs absent from `id`} -#' \item{dependencies}{rows whose `id` is involved in a pedigree cycle} +#' @return An invisible named list of data frames: +#' \describe{ +#' \item{exact_duplicates}{Exact duplicate rows.} +#' \item{repeated_ids_diff}{Rows with the same `id` but conflicting parent assignments.} +#' \item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} +#' \item{missing_parents}{Rows referencing parent IDs absent from `id`.} +#' \item{dependencies}{Rows whose `id` is involved in a detected cycle.} +#' } #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate -#' summarize first bind_rows n_distinct n select row_number +#' @author Josué Chinchilla-Vargas +#' +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first +#' bind_rows n_distinct n select row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip capture.output +#' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE, correct = TRUE, - save_zip = TRUE, - save_corrected_zip = TRUE) { + save_zip = FALSE, + save_corrected_zip = FALSE) { + + #### setup #### #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) - # ── Column alias remapping ────────────────────────────────────────────────── - col_aliases <- list( - id = c("id", "ID", "animal", "Animal", "ind", "Ind"), - male_parent = c("male_parent", "sire", "Sire", "father", "Father", "pat", "Pat"), - female_parent = c("female_parent", "dam", "Dam", "mother", "Mother", "mat", "Mat") - ) - for (canonical in names(col_aliases)) { - if (!canonical %in% colnames(data)) { - match <- intersect(col_aliases[[canonical]], colnames(data)) - if (length(match) > 0) { - colnames(data)[colnames(data) == match[1]] <- canonical - } - } - } - # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) @@ -72,48 +79,47 @@ check_ped <- function(ped.file, ".\nExpected columns: id, male_parent, female_parent." ) } + data <- data %>% - dplyr::mutate( + mutate( id = as.character(id), male_parent = as.character(male_parent), female_parent = as.character(female_parent) ) - # Add row numbers before any processing - data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) + data <- data %>% mutate(row_number = row_number(), .before = id) + errors <- list() + # Add row numbers before any processing + data <- data %>% mutate(row_number = row_number(), .before = id) errors <- list() #### check 1: exact duplicates #### exact_duplicates <- data[ - duplicated(data %>% dplyr::select(-row_number)) | - duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), + duplicated(data %>% select(-row_number)) | + duplicated(data %>% select(-row_number), fromLast = TRUE), ] #### check 2: repeated IDs with conflicting male_parent/female_parent #### repeated_ids_diff <- data %>% - dplyr::group_by(id) %>% - dplyr::filter(dplyr::n() > 1) %>% - dplyr::filter(dplyr::n_distinct(male_parent) > 1 | - dplyr::n_distinct(female_parent) > 1) %>% - dplyr::ungroup() + group_by(id) %>% + filter(n() > 1) %>% + filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% + ungroup() #### check 3: inconsistent parent sex roles #### male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) messy_ids <- intersect(male_ids, female_ids) inconsistent_sex_roles <- data %>% - dplyr::filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + filter(male_parent %in% messy_ids | female_parent %in% messy_ids) #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - - # Trim, remove NAs and zeros - ref_ids <- trimws(ref_ids) - ref_ids <- ref_ids[!is.na(ref_ids) & ref_ids != "" & ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) + all_ids <- unique(data$id) + ref_ids <- unique(c(data$male_parent, data$female_parent)) + ref_ids <- ref_ids[ref_ids != "0"] + missing_ids <- setdiff(ref_ids, all_ids) missing_parents <- data %>% - dplyr::filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + filter(male_parent %in% missing_ids | female_parent %in% missing_ids) # Only build founder rows if there are actually missing parents to add if (length(missing_ids) > 0) { @@ -141,7 +147,6 @@ check_ped <- function(ped.file, c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id - dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE @@ -160,7 +165,6 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } - visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() @@ -172,6 +176,7 @@ check_ped <- function(ped.file, } return(all_cycles) } + cycles <- detect_all_cycles(data) cycle_ids <- character(0) if (length(cycles) > 0) { @@ -183,7 +188,7 @@ check_ped <- function(ped.file, paste(ids, collapse = " -> "))) } } - dependencies <- data %>% dplyr::filter(id %in% cycle_ids) + dependencies <- data %>% filter(id %in% cycle_ids) if (nrow(dependencies) == 0 && length(errors) > 0) { dependencies <- data.frame(Dependency = unique(unlist(errors)), stringsAsFactors = FALSE) @@ -200,27 +205,24 @@ check_ped <- function(ped.file, #### build corrected pedigree (no row_number column) #### if (correct) { - corrected <- data %>% dplyr::select(-row_number) - + corrected <- data %>% select(-row_number) # Remove exact duplicates - corrected <- dplyr::distinct(corrected) - + corrected <- distinct(corrected) # Resolve conflicting IDs if (nrow(repeated_ids_diff) > 0) { corrected <- corrected %>% - dplyr::group_by(id) %>% - dplyr::summarize( - male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), - female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), + group_by(id) %>% + summarize( + male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), + female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), .groups = "drop" ) } - # Add missing founders if (length(missing_ids) > 0) { - corrected <- dplyr::bind_rows( + corrected <- bind_rows( corrected, - missing_founders %>% dplyr::select(-row_number) + missing_founders %>% select(-row_number) ) } } @@ -230,7 +232,6 @@ check_ped <- function(ped.file, corrected_name <- paste0(file_base, "_corrected") report_name <- paste0(file_base, "_report") zip_name <- paste0(file_base, "_report.zip") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") @@ -245,70 +246,68 @@ check_ped <- function(ped.file, cat("\n--- Cycles / Dependencies ---\n") if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") if (correct) { - cat(paste0("`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) + cat(paste0("\n`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) } else { - cat("`correct = FALSE`: no changes made to the pedigree.\n") + cat("\n`correct = FALSE`: no changes made to the pedigree.\n") } } - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) + # Always save report to global environment + assign(report_name, input_ped_report, envir = .GlobalEnv) + # Save corrected pedigree if correct = TRUE + if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) + #### zip export #### if (save_zip) { - tmp_dir <- tempfile() + tmp_dir <- tempfile() dir.create(tmp_dir) - on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE) zip_files <- character(0) + # Section headers matching the console output labels + section_labels <- c( + exact_duplicates = "Exact Duplicates", + repeated_ids_diff = "Conflicting IDs", + inconsistent_sex_roles = "Inconsistent Parent Sex Roles", + missing_parents = "Missing Parents (rows that reference them)", + dependencies = "Cycles / Dependencies" + ) + for (component in names(section_labels)) { df <- input_ped_report[[component]] file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + if (nrow(df) == 0) { + # Write header + "None found." for clean empty reports writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), - "None found."), + c(paste0("--- ", section_labels[[component]], " ---"), "None found."), con = file_path ) } else { + # Write header, then column names, then data — avoids col.names warning file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), - con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), - con = file_con) + writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) + writeLines(paste(colnames(df), collapse = "\t"), con = file_con) close(file_con) - utils::write.table(df, file = file_path, sep = "\t", - row.names = FALSE, - quote = FALSE, - append = TRUE, - col.names = FALSE) + utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, + quote = FALSE, append = TRUE, col.names = FALSE) } + zip_files <- c(zip_files, file_path) } + # Optionally include corrected pedigree in zip if (save_corrected_zip && correct) { corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, - quote = FALSE) + row.names = FALSE, quote = FALSE) zip_files <- c(zip_files, corrected_path) } + # Bundle all files into zip in the working directory zip_path <- file.path(getwd(), zip_name) - invisible(capture.output( - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") - )) + utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) } - # Assign objects to the global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - invisible(input_ped_report) } diff --git a/R/find_parentage.R b/R/find_parentage.R index 410981b..b02477c 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -90,6 +90,8 @@ #' ) #' } #' +#' @author Josué Chinchilla-Vargas +#' #' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 918071a..7ebbdcc 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,79 +1,60 @@ -#' Calculate Concordance between Imputed and Reference Genotypes -#' -#' This function calculates the concordance between imputed and reference -#' genotypes. It assumes that samples are rows and markers are columns. -#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -#' Missing data in either dataset can be excluded from the concordance calculation -#' using the `missing_code` argument. Specific markers can be excluded using -#' the `snps_2_exclude` argument. -#' -#' @param reference_genos A data frame containing reference genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param imputed_genos A data frame containing imputed genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param missing_code Optional value specifying missing data. If provided, -#' loci with this value in either dataset will be excluded from the concordance calculation. -#' -#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. -#' -#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, -#' median, mean, maximum) of concordance percentages. -#' -#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage -#' by sample. -#' -#' @param print_result Logical. If `TRUE` (default), prints the concordance -#' results data frame to the console. If `FALSE`, results are returned invisibly. -#' -#' @return A data frame with: -#' \itemize{ -#' \item \code{ID}: Sample identifiers shared between the datasets. -#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' Calculate concordance between imputed and reference genotypes +#' +#' Compares imputed and reference genotype datasets sample by sample, returning +#' the percentage of matching genotypes per sample. Expects samples as rows and +#' markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +#' formats are supported. +#' +#' @param reference_genos A data frame of reference genotypes (samples × markers) +#' with a column named `ID`. +#' @param imputed_genos A data frame of imputed genotypes (samples × markers) +#' with a column named `ID`. +#' @param missing_code Optional value indicating missing data. Loci carrying this +#' value in either dataset are excluded from the concordance calculation. +#' @param snps_2_exclude Optional character vector of marker names to exclude. +#' @param verbose Logical. Print a five-number summary of concordance? Default `FALSE`. +#' @param plot Logical. Produce a bar plot of concordance by sample? Default `FALSE`. +#' @param print_result Logical. Print the results data frame to the console? +#' Default `TRUE`. If `FALSE`, results are returned invisibly. +#' +#' @return A data frame with columns: +#' \describe{ +#' \item{ID}{Sample identifiers shared between the two datasets.} +#' \item{Concordance}{Percentage of matching genotypes per sample.} #' } -#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' #' @details -#' The function: -#' \enumerate{ -#' \item Identifies common samples and markers between the datasets. -#' \item Optionally excludes specified SNPs. -#' \item Removes loci with missing data (if \code{missing_code} is provided). -#' \item Computes per-sample concordance as the percentage of matching genotypes. -#' } -#' -#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -#' is generated using \pkg{ggplot2}. -#' -#' @import dplyr +#' The function identifies common samples and markers between the two datasets, +#' optionally removes specified SNPs and loci with missing data, then computes +#' per-sample concordance as the percentage of matching genotypes over valid loci. +#' When `plot = TRUE`, a bar plot is produced with \pkg{ggplot2}. #' #' @examples #' ref <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 1, 2), #' SNP2 = c(1, 1, 0), #' SNP3 = c(2, 5, 1) #' ) -#' -#' test <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' imp <- data.frame( +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 0, 2), #' SNP2 = c(1, 1, 1), #' SNP3 = c(2, 5, 0) #' ) -#' -#' result <- imputation_concordance( +#' imputation_concordance( #' reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = "SNP2", -#' missing_code = 5, -#' print_result = FALSE +#' imputed_genos = imp, +#' snps_2_exclude = "SNP2", +#' missing_code = 5, +#' print_result = FALSE #' ) #' -#' result +#' @author Josué Chinchilla-Vargas #' +#' @importFrom dplyr %>% filter arrange #' @importFrom stats reorder +#' @importFrom ggplot2 ggplot aes geom_bar labs theme_minimal theme element_text #' @export imputation_concordance <- function(reference_genos, imputed_genos, diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8784914..c30491f 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -60,6 +60,9 @@ #' \item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} #' } #' @export +#' +#' @author Josué Chinchilla-Vargas +#' #' @importFrom data.table fread fwrite copy data.table set rbindlist validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, diff --git a/man/allele_freq_poly.Rd b/man/allele_freq_poly.Rd index 407d158..6d56f79 100644 --- a/man/allele_freq_poly.Rd +++ b/man/allele_freq_poly.Rd @@ -2,48 +2,51 @@ % Please edit documentation in R/breedtools_functions.R \name{allele_freq_poly} \alias{allele_freq_poly} -\title{Compute Allele Frequencies for Populations} +\title{Compute allele frequencies for populations} \usage{ allele_freq_poly(geno, populations, ploidy = 2) } \arguments{ -\item{geno}{matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -with individuals in rows (named) and SNPs in columns (named)} +\item{geno}{Numeric matrix of genotypes coded as dosage of allele B +\code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +SNPs in columns (named).} -\item{populations}{list of named populations. Each population has a vector of IDs -that belong to the population. Allele frequencies will be derived from all animals} +\item{populations}{Named list of populations, each containing a character +vector of individual IDs belonging to that population.} -\item{ploidy}{integer indicating the ploidy level (default is 2 for diploid)} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -data.frame consisting of allele_frequencies for populations (columns) for -each SNP (rows) +A data frame of allele frequencies with SNPs as rows and populations +as columns. } \description{ -Computes allele frequencies for specified populations given SNP array data +Computes allele frequencies for specified populations from SNP array data +coded as dosage of allele B. } \examples{ -# Example inputs geno_matrix <- matrix( -c(4, 1, 4, 0, # S1 - 2, 2, 1, 3, # S2 - 0, 4, 0, 4, # S3 - 3, 3, 2, 2, # S4 - 1, 4, 2, 3),# S5 -nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) + c(4, 1, 4, 0, + 2, 2, 1, 3, + 0, 4, 0, 4, + 3, 3, 2, 2, + 1, 4, 2, 3), + nrow = 4, ncol = 5, byrow = FALSE, + dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) ) - pop_list <- list( -PopA = c("Ind1", "Ind2"), -PopB = c("Ind3", "Ind4") + PopA = c("Ind1", "Ind2"), + PopB = c("Ind3", "Ind4") ) - allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) print(allele_freqs) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 0cf3840..8df46e8 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for structural issues} +\title{Check and optionally correct a pedigree file} \usage{ check_ped( ped.file, @@ -18,35 +18,63 @@ check_ped( \item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If \code{TRUE} (default), prints findings to the -console.} +\item{verbose}{Logical. Print findings to the console? Default \code{TRUE}.} -\item{correct}{Logical. If \code{TRUE} (default), builds and assigns a -corrected pedigree to the global environment. If \code{FALSE}, issues -are only reported.} +\item{correct}{Logical. Build and save a corrected pedigree? Default \code{TRUE}.} -\item{save_zip}{Logical. If \code{TRUE}, bundles report components into a -\code{.zip} archive in the working directory.} +\item{save_zip}{Logical. Export report components as a \code{.zip} archive? Default \code{FALSE}.} -\item{save_corrected_zip}{Logical. If \code{TRUE} (requires \code{save_zip = TRUE} and \code{correct = TRUE}), includes the corrected pedigree in -the \code{.zip} archive.} +\item{save_corrected_zip}{Logical. Include corrected pedigree in the \code{.zip}? +Only applies when \code{save_zip = TRUE} and \code{correct = TRUE}. Default \code{FALSE}.} } \value{ -An invisible named list of data frames describing detected -issues: -\item{exact_duplicates}{exact duplicate rows} -\item{repeated_ids_diff}{rows where the same \code{id} has conflicting parents} -\item{inconsistent_sex_roles}{rows involving IDs used as both parents} -\item{missing_parents}{rows referencing parent IDs absent from \code{id}} -\item{dependencies}{rows whose \code{id} is involved in a pedigree cycle} +An invisible named list of data frames: +\describe{ +\item{exact_duplicates}{Exact duplicate rows.} +\item{repeated_ids_diff}{Rows with the same \code{id} but conflicting parent assignments.} +\item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} +\item{missing_parents}{Rows referencing parent IDs absent from \code{id}.} +\item{dependencies}{Rows whose \code{id} is involved in a detected cycle.} +} } \description{ -Reads a tab‑separated pedigree file with columns \code{id}, -\code{male_parent}, and \code{female_parent} and checks for five classes of -errors: missing required columns, exact duplicate rows, -repeated IDs with conflicting parents, inconsistent parent -sex roles, and pedigree cycles. The input file is never -modified. A report list is always assigned to the global -environment; when \code{correct = TRUE} a corrected pedigree is -assigned as well. +Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and +\code{female_parent} (any column order), performs structural and consistency checks, +reports findings, and optionally builds a partially corrected pedigree object. +} +\details{ +The input file is never modified. Findings are printed to the console when +\code{verbose = TRUE}, a report list is assigned to the global environment, and when +\code{correct = TRUE}, a corrected pedigree data frame is also assigned there. +} +\section{Checks performed}{ + +\describe{ +\item{Missing required columns}{Stops immediately if \code{id}, \code{male_parent}, or +\code{female_parent} are absent.} +\item{Exact duplicate rows}{Rows identical across all three fields. Removed +when \code{correct = TRUE}.} +\item{Repeated IDs with conflicting parents}{Same \code{id} with differing parent +assignments. Collapsed to one row when \code{correct = TRUE}; conflicting fields +are set to \code{"0"}.} +\item{Inconsistent sex roles}{IDs appearing as both \code{male_parent} and +\code{female_parent}. Reported only; no automatic correction.} +\item{Missing parents}{Parent IDs not listed as individuals. Reported, and +added as founder rows (\code{"0"/"0"}) when \code{correct = TRUE}.} +\item{Cycles}{Direct or indirect ancestry loops. Reported only; must be +resolved manually.} +} +} + +\section{Correction behavior}{ + +When \code{correct = TRUE}, the function removes exact duplicates, collapses +conflicting repeated IDs (replacing inconsistent parent fields with \code{"0"}), +and appends missing parents as founders. Inconsistent sex roles and cycles are +not automatically corrected. Running the function again on the corrected +pedigree is recommended to reassess downstream issues. +} + +\author{ +Josué Chinchilla-Vargas } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 5a148af..55b26e8 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -126,3 +126,6 @@ results <- find_parentage( } } +\author{ +Josué Chinchilla-Vargas +} diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 22e9462..f96493f 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/imputation_concordance.R \name{imputation_concordance} \alias{imputation_concordance} -\title{Calculate Concordance between Imputed and Reference Genotypes} +\title{Calculate concordance between imputed and reference genotypes} \usage{ imputation_concordance( reference_genos, @@ -15,77 +15,65 @@ imputation_concordance( ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{reference_genos}{A data frame of reference genotypes (samples × markers) +with a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{imputed_genos}{A data frame of imputed genotypes (samples × markers) +with a column named \code{ID}.} -\item{missing_code}{Optional value specifying missing data. If provided, -loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value indicating missing data. Loci carrying this +value in either dataset are excluded from the concordance calculation.} -\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional character vector of marker names to exclude.} -\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, -median, mean, maximum) of concordance percentages.} +\item{verbose}{Logical. Print a five-number summary of concordance? Default \code{FALSE}.} -\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage -by sample.} +\item{plot}{Logical. Produce a bar plot of concordance by sample? Default \code{FALSE}.} -\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance -results data frame to the console. If \code{FALSE}, results are returned invisibly.} +\item{print_result}{Logical. Print the results data frame to the console? +Default \code{TRUE}. If \code{FALSE}, results are returned invisibly.} } \value{ -A data frame with: -\itemize{ -\item \code{ID}: Sample identifiers shared between the datasets. -\item \code{Concordance}: Percentage of matching genotypes per sample. +A data frame with columns: +\describe{ +\item{ID}{Sample identifiers shared between the two datasets.} +\item{Concordance}{Percentage of matching genotypes per sample.} } -If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference -genotypes. It assumes that samples are rows and markers are columns. -Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -Missing data in either dataset can be excluded from the concordance calculation -using the \code{missing_code} argument. Specific markers can be excluded using -the \code{snps_2_exclude} argument. +Compares imputed and reference genotype datasets sample by sample, returning +the percentage of matching genotypes per sample. Expects samples as rows and +markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +formats are supported. } \details{ -The function: -\enumerate{ -\item Identifies common samples and markers between the datasets. -\item Optionally excludes specified SNPs. -\item Removes loci with missing data (if \code{missing_code} is provided). -\item Computes per-sample concordance as the percentage of matching genotypes. -} - -When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -is generated using \pkg{ggplot2}. +The function identifies common samples and markers between the two datasets, +optionally removes specified SNPs and loci with missing data, then computes +per-sample concordance as the percentage of matching genotypes over valid loci. +When \code{plot = TRUE}, a bar plot is produced with \pkg{ggplot2}. } \examples{ ref <- data.frame( - ID = c("S1", "S2", "S3"), + ID = c("S1", "S2", "S3"), SNP1 = c(0, 1, 2), SNP2 = c(1, 1, 0), SNP3 = c(2, 5, 1) ) - -test <- data.frame( - ID = c("S1", "S2", "S3"), +imp <- data.frame( + ID = c("S1", "S2", "S3"), SNP1 = c(0, 0, 2), SNP2 = c(1, 1, 1), SNP3 = c(2, 5, 0) ) - -result <- imputation_concordance( +imputation_concordance( reference_genos = ref, - imputed_genos = test, - snps_2_exclude = "SNP2", - missing_code = 5, - print_result = FALSE + imputed_genos = imp, + snps_2_exclude = "SNP2", + missing_code = 5, + print_result = FALSE ) -result - +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/solve_composition_poly.Rd b/man/solve_composition_poly.Rd index bd92739..128f7ad 100644 --- a/man/solve_composition_poly.Rd +++ b/man/solve_composition_poly.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/breedtools_functions.R \name{solve_composition_poly} \alias{solve_composition_poly} -\title{Compute Genome-Wide Breed Composition} +\title{Compute genome-wide breed composition} \usage{ solve_composition_poly( Y, @@ -16,41 +16,40 @@ solve_composition_poly( ) } \arguments{ -\item{Y}{numeric matrix of genotypes (columns) from all animals (rows) in population -coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} +\item{Y}{Numeric matrix of genotypes with individuals as rows and SNPs as +columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}.} -\item{X}{numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -relative to allele B.} +\item{X}{Numeric matrix of allele frequencies with SNPs as rows and +breeds/populations as columns.} -\item{ped}{data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam"} +\item{ped}{Optional data frame with pedigree information formatted with +columns \code{ID}, \code{Sire}, and \code{Dam}. When supplied, \code{QPsolve_par} is used +and only animals with genotyped parents are included.} -\item{groups}{list of IDs categorized by breed/population. If specified, output will be a list -of results categorized by breed/population.} +\item{groups}{Optional named list of IDs grouped by breed/population. +When supplied, results are returned as a named list partitioned by group.} -\item{mia}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing the inferred maternally inherited allele for each locus for each animal instead -of breed composition results.} +\item{mia}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +the inferred maternally inherited allele per locus per animal. Default \code{FALSE}.} -\item{sire}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing sire genotypes for each locus for each animal instead of breed composition results.} +\item{sire}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +sire genotypes per locus per animal. Default \code{FALSE}.} -\item{dam}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing dam genotypes for each locus for each animal instead of breed composition results.} +\item{dam}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +dam genotypes per locus per animal. Default \code{FALSE}.} -\item{ploidy}{integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.).} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -results +A data frame, or a named list of data frames when \code{groups} is +supplied, containing breed/ancestry composition estimates. } \description{ -Computes genome-wide breed/ancestry composition using quadratic programming on a -batch of animals. +Estimates genome-wide breed/ancestry composition for a batch of animals +using quadratic programming, with optional pedigree-assisted and +grouped-output modes. } \examples{ -# Example inputs for solve_composition_poly (ploidy = 4) - -# (This would typically be the output from allele_freq_poly) allele_freqs_matrix <- matrix( c(0.625, 0.500, 0.500, 0.500, @@ -60,16 +59,12 @@ allele_freqs_matrix <- matrix( nrow = 5, ncol = 2, byrow = TRUE, dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) ) - -# Validation Genotypes (individuals x SNPs) val_geno_matrix <- matrix( - c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 - 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 + c(2, 1, 2, 3, 4, + 3, 4, 2, 3, 0), nrow = 2, ncol = 5, byrow = TRUE, dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) ) - -# Calculate Breed Composition composition <- solve_composition_poly(Y = val_geno_matrix, X = allele_freqs_matrix, ploidy = 4) @@ -77,6 +72,10 @@ print(composition) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index b4987de..08c7329 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -93,3 +93,6 @@ are declared founders (both parents coded as 0) are preserved unchanged with no recommendations. Trios removed due to missing genotype data are retained in the output with a NO_GENOTYPE_DATA status. } +\author{ +Josué Chinchilla-Vargas +} From 6b84e18653c7072657e942aea90b27253a7bbff1 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 20 May 2026 09:53:16 -0400 Subject: [PATCH 11/16] staged comments for next push --- DESCRIPTION | 2 +- R/check_ped.R | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0524f2f..b71abd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,6 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: @@ -74,3 +73,4 @@ Suggests: polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack +Config/roxygen2/version: 8.0.0 diff --git a/R/check_ped.R b/R/check_ped.R index 847ebe1..a948138 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -51,8 +51,7 @@ #' #' @author Josué Chinchilla-Vargas #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first -#' bind_rows n_distinct n select row_number +#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number #' @importFrom stats setNames #' @importFrom utils read.table write.table zip #' @importFrom tools file_path_sans_ext @@ -60,11 +59,10 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE, - correct = TRUE, - save_zip = FALSE, + correct = TRUE, #add arguments for removing/filtering conflicting ids, repeated ids and founders are always fixed, dependencies are always manual. + save_zip = FALSE, #remove all save zips, export each report and corrected_pedigree as a list save_corrected_zip = FALSE) { - #### setup #### #### setup #### if (!is.null(seed)) set.seed(seed) data <- utils::read.table(ped.file, header = TRUE) From 4eb79bb186b83c6a52d0e8620419501719597474 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 20 May 2026 11:09:30 -0400 Subject: [PATCH 12/16] removed zip option and added specific correction arguments --- R/check_ped.R | 386 +++++++++++++++++++++++--------------------------- 1 file changed, 177 insertions(+), 209 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index a948138..6f74732 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,71 +1,62 @@ -#' Check and optionally correct a pedigree file +#' Check a pedigree file for accuracy and report and correct common errors #' -#' Reads a tab-separated pedigree file with columns `id`, `male_parent`, and -#' `female_parent` (any column order), performs structural and consistency checks, -#' reports findings, and optionally builds a partially corrected pedigree object. +#' Reads a 3-column pedigree file (tab-separated, columns id, male_parent, +#' female_parent in any order) and performs quality checks, optionally +#' correcting detected errors. #' -#' The input file is never modified. Findings are printed to the console when -#' `verbose = TRUE`, a report list is assigned to the global environment, and when -#' `correct = TRUE`, a corrected pedigree data frame is also assigned there. -#' -#' @section Checks performed: -#' \describe{ -#' \item{Missing required columns}{Stops immediately if `id`, `male_parent`, or -#' `female_parent` are absent.} -#' \item{Exact duplicate rows}{Rows identical across all three fields. Removed -#' when `correct = TRUE`.} -#' \item{Repeated IDs with conflicting parents}{Same `id` with differing parent -#' assignments. Collapsed to one row when `correct = TRUE`; conflicting fields -#' are set to `"0"`.} -#' \item{Inconsistent sex roles}{IDs appearing as both `male_parent` and -#' `female_parent`. Reported only; no automatic correction.} -#' \item{Missing parents}{Parent IDs not listed as individuals. Reported, and -#' added as founder rows (`"0"/"0"`) when `correct = TRUE`.} -#' \item{Cycles}{Direct or indirect ancestry loops. Reported only; must be -#' resolved manually.} -#' } -#' -#' @section Correction behavior: -#' When `correct = TRUE`, the function removes exact duplicates, collapses -#' conflicting repeated IDs (replacing inconsistent parent fields with `"0"`), -#' and appends missing parents as founders. Inconsistent sex roles and cycles are -#' not automatically corrected. Running the function again on the corrected -#' pedigree is recommended to reassess downstream issues. +#' Exact duplicate rows and missing parents are always corrected. +#' Conflicting trios and inconsistent sex roles are corrected when their +#' respective arguments are TRUE. Cycles and dependencies are always +#' reported only and must be resolved manually. #' #' @param ped.file Path to the pedigree text file. #' @param seed Optional integer seed for reproducibility. -#' @param verbose Logical. Print findings to the console? Default `TRUE`. -#' @param correct Logical. Build and save a corrected pedigree? Default `TRUE`. -#' @param save_zip Logical. Export report components as a `.zip` archive? Default `FALSE`. -#' @param save_corrected_zip Logical. Include corrected pedigree in the `.zip`? -#' Only applies when `save_zip = TRUE` and `correct = TRUE`. Default `FALSE`. +#' @param verbose Logical. If TRUE (default), prints the report to the console. +#' @param correct_conflicting_trios Logical. If TRUE (default), sets conflicting +#' male_parent and female_parent to 0 and collapses to one row per ID. +#' @param correct_inconsistent_sex_roles Logical. If TRUE (default), sets +#' male_parent and female_parent to 0 for rows involving IDs found as both, +#' then removes any resulting exact duplicates. #' #' @return An invisible named list of data frames: #' \describe{ -#' \item{exact_duplicates}{Exact duplicate rows.} -#' \item{repeated_ids_diff}{Rows with the same `id` but conflicting parent assignments.} -#' \item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} -#' \item{missing_parents}{Rows referencing parent IDs absent from `id`.} -#' \item{dependencies}{Rows whose `id` is involved in a detected cycle.} +#' \item{exact_duplicates}{Exact duplicate rows found in the input.} +#' \item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} +#' \item{inconsistent_sex_roles}{IDs appearing as both male_parent and female_parent.} +#' \item{missing_parents}{Parent IDs absent from id, added as founders.} +#' \item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} +#' \item{corrected_pedigree}{Corrected pedigree table.} #' } #' -#' @author Josué Chinchilla-Vargas +#' @examples +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' -#' @importFrom dplyr %>% group_by filter ungroup distinct mutate summarize first bind_rows n_distinct n select row_number +#' ped_errors$inconsistent_sex_roles +#' ped_errors$corrected_pedigree +#' +#' conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id +#' print(conflicting_sex_ids) +#' +#' @importFrom dplyr %>% mutate filter group_by ungroup summarize distinct bind_rows select first n n_distinct if_else row_number #' @importFrom stats setNames -#' @importFrom utils read.table write.table zip -#' @importFrom tools file_path_sans_ext +#' @importFrom utils read.table #' @export check_ped <- function(ped.file, - seed = NULL, - verbose = TRUE, - correct = TRUE, #add arguments for removing/filtering conflicting ids, repeated ids and founders are always fixed, dependencies are always manual. - save_zip = FALSE, #remove all save zips, export each report and corrected_pedigree as a list - save_corrected_zip = FALSE) { + seed = NULL, + verbose = TRUE, + correct_conflicting_trios = TRUE, + correct_inconsistent_sex_roles = TRUE) { #### setup #### if (!is.null(seed)) set.seed(seed) - data <- utils::read.table(ped.file, header = TRUE) + + data <- utils::read.table(ped.file, header = TRUE) %>% + dplyr::mutate( + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) + ) # Validate required column names before any processing required_cols <- c("id", "male_parent", "female_parent") @@ -78,78 +69,120 @@ check_ped <- function(ped.file, ) } - data <- data %>% - mutate( - id = as.character(id), - male_parent = as.character(male_parent), - female_parent = as.character(female_parent) - ) - # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) - errors <- list() - # Add row numbers before any processing - data <- data %>% mutate(row_number = row_number(), .before = id) - errors <- list() + # Add row numbers before any processing so all reports reference original rows [1] + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) + + errors <- list() + missing_parents <- data.frame( + row_number = integer(), + id = character(), + male_parent = character(), + female_parent = character(), + stringsAsFactors = FALSE + ) - #### check 1: exact duplicates #### + #### check 1: exact duplicates (always fixed) #### exact_duplicates <- data[ - duplicated(data %>% select(-row_number)) | - duplicated(data %>% select(-row_number), fromLast = TRUE), + duplicated(data %>% dplyr::select(-row_number)) | + duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), ] + if (nrow(exact_duplicates) > 0) { + data <- data %>% + dplyr::select(-row_number) %>% + dplyr::distinct() %>% + dplyr::mutate(row_number = dplyr::row_number(), .before = id) + } #### check 2: repeated IDs with conflicting male_parent/female_parent #### - repeated_ids_diff <- data %>% - group_by(id) %>% - filter(n() > 1) %>% - filter(n_distinct(male_parent) > 1 | n_distinct(female_parent) > 1) %>% - ungroup() + repeated_ids <- data %>% + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() - #### check 3: inconsistent parent sex roles #### - male_ids <- unique(data$male_parent[data$male_parent != "0"]) - female_ids <- unique(data$female_parent[data$female_parent != "0"]) - messy_ids <- intersect(male_ids, female_ids) - inconsistent_sex_roles <- data %>% - filter(male_parent %in% messy_ids | female_parent %in% messy_ids) + conflicting_ids <- repeated_ids %>% + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | dplyr::n_distinct(female_parent) > 1) %>% + dplyr::ungroup() - #### check 4: missing parents #### - all_ids <- unique(data$id) - ref_ids <- unique(c(data$male_parent, data$female_parent)) - ref_ids <- ref_ids[ref_ids != "0"] - missing_ids <- setdiff(ref_ids, all_ids) - missing_parents <- data %>% - filter(male_parent %in% missing_ids | female_parent %in% missing_ids) + if (correct_conflicting_trios && nrow(conflicting_ids) > 0) { + # Set conflicting parents to "0" -- rows become exact duplicates, summarize collapses to one [1] + data <- data %>% + dplyr::group_by(id) %>% + dplyr::summarize( + row_number = dplyr::first(row_number), + male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), + female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), + .groups = "drop" + ) %>% + dplyr::select(row_number, id, male_parent, female_parent) + } - # Only build founder rows if there are actually missing parents to add - if (length(missing_ids) > 0) { - missing_founders <- data.frame( - row_number = NA_integer_, - id = missing_ids, - male_parent = "0", - female_parent = "0", - stringsAsFactors = FALSE - ) - } else { - missing_founders <- data.frame( - row_number = integer(0), - id = character(0), - male_parent = character(0), - female_parent = character(0), - stringsAsFactors = FALSE - ) + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents (always fixed) #### + for (i in seq_len(nrow(data))) { + id <- data$id[i] + male_parent <- data$male_parent[i] + female_parent <- data$female_parent[i] + + if (male_parent != "0" && male_parent != id && !male_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = male_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) + } + if (female_parent != "0" && female_parent != id && !female_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = female_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) + } + + if (male_parent == id || female_parent == id) { + errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) + } + } + + missing_parents <- dplyr::distinct(missing_parents) + if (nrow(missing_parents) > 0) { + data <- dplyr::bind_rows(data, missing_parents) + } + + #### check 4: inconsistent sex roles #### + male_ids <- unique(data$male_parent[data$male_parent != "0"]) + female_ids <- unique(data$female_parent[data$female_parent != "0"]) + conflicting_sex_ids <- intersect(male_ids, female_ids) + inconsistent_sex_roles <- data %>% dplyr::filter(id %in% conflicting_sex_ids) + + if (correct_inconsistent_sex_roles && length(conflicting_sex_ids) > 0) { + # Zero out male_parent/female_parent wherever a conflicting ID appears, + # then distinct() removes any rows that became exact duplicates [1] + data <- data %>% + dplyr::mutate( + male_parent = dplyr::if_else(male_parent %in% conflicting_sex_ids, "0", male_parent), + female_parent = dplyr::if_else(female_parent %in% conflicting_sex_ids, "0", female_parent) + ) %>% + dplyr::distinct(id, male_parent, female_parent, .keep_all = TRUE) } - #### check 5: dependencies (cycles) #### + #### check 5: dependencies (cycles) -- reported only, never modified #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { @@ -163,148 +196,83 @@ check_ped <- function(ped.file, rec_stack[node] <- FALSE return(cycles) } + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) + if (length(node_cycles) > 0) { + all_cycles <- append(all_cycles, node_cycles) + } } } return(all_cycles) } - cycles <- detect_all_cycles(data) - cycle_ids <- character(0) + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { for (cycle_group in cycles) { - ids <- unique(unlist(cycle_group)) - cycle_ids <- unique(c(cycle_ids, ids)) + cycle_ids <- unique(unlist(cycle_group)) errors <- append(errors, paste("Cycle detected involving IDs:", - paste(ids, collapse = " -> "))) + paste(cycle_ids, collapse = " -> "))) } } - dependencies <- data %>% filter(id %in% cycle_ids) - if (nrow(dependencies) == 0 && length(errors) > 0) { - dependencies <- data.frame(Dependency = unique(unlist(errors)), - stringsAsFactors = FALSE) - } - #### compile report #### + #### compile findings #### input_ped_report <- list( exact_duplicates = exact_duplicates, - repeated_ids_diff = repeated_ids_diff, + conflicting_trios = repeated_ids_report, inconsistent_sex_roles = inconsistent_sex_roles, missing_parents = missing_parents, - dependencies = dependencies + dependencies = data.frame(Dependency = unique(unlist(errors)), + stringsAsFactors = FALSE), + corrected_pedigree = data %>% dplyr::select(-row_number) ) - #### build corrected pedigree (no row_number column) #### - if (correct) { - corrected <- data %>% select(-row_number) - # Remove exact duplicates - corrected <- distinct(corrected) - # Resolve conflicting IDs - if (nrow(repeated_ids_diff) > 0) { - corrected <- corrected %>% - group_by(id) %>% - summarize( - male_parent = if (n_distinct(male_parent) > 1) "0" else first(male_parent), - female_parent = if (n_distinct(female_parent) > 1) "0" else first(female_parent), - .groups = "drop" - ) - } - # Add missing founders - if (length(missing_ids) > 0) { - corrected <- bind_rows( - corrected, - missing_founders %>% select(-row_number) - ) - } - } - - #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) - corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") - zip_name <- paste0(file_base, "_report.zip") #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") - cat("\n--- Exact Duplicates ---\n") - if (nrow(exact_duplicates) > 0) print(exact_duplicates) else cat("None found.\n") - cat("\n--- Conflicting IDs ---\n") - if (nrow(repeated_ids_diff) > 0) print(repeated_ids_diff) else cat("None found.\n") - cat("\n--- Inconsistent Parent Sex Roles ---\n") - if (nrow(inconsistent_sex_roles) > 0) print(inconsistent_sex_roles) else cat("None found.\n") - cat("\n--- Missing Parents (rows that reference them) ---\n") - if (nrow(missing_parents) > 0) print(missing_parents) else cat("None found.\n") - cat("\n--- Cycles / Dependencies ---\n") - if (nrow(dependencies) > 0) print(dependencies) else cat("None found.\n") - if (correct) { - cat(paste0("\n`correct = TRUE`: saving corrected pedigree as `", corrected_name, "`.\n")) - } else { - cat("\n`correct = FALSE`: no changes made to the pedigree.\n") - } - } - - # Always save report to global environment - assign(report_name, input_ped_report, envir = .GlobalEnv) - # Save corrected pedigree if correct = TRUE - if (correct) assign(corrected_name, corrected, envir = .GlobalEnv) - #### zip export #### - if (save_zip) { - tmp_dir <- tempfile() - dir.create(tmp_dir) - zip_files <- character(0) + if (nrow(exact_duplicates) > 0) { + cat("\nExact duplicate trios detected (removed in corrected pedigree):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate trios found.\n") - # Section headers matching the console output labels - section_labels <- c( - exact_duplicates = "Exact Duplicates", - repeated_ids_diff = "Conflicting IDs", - inconsistent_sex_roles = "Inconsistent Parent Sex Roles", - missing_parents = "Missing Parents (rows that reference them)", - dependencies = "Cycles / Dependencies" - ) + if (nrow(repeated_ids_report) > 0) { + cat("\nConflicting trios detected:\n") + print(repeated_ids_report) + if (correct_conflicting_trios) { + cat(" -> parents set to 0 and collapsed to one row in corrected pedigree.\n") + } else { + cat(" -> correct_conflicting_trios = FALSE: left as-is in corrected pedigree.\n") + } + } else cat("\nNo conflicting trios found.\n") - for (component in names(section_labels)) { - df <- input_ped_report[[component]] - file_path <- file.path(tmp_dir, paste0(file_base, "_", component, ".txt")) + if (nrow(missing_parents) > 0) { + cat("\nParents missing as IDs found in the pedigree (added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") - if (nrow(df) == 0) { - # Write header + "None found." for clean empty reports - writeLines( - c(paste0("--- ", section_labels[[component]], " ---"), "None found."), - con = file_path - ) + if (nrow(inconsistent_sex_roles) > 0) { + cat("\nIDs found as both male_parent and female_parent (is selfing or hermaphrodytism possible?):\n") + print(inconsistent_sex_roles) + if (correct_inconsistent_sex_roles) { + cat(" -> parent fields set to 0 for conflicting IDs; exact duplicates removed in corrected pedigree.\n") } else { - # Write header, then column names, then data — avoids col.names warning - file_con <- file(file_path, open = "wt") - writeLines(paste0("--- ", section_labels[[component]], " ---"), con = file_con) - writeLines(paste(colnames(df), collapse = "\t"), con = file_con) - close(file_con) - utils::write.table(df, file = file_path, sep = "\t", row.names = FALSE, - quote = FALSE, append = TRUE, col.names = FALSE) + cat(" -> correct_inconsistent_sex_roles = FALSE: left as-is in corrected pedigree.\n") } + } else cat("\nNo IDs found as both male_parent and female_parent.\n") - zip_files <- c(zip_files, file_path) - } - - # Optionally include corrected pedigree in zip - if (save_corrected_zip && correct) { - corrected_path <- file.path(tmp_dir, paste0(corrected_name, ".txt")) - utils::write.table(corrected, file = corrected_path, sep = "\t", - row.names = FALSE, quote = FALSE) - zip_files <- c(zip_files, corrected_path) - } + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected (must be resolved manually):\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") - # Bundle all files into zip in the working directory - zip_path <- file.path(getwd(), zip_name) - utils::zip(zipfile = zip_path, files = zip_files, flags = "-j") - if (verbose) cat(paste0("\nZip archive saved to: ", zip_path, "\n")) + cat("\nThe corrected pedigree is included in the returned list as corrected_pedigree.\n") } invisible(input_ped_report) From 1fb19d08d64f0c68863515ab54e1331e49e69da9 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Wed, 20 May 2026 11:15:43 -0400 Subject: [PATCH 13/16] finalzied refactoring according to CRAN standards --- R/check_ped.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 6f74732..d2c75c0 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -93,18 +93,18 @@ check_ped <- function(ped.file, dplyr::mutate(row_number = dplyr::row_number(), .before = id) } - #### check 2: repeated IDs with conflicting male_parent/female_parent #### + #### check 2: conflicting trios #### repeated_ids <- data %>% dplyr::group_by(id) %>% dplyr::filter(dplyr::n() > 1) %>% dplyr::ungroup() - conflicting_ids <- repeated_ids %>% + conflicting_trios_ids <- repeated_ids %>% dplyr::group_by(id) %>% dplyr::filter(dplyr::n_distinct(male_parent) > 1 | dplyr::n_distinct(female_parent) > 1) %>% dplyr::ungroup() - if (correct_conflicting_trios && nrow(conflicting_ids) > 0) { + if (correct_conflicting_trios && nrow(conflicting_trios_ids) > 0) { # Set conflicting parents to "0" -- rows become exact duplicates, summarize collapses to one [1] data <- data %>% dplyr::group_by(id) %>% @@ -117,7 +117,7 @@ check_ped <- function(ped.file, dplyr::select(row_number, id, male_parent, female_parent) } - repeated_ids_report <- conflicting_ids + conflicting_trios <- conflicting_trios_ids #### check 3: missing parents (always fixed) #### for (i in seq_len(nrow(data))) { @@ -225,7 +225,7 @@ check_ped <- function(ped.file, #### compile findings #### input_ped_report <- list( exact_duplicates = exact_duplicates, - conflicting_trios = repeated_ids_report, + conflicting_trios = conflicting_trios, inconsistent_sex_roles = inconsistent_sex_roles, missing_parents = missing_parents, dependencies = data.frame(Dependency = unique(unlist(errors)), @@ -242,9 +242,9 @@ check_ped <- function(ped.file, print(exact_duplicates) } else cat("\nNo exact duplicate trios found.\n") - if (nrow(repeated_ids_report) > 0) { + if (nrow(conflicting_trios) > 0) { cat("\nConflicting trios detected:\n") - print(repeated_ids_report) + print(conflicting_trios) if (correct_conflicting_trios) { cat(" -> parents set to 0 and collapsed to one row in corrected pedigree.\n") } else { From 65d48fd453e5e2e52593fde37d59594f53c8d8a9 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Thu, 21 May 2026 13:54:44 -0400 Subject: [PATCH 14/16] Updated functions and descriptions to be CRAN compliant and updatede documentations --- NAMESPACE | 2 +- R/check_ped.R | 15 ++- R/find_parentage.R | 250 +++++++++++++++++---------------------- R/validate_pedigree.R | 91 +++++++------- man/check_ped.Rd | 76 +++++------- man/find_parentage.Rd | 79 +++++-------- man/validate_pedigree.Rd | 88 ++++++-------- 7 files changed, 259 insertions(+), 342 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c460be1..027d02e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ importFrom(dplyr,filter) importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) +importFrom(dplyr,if_else) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,n_distinct) @@ -86,7 +87,6 @@ importFrom(utils,read.table) importFrom(utils,tail) importFrom(utils,write.csv) importFrom(utils,write.table) -importFrom(utils,zip) importFrom(vcfR,extract.gt) importFrom(vcfR,maf) importFrom(vcfR,read.vcfR) diff --git a/R/check_ped.R b/R/check_ped.R index d2c75c0..ca82e14 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,12 +1,9 @@ -#' Check a pedigree file for accuracy and report and correct common errors +#' Check and Correct Common Pedigree Errors #' -#' Reads a 3-column pedigree file (tab-separated, columns id, male_parent, -#' female_parent in any order) and performs quality checks, optionally -#' correcting detected errors. -#' -#' Exact duplicate rows and missing parents are always corrected. -#' Conflicting trios and inconsistent sex roles are corrected when their -#' respective arguments are TRUE. Cycles and dependencies are always +#' Reads a 3-column pedigree file (id, male_parent, female_parent) and performs +#' quality checks, optionally correcting detected errors. Exact duplicates and +#' missing parents are always corrected. Conflicting trios and inconsistent sex +#' roles are corrected when their respective arguments are TRUE. Cycles are #' reported only and must be resolved manually. #' #' @param ped.file Path to the pedigree text file. @@ -38,6 +35,8 @@ #' conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id #' print(conflicting_sex_ids) #' +#' @author Josue Chinchilla-Vargas +#' #' @importFrom dplyr %>% mutate filter group_by ungroup summarize distinct bind_rows select first n n_distinct if_else row_number #' @importFrom stats setNames #' @importFrom utils read.table diff --git a/R/find_parentage.R b/R/find_parentage.R index b02477c..c4ff140 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -1,51 +1,44 @@ #' Find Parentage Assignments for Progeny #' -#' Assigns the most likely parent(s) to each progeny individual based on -#' genotypic data using Mendelian error rates or homozygous mismatch rates. +#' Assigns the most likely parent(s) to each progeny from SNP genotype data +#' using Mendelian error rates or homozygous mismatch rates. Parents or progeny +#' absent from the genotype file are removed with a warning. #' -#' @param genotypes_file Path to a TSV/CSV/TXT file containing genotype data. -#' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 -#' (allele dosage). -#' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. -#' Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') -#' determines parent roles. If absent, all parents are treated as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs. -#' Must include an 'ID' column. +#' @param genotypes_file Path to a TSV/CSV/TXT file with an 'ID' column +#' followed by marker columns coded as 0, 1, 2 (allele dosage). +#' @param parents_file Path to a TSV/CSV/TXT file with an 'ID' column and an +#' optional 'Sex' column ('M', 'F', or 'A'). If absent, all parents are +#' treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file with an 'ID' column. #' @param method Character. One of \code{"best_male_parent"}, #' \code{"best_female_parent"}, \code{"best_match"}, or #' \code{"best_pair"} (default). See Details. -#' @param min_markers Integer. Minimum markers required for a valid assignment. -#' Fewer markers flags \code{LOW_MARKERS} (default: \code{10}). -#' @param error_threshold Numeric. Maximum mismatch percentage for a confident -#' assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). -#' Must be between 0 and 100. -#' @param show_ties Logical. If \code{TRUE}, tied best pairs are reported as -#' additional suffix columns (e.g. \code{Male_Parent_2}) when -#' \code{method = "best_pair"}. Default is \code{TRUE}. +#' @param min_markers Integer. Minimum markers required; fewer flags +#' \code{LOW_MARKERS} (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage; exceeded values +#' flag \code{HIGH_ERROR} (default: \code{5.0}). Must be between 0 and 100. +#' @param show_ties Logical. If \code{TRUE}, tied best pairs are appended as +#' suffix columns (e.g. \code{Male_Parent_2}) for \code{"best_pair"}. +#' Default is \code{TRUE}. #' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and #' female parent IDs are excluded. Default is \code{TRUE}. -#' @param verbose Logical. If \code{TRUE}, prints progress, summary, and -#' results to the console. Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress and summary to the +#' console. Default is \code{TRUE}. #' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{output_filename}. The plot will be saved using the same base name -#' with a .jpg extension. Default is \code{TRUE}. -#' @param output_filename Character. Path/name of the output results file -#' (default: \code{"parentage_testing_results.txt"}). The plot will be saved -#' using the same base name with a .jpg extension. -#' @param na_string Character. String for missing values in output file. +#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. +#' @param na_string Character. Missing value string in output. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error -#' distribution colored by assignment status. Requires \code{ggplot2}. -#' Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. -#' Default is \code{TRUE}. +#' distribution colored by status. Requires \code{ggplot2}. Saved as +#' \code{.jpg} when \code{write_txt = TRUE}. Default is \code{TRUE}. #' -#' @return A \code{data.table} with one row per progeny. Base columns are -#' \code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), -#' \code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, -#' and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or -#' \code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are -#' appended when \code{show_ties = TRUE} and ties remain after tie-breaking. -#' Returned invisibly when \code{verbose = TRUE}. +#' @return A named list (returned invisibly) with the following elements: +#' \describe{ +#' \item{pass}{Progeny with a confident parentage assignment.} +#' \item{high_error}{Progeny whose best assignment exceeds the error threshold.} +#' \item{low_markers}{Progeny with insufficient markers for a valid assignment.} +#' \item{full_results}{Complete data.table with all progeny and all output columns.} +#' } #' #' @details #' A homozygous-only matrix is pre-computed once and shared across @@ -56,14 +49,8 @@ #' For \code{"best_pair"}, full Mendelian inheritance rules are applied across #' all marker and pair combinations using vectorised \code{vapply} calls, #' producing \code{n_pairs x n_progeny} mismatch and comparison matrices. -#' -#' Ties on mismatch rate are broken by maximum markers tested. Remaining ties -#' are reported as suffix columns when \code{show_ties = TRUE}. Base columns -#' are always populated with the top result. -#' -#' Parents or progeny absent from \code{genotypes_file} are removed with a -#' warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; -#' those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' Ties are broken by maximum markers tested; remaining ties are appended as +#' suffix columns when \code{show_ties = TRUE}. #' #' @examples #' \dontrun{ @@ -90,9 +77,10 @@ #' ) #' } #' -#' @author Josué Chinchilla-Vargas +#' @author Josue Chinchilla-Vargas #' #' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table +#' @importFrom tools file_path_sans_ext #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best_pair", @@ -102,10 +90,13 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, - output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE) { + ## ---- silence R CMD check NOTEs for data.table / ggplot2 NSE columns ---- + ID <- Sex <- Male_Parent <- Female_Parent <- NULL + Mendelian_Error_Pct <- Plot_Status <- Assignment_Status <- NULL + #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") if (!method %in% allowed_methods) @@ -157,20 +148,14 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("No valid progeny candidates remain after filtering.") #### Pre-compute genotype matrices once (shared across all methods) #### - - # Full genotype matrix used by best_pair - genos_mat <- base::as.matrix(genos, rownames = "ID") - - # Homozygous-only matrix (het markers set to NA) used by hom methods + genos_mat <- base::as.matrix(genos, rownames = "ID") genos_hom <- data.table::copy(genos) marker_cols <- base::setdiff(base::names(genos_hom), "ID") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - #### Assignment_Status from markers and error rate #### - - # Returns LOW_MARKERS, HIGH_ERROR, or PASS + #### Assignment_Status helper #### assign_status <- function(markers, error_pct) { base::ifelse(markers < min_markers, "LOW_MARKERS", base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) @@ -183,12 +168,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, "best_female_parent" = female_parent_candidates$ID, "best_match" = base::union(male_parent_candidates$ID, female_parent_candidates$ID)) - - # Subset pre-computed homozygous matrix for relevant parents and progeny parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] - - # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( Progeny = base::rownames(progeny_genos), @@ -197,26 +178,20 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Markers_Tested = NA_integer_, Assignment_Status = NA_character_ ) - for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA - best_idx <- base::which.min(percent_mismatch) - - # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "Markers_Tested", 0L) data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") next } - best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) @@ -235,17 +210,13 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - # Pre-extract parent genotype row blocks for vectorised operations male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - - # Subset full genotype matrix to progeny only progeny_ids <- progeny_candidates$ID progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -264,7 +235,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -275,11 +245,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( Progeny = progeny_ids, Male_Parent = NA_character_, @@ -289,7 +257,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, Assignment_Status = NA_character_ ) - # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { prog_id <- progeny_ids[j] @@ -297,7 +264,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - # No markers overlap at all — flag LOW_MARKERS and continue if (base::is.infinite(min_mismatch_val)) { data.table::set(results_dt, j, "Markers_Tested", 0L) data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") @@ -305,8 +271,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } best_indices <- base::which(percent_mismatch == min_mismatch_val) - - # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) @@ -326,15 +290,12 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - # Always populate base columns with the top result data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) data.table::set(results_dt, j, "Assignment_Status", a_status) - # If ties remain after tie-breaking and show_ties is TRUE, - # store tie details for later column binding if (show_ties && num_to_report > 1) { tie_row <- base::list(Progeny = prog_id) for (k in base::seq(2, num_to_report)) { @@ -347,7 +308,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } } - # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), fill = TRUE, @@ -355,41 +315,66 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) if (base::nrow(tie_rows) > 0) { final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) - - char_cols <- base::names(final_df)[ - base::vapply(final_df, base::is.character, logical(1)) - ] - for (col in char_cols) { - empty_idx <- which(final_df[[col]] == "") - if (base::length(empty_idx) > 0) { - data.table::set(final_df, empty_idx, col, NA_character_) - } - } + for (col in base::names(final_df)) + data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) } else { final_df <- results_dt } } - #### Summary #### + #### Compile named list (mirrors check_ped structure) #### + output_list <- list( + pass = final_df[Assignment_Status == "PASS"], + high_error = final_df[Assignment_Status == "HIGH_ERROR"], + low_markers = final_df[Assignment_Status == "LOW_MARKERS"], + full_results = final_df + ) + + #### Verbose output (mirrors check_ped reporting style) #### if (verbose) { - total <- base::nrow(final_df) - a_counts <- base::table(final_df$Assignment_Status) - base::cat("\n--- Parentage Assignment Summary ---\n") - base::cat("Total progeny evaluated:", total, "\n") - for (s in base::names(a_counts)) - base::cat(base::sprintf(" %-14s: %d (%.1f%%)\n", s, - a_counts[s], (a_counts[s] / total) * 100)) - base::cat("Min markers threshold :", min_markers, "\n") - base::cat("Error threshold :", error_threshold, "%\n\n") - } + total_progeny <- base::nrow(final_df) + base::cat("\n=== Parentage Assignment Report ===\n") + base::cat("\nTotal progeny evaluated:", total_progeny, "\n") + base::cat("Method:", method, " | ", + "Error threshold:", error_threshold, "% | ", + "Minimum markers:", min_markers, "\n") + + n_pass <- base::nrow(output_list$pass) + if (n_pass > 0) { + base::cat(base::sprintf("\n%d progeny PASSED (%.1f%%).\n", + n_pass, (n_pass / total_progeny) * 100)) + } else { + base::cat("\nNo progeny passed.\n") + } + + n_high <- base::nrow(output_list$high_error) + if (n_high > 0) { + base::cat(base::sprintf("\n%d progeny flagged HIGH_ERROR (%.1f%%):\n", + n_high, (n_high / total_progeny) * 100)) + base::print(output_list$high_error) + } else { + base::cat("\nNo progeny flagged for high error.\n") + } + + n_low <- base::nrow(output_list$low_markers) + if (n_low > 0) { + base::cat(base::sprintf("\n%d progeny flagged LOW_MARKERS (%.1f%%):\n", + n_low, (n_low / total_progeny) * 100)) + base::print(output_list$low_markers) + } else { + base::cat("\nNo progeny flagged for low marker count.\n") + } - #### Output #### + base::cat("\nFull results are included in the returned list as $full_results.\n") + } + #### Write results #### if (write_txt) { + output_filename <- "parentage_testing_results.txt" tryCatch({ - data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE, - na = na_string) - if (verbose) base::cat("Results successfully written to:", output_filename, "\n") + data.table::fwrite(final_df, file = output_filename, sep = "\t", + quote = FALSE, na = na_string) + if (verbose) base::cat("Results written to:", output_filename, "\n") }, error = function(e) { warning("Could not write results to file. Error: ", e$message, call. = FALSE) }) @@ -402,12 +387,15 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } else { plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) - - plot_df$Plot_Status <- dplyr::case_when( - plot_df$Assignment_Status == "PASS" ~ "PASS", - plot_df$Assignment_Status == "HIGH_ERROR" ~ "HIGH_ERROR", - plot_df$Assignment_Status == "LOW_MARKERS" ~ "LOW_MARKERS", - TRUE ~ "OTHER" + plot_df$Plot_Status <- base::ifelse( + plot_df$Assignment_Status == "PASS", "PASS", + base::ifelse( + plot_df$Assignment_Status == "HIGH_ERROR", "HIGH_ERROR", + base::ifelse( + plot_df$Assignment_Status == "LOW_MARKERS", "LOW_MARKERS", + "OTHER" + ) + ) ) n_total <- base::nrow(plot_df) @@ -427,28 +415,19 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) ) + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + - ggplot2::geom_vline( - xintercept = error_threshold, - linetype = "dashed", - color = "black", - linewidth = 1 - ) + + ggplot2::geom_vline(xintercept = error_threshold, + linetype = "dashed", color = "black", linewidth = 1) + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + - - ggplot2::scale_y_continuous(breaks = scales::pretty_breaks()) + + ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + ggplot2::scale_fill_manual( - values = c( - "PASS" = "#339900", - "HIGH_ERROR" = "#cc3333", - "LOW_MARKERS" = "#F1C40F", - "OTHER" = "#BDC3C7" - ), - labels = c( - "PASS" = "Pass", - "HIGH_ERROR" = "High Error", - "LOW_MARKERS" = "Low Markers", - "OTHER" = "Other" - ) + values = c("PASS" = "#339900", + "HIGH_ERROR" = "#cc3333", + "LOW_MARKERS" = "#F1C40F", + "OTHER" = "#BDC3C7"), + labels = c("PASS" = "Pass", + "HIGH_ERROR" = "High Error", + "LOW_MARKERS" = "Low Markers", + "OTHER" = "Other") ) + ggplot2::labs( title = "Parentage Mendelian Error Distribution", @@ -463,8 +442,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::print(p) if (write_txt) { - - plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") + plot_filename <- paste0(tools::file_path_sans_ext("parentage_testing_results.txt"), ".jpg") tryCatch({ ggplot2::ggsave(plot_filename, plot = p, device = "jpeg", width = 10, height = 6, dpi = 300) @@ -476,11 +454,5 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } } - if (verbose) { - base::cat("\n--- Parentage Assignment Results ---\n") - base::print(final_df) - return(base::invisible(final_df)) - } else { - return(final_df) - } + return(base::invisible(output_list)) } diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index c30491f..ebeecb0 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -1,69 +1,60 @@ #' Validate Pedigree Trios Using Mendelian Error Analysis #' -#' Validates parent-offspring trios by calculating Mendelian error rates from -#' SNP genotype data. Identifies incorrect parentage assignments and suggests -#' best-matching replacements. If a list of founders is supplied, trios that -#' are declared founders (both parents coded as 0) are preserved unchanged -#' with no recommendations. Trios removed due to missing genotype data are -#' retained in the output with a NO_GENOTYPE_DATA status. +#' Validates parent-offspring trios against SNP genotype data using Mendelian +#' error rates. Identifies incorrect parentage assignments, suggests +#' best-matching replacements, and outputs a corrected pedigree. Founder trios +#' (both parents coded as 0) are preserved unchanged if a founders file is +#' supplied. Trios absent from the genotype file are retained as +#' \code{NO_GENOTYPE_DATA}. #' #' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) #' with columns: ID, Male_Parent, Female_Parent. #' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) #' with an ID column followed by marker columns coded as 0, 1, 2. -#' @param founders_file Character, optional. Path to a one-column file -#' listing the IDs of founder individuals. Founders with both parents -#' coded as 0 are left unchanged with no recommendations. Defaults to NULL. -#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage -#' to classify a trio as PASS (default: 5.0). Must be between 0 and 100. -#' @param min_markers Integer. Minimum number of non-missing markers -#' required to evaluate a trio (default: 10). +#' @param founders_file Character, optional. Path to a one-column file listing +#' founder IDs. Founders with both parents coded as 0 are left unchanged. +#' Defaults to NULL. +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to +#' classify a trio as PASS (default: 5.0). Must be between 0 and 100. +#' @param min_markers Integer. Minimum non-missing markers required to evaluate +#' a trio (default: 10). #' @param single_parent_error_threshold Numeric. Maximum homozygous-marker -#' mismatch percentage for a parent to be considered acceptable during -#' parent-level evaluation (default: 2.0). Must be between 0 and 100. -#' @param verbose Logical. If TRUE, prints progress messages, a summary -#' table, and results to the console (default: TRUE). -#' @param write_results Logical. If TRUE, writes the validation results to -#' output_filename and saves the plot as a .jpg file (default: TRUE). +#' mismatch percentage for a parent to be considered acceptable +#' (default: 2.0). Must be between 0 and 100. +#' @param verbose Logical. If TRUE, prints progress, summary, and results to +#' the console (default: TRUE). +#' @param write_results Logical. If TRUE, writes results to +#' \code{output_filename} and saves the plot as a .jpg (default: TRUE). #' @param output_filename Character. Path/name of the output results file -#' (default: "__validation_report.txt"). The plot will be saved using the -#' same base name with a .jpg extension. -#' @param plot_results Logical. If TRUE, prints a histogram of Trio Mendelian -#' Error percentages with a threshold line (default: TRUE). If write_results -#' is also TRUE, the plot is additionally saved as a .jpg file. -#' @param na_string Character. String for missing values in the output file. +#' (default: \code{"__validation_report.txt"}). The plot uses the same base +#' name with a .jpg extension. +#' @param plot_results Logical. If TRUE, prints a histogram of trio Mendelian +#' error percentages with a threshold line. Saved as .jpg when +#' \code{write_results = TRUE} (default: TRUE). +#' @param na_string Character. Missing value string in the output file. #' Use \code{"NA"} or \code{""} (default: \code{"NA"}). -#' @param corrected_pedigree_filename Character. Path/name of the output file -#' for the corrected pedigree (default: "corrected_pedigree.txt"). Set to -#' NULL to suppress writing the corrected pedigree. +#' @param corrected_pedigree_filename Character. Path/name for the corrected +#' pedigree output (default: \code{"corrected_pedigree.txt"}). Set to NULL +#' to suppress writing. #' -#' @return A data.table (returned invisibly) with one row per trio and -#' the following columns: +#' @return A named list (returned invisibly) with the following elements: #' \describe{ -#' \item{ID}{Individual ID.} -#' \item{Orig_Male_Parent}{Declared male parent ID.} -#' \item{Orig_Female_Parent}{Declared female parent ID.} -#' \item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} -#' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, -#' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -#' or NO_GENOTYPE_DATA.} -#' \item{Recommended_Correction}{One of NONE, KEEP_BOTH, -#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, -#' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, -#' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} -#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} -#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -#' \item{Best_Male_Candidate}{Best-matching male parent candidate ID.} -#' \item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} -#' \item{Best_Female_Candidate}{Best-matching female parent candidate ID.} -#' \item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} +#' \item{pass}{Trios that passed the Mendelian error threshold.} +#' \item{fail}{Trios that failed the Mendelian error threshold.} +#' \item{low_markers}{Trios with insufficient markers for evaluation.} +#' \item{no_data}{Trios with no overlapping genotype data.} +#' \item{no_genotype_data}{Trios absent from the genotype file.} +#' \item{founders}{Trios identified as founders.} +#' \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} +#' \item{full_results}{Complete data.table with all trios and all output columns.} +#' \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} #' } -#' @export #' -#' @author Josué Chinchilla-Vargas +#' @author Josue Chinchilla-Vargas #' #' @importFrom data.table fread fwrite copy data.table set rbindlist +#' @importFrom tools file_path_sans_ext +#' @export validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, trio_error_threshold = 5.0, diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 8df46e8..32ea364 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,15 +2,14 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check and optionally correct a pedigree file} +\title{Check and Correct Common Pedigree Errors} \usage{ check_ped( ped.file, seed = NULL, verbose = TRUE, - correct = TRUE, - save_zip = FALSE, - save_corrected_zip = FALSE + correct_conflicting_trios = TRUE, + correct_inconsistent_sex_roles = TRUE ) } \arguments{ @@ -18,63 +17,44 @@ check_ped( \item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. Print findings to the console? Default \code{TRUE}.} +\item{verbose}{Logical. If TRUE (default), prints the report to the console.} -\item{correct}{Logical. Build and save a corrected pedigree? Default \code{TRUE}.} +\item{correct_conflicting_trios}{Logical. If TRUE (default), sets conflicting +male_parent and female_parent to 0 and collapses to one row per ID.} -\item{save_zip}{Logical. Export report components as a \code{.zip} archive? Default \code{FALSE}.} - -\item{save_corrected_zip}{Logical. Include corrected pedigree in the \code{.zip}? -Only applies when \code{save_zip = TRUE} and \code{correct = TRUE}. Default \code{FALSE}.} +\item{correct_inconsistent_sex_roles}{Logical. If TRUE (default), sets +male_parent and female_parent to 0 for rows involving IDs found as both, +then removes any resulting exact duplicates.} } \value{ An invisible named list of data frames: \describe{ -\item{exact_duplicates}{Exact duplicate rows.} -\item{repeated_ids_diff}{Rows with the same \code{id} but conflicting parent assignments.} -\item{inconsistent_sex_roles}{Rows involving IDs used as both male and female parent.} -\item{missing_parents}{Rows referencing parent IDs absent from \code{id}.} -\item{dependencies}{Rows whose \code{id} is involved in a detected cycle.} +\item{exact_duplicates}{Exact duplicate rows found in the input.} +\item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} +\item{inconsistent_sex_roles}{IDs appearing as both male_parent and female_parent.} +\item{missing_parents}{Parent IDs absent from id, added as founders.} +\item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} +\item{corrected_pedigree}{Corrected pedigree table.} } } \description{ -Reads a tab-separated pedigree file with columns \code{id}, \code{male_parent}, and -\code{female_parent} (any column order), performs structural and consistency checks, -reports findings, and optionally builds a partially corrected pedigree object. -} -\details{ -The input file is never modified. Findings are printed to the console when -\code{verbose = TRUE}, a report list is assigned to the global environment, and when -\code{correct = TRUE}, a corrected pedigree data frame is also assigned there. +Reads a 3-column pedigree file (id, male_parent, female_parent) and performs +quality checks, optionally correcting detected errors. Exact duplicates and +missing parents are always corrected. Conflicting trios and inconsistent sex +roles are corrected when their respective arguments are TRUE. Cycles are +reported only and must be resolved manually. } -\section{Checks performed}{ +\examples{ +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) -\describe{ -\item{Missing required columns}{Stops immediately if \code{id}, \code{male_parent}, or -\code{female_parent} are absent.} -\item{Exact duplicate rows}{Rows identical across all three fields. Removed -when \code{correct = TRUE}.} -\item{Repeated IDs with conflicting parents}{Same \code{id} with differing parent -assignments. Collapsed to one row when \code{correct = TRUE}; conflicting fields -are set to \code{"0"}.} -\item{Inconsistent sex roles}{IDs appearing as both \code{male_parent} and -\code{female_parent}. Reported only; no automatic correction.} -\item{Missing parents}{Parent IDs not listed as individuals. Reported, and -added as founder rows (\code{"0"/"0"}) when \code{correct = TRUE}.} -\item{Cycles}{Direct or indirect ancestry loops. Reported only; must be -resolved manually.} -} -} +ped_errors$inconsistent_sex_roles +ped_errors$corrected_pedigree -\section{Correction behavior}{ +conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id +print(conflicting_sex_ids) -When \code{correct = TRUE}, the function removes exact duplicates, collapses -conflicting repeated IDs (replacing inconsistent parent fields with \code{"0"}), -and appends missing parents as founders. Inconsistent sex roles and cycles are -not automatically corrected. Running the function again on the corrected -pedigree is recommended to reassess downstream issues. } - \author{ -Josué Chinchilla-Vargas +Josue Chinchilla-Vargas } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 55b26e8..d41555a 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -15,72 +15,63 @@ find_parentage( allow_selfing = TRUE, verbose = TRUE, write_txt = TRUE, - output_filename = "parentage_testing_results.txt", na_string = "NA", plot_results = TRUE ) } \arguments{ -\item{genotypes_file}{Path to a TSV/CSV/TXT file containing genotype data. -Must include an 'ID' column followed by marker columns coded as 0, 1, 2 -(allele dosage).} +\item{genotypes_file}{Path to a TSV/CSV/TXT file with an 'ID' column +followed by marker columns coded as 0, 1, 2 (allele dosage).} -\item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. -Must include an 'ID' column. An optional 'Sex' column ('M', 'F', or 'A') -determines parent roles. If absent, all parents are treated as ambiguous.} +\item{parents_file}{Path to a TSV/CSV/TXT file with an 'ID' column and an +optional 'Sex' column ('M', 'F', or 'A'). If absent, all parents are +treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs. -Must include an 'ID' column.} +\item{progeny_file}{Path to a TSV/CSV/TXT file with an 'ID' column.} \item{method}{Character. One of \code{"best_male_parent"}, \code{"best_female_parent"}, \code{"best_match"}, or \code{"best_pair"} (default). See Details.} -\item{min_markers}{Integer. Minimum markers required for a valid assignment. -Fewer markers flags \code{LOW_MARKERS} (default: \code{10}).} +\item{min_markers}{Integer. Minimum markers required; fewer flags +\code{LOW_MARKERS} (default: \code{10}).} -\item{error_threshold}{Numeric. Maximum mismatch percentage for a confident -assignment. Exceeded values flag \code{HIGH_ERROR} (default: \code{5.0}). -Must be between 0 and 100.} +\item{error_threshold}{Numeric. Maximum mismatch percentage; exceeded values +flag \code{HIGH_ERROR} (default: \code{5.0}). Must be between 0 and 100.} -\item{show_ties}{Logical. If \code{TRUE}, tied best pairs are reported as -additional suffix columns (e.g. \code{Male_Parent_2}) when -\code{method = "best_pair"}. Default is \code{TRUE}.} +\item{show_ties}{Logical. If \code{TRUE}, tied best pairs are appended as +suffix columns (e.g. \code{Male_Parent_2}) for \code{"best_pair"}. +Default is \code{TRUE}.} \item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and female parent IDs are excluded. Default is \code{TRUE}.} -\item{verbose}{Logical. If \code{TRUE}, prints progress, summary, and -results to the console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and summary to the +console. Default is \code{TRUE}.} \item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{output_filename}. The plot will be saved using the same base name -with a .jpg extension. Default is \code{TRUE}.} - -\item{output_filename}{Character. Path/name of the output results file -(default: \code{"parentage_testing_results.txt"}). The plot will be saved -using the same base name with a .jpg extension.} +\code{parentage_testing_results.txt}. Default is \code{TRUE}.} -\item{na_string}{Character. String for missing values in output file. +\item{na_string}{Character. Missing value string in output. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} \item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error -distribution colored by assignment status. Requires \code{ggplot2}. -Plot is saved as \code{.jpg} when \code{write_txt = TRUE}. -Default is \code{TRUE}.} +distribution colored by status. Requires \code{ggplot2}. Saved as +\code{.jpg} when \code{write_txt = TRUE}. Default is \code{TRUE}.} } \value{ -A \code{data.table} with one row per progeny. Base columns are -\code{Progeny}, \code{Male_Parent} (or \code{Best_Match}), -\code{Female_Parent}, \code{Mendelian_Error_Pct}, \code{Markers_Tested}, -and \code{Assignment_Status} (\code{PASS}, \code{HIGH_ERROR}, or -\code{LOW_MARKERS}). Tie suffix columns (\code{Male_Parent_2}, etc.) are -appended when \code{show_ties = TRUE} and ties remain after tie-breaking. -Returned invisibly when \code{verbose = TRUE}. +A named list (returned invisibly) with the following elements: +\describe{ +\item{pass}{Progeny with a confident parentage assignment.} +\item{high_error}{Progeny whose best assignment exceeds the error threshold.} +\item{low_markers}{Progeny with insufficient markers for a valid assignment.} +\item{full_results}{Complete data.table with all progeny and all output columns.} +} } \description{ -Assigns the most likely parent(s) to each progeny individual based on -genotypic data using Mendelian error rates or homozygous mismatch rates. +Assigns the most likely parent(s) to each progeny from SNP genotype data +using Mendelian error rates or homozygous mismatch rates. Parents or progeny +absent from the genotype file are removed with a warning. } \details{ A homozygous-only matrix is pre-computed once and shared across @@ -91,14 +82,8 @@ A homozygous-only matrix is pre-computed once and shared across For \code{"best_pair"}, full Mendelian inheritance rules are applied across all marker and pair combinations using vectorised \code{vapply} calls, producing \code{n_pairs x n_progeny} mismatch and comparison matrices. - -Ties on mismatch rate are broken by maximum markers tested. Remaining ties -are reported as suffix columns when \code{show_ties = TRUE}. Base columns -are always populated with the top result. - -Parents or progeny absent from \code{genotypes_file} are removed with a -warning. Progeny below \code{min_markers} are flagged \code{LOW_MARKERS}; -those exceeding \code{error_threshold} are flagged \code{HIGH_ERROR}. +Ties are broken by maximum markers tested; remaining ties are appended as +suffix columns when \code{show_ties = TRUE}. } \examples{ \dontrun{ @@ -127,5 +112,5 @@ results <- find_parentage( } \author{ -Josué Chinchilla-Vargas +Josue Chinchilla-Vargas } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 08c7329..78938d4 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -26,73 +26,63 @@ with columns: ID, Male_Parent, Female_Parent.} \item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) with an ID column followed by marker columns coded as 0, 1, 2.} -\item{founders_file}{Character, optional. Path to a one-column file -listing the IDs of founder individuals. Founders with both parents -coded as 0 are left unchanged with no recommendations. Defaults to NULL.} +\item{founders_file}{Character, optional. Path to a one-column file listing +founder IDs. Founders with both parents coded as 0 are left unchanged. +Defaults to NULL.} -\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage -to classify a trio as PASS (default: 5.0). Must be between 0 and 100.} +\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to +classify a trio as PASS (default: 5.0). Must be between 0 and 100.} -\item{min_markers}{Integer. Minimum number of non-missing markers -required to evaluate a trio (default: 10).} +\item{min_markers}{Integer. Minimum non-missing markers required to evaluate +a trio (default: 10).} \item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker -mismatch percentage for a parent to be considered acceptable during -parent-level evaluation (default: 2.0). Must be between 0 and 100.} +mismatch percentage for a parent to be considered acceptable +(default: 2.0). Must be between 0 and 100.} -\item{verbose}{Logical. If TRUE, prints progress messages, a summary -table, and results to the console (default: TRUE).} +\item{verbose}{Logical. If TRUE, prints progress, summary, and results to +the console (default: TRUE).} -\item{write_results}{Logical. If TRUE, writes the validation results to -output_filename and saves the plot as a .jpg file (default: TRUE).} +\item{write_results}{Logical. If TRUE, writes results to +\code{output_filename} and saves the plot as a .jpg (default: TRUE).} -\item{plot_results}{Logical. If TRUE, prints a histogram of Trio Mendelian -Error percentages with a threshold line (default: TRUE). If write_results -is also TRUE, the plot is additionally saved as a .jpg file.} +\item{plot_results}{Logical. If TRUE, prints a histogram of trio Mendelian +error percentages with a threshold line. Saved as .jpg when +\code{write_results = TRUE} (default: TRUE).} -\item{na_string}{Character. String for missing values in the output file. +\item{na_string}{Character. Missing value string in the output file. Use \code{"NA"} or \code{""} (default: \code{"NA"}).} \item{output_filename}{Character. Path/name of the output results file -(default: "__validation_report.txt"). The plot will be saved using the -same base name with a .jpg extension.} +(default: \code{"__validation_report.txt"}). The plot uses the same base +name with a .jpg extension.} -\item{corrected_pedigree_filename}{Character. Path/name of the output file -for the corrected pedigree (default: "corrected_pedigree.txt"). Set to -NULL to suppress writing the corrected pedigree.} +\item{corrected_pedigree_filename}{Character. Path/name for the corrected +pedigree output (default: \code{"corrected_pedigree.txt"}). Set to NULL +to suppress writing.} } \value{ -A data.table (returned invisibly) with one row per trio and -the following columns: +A named list (returned invisibly) with the following elements: \describe{ -\item{ID}{Individual ID.} -\item{Orig_Male_Parent}{Declared male parent ID.} -\item{Orig_Female_Parent}{Declared female parent ID.} -\item{Trio_Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Trio_Markers_Tested}{Number of markers with non-missing genotypes.} -\item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, -MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -or NO_GENOTYPE_DATA.} -\item{Recommended_Correction}{One of NONE, KEEP_BOTH, -REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, -LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, -LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} -\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} -\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -\item{Best_Male_Candidate}{Best-matching male parent candidate ID.} -\item{Best_Male_Candidate_Error_Pct}{Homozygous mismatch percentage for the best male candidate.} -\item{Best_Female_Candidate}{Best-matching female parent candidate ID.} -\item{Best_Female_Candidate_Error_Pct}{Homozygous mismatch percentage for the best female candidate.} +\item{pass}{Trios that passed the Mendelian error threshold.} +\item{fail}{Trios that failed the Mendelian error threshold.} +\item{low_markers}{Trios with insufficient markers for evaluation.} +\item{no_data}{Trios with no overlapping genotype data.} +\item{no_genotype_data}{Trios absent from the genotype file.} +\item{founders}{Trios identified as founders.} +\item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} +\item{full_results}{Complete data.table with all trios and all output columns.} +\item{corrected_pedigree}{Pedigree table after applying recommended corrections.} } } \description{ -Validates parent-offspring trios by calculating Mendelian error rates from -SNP genotype data. Identifies incorrect parentage assignments and suggests -best-matching replacements. If a list of founders is supplied, trios that -are declared founders (both parents coded as 0) are preserved unchanged -with no recommendations. Trios removed due to missing genotype data are -retained in the output with a NO_GENOTYPE_DATA status. +Validates parent-offspring trios against SNP genotype data using Mendelian +error rates. Identifies incorrect parentage assignments, suggests +best-matching replacements, and outputs a corrected pedigree. Founder trios +(both parents coded as 0) are preserved unchanged if a founders file is +supplied. Trios absent from the genotype file are retained as +\code{NO_GENOTYPE_DATA}. } \author{ -Josué Chinchilla-Vargas +Josue Chinchilla-Vargas } From 9b16038f5f304b8eb85a445176cd36679c289154 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 26 May 2026 11:39:50 -0400 Subject: [PATCH 15/16] updated parentage functions to standardize inputs and outputs --- NAMESPACE | 2 - R/check_ped.R | 43 +- R/find_parentage.R | 281 +++---- R/validate_pedigree.R | 362 +++++---- man/check_ped.Rd | 8 +- man/find_parentage.Rd | 64 +- man/validate_pedigree.Rd | 35 +- tests/testthat/test-check_ped.R | 926 +++++++++++++++++------- tests/testthat/test-find_parentage.R | 587 ++++++++------- tests/testthat/test-validate_pedigree.R | 802 ++++++++++---------- 10 files changed, 1662 insertions(+), 1448 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 027d02e..f94124f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,7 +40,6 @@ importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) -importFrom(data.table,fwrite) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(dplyr,"%>%") @@ -80,7 +79,6 @@ importFrom(stats,cor) importFrom(stats,reorder) importFrom(stats,setNames) importFrom(tibble,as_tibble) -importFrom(tools,file_path_sans_ext) importFrom(utils,packageVersion) importFrom(utils,read.csv) importFrom(utils,read.table) diff --git a/R/check_ped.R b/R/check_ped.R index ca82e14..78c53ed 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -19,7 +19,7 @@ #' \describe{ #' \item{exact_duplicates}{Exact duplicate rows found in the input.} #' \item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} -#' \item{inconsistent_sex_roles}{IDs appearing as both male_parent and female_parent.} +#' \item{inconsistent_sex_roles}{Rows where a conflicting ID appears as male_parent or female_parent.} #' \item{missing_parents}{Parent IDs absent from id, added as founders.} #' \item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} #' \item{corrected_pedigree}{Corrected pedigree table.} @@ -32,7 +32,11 @@ #' ped_errors$inconsistent_sex_roles #' ped_errors$corrected_pedigree #' -#' conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id +#' # Identify the conflicting IDs from the parent columns +#' conflicting_sex_ids <- unique(c( +#' ped_errors$inconsistent_sex_roles$male_parent, +#' ped_errors$inconsistent_sex_roles$female_parent +#' )) #' print(conflicting_sex_ids) #' #' @author Josue Chinchilla-Vargas @@ -50,14 +54,11 @@ check_ped <- function(ped.file, #### setup #### if (!is.null(seed)) set.seed(seed) - data <- utils::read.table(ped.file, header = TRUE) %>% - dplyr::mutate( - id = as.character(id), - male_parent = as.character(male_parent), - female_parent = as.character(female_parent) - ) + # read inputs first, then normalize column names to lowercase + data <- utils::read.table(ped.file, header = TRUE) + names(data) <- tolower(names(data)) - # Validate required column names before any processing + # validate required columns after normalization required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) if (length(missing_cols) > 0) { @@ -68,7 +69,17 @@ check_ped <- function(ped.file, ) } - # Add row numbers before any processing so all reports reference original rows [1] + # reorder so required columns always come first regardless of column order in the input file + extra_cols <- setdiff(names(data), required_cols) + data <- data[, c(required_cols, extra_cols)] + + data <- data %>% + dplyr::mutate( + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) + ) + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) errors <- list() @@ -104,7 +115,6 @@ check_ped <- function(ped.file, dplyr::ungroup() if (correct_conflicting_trios && nrow(conflicting_trios_ids) > 0) { - # Set conflicting parents to "0" -- rows become exact duplicates, summarize collapses to one [1] data <- data %>% dplyr::group_by(id) %>% dplyr::summarize( @@ -155,11 +165,14 @@ check_ped <- function(ped.file, male_ids <- unique(data$male_parent[data$male_parent != "0"]) female_ids <- unique(data$female_parent[data$female_parent != "0"]) conflicting_sex_ids <- intersect(male_ids, female_ids) - inconsistent_sex_roles <- data %>% dplyr::filter(id %in% conflicting_sex_ids) + + # FIX 4: return the rows where the conflicting ID appears AS A PARENT + # (not the row where it is the subject) — this shows the actual contradiction + inconsistent_sex_roles <- data %>% + dplyr::filter(male_parent %in% conflicting_sex_ids | + female_parent %in% conflicting_sex_ids) if (correct_inconsistent_sex_roles && length(conflicting_sex_ids) > 0) { - # Zero out male_parent/female_parent wherever a conflicting ID appears, - # then distinct() removes any rows that became exact duplicates [1] data <- data %>% dplyr::mutate( male_parent = dplyr::if_else(male_parent %in% conflicting_sex_ids, "0", male_parent), @@ -227,7 +240,7 @@ check_ped <- function(ped.file, conflicting_trios = conflicting_trios, inconsistent_sex_roles = inconsistent_sex_roles, missing_parents = missing_parents, - dependencies = data.frame(Dependency = unique(unlist(errors)), + dependencies = data.frame(dependency = unique(unlist(errors)), stringsAsFactors = FALSE), corrected_pedigree = data %>% dplyr::select(-row_number) ) diff --git a/R/find_parentage.R b/R/find_parentage.R index c4ff140..56cd41b 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -4,83 +4,41 @@ #' using Mendelian error rates or homozygous mismatch rates. Parents or progeny #' absent from the genotype file are removed with a warning. #' -#' @param genotypes_file Path to a TSV/CSV/TXT file with an 'ID' column +#' @param genotypes_file Path to a TSV/CSV/TXT file with an 'id' column #' followed by marker columns coded as 0, 1, 2 (allele dosage). -#' @param parents_file Path to a TSV/CSV/TXT file with an 'ID' column and an -#' optional 'Sex' column ('M', 'F', or 'A'). If absent, all parents are +#' @param parents_file Path to a TSV/CSV/TXT file with an 'id' column and an +#' optional 'sex' column ('M', 'F', or 'A'). If absent, all parents are #' treated as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file with an 'ID' column. +#' @param progeny_file Path to a TSV/CSV/TXT file with an 'id' column. #' @param method Character. One of \code{"best_male_parent"}, #' \code{"best_female_parent"}, \code{"best_match"}, or #' \code{"best_pair"} (default). See Details. #' @param min_markers Integer. Minimum markers required; fewer flags -#' \code{LOW_MARKERS} (default: \code{10}). +#' \code{low_markers} (default: \code{10}). #' @param error_threshold Numeric. Maximum mismatch percentage; exceeded values -#' flag \code{HIGH_ERROR} (default: \code{5.0}). Must be between 0 and 100. +#' flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100. #' @param show_ties Logical. If \code{TRUE}, tied best pairs are appended as -#' suffix columns (e.g. \code{Male_Parent_2}) for \code{"best_pair"}. +#' suffix columns (e.g. \code{male_parent_2}) for \code{"best_pair"}. #' Default is \code{TRUE}. #' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and #' female parent IDs are excluded. Default is \code{TRUE}. #' @param verbose Logical. If \code{TRUE}, prints progress and summary to the #' console. Default is \code{TRUE}. -#' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_testing_results.txt}. Default is \code{TRUE}. -#' @param na_string Character. Missing value string in output. -#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). #' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error -#' distribution colored by status. Requires \code{ggplot2}. Saved as -#' \code{.jpg} when \code{write_txt = TRUE}. Default is \code{TRUE}. +#' distribution colored by status. Requires \code{ggplot2}. Default is \code{TRUE}. #' #' @return A named list (returned invisibly) with the following elements: -#' \describe{ -#' \item{pass}{Progeny with a confident parentage assignment.} -#' \item{high_error}{Progeny whose best assignment exceeds the error threshold.} -#' \item{low_markers}{Progeny with insufficient markers for a valid assignment.} -#' \item{full_results}{Complete data.table with all progeny and all output columns.} -#' } -#' -#' @details -#' A homozygous-only matrix is pre-computed once and shared across -#' \code{"best_male_parent"}, \code{"best_female_parent"}, and -#' \code{"best_match"}, where heterozygous markers (coded 1) are set to -#' \code{NA} to reduce false mismatches from phase ambiguity. -#' -#' For \code{"best_pair"}, full Mendelian inheritance rules are applied across -#' all marker and pair combinations using vectorised \code{vapply} calls, -#' producing \code{n_pairs x n_progeny} mismatch and comparison matrices. -#' Ties are broken by maximum markers tested; remaining ties are appended as -#' suffix columns when \code{show_ties = TRUE}. -#' -#' @examples -#' \dontrun{ -#' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_pair", -#' min_markers = 50, -#' error_threshold = 5.0, -#' show_ties = TRUE, -#' allow_selfing = FALSE, -#' na_string = "NA", -#' plot_results = TRUE -#' ) -#' -#' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_match", -#' min_markers = 30, -#' error_threshold = 3.0 -#' ) +#' \describe{ +#' \item{pass}{Progeny with a confident parentage assignment.} +#' \item{high_error}{Progeny whose best assignment exceeds the error threshold.} +#' \item{low_markers}{Progeny with insufficient markers for a valid assignment.} +#' \item{full_results}{Complete data.table with all progeny and all output columns.} +#' \item{plot}{ggplot object. Use ggsave() to save if desired.} #' } #' #' @author Josue Chinchilla-Vargas #' -#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table -#' @importFrom tools file_path_sans_ext +#' @importFrom data.table fread copy CJ rbindlist set data.table as.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best_pair", @@ -89,13 +47,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE, - na_string = "NA", plot_results = TRUE) { - ## ---- silence R CMD check NOTEs for data.table / ggplot2 NSE columns ---- - ID <- Sex <- Male_Parent <- Female_Parent <- NULL - Mendelian_Error_Pct <- Plot_Status <- Assignment_Status <- NULL + ## silence R CMD check NOTEs + id <- sex <- male_parent <- female_parent <- NULL + mendelian_error_pct <- plot_status <- status <- NULL #### Input Validation and Data Loading #### allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") @@ -105,8 +61,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("min_markers must be a positive integer.") if (error_threshold < 0 || error_threshold > 100) stop("error_threshold must be between 0 and 100.") - if (!na_string %in% c("NA", "")) - stop("na_string must be either 'NA' or ''.") tryCatch({ genos <- data.table::fread(genotypes_file, sep = "auto") @@ -116,29 +70,29 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - valid_ids <- genos$ID - removed_parents <- base::setdiff(all_parents$ID, valid_ids) + valid_ids <- genos$id + removed_parents <- base::setdiff(all_parents$id, valid_ids) if (base::length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) - all_parents <- all_parents[ID %in% valid_ids] + all_parents <- all_parents[id %in% valid_ids] } - removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) + removed_progeny <- base::setdiff(progeny_candidates$id, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) - progeny_candidates <- progeny_candidates[ID %in% valid_ids] + progeny_candidates <- progeny_candidates[id %in% valid_ids] } - if (!"Sex" %in% base::colnames(all_parents)) { - warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") - all_parents[, Sex := "A"] + if (!"sex" %in% base::colnames(all_parents)) { + warning("No 'sex' column in parents file. All parents treated as ambiguous ('A').") + all_parents[, sex := "A"] } - all_parents[, Sex := base::toupper(Sex)] - male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] - female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] + all_parents[, sex := base::toupper(sex)] + male_parent_candidates <- all_parents[sex %in% c("M", "A", "NA"), .SD] + female_parent_candidates <- all_parents[sex %in% c("F", "A", "NA")] if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) @@ -147,36 +101,36 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - #### Pre-compute genotype matrices once (shared across all methods) #### - genos_mat <- base::as.matrix(genos, rownames = "ID") + #### Pre-compute genotype matrices once #### + genos_mat <- base::as.matrix(genos, rownames = "id") genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") + marker_cols <- base::setdiff(base::names(genos_hom), "id") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "id") - #### Assignment_Status helper #### + #### Status helper #### assign_status <- function(markers, error_pct) { - base::ifelse(markers < min_markers, "LOW_MARKERS", - base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) + base::ifelse(markers < min_markers, "low_markers", + base::ifelse(error_pct > error_threshold, "high_error", "pass")) } #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, - "best_male_parent" = male_parent_candidates$ID, - "best_female_parent" = female_parent_candidates$ID, - "best_match" = base::union(male_parent_candidates$ID, - female_parent_candidates$ID)) - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + "best_male_parent" = male_parent_candidates$id, + "best_female_parent" = female_parent_candidates$id, + "best_match" = base::union(male_parent_candidates$id, + female_parent_candidates$id)) + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$id, , drop = FALSE] n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( - Progeny = base::rownames(progeny_genos), - Best_Match = NA_character_, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = NA_integer_, - Assignment_Status = NA_character_ + id = base::rownames(progeny_genos), + best_match = NA_character_, + mendelian_error_pct = NA_real_, + markers_tested = NA_integer_, + status = NA_character_ ) for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] @@ -186,33 +140,33 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch[base::is.nan(percent_mismatch)] <- NA best_idx <- base::which.min(percent_mismatch) if (base::length(best_idx) == 0) { - data.table::set(results_dt, i, "Markers_Tested", 0L) - data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") + data.table::set(results_dt, i, "markers_tested", 0L) + data.table::set(results_dt, i, "status", "low_markers") next } best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) - data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) - data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) - data.table::set(results_dt, i, "Assignment_Status", assign_status(best_markers, best_error)) + data.table::set(results_dt, i, "best_match", base::rownames(parent_genos)[best_idx]) + data.table::set(results_dt, i, "mendelian_error_pct", best_error) + data.table::set(results_dt, i, "markers_tested", base::as.integer(best_markers)) + data.table::set(results_dt, i, "status", assign_status(best_markers, best_error)) } final_df <- results_dt } #### Logic for Best Pair Method #### if (method == "best_pair") { - parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, - Female_Parent = female_parent_candidates$ID) + parent_pairs <- data.table::CJ(male_parent = male_parent_candidates$id, + female_parent = female_parent_candidates$id) if (!allow_selfing) { - parent_pairs <- parent_pairs[Male_Parent != Female_Parent] + parent_pairs <- parent_pairs[male_parent != female_parent] if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] - female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - progeny_ids <- progeny_candidates$ID + male_parent_genos_mat <- genos_mat[parent_pairs$male_parent, , drop = FALSE] + female_parent_genos_mat <- genos_mat[parent_pairs$female_parent, , drop = FALSE] + progeny_ids <- progeny_candidates$id progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) @@ -249,12 +203,12 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA results_dt <- data.table::data.table( - Progeny = progeny_ids, - Male_Parent = NA_character_, - Female_Parent = NA_character_, - Mendelian_Error_Pct = NA_character_, - Markers_Tested = NA_integer_, - Assignment_Status = NA_character_ + id = progeny_ids, + male_parent = NA_character_, + female_parent = NA_character_, + mendelian_error_pct = NA_character_, + markers_tested = NA_integer_, + status = NA_character_ ) results_list <- base::vector("list", n_progeny) @@ -265,8 +219,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) if (base::is.infinite(min_mismatch_val)) { - data.table::set(results_dt, j, "Markers_Tested", 0L) - data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") + data.table::set(results_dt, j, "markers_tested", 0L) + data.table::set(results_dt, j, "status", "low_markers") next } @@ -290,19 +244,19 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) - data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) - data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) - data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) - data.table::set(results_dt, j, "Assignment_Status", a_status) + data.table::set(results_dt, j, "male_parent", best_pairs$male_parent[1]) + data.table::set(results_dt, j, "female_parent", best_pairs$female_parent[1]) + data.table::set(results_dt, j, "mendelian_error_pct", base::sprintf("%.2f", min_mismatch_val)) + data.table::set(results_dt, j, "markers_tested", base::as.integer(best_markers)) + data.table::set(results_dt, j, "status", a_status) if (show_ties && num_to_report > 1) { - tie_row <- base::list(Progeny = prog_id) + tie_row <- base::list(id = prog_id) for (k in base::seq(2, num_to_report)) { - tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] - tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] - tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val - tie_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + tie_row[[base::paste0("male_parent_", k)]] <- best_pairs$male_parent[k] + tie_row[[base::paste0("female_parent_", k)]] <- best_pairs$female_parent[k] + tie_row[[base::paste0("mendelian_error_pct_", k)]] <- min_mismatch_val + tie_row[[base::paste0("markers_tested_", k)]] <- comparisons[best_indices[k]] } results_list[[j]] <- data.table::as.data.table(tie_row) } @@ -314,7 +268,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, use.names = TRUE ) if (base::nrow(tie_rows) > 0) { - final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + final_df <- merge(results_dt, tie_rows, by = "id", all.x = TRUE) for (col in base::names(final_df)) data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) } else { @@ -322,15 +276,15 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } } - #### Compile named list (mirrors check_ped structure) #### + #### Compile named list #### output_list <- list( - pass = final_df[Assignment_Status == "PASS"], - high_error = final_df[Assignment_Status == "HIGH_ERROR"], - low_markers = final_df[Assignment_Status == "LOW_MARKERS"], + pass = final_df[status == "pass"], + high_error = final_df[status == "high_error"], + low_markers = final_df[status == "low_markers"], full_results = final_df ) - #### Verbose output (mirrors check_ped reporting style) #### + #### Verbose output #### if (verbose) { total_progeny <- base::nrow(final_df) base::cat("\n=== Parentage Assignment Report ===\n") @@ -341,7 +295,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, n_pass <- base::nrow(output_list$pass) if (n_pass > 0) { - base::cat(base::sprintf("\n%d progeny PASSED (%.1f%%).\n", + base::cat(base::sprintf("\n%d progeny passed (%.1f%%).\n", n_pass, (n_pass / total_progeny) * 100)) } else { base::cat("\nNo progeny passed.\n") @@ -349,7 +303,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, n_high <- base::nrow(output_list$high_error) if (n_high > 0) { - base::cat(base::sprintf("\n%d progeny flagged HIGH_ERROR (%.1f%%):\n", + base::cat(base::sprintf("\n%d progeny flagged high_error (%.1f%%):\n", n_high, (n_high / total_progeny) * 100)) base::print(output_list$high_error) } else { @@ -358,7 +312,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, n_low <- base::nrow(output_list$low_markers) if (n_low > 0) { - base::cat(base::sprintf("\n%d progeny flagged LOW_MARKERS (%.1f%%):\n", + base::cat(base::sprintf("\n%d progeny flagged low_markers (%.1f%%):\n", n_low, (n_low / total_progeny) * 100)) base::print(output_list$low_markers) } else { @@ -368,40 +322,29 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::cat("\nFull results are included in the returned list as $full_results.\n") } - #### Write results #### - if (write_txt) { - output_filename <- "parentage_testing_results.txt" - tryCatch({ - data.table::fwrite(final_df, file = output_filename, sep = "\t", - quote = FALSE, na = na_string) - if (verbose) base::cat("Results written to:", output_filename, "\n") - }, error = function(e) { - warning("Could not write results to file. Error: ", e$message, call. = FALSE) - }) - } - #### Plot Results #### + p <- NULL if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) } else { - plot_df <- final_df[!is.na(final_df$Mendelian_Error_Pct)] - plot_df$Mendelian_Error_Pct <- base::as.numeric(plot_df$Mendelian_Error_Pct) - plot_df$Plot_Status <- base::ifelse( - plot_df$Assignment_Status == "PASS", "PASS", + plot_df <- final_df[!is.na(final_df$mendelian_error_pct)] + plot_df$mendelian_error_pct <- base::as.numeric(plot_df$mendelian_error_pct) + plot_df$plot_status <- base::ifelse( + plot_df$status == "pass", "pass", base::ifelse( - plot_df$Assignment_Status == "HIGH_ERROR", "HIGH_ERROR", + plot_df$status == "high_error", "high_error", base::ifelse( - plot_df$Assignment_Status == "LOW_MARKERS", "LOW_MARKERS", - "OTHER" + plot_df$status == "low_markers", "low_markers", + "other" ) ) ) n_total <- base::nrow(plot_df) - n_pass <- base::sum(plot_df$Assignment_Status == "PASS", na.rm = TRUE) - n_high <- base::sum(plot_df$Assignment_Status == "HIGH_ERROR", na.rm = TRUE) - n_low <- base::sum(plot_df$Assignment_Status == "LOW_MARKERS", na.rm = TRUE) + n_pass <- base::sum(plot_df$status == "pass", na.rm = TRUE) + n_high <- base::sum(plot_df$status == "high_error", na.rm = TRUE) + n_low <- base::sum(plot_df$status == "low_markers", na.rm = TRUE) threshold_label <- base::paste0( "Error Threshold: ", error_threshold, "% | ", @@ -412,7 +355,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, p <- ggplot2::ggplot( plot_df, - ggplot2::aes(x = Mendelian_Error_Pct, fill = Plot_Status) + ggplot2::aes(x = mendelian_error_pct, fill = plot_status) ) + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + ggplot2::geom_vline(xintercept = error_threshold, @@ -420,14 +363,14 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + ggplot2::scale_fill_manual( - values = c("PASS" = "#339900", - "HIGH_ERROR" = "#cc3333", - "LOW_MARKERS" = "#F1C40F", - "OTHER" = "#BDC3C7"), - labels = c("PASS" = "Pass", - "HIGH_ERROR" = "High Error", - "LOW_MARKERS" = "Low Markers", - "OTHER" = "Other") + values = c("pass" = "#339900", + "high_error" = "#cc3333", + "low_markers" = "#F1C40F", + "other" = "#BDC3C7"), + labels = c("pass" = "Pass", + "high_error" = "High Error", + "low_markers" = "Low Markers", + "other" = "Other") ) + ggplot2::labs( title = "Parentage Mendelian Error Distribution", @@ -440,19 +383,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ggplot2::theme(legend.position = "top") base::print(p) - - if (write_txt) { - plot_filename <- paste0(tools::file_path_sans_ext("parentage_testing_results.txt"), ".jpg") - tryCatch({ - ggplot2::ggsave(plot_filename, plot = p, - device = "jpeg", width = 10, height = 6, dpi = 300) - if (verbose) base::cat("Plot saved to:", plot_filename, "\n") - }, error = function(e) { - warning("Could not save plot. Error: ", e$message, call. = FALSE) - }) - } } } + output_list$plot <- p return(base::invisible(output_list)) } diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index ebeecb0..030d80f 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -5,17 +5,17 @@ #' best-matching replacements, and outputs a corrected pedigree. Founder trios #' (both parents coded as 0) are preserved unchanged if a founders file is #' supplied. Trios absent from the genotype file are retained as -#' \code{NO_GENOTYPE_DATA}. +#' no_genotype_data. #' #' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) -#' with columns: ID, Male_Parent, Female_Parent. +#' with columns: id, male_parent, female_parent. #' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) -#' with an ID column followed by marker columns coded as 0, 1, 2. +#' with an id column followed by marker columns coded as 0, 1, 2. #' @param founders_file Character, optional. Path to a one-column file listing #' founder IDs. Founders with both parents coded as 0 are left unchanged. #' Defaults to NULL. #' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to -#' classify a trio as PASS (default: 5.0). Must be between 0 and 100. +#' classify a trio as pass (default: 5.0). Must be between 0 and 100. #' @param min_markers Integer. Minimum non-missing markers required to evaluate #' a trio (default: 10). #' @param single_parent_error_threshold Numeric. Maximum homozygous-marker @@ -23,57 +23,39 @@ #' (default: 2.0). Must be between 0 and 100. #' @param verbose Logical. If TRUE, prints progress, summary, and results to #' the console (default: TRUE). -#' @param write_results Logical. If TRUE, writes results to -#' \code{output_filename} and saves the plot as a .jpg (default: TRUE). -#' @param output_filename Character. Path/name of the output results file -#' (default: \code{"__validation_report.txt"}). The plot uses the same base -#' name with a .jpg extension. #' @param plot_results Logical. If TRUE, prints a histogram of trio Mendelian -#' error percentages with a threshold line. Saved as .jpg when -#' \code{write_results = TRUE} (default: TRUE). -#' @param na_string Character. Missing value string in the output file. -#' Use \code{"NA"} or \code{""} (default: \code{"NA"}). -#' @param corrected_pedigree_filename Character. Path/name for the corrected -#' pedigree output (default: \code{"corrected_pedigree.txt"}). Set to NULL -#' to suppress writing. +#' error percentages with a threshold line (default: TRUE). #' -#' @return A named list (returned invisibly) with the following elements: -#' \describe{ -#' \item{pass}{Trios that passed the Mendelian error threshold.} -#' \item{fail}{Trios that failed the Mendelian error threshold.} -#' \item{low_markers}{Trios with insufficient markers for evaluation.} -#' \item{no_data}{Trios with no overlapping genotype data.} -#' \item{no_genotype_data}{Trios absent from the genotype file.} -#' \item{founders}{Trios identified as founders.} -#' \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} -#' \item{full_results}{Complete data.table with all trios and all output columns.} -#' \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} -#' } +#' @return An invisible named list with the following elements: +#' \describe{ +#' \item{pass}{Trios that passed the Mendelian error threshold.} +#' \item{fail}{Trios that failed the Mendelian error threshold.} +#' \item{low_markers}{Trios with insufficient markers for evaluation.} +#' \item{no_genotype_data}{Trios absent from the genotype file.} +#' \item{founders}{Trios identified as founders.} +#' \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} +#' \item{full_results}{Complete data.table with all trios and all output columns.} +#' \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} +#' \item{plot}{ggplot object. Use ggsave() to save if desired.} +#' } #' #' @author Josue Chinchilla-Vargas #' -#' @importFrom data.table fread fwrite copy data.table set rbindlist -#' @importFrom tools file_path_sans_ext +#' @importFrom data.table fread copy data.table set rbindlist #' @export validate_pedigree <- function(pedigree_file, genotypes_file, - founders_file = NULL, - trio_error_threshold = 5.0, - min_markers = 10, + founders_file = NULL, + trio_error_threshold = 5.0, + min_markers = 10, single_parent_error_threshold = 2.0, - verbose = TRUE, - write_results = TRUE, - plot_results = TRUE, - na_string = "NA", - output_filename = "__validation_report.txt", - corrected_pedigree_filename = "corrected_pedigree.txt") { + verbose = TRUE, + plot_results = TRUE) { #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") - if (!na_string %in% c("NA", "")) - stop("na_string must be either 'NA' or ''.") tryCatch({ pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") @@ -83,16 +65,16 @@ validate_pedigree <- function(pedigree_file, genotypes_file, }) #### Check required columns #### - required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") + required_ped_cols <- c("id", "male_parent", "female_parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) if (base::length(missing_cols) > 0) stop("Pedigree file missing required columns: ", base::paste(missing_cols, collapse = ", ")) - if (!"ID" %in% base::names(genos)) - stop("Genotypes file must have an 'ID' column") + if (!"id" %in% base::names(genos)) + stop("Genotypes file must have an 'id' column") - pedigree[, Male_Parent := as.character(Male_Parent)] - pedigree[, Female_Parent := as.character(Female_Parent)] + pedigree[, male_parent := as.character(male_parent)] + pedigree[, female_parent := as.character(female_parent)] original_pedigree <- data.table::copy(pedigree) #### Read founders list #### @@ -108,24 +90,24 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } #### Build genotype matrices #### - genos_mat <- base::as.matrix(genos, rownames = "ID") + genos_mat <- base::as.matrix(genos, rownames = "id") genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") + marker_cols <- base::setdiff(base::names(genos_hom), "id") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "id") #### Identify trios missing from the genotype file #### - valid_ids <- as.character(genos$ID) - has_geno <- pedigree[ID %in% valid_ids & - (Male_Parent %in% valid_ids | Male_Parent == "0") & - (Female_Parent %in% valid_ids | Female_Parent == "0")] - no_geno_rows <- pedigree[!(ID %in% valid_ids) | - (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | - (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] + valid_ids <- as.character(genos$id) + has_geno <- pedigree[id %in% valid_ids & + (male_parent %in% valid_ids | male_parent == "0") & + (female_parent %in% valid_ids | female_parent == "0")] + no_geno_rows <- pedigree[!(id %in% valid_ids) | + (!(male_parent %in% valid_ids) & male_parent != "0") | + (!(female_parent %in% valid_ids) & female_parent != "0")] if (base::nrow(no_geno_rows) > 0 && verbose) base::cat("Found", base::nrow(no_geno_rows), - "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") + "trios with missing genotype data; flagged as no_genotype_data.\n") pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") @@ -143,8 +125,7 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - - if (base::all(base::is.na(errors))) + if (base::all(base::is.na(errors))) return(base::list(id = NA_character_, error_pct = NA_real_)) best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], @@ -153,12 +134,12 @@ validate_pedigree <- function(pedigree_file, genotypes_file, #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { - prog_id <- pedigree$ID[i] - male_parent_id <- pedigree$Male_Parent[i] - female_parent_id <- pedigree$Female_Parent[i] - correction_decision <- "NONE" + prog_id <- pedigree$id[i] + male_parent_id <- pedigree$male_parent[i] + female_parent_id <- pedigree$female_parent[i] + correction_decision <- "none" error_pct <- NA_real_ - status <- "NO_DATA" + status <- "no_data" markers_tested <- 0L male_parent_error_pct <- NA_real_ female_parent_error_pct <- NA_real_ @@ -169,12 +150,12 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { - status <- "FOUNDERS" - correction_decision <- "NONE" + status <- "founders" + correction_decision <- "none" } else { if (male_parent_id == "0" && female_parent_id == "0") { - status <- "MISSING_BOTH_PARENTS" - correction_decision <- "NONE" + status <- "missing_both_parents" + correction_decision <- "none" best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct @@ -182,14 +163,14 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } else if (male_parent_id == "0" && female_parent_id != "0") { - status <- "MISSING_MALE_PARENT" - correction_decision <- "NONE" + status <- "missing_male_parent" + correction_decision <- "none" best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct } else if (male_parent_id != "0" && female_parent_id == "0") { - status <- "MISSING_FEMALE_PARENT" - correction_decision <- "NONE" + status <- "missing_female_parent" + correction_decision <- "none" best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct @@ -212,19 +193,19 @@ validate_pedigree <- function(pedigree_file, genotypes_file, !base::is.na(female_parent_vec) & !base::is.na(progeny_vec)) if (markers_tested == 0) { - status <- "NO_DATA" - correction_decision <- "NONE" + status <- "no_data" + correction_decision <- "none" } else { error_pct <- (mismatches / markers_tested) * 100 if (markers_tested < min_markers) { - status <- "LOW_MARKERS" + status <- "low_markers" } else if (error_pct <= trio_error_threshold) { - status <- "PASS" - correction_decision <- "NONE" + status <- "pass" + correction_decision <- "none" } else { - status <- "FAIL" + status <- "fail" } - if (status %in% c("FAIL", "LOW_MARKERS")) { + if (status %in% c("fail", "low_markers")) { progeny_hom <- genos_hom_mat[prog_id, ] male_parent_hom <- genos_hom_mat[male_parent_id, ] female_parent_hom <- genos_hom_mat[female_parent_id, ] @@ -238,24 +219,24 @@ validate_pedigree <- function(pedigree_file, genotypes_file, female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / female_comparisons) * 100, 2) - male_acceptable <- !is.na(male_parent_error_pct) && + male_acceptable <- !is.na(male_parent_error_pct) && male_parent_error_pct <= single_parent_error_threshold female_acceptable <- !is.na(female_parent_error_pct) && female_parent_error_pct <= single_parent_error_threshold if (male_acceptable && female_acceptable) { - correction_decision <- "KEEP_BOTH" + correction_decision <- "keep_both" } else if (male_acceptable && !female_acceptable) { - correction_decision <- "REMOVE_FEMALE_PARENT" + correction_decision <- "remove_female_parent" best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } else if (!male_acceptable && female_acceptable) { - correction_decision <- "REMOVE_MALE_PARENT" + correction_decision <- "remove_male_parent" best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct } else { - correction_decision <- "REMOVE_BOTH" + correction_decision <- "remove_both" best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct @@ -263,157 +244,144 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } - if (status == "LOW_MARKERS") - correction_decision <- paste0("LOW_MARKERS_", correction_decision) + if (status == "low_markers") + correction_decision <- paste0("low_markers_", correction_decision) } } } } data.table::data.table( - ID = prog_id, - Orig_Male_Parent = male_parent_id, - Orig_Female_Parent = female_parent_id, - Trio_Mendelian_Error_Pct = base::round(error_pct, 2), - Trio_Markers_Tested = markers_tested, - Status = status, - Recommended_Correction = correction_decision, - Male_Parent_Hom_Error_Pct = male_parent_error_pct, - Female_Parent_Hom_Error_Pct = female_parent_error_pct, - Best_Male_Candidate = best_male_parent, - Best_Male_Candidate_Error_Pct = best_male_parent_pct, - Best_Female_Candidate = best_female_parent, - Best_Female_Candidate_Error_Pct = best_female_parent_pct + id = prog_id, + orig_male_parent = male_parent_id, + orig_female_parent = female_parent_id, + trio_mendelian_error_pct = base::round(error_pct, 2), + trio_markers_tested = markers_tested, + status = status, + recommended_correction = correction_decision, + male_parent_hom_error_pct = male_parent_error_pct, + female_parent_hom_error_pct = female_parent_error_pct, + best_male_candidate = best_male_parent, + best_male_candidate_error_pct = best_male_parent_pct, + best_female_candidate = best_female_parent, + best_female_candidate_error_pct = best_female_parent_pct ) }) final_df <- data.table::rbindlist(results_list) - #### Append NO_GENOTYPE_DATA rows #### + #### Append no_genotype_data rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( - ID = no_geno_rows$ID, - Orig_Male_Parent = no_geno_rows$Male_Parent, - Orig_Female_Parent = no_geno_rows$Female_Parent, - Trio_Mendelian_Error_Pct = NA_real_, - Trio_Markers_Tested = 0L, - Status = "NO_GENOTYPE_DATA", - Recommended_Correction = "NONE", - Male_Parent_Hom_Error_Pct = NA_real_, - Female_Parent_Hom_Error_Pct = NA_real_, - Best_Male_Candidate = NA_character_, - Best_Male_Candidate_Error_Pct = NA_real_, - Best_Female_Candidate = NA_character_, - Best_Female_Candidate_Error_Pct = NA_real_ + id = no_geno_rows$id, + orig_male_parent = no_geno_rows$male_parent, + orig_female_parent = no_geno_rows$female_parent, + trio_mendelian_error_pct = NA_real_, + trio_markers_tested = 0L, + status = "no_genotype_data", + recommended_correction = "none", + male_parent_hom_error_pct = NA_real_, + female_parent_hom_error_pct = NA_real_, + best_male_candidate = NA_character_, + best_male_candidate_error_pct = NA_real_, + best_female_candidate = NA_character_, + best_female_candidate_error_pct = NA_real_ ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - #### Write corrected pedigree #### - - # pass NULL to suppress writing. - if (!is.null(corrected_pedigree_filename)) { - corrected_pedigree <- data.table::copy(original_pedigree) - for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] - decision <- final_df$Recommended_Correction[i] - row_idx <- base::which(corrected_pedigree$ID == prog_id) - if (decision == "REMOVE_MALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") - } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + #### Build corrected pedigree #### + corrected_pedigree <- data.table::copy(original_pedigree) + for (i in base::seq_len(base::nrow(final_df))) { + prog_id <- final_df$id[i] + decision <- final_df$recommended_correction[i] + row_idx <- base::which(corrected_pedigree$id == prog_id) + if (decision == "remove_male_parent") { + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + } else if (decision == "remove_female_parent") { + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") + } else if (decision %in% c("remove_both", + "low_markers_remove_both", + "low_markers_remove_male_parent", + "low_markers_remove_female_parent")) { + if (grepl("male", decision)) { + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + } + if (grepl("female", decision)) { + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") + } + if (decision == "low_markers_remove_both" || decision == "remove_both") { + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") } } - tryCatch({ - data.table::fwrite(corrected_pedigree, file = corrected_pedigree_filename, - sep = "\t", quote = FALSE) - if (verbose) base::cat("Corrected pedigree written to:", corrected_pedigree_filename, "\n") - }, error = function(e) { - warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) - }) } #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) - status_counts <- base::table(final_df$Status) + status_counts <- base::table(final_df$status) base::cat("\n--- Trio Validation Summary ---\n") base::cat("Total trios in pedigree:", total_trios, "\n") for (s in base::names(status_counts)) - base::cat(base::sprintf("%-24s: %d (%.1f%%)\n", s, + base::cat(base::sprintf("%-28s: %d (%.1f%%)\n", s, status_counts[s], (status_counts[s] / total_trios) * 100)) base::cat("Error threshold:", trio_error_threshold, "%\n") base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") base::cat("Minimum markers required:", min_markers, "\n\n") - corrections <- base::table(final_df$Recommended_Correction) + corrections <- base::table(final_df$recommended_correction) base::cat("Correction summary:\n") for (decision in base::names(corrections)) - if (decision != "NONE") + if (decision != "none") base::cat(" ", decision, ":", corrections[decision], "\n") base::cat("\n") base::print(final_df) } - #### Write results #### - if (write_results) { - tryCatch({ - data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE, na = na_string) - if (verbose) base::cat("Results written to:", output_filename, "\n") - }, error = function(e) { - warning("Could not write results. Error: ", e$message, call. = FALSE) - }) - } - #### Plot results #### + p <- NULL if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) } else { - plot_df <- final_df[!is.na(final_df$Trio_Mendelian_Error_Pct)] - plot_df$Plot_Status <- dplyr::case_when( - plot_df$Recommended_Correction %in% c("NONE", - "KEEP_BOTH", - "LOW_MARKERS_KEEP_BOTH") ~ "PASS", - plot_df$Recommended_Correction %in% c("REMOVE_MALE_PARENT", - "REMOVE_FEMALE_PARENT", - "LOW_MARKERS_REMOVE_MALE_PARENT", - "LOW_MARKERS_REMOVE_FEMALE_PARENT") ~ "FAIL_ONE_PARENT", - plot_df$Recommended_Correction %in% c("REMOVE_BOTH", - "LOW_MARKERS_REMOVE_BOTH") ~ "FAIL_BOTH_PARENTS", - TRUE ~ "OTHER" + plot_df <- final_df[!is.na(final_df$trio_mendelian_error_pct)] + plot_df$plot_status <- dplyr::case_when( + plot_df$recommended_correction %in% c("none", + "keep_both", + "low_markers_keep_both") ~ "pass", + plot_df$recommended_correction %in% c("remove_male_parent", + "remove_female_parent", + "low_markers_remove_male_parent", + "low_markers_remove_female_parent") ~ "fail_one_parent", + plot_df$recommended_correction %in% c("remove_both", + "low_markers_remove_both") ~ "fail_both_parents", + TRUE ~ "other" + ) + n_total <- nrow(plot_df) + n_fail <- sum(plot_df$trio_mendelian_error_pct > trio_error_threshold) + n_pass <- sum(plot_df$trio_mendelian_error_pct <= trio_error_threshold) + threshold_label <- paste0( + "Mendelian Error Threshold: ", trio_error_threshold, "% | ", + "Lost: ", n_fail, " trios | ", + "Kept: ", n_pass, " trios" ) - n_total <- nrow(plot_df) - n_fail <- sum(plot_df$Trio_Mendelian_Error_Pct > trio_error_threshold) - n_pass <- sum(plot_df$Trio_Mendelian_Error_Pct <= trio_error_threshold) - pct_lost <- round((n_fail / n_total) * 100, 1) - threshold_label <- paste0("Mendelian Error Threshold: ", trio_error_threshold, "% | ", - "Lost: ", n_fail, " trios | ", - "Kept: ", n_pass, " trios") p <- ggplot2::ggplot(plot_df, - ggplot2::aes(x = Trio_Mendelian_Error_Pct, fill = Plot_Status)) + + ggplot2::aes(x = trio_mendelian_error_pct, fill = plot_status)) + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + ggplot2::geom_vline(xintercept = trio_error_threshold, linetype = "dashed", color = "black", linewidth = 1) + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + ggplot2::scale_y_continuous(breaks = seq(0, 100, by = 5)) + ggplot2::scale_fill_manual( - values = c( - "PASS" = "#339900", - "FAIL_ONE_PARENT" = "#F1C40F", - "FAIL_BOTH_PARENTS" = "#cc3333", - "OTHER" = "#BDC3C7" - ), - labels = c( - "PASS" = "Pass", - "FAIL_ONE_PARENT" = "Fail - One Parent", - "FAIL_BOTH_PARENTS" = "Fail - Both Parents", - "OTHER" = "Other" - ) + values = c("pass" = "#339900", + "fail_one_parent" = "#F1C40F", + "fail_both_parents"= "#cc3333", + "other" = "#BDC3C7"), + labels = c("pass" = "Pass", + "fail_one_parent" = "Fail - One Parent", + "fail_both_parents"= "Fail - Both Parents", + "other" = "Other") ) + ggplot2::labs( title = "Trio Mendelian Error Distribution", @@ -425,20 +393,22 @@ validate_pedigree <- function(pedigree_file, genotypes_file, ggplot2::theme_classic(base_size = 13) + ggplot2::theme(legend.position = "top") print(p) - if (write_results) { - # Fix #3: use tools::file_path_sans_ext() to reliably build the .jpg - # filename regardless of whether output_filename has an extension or not. - plot_filename <- paste0(tools::file_path_sans_ext(output_filename), ".jpg") - tryCatch({ - ggplot2::ggsave(plot_filename, plot = p, - device = "jpeg", width = 10, height = 6, dpi = 300) - if (verbose) base::cat("Plot saved to:", plot_filename, "\n") - }, error = function(e) { - warning("Could not save plot. Error: ", e$message, call. = FALSE) - }) - } } } - return(base::invisible(final_df)) + #### Compile and return named list #### + output_list <- base::list( + pass = final_df[status == "pass"], + fail = final_df[status == "fail"], + low_markers = final_df[status == "low_markers"], + no_genotype_data = final_df[status == "no_genotype_data"], + founders = final_df[status == "founders"], + missing_parents = final_df[status %in% c("missing_both_parents", + "missing_male_parent", + "missing_female_parent")], + full_results = final_df, + corrected_pedigree = corrected_pedigree, + plot = p + ) + return(base::invisible(output_list)) } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 32ea364..5f29767 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -31,7 +31,7 @@ An invisible named list of data frames: \describe{ \item{exact_duplicates}{Exact duplicate rows found in the input.} \item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} -\item{inconsistent_sex_roles}{IDs appearing as both male_parent and female_parent.} +\item{inconsistent_sex_roles}{Rows where a conflicting ID appears as male_parent or female_parent.} \item{missing_parents}{Parent IDs absent from id, added as founders.} \item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} \item{corrected_pedigree}{Corrected pedigree table.} @@ -51,7 +51,11 @@ ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) ped_errors$inconsistent_sex_roles ped_errors$corrected_pedigree -conflicting_sex_ids <- ped_errors$inconsistent_sex_roles$id +# Identify the conflicting IDs from the parent columns +conflicting_sex_ids <- unique(c( + ped_errors$inconsistent_sex_roles$male_parent, + ped_errors$inconsistent_sex_roles$female_parent +)) print(conflicting_sex_ids) } diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index d41555a..a414919 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -14,33 +14,31 @@ find_parentage( show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE, - na_string = "NA", plot_results = TRUE ) } \arguments{ -\item{genotypes_file}{Path to a TSV/CSV/TXT file with an 'ID' column +\item{genotypes_file}{Path to a TSV/CSV/TXT file with an 'id' column followed by marker columns coded as 0, 1, 2 (allele dosage).} -\item{parents_file}{Path to a TSV/CSV/TXT file with an 'ID' column and an -optional 'Sex' column ('M', 'F', or 'A'). If absent, all parents are +\item{parents_file}{Path to a TSV/CSV/TXT file with an 'id' column and an +optional 'sex' column ('M', 'F', or 'A'). If absent, all parents are treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file with an 'ID' column.} +\item{progeny_file}{Path to a TSV/CSV/TXT file with an 'id' column.} \item{method}{Character. One of \code{"best_male_parent"}, \code{"best_female_parent"}, \code{"best_match"}, or \code{"best_pair"} (default). See Details.} \item{min_markers}{Integer. Minimum markers required; fewer flags -\code{LOW_MARKERS} (default: \code{10}).} +\code{low_markers} (default: \code{10}).} \item{error_threshold}{Numeric. Maximum mismatch percentage; exceeded values -flag \code{HIGH_ERROR} (default: \code{5.0}). Must be between 0 and 100.} +flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100.} \item{show_ties}{Logical. If \code{TRUE}, tied best pairs are appended as -suffix columns (e.g. \code{Male_Parent_2}) for \code{"best_pair"}. +suffix columns (e.g. \code{male_parent_2}) for \code{"best_pair"}. Default is \code{TRUE}.} \item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and @@ -49,15 +47,8 @@ female parent IDs are excluded. Default is \code{TRUE}.} \item{verbose}{Logical. If \code{TRUE}, prints progress and summary to the console. Default is \code{TRUE}.} -\item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_testing_results.txt}. Default is \code{TRUE}.} - -\item{na_string}{Character. Missing value string in output. -Use \code{"NA"} or \code{""} (default: \code{"NA"}).} - \item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error -distribution colored by status. Requires \code{ggplot2}. Saved as -\code{.jpg} when \code{write_txt = TRUE}. Default is \code{TRUE}.} +distribution colored by status. Requires \code{ggplot2}. Default is \code{TRUE}.} } \value{ A named list (returned invisibly) with the following elements: @@ -66,50 +57,13 @@ A named list (returned invisibly) with the following elements: \item{high_error}{Progeny whose best assignment exceeds the error threshold.} \item{low_markers}{Progeny with insufficient markers for a valid assignment.} \item{full_results}{Complete data.table with all progeny and all output columns.} +\item{plot}{ggplot object. Use ggsave() to save if desired.} } } \description{ Assigns the most likely parent(s) to each progeny from SNP genotype data using Mendelian error rates or homozygous mismatch rates. Parents or progeny absent from the genotype file are removed with a warning. -} -\details{ -A homozygous-only matrix is pre-computed once and shared across -\code{"best_male_parent"}, \code{"best_female_parent"}, and -\code{"best_match"}, where heterozygous markers (coded 1) are set to -\code{NA} to reduce false mismatches from phase ambiguity. - -For \code{"best_pair"}, full Mendelian inheritance rules are applied across -all marker and pair combinations using vectorised \code{vapply} calls, -producing \code{n_pairs x n_progeny} mismatch and comparison matrices. -Ties are broken by maximum markers tested; remaining ties are appended as -suffix columns when \code{show_ties = TRUE}. -} -\examples{ -\dontrun{ -results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_pair", - min_markers = 50, - error_threshold = 5.0, - show_ties = TRUE, - allow_selfing = FALSE, - na_string = "NA", - plot_results = TRUE -) - -results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_match", - min_markers = 30, - error_threshold = 3.0 -) -} - } \author{ Josue Chinchilla-Vargas diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 78938d4..662e91c 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -12,26 +12,22 @@ validate_pedigree( min_markers = 10, single_parent_error_threshold = 2, verbose = TRUE, - write_results = TRUE, - plot_results = TRUE, - na_string = "NA", - output_filename = "__validation_report.txt", - corrected_pedigree_filename = "corrected_pedigree.txt" + plot_results = TRUE ) } \arguments{ \item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) -with columns: ID, Male_Parent, Female_Parent.} +with columns: id, male_parent, female_parent.} \item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) -with an ID column followed by marker columns coded as 0, 1, 2.} +with an id column followed by marker columns coded as 0, 1, 2.} \item{founders_file}{Character, optional. Path to a one-column file listing founder IDs. Founders with both parents coded as 0 are left unchanged. Defaults to NULL.} \item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to -classify a trio as PASS (default: 5.0). Must be between 0 and 100.} +classify a trio as pass (default: 5.0). Must be between 0 and 100.} \item{min_markers}{Integer. Minimum non-missing markers required to evaluate a trio (default: 10).} @@ -43,36 +39,21 @@ mismatch percentage for a parent to be considered acceptable \item{verbose}{Logical. If TRUE, prints progress, summary, and results to the console (default: TRUE).} -\item{write_results}{Logical. If TRUE, writes results to -\code{output_filename} and saves the plot as a .jpg (default: TRUE).} - \item{plot_results}{Logical. If TRUE, prints a histogram of trio Mendelian -error percentages with a threshold line. Saved as .jpg when -\code{write_results = TRUE} (default: TRUE).} - -\item{na_string}{Character. Missing value string in the output file. -Use \code{"NA"} or \code{""} (default: \code{"NA"}).} - -\item{output_filename}{Character. Path/name of the output results file -(default: \code{"__validation_report.txt"}). The plot uses the same base -name with a .jpg extension.} - -\item{corrected_pedigree_filename}{Character. Path/name for the corrected -pedigree output (default: \code{"corrected_pedigree.txt"}). Set to NULL -to suppress writing.} +error percentages with a threshold line (default: TRUE).} } \value{ -A named list (returned invisibly) with the following elements: +An invisible named list with the following elements: \describe{ \item{pass}{Trios that passed the Mendelian error threshold.} \item{fail}{Trios that failed the Mendelian error threshold.} \item{low_markers}{Trios with insufficient markers for evaluation.} -\item{no_data}{Trios with no overlapping genotype data.} \item{no_genotype_data}{Trios absent from the genotype file.} \item{founders}{Trios identified as founders.} \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} \item{full_results}{Complete data.table with all trios and all output columns.} \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} +\item{plot}{ggplot object. Use ggsave() to save if desired.} } } \description{ @@ -81,7 +62,7 @@ error rates. Identifies incorrect parentage assignments, suggests best-matching replacements, and outputs a corrected pedigree. Founder trios (both parents coded as 0) are preserved unchanged if a founders file is supplied. Trios absent from the genotype file are retained as -\code{NO_GENOTYPE_DATA}. +no_genotype_data. } \author{ Josue Chinchilla-Vargas diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 64d13ca..57d462c 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -1,375 +1,709 @@ -context("check_ped – Pedigree Quality Checks") +# tests/testthat/test-check_ped.R +library(testthat) -# --------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # Helper: write a data.frame to a temp tab-separated file and return the path -# --------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ write_ped <- function(df) { f <- tempfile(fileext = ".txt") utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) f } -# =========================================================================== +context("check_ped – Pedigree Quality Checks") + +# ============================================================================== # 1. Return structure -# Tests that check_ped returns a named list of exactly 5 data frame -# components covering all five issue categories. -# =========================================================================== -test_that("check_ped returns a named list of length 5", { +# ============================================================================== +test_that("check_ped returns a named list of length 6", { ped <- data.frame( id = c("A", "B", "C"), male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE) expect_type(out, "list") - expect_length(out, 5) + expect_length(out, 6) expect_named(out, c( "exact_duplicates", - "repeated_ids_diff", + "conflicting_trios", "inconsistent_sex_roles", "missing_parents", - "dependencies" + "dependencies", + "corrected_pedigree" )) }) -# =========================================================================== +# ============================================================================== # 2. Component types -# Tests that every element in the returned list is a data.frame, ensuring -# downstream code can safely call nrow(), colnames(), etc. on any component. -# =========================================================================== +# ============================================================================== test_that("check_ped report components are data.frames", { ped <- data.frame( id = c("A", "B", "C"), male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), verbose = FALSE) expect_true(is.data.frame(out$exact_duplicates)) - expect_true(is.data.frame(out$repeated_ids_diff)) + expect_true(is.data.frame(out$conflicting_trios)) expect_true(is.data.frame(out$inconsistent_sex_roles)) expect_true(is.data.frame(out$missing_parents)) expect_true(is.data.frame(out$dependencies)) + expect_true(is.data.frame(out$corrected_pedigree)) +}) + +# ============================================================================== +# 3. corrected_pedigree column names are lowercase, no row_number leak +# ============================================================================== +test_that("corrected_pedigree has lowercase column names and no row_number", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) + expect_false("row_number" %in% names(out$corrected_pedigree)) }) -# =========================================================================== -# 3. Clean pedigree → empty report -# Tests that a structurally valid pedigree with no issues produces zero rows -# in every report component (no false positives). -# =========================================================================== +# ============================================================================== +# 4. Clean pedigree → empty report (no false positives) +# ============================================================================== test_that("clean pedigree produces no issues", { ped <- data.frame( id = c("G1", "G2", "P1"), male_parent = c("0", "0", "G1"), female_parent = c("0", "0", "G2") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) - - expect_equal(nrow(out$exact_duplicates), 0) - expect_equal(nrow(out$repeated_ids_diff), 0) - expect_equal(nrow(out$inconsistent_sex_roles), 0) - expect_equal(nrow(out$missing_parents), 0) - expect_equal(nrow(out$dependencies), 0) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_equal(nrow(out$exact_duplicates), 0) + expect_equal(nrow(out$conflicting_trios), 0) + expect_equal(nrow(out$inconsistent_sex_roles), 0) + expect_equal(nrow(out$missing_parents), 0) + expect_equal(nrow(out$dependencies), 0) + expect_equal(nrow(out$corrected_pedigree), 3) }) -# =========================================================================== -# 4. Exact duplicate detection and correction -# Tests that rows sharing the same (id, male_parent, female_parent) triplet -# are flagged, and that correct = TRUE retains only one copy of each. -# =========================================================================== +# ============================================================================== +# 5. Exact duplicate detection +# ============================================================================== test_that("check_ped detects exact duplicates", { ped <- data.frame( id = c("A", "A", "B"), male_parent = c("0", "0", "A"), female_parent = c("0", "0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), verbose = FALSE) expect_equal(nrow(out$exact_duplicates), 2) expect_true(all(out$exact_duplicates$id == "A")) }) -test_that("correct = TRUE removes exact duplicates", { +test_that("exact duplicates are collapsed to one row in corrected_pedigree", { ped <- data.frame( id = c("A", "A", "B"), male_parent = c("0", "0", "A"), female_parent = c("0", "0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) - - check_ped(f, verbose = FALSE, correct = TRUE) - corrected <- get(corr_name, envir = .GlobalEnv) + out <- check_ped(write_ped(ped), verbose = FALSE) - # Only one "A" row should remain - expect_equal(sum(corrected$id == "A"), 1) + expect_equal(sum(out$corrected_pedigree$id == "A"), 1) }) -# =========================================================================== -# 5. Repeated IDs with conflicting parent assignments -# Tests that the same ID appearing on multiple rows with different parent -# values is flagged, and that correct = TRUE blanks only the conflicting -# parent field while preserving consistent values. -# =========================================================================== -test_that("check_ped detects repeated IDs with conflicting parents", { +# ============================================================================== +# 6. Conflicting trios +# ============================================================================== +test_that("check_ped detects conflicting trios", { ped <- data.frame( id = c("A", "A", "B"), male_parent = c("X", "Y", "A"), female_parent = c("M", "M", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$repeated_ids_diff), 2) - expect_true(all(out$repeated_ids_diff$id == "A")) + expect_equal(nrow(out$conflicting_trios), 2) + expect_true(all(out$conflicting_trios$id == "A")) }) -test_that("correct = TRUE resolves conflicting IDs: conflicting field -> '0', consistent field kept", { +test_that("correct_conflicting_trios = TRUE: conflicting field -> '0', consistent kept", { ped <- data.frame( id = c("A", "A", "B"), - male_parent = c("X", "Y", "A"), # conflicting -> should become "0" - female_parent = c("M", "M", "0") # consistent -> should stay "M" - ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) - - check_ped(f, verbose = FALSE, correct = TRUE) - corrected <- get(corr_name, envir = .GlobalEnv) - a_row <- corrected[corrected$id == "A", ] + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = TRUE) + a_row <- out$corrected_pedigree[out$corrected_pedigree$id == "A", ] expect_equal(nrow(a_row), 1) expect_equal(a_row$male_parent, "0") expect_equal(a_row$female_parent, "M") }) -# =========================================================================== -# 6. Inconsistent parent sex roles -# Tests that an individual ID appearing in both male_parent and female_parent -# columns is flagged. Also confirms that correct = TRUE does NOT auto-resolve -# this issue since the correct fix requires manual review. -# =========================================================================== -test_that("check_ped detects inconsistent parent sex roles", { +test_that("correct_conflicting_trios = FALSE leaves conflicting rows as-is", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = FALSE) + + expect_equal(sum(out$corrected_pedigree$id == "A"), 2) +}) + +# ============================================================================== +# 7. Missing parent detection and founder row injection +# ============================================================================== +test_that("check_ped detects missing parents", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_equal(nrow(out$missing_parents), 2) + expect_true("X" %in% out$missing_parents$id) + expect_true("Y" %in% out$missing_parents$id) +}) + +test_that("missing parents are added as founder rows in corrected_pedigree", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true("X" %in% out$corrected_pedigree$id) + expect_true("Y" %in% out$corrected_pedigree$id) + + x_row <- out$corrected_pedigree[out$corrected_pedigree$id == "X", ] + y_row <- out$corrected_pedigree[out$corrected_pedigree$id == "Y", ] + + expect_equal(x_row$male_parent, "0") + expect_equal(x_row$female_parent, "0") + expect_equal(y_row$male_parent, "0") + expect_equal(y_row$female_parent, "0") +}) + +test_that("individual that is its own parent is logged as a dependency", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("A", "0"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_gt(nrow(out$dependencies), 0) +}) + +# ============================================================================== +# 8. Inconsistent sex roles +# inconsistent_sex_roles returns the rows where the conflicting ID appears +# AS A PARENT (not the row where it is the subject) [1] +# ============================================================================== +test_that("check_ped detects inconsistent sex roles", { ped <- data.frame( id = c("child1", "child2", "P"), male_parent = c("P", "0", "0"), female_parent = c("0", "P", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), verbose = FALSE) expect_true("inconsistent_sex_roles" %in% names(out)) expect_gt(nrow(out$inconsistent_sex_roles), 0) - - # Both rows that reference P should be flagged expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | out$inconsistent_sex_roles$female_parent == "P")) }) -test_that("correct = TRUE does NOT auto-correct inconsistent sex roles", { +test_that("correct_inconsistent_sex_roles = TRUE zeros out conflicting parent references", { ped <- data.frame( id = c("child1", "child2", "P"), male_parent = c("P", "0", "0"), female_parent = c("0", "P", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = TRUE) + corr <- out$corrected_pedigree + + expect_false(any(corr$male_parent == "P")) + expect_false(any(corr$female_parent == "P")) +}) - check_ped(f, verbose = FALSE, correct = TRUE) - corrected <- get(corr_name, envir = .GlobalEnv) +test_that("correct_inconsistent_sex_roles = FALSE leaves conflicting references", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = FALSE) + corr <- out$corrected_pedigree - # P should still appear with its original (inconsistent) parent assignments - expect_true(any(corrected$male_parent == "P" | - corrected$female_parent == "P")) + expect_true(any(corr$male_parent == "P" | corr$female_parent == "P")) }) -# =========================================================================== -# 7. Missing parent detection and founder row injection -# Tests that parent IDs referenced in male_parent/female_parent but absent -# from the id column are flagged. Confirms that correct = TRUE appends a -# founder row (both parents = "0") for each missing parent. -# =========================================================================== -test_that("check_ped detects missing parents", { +# ============================================================================== +# 9. Cycle / dependency detection +# ============================================================================== +test_that("check_ped detects a direct two-node cycle", { ped <- data.frame( id = c("A", "B"), - male_parent = c("0", "X"), - female_parent = c("0", "Y") + male_parent = c("B", "A"), + female_parent = c("0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$missing_parents), 1) - expect_true(all(out$missing_parents$id == "B")) + expect_gt(nrow(out$dependencies), 0) }) -test_that("correct = TRUE adds missing parents as founder rows", { +test_that("cycle-involved IDs are still present in corrected_pedigree", { ped <- data.frame( id = c("A", "B"), - male_parent = c("0", "X"), - female_parent = c("0", "Y") + male_parent = c("B", "A"), + female_parent = c("0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) - - check_ped(f, verbose = FALSE, correct = TRUE) - corrected <- get(corr_name, envir = .GlobalEnv) - - # X and Y should now appear as founder rows - expect_true("X" %in% corrected$id) - expect_true("Y" %in% corrected$id) - x_row <- corrected[corrected$id == "X", ] - y_row <- corrected[corrected$id == "Y", ] - expect_equal(x_row$male_parent, "0") - expect_equal(x_row$female_parent, "0") - expect_equal(y_row$male_parent, "0") - expect_equal(y_row$female_parent, "0") + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true("A" %in% out$corrected_pedigree$id) + expect_true("B" %in% out$corrected_pedigree$id) }) -# =========================================================================== -# 8. Pedigree cycle / dependency detection -# Tests that circular ancestry chains (e.g. A is parent of B and B is parent -# of A) are detected and reported. Also confirms that correct = TRUE does NOT -# silently remove cycle-involved individuals, as cycles require manual review. -# =========================================================================== -test_that("check_ped detects a direct two-node cycle", { +# ============================================================================== +# 10. Missing required columns → hard stop +# tolower(names()) in check_ped.R normalizes caps before validation [1] +# so only truly unrecognizable columns trigger this error +# ============================================================================== +test_that("check_ped errors when required columns are missing", { + bad_df <- data.frame( + animal_id = c("a", "b"), + parent1 = c("0", "a"), + parent2 = c("0", "0") + ) + expect_error( + check_ped(write_ped(bad_df), verbose = FALSE), + "Input file is missing required column" + ) +}) + +# ============================================================================== +# 11. Capitalization tolerance — mixed caps should be accepted [1] +# ============================================================================== +test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Parent)", { + ped <- data.frame( + ID = c("A", "B", "C"), + Male_Parent = c("0", "A", "A"), + Female_Parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_length(out, 6) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("check_ped accepts all-uppercase column names", { + ped <- data.frame( + ID = c("A", "B"), + MALE_PARENT = c("0", "A"), + FEMALE_PARENT = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_length(out, 6) +}) + +# ============================================================================== +# 12. Column order tolerance — columns in any order should be accepted [1] +# ============================================================================== +test_that("check_ped accepts columns in any order", { + ped <- data.frame( + female_parent = c("0", "0"), + male_parent = c("0", "A"), + id = c("A", "B") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_length(out, 6) + expect_equal(nrow(out$corrected_pedigree), 2) +}) + +# ============================================================================== +# 13. Seed reproducibility +# ============================================================================== +test_that("seed produces reproducible results", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + out1 <- check_ped(f, seed = 42, verbose = FALSE) + out2 <- check_ped(f, seed = 42, verbose = FALSE) + + expect_identical(out1$corrected_pedigree, out2$corrected_pedigree) +}) + +# ============================================================================== +# 14. verbose = FALSE suppresses console output +# ============================================================================== +test_that("verbose = FALSE suppresses console output", { ped <- data.frame( id = c("A", "B"), - male_parent = c("B", "A"), + male_parent = c("0", "A"), female_parent = c("0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE, correct = FALSE) + expect_silent(check_ped(write_ped(ped), verbose = FALSE)) +}) - expect_gt(nrow(out$dependencies), 0) - expect_true(all(c("A", "B") %in% out$dependencies$id)) +# ============================================================================== +# 15. Return value is invisible +# ============================================================================== +test_that("check_ped returns invisibly", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + expect_invisible(check_ped(write_ped(ped), verbose = FALSE)) }) -test_that("check_ped does NOT auto-correct cycles", { +# ============================================================================== +# 16. No write logic — check_ped does not write files to disk +# ============================================================================== +test_that("no output files are written to disk", { + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + ped <- data.frame( id = c("A", "B"), - male_parent = c("B", "A"), + male_parent = c("0", "A"), female_parent = c("0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) + check_ped(write_ped(ped), verbose = FALSE) + + expect_length(list.files(tmp_dir), 0) +}) + +# ============================================================================== +# 17. Integration test with bundled fixture file +# Fixture has mixed-case headers (ID, Male_parent, Female_Parent) [1] +# tolower() in check_ped.R normalizes these automatically +# ============================================================================== +test_that("integration test with bundled fixture file", { + ped_file <- system.file("check_ped_test.txt", package = "BIGr") + skip_if(ped_file == "", "Bundled fixture file not found; skipping.") + + out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + + expect_length(out, 6) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + + # inconsistent_sex_roles stores rows where conflicting ID appears AS A PARENT [1] + conflicting_ids <- unique(c( + out$inconsistent_sex_roles$male_parent, + out$inconsistent_sex_roles$female_parent + )) + expect_true(any(c("grandfather2", "grandfather3") %in% conflicting_ids)) + expect_equal(nrow(out$missing_parents), 8) +})# tests/testthat/test-check_ped.R +library(testthat) + +# ------------------------------------------------------------------------------ +# Helper: write a data.frame to a temp tab-separated file and return the path +# ------------------------------------------------------------------------------ +write_ped <- function(df) { + f <- tempfile(fileext = ".txt") + utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) + f +} - check_ped(f, verbose = FALSE, correct = TRUE) - corrected <- get(corr_name, envir = .GlobalEnv) +context("check_ped – Pedigree Quality Checks") + +# ============================================================================== +# 1. Return structure +# ============================================================================== +test_that("check_ped returns a named list of length 6", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE) + + expect_type(out, "list") + expect_length(out, 6) + expect_named(out, c( + "exact_duplicates", + "conflicting_trios", + "inconsistent_sex_roles", + "missing_parents", + "dependencies", + "corrected_pedigree" + )) +}) + +# ============================================================================== +# 2. Component types +# ============================================================================== +test_that("check_ped report components are data.frames", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - # Cycle-involved IDs should still be present (no silent removal) - expect_true("A" %in% corrected$id) - expect_true("B" %in% corrected$id) + expect_true(is.data.frame(out$exact_duplicates)) + expect_true(is.data.frame(out$conflicting_trios)) + expect_true(is.data.frame(out$inconsistent_sex_roles)) + expect_true(is.data.frame(out$missing_parents)) + expect_true(is.data.frame(out$dependencies)) + expect_true(is.data.frame(out$corrected_pedigree)) }) -# =========================================================================== -# 9. Global environment assignments -# Tests that the report list is always written to the global environment, -# that the corrected pedigree is written only when correct = TRUE, and that -# the internal row_number column does not leak into the corrected output. -# =========================================================================== -test_that("report is always assigned to global environment", { +# ============================================================================== +# 3. corrected_pedigree column names are lowercase, no row_number leak +# ============================================================================== +test_that("corrected_pedigree has lowercase column names and no row_number", { ped <- data.frame( id = c("A", "B"), male_parent = c("0", "A"), female_parent = c("0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - rep_name <- paste0(file_base, "_report") - on.exit(if (exists(rep_name, envir = .GlobalEnv)) - rm(list = rep_name, envir = .GlobalEnv)) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) + expect_false("row_number" %in% names(out$corrected_pedigree)) +}) + +# ============================================================================== +# 4. Clean pedigree → empty report (no false positives) +# ============================================================================== +test_that("clean pedigree produces no issues", { + ped <- data.frame( + id = c("G1", "G2", "P1"), + male_parent = c("0", "0", "G1"), + female_parent = c("0", "0", "G2") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_equal(nrow(out$exact_duplicates), 0) + expect_equal(nrow(out$conflicting_trios), 0) + expect_equal(nrow(out$inconsistent_sex_roles), 0) + expect_equal(nrow(out$missing_parents), 0) + expect_equal(nrow(out$dependencies), 0) + expect_equal(nrow(out$corrected_pedigree), 3) +}) + +# ============================================================================== +# 5. Exact duplicate detection +# ============================================================================== +test_that("check_ped detects exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_equal(nrow(out$exact_duplicates), 2) + expect_true(all(out$exact_duplicates$id == "A")) +}) + +test_that("exact duplicates are collapsed to one row in corrected_pedigree", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_equal(sum(out$corrected_pedigree$id == "A"), 1) +}) + +# ============================================================================== +# 6. Conflicting trios +# ============================================================================== +test_that("check_ped detects conflicting trios", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - check_ped(f, verbose = FALSE, correct = FALSE) + expect_equal(nrow(out$conflicting_trios), 2) + expect_true(all(out$conflicting_trios$id == "A")) +}) - expect_true(exists(rep_name, envir = .GlobalEnv)) - expect_type(get(rep_name, envir = .GlobalEnv), "list") +test_that("correct_conflicting_trios = TRUE: conflicting field -> '0', consistent kept", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = TRUE) + a_row <- out$corrected_pedigree[out$corrected_pedigree$id == "A", ] + + expect_equal(nrow(a_row), 1) + expect_equal(a_row$male_parent, "0") + expect_equal(a_row$female_parent, "M") }) -test_that("corrected pedigree is assigned to global env when correct = TRUE", { +test_that("correct_conflicting_trios = FALSE leaves conflicting rows as-is", { ped <- data.frame( id = c("A", "A", "B"), male_parent = c("X", "Y", "A"), female_parent = c("M", "M", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = FALSE) - check_ped(f, verbose = FALSE, correct = TRUE) + expect_equal(sum(out$corrected_pedigree$id == "A"), 2) +}) - expect_true(exists(corr_name, envir = .GlobalEnv)) - corrected <- get(corr_name, envir = .GlobalEnv) - expect_true(is.data.frame(corrected)) +# ============================================================================== +# 7. Missing parent detection and founder row injection +# ============================================================================== +test_that("check_ped detects missing parents", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - # row_number must not leak into corrected output - expect_false("row_number" %in% names(corrected)) - expect_true(all(c("id", "male_parent", "female_parent") %in% names(corrected))) + expect_equal(nrow(out$missing_parents), 2) + expect_true("X" %in% out$missing_parents$id) + expect_true("Y" %in% out$missing_parents$id) }) -test_that("corrected pedigree is NOT assigned to global env when correct = FALSE", { +test_that("missing parents are added as founder rows in corrected_pedigree", { ped <- data.frame( id = c("A", "B"), - male_parent = c("0", "A"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true("X" %in% out$corrected_pedigree$id) + expect_true("Y" %in% out$corrected_pedigree$id) + + x_row <- out$corrected_pedigree[out$corrected_pedigree$id == "X", ] + y_row <- out$corrected_pedigree[out$corrected_pedigree$id == "Y", ] + + expect_equal(x_row$male_parent, "0") + expect_equal(x_row$female_parent, "0") + expect_equal(y_row$male_parent, "0") + expect_equal(y_row$female_parent, "0") +}) + +test_that("individual that is its own parent is logged as a dependency", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("A", "0"), female_parent = c("0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - on.exit({ - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_gt(nrow(out$dependencies), 0) +}) - check_ped(f, verbose = FALSE, correct = FALSE) +# ============================================================================== +# 8. Inconsistent sex roles +# inconsistent_sex_roles returns the rows where the conflicting ID appears +# AS A PARENT (not the row where it is the subject) [1] +# ============================================================================== +test_that("check_ped detects inconsistent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - expect_false(exists(corr_name, envir = .GlobalEnv)) - expect_true(exists(rep_name, envir = .GlobalEnv)) + expect_true("inconsistent_sex_roles" %in% names(out)) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | + out$inconsistent_sex_roles$female_parent == "P")) +}) + +test_that("correct_inconsistent_sex_roles = TRUE zeros out conflicting parent references", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = TRUE) + corr <- out$corrected_pedigree + + expect_false(any(corr$male_parent == "P")) + expect_false(any(corr$female_parent == "P")) +}) + +test_that("correct_inconsistent_sex_roles = FALSE leaves conflicting references", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = FALSE) + corr <- out$corrected_pedigree + + expect_true(any(corr$male_parent == "P" | corr$female_parent == "P")) +}) + +# ============================================================================== +# 9. Cycle / dependency detection +# ============================================================================== +test_that("check_ped detects a direct two-node cycle", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_gt(nrow(out$dependencies), 0) }) -# =========================================================================== +test_that("cycle-involved IDs are still present in corrected_pedigree", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_true("A" %in% out$corrected_pedigree$id) + expect_true("B" %in% out$corrected_pedigree$id) +}) + +# ============================================================================== # 10. Missing required columns → hard stop -# Tests that a file lacking the canonical column names (after alias remapping) -# causes an immediate error rather than silently producing empty results. -# Column names like animal_id/parent1/parent2 are not in the alias list and -# therefore cannot be remapped, triggering the validation stop. -# =========================================================================== +# tolower(names()) in check_ped.R normalizes caps before validation [1] +# so only truly unrecognizable columns trigger this error +# ============================================================================== test_that("check_ped errors when required columns are missing", { bad_df <- data.frame( - animal_id = c("a", "b"), - parent1 = c("0", "a"), - parent2 = c("0", "0") + animal_id = c("a", "b"), + parent1 = c("0", "a"), + parent2 = c("0", "0") ) expect_error( check_ped(write_ped(bad_df), verbose = FALSE), @@ -377,79 +711,127 @@ test_that("check_ped errors when required columns are missing", { ) }) -# =========================================================================== -# 11. Integration test using the bundled check_ped_test.txt fixture -# End-to-end test against a real fixture file shipped with the package. -# Validates the overall row counts and that known problematic individuals -# (grandfather2, grandfather3) are flagged for inconsistent sex roles. -# =========================================================================== -test_that("integration test with bundled fixture file", { - ped_file <- system.file("check_ped_test.txt", package = "BIGr") - skip_if(ped_file == "", "Bundled fixture file not found; skipping integration test.") +# ============================================================================== +# 11. Capitalization tolerance — mixed caps should be accepted [1] +# ============================================================================== +test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Parent)", { + ped <- data.frame( + ID = c("A", "B", "C"), + Male_Parent = c("0", "A", "A"), + Female_Parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + expect_length(out, 6) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) - expect_length(out, 5) +test_that("check_ped accepts all-uppercase column names", { + ped <- data.frame( + ID = c("A", "B"), + MALE_PARENT = c("0", "A"), + FEMALE_PARENT = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) - # Inconsistent sex roles should flag grandfather2 & grandfather3 - expect_true(all( - c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$male_parent | - c("grandfather2", "grandfather3") %in% out$inconsistent_sex_roles$female_parent - )) - expect_equal(nrow(out$missing_parents), 8) + expect_length(out, 6) }) -# =========================================================================== -# 12. Zip export (save_zip = TRUE / save_corrected_zip = TRUE) -# Tests that save_zip = TRUE produces a .zip archive in the working directory, -# and that save_corrected_zip = TRUE includes the corrected pedigree file -# inside that archive when correct = TRUE. -# =========================================================================== -test_that("save_zip = TRUE creates a zip archive in the working directory", { +# ============================================================================== +# 12. Column order tolerance — columns in any order should be accepted [1] +# ============================================================================== +test_that("check_ped accepts columns in any order", { + ped <- data.frame( + female_parent = c("0", "0"), + male_parent = c("0", "A"), + id = c("A", "B") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + + expect_length(out, 6) + expect_equal(nrow(out$corrected_pedigree), 2) +}) + +# ============================================================================== +# 13. Seed reproducibility +# ============================================================================== +test_that("seed produces reproducible results", { ped <- data.frame( id = c("A", "B", "C"), male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - zip_name <- paste0(file_base, "_report.zip") - rep_name <- paste0(file_base, "_report") - zip_path <- file.path(getwd(), zip_name) + f <- write_ped(ped) + out1 <- check_ped(f, seed = 42, verbose = FALSE) + out2 <- check_ped(f, seed = 42, verbose = FALSE) - on.exit({ - if (file.exists(zip_path)) file.remove(zip_path) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) + expect_identical(out1$corrected_pedigree, out2$corrected_pedigree) +}) - check_ped(f, verbose = FALSE, correct = FALSE, save_zip = TRUE) +# ============================================================================== +# 14. verbose = FALSE suppresses console output +# ============================================================================== +test_that("verbose = FALSE suppresses console output", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + expect_silent(check_ped(write_ped(ped), verbose = FALSE)) +}) - expect_true(file.exists(zip_path)) +# ============================================================================== +# 15. Return value is invisible +# ============================================================================== +test_that("check_ped returns invisibly", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + expect_invisible(check_ped(write_ped(ped), verbose = FALSE)) }) -test_that("save_zip + save_corrected_zip + correct = TRUE includes corrected file in zip", { +# ============================================================================== +# 16. No write logic — check_ped does not write files to disk +# ============================================================================== +test_that("no output files are written to disk", { + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + ped <- data.frame( id = c("A", "B"), - male_parent = c("0", "X"), - female_parent = c("0", "Y") + male_parent = c("0", "A"), + female_parent = c("0", "0") ) - f <- write_ped(ped) - file_base <- tools::file_path_sans_ext(basename(f)) - zip_name <- paste0(file_base, "_report.zip") - corr_name <- paste0(file_base, "_corrected") - rep_name <- paste0(file_base, "_report") - zip_path <- file.path(getwd(), zip_name) - - on.exit({ - if (file.exists(zip_path)) file.remove(zip_path) - if (exists(corr_name, envir = .GlobalEnv)) rm(list = corr_name, envir = .GlobalEnv) - if (exists(rep_name, envir = .GlobalEnv)) rm(list = rep_name, envir = .GlobalEnv) - }) - - check_ped(f, verbose = FALSE, correct = TRUE, - save_zip = TRUE, save_corrected_zip = TRUE) - - expect_true(file.exists(zip_path)) - zip_contents <- utils::unzip(zip_path, list = TRUE)$Name - expect_true(any(grepl("_corrected", zip_contents))) + check_ped(write_ped(ped), verbose = FALSE) + + expect_length(list.files(tmp_dir), 0) +}) + +# ============================================================================== +# 17. Integration test with bundled fixture file +# Fixture has mixed-case headers (ID, Male_parent, Female_Parent) [1] +# tolower() in check_ped.R normalizes these automatically +# ============================================================================== +test_that("integration test with bundled fixture file", { + ped_file <- system.file("check_ped_test.txt", package = "BIGr") + skip_if(ped_file == "", "Bundled fixture file not found; skipping.") + + out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + + expect_length(out, 6) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + + # inconsistent_sex_roles stores rows where conflicting ID appears AS A PARENT [1] + conflicting_ids <- unique(c( + out$inconsistent_sex_roles$male_parent, + out$inconsistent_sex_roles$female_parent + )) + expect_true(any(c("grandfather2", "grandfather3") %in% conflicting_ids)) + expect_equal(nrow(out$missing_parents), 8) }) diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index d1dfb3c..bf62952 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -6,56 +6,51 @@ library(data.table) # ============================================================================== # Helpers # ============================================================================== - make_files <- function(genos, parents, progeny, dir = tempdir()) { - geno_file <- file.path(dir, paste0("genos_", sample(1e6,1), ".txt")) - parent_file <- file.path(dir, paste0("parents_", sample(1e6,1), ".txt")) - progeny_file <- file.path(dir, paste0("progeny_", sample(1e6,1), ".txt")) + geno_file <- file.path(dir, paste0("genos_", sample(1e6, 1), ".txt")) + parent_file <- file.path(dir, paste0("parents_", sample(1e6, 1), ".txt")) + progeny_file <- file.path(dir, paste0("progeny_", sample(1e6, 1), ".txt")) data.table::fwrite(genos, geno_file, sep = "\t") data.table::fwrite(parents, parent_file, sep = "\t") data.table::fwrite(progeny, progeny_file, sep = "\t") list(g = geno_file, p = parent_file, pr = progeny_file) } -# ============================================================================== -# Base toy data +# Base toy data — lowercase id column # S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 # S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 -# ============================================================================== - base_genos <- data.table::data.table( - ID = c("S1","S2","D1","D2","child1","child2"), + id = c("S1", "S2", "D1", "D2", "child1", "child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), M2 = c(0L, 2L, 0L, 2L, 0L, 2L), M3 = c(0L, 2L, 0L, 2L, 0L, 2L), M4 = c(0L, 2L, 0L, 2L, 0L, 2L), M5 = c(0L, 2L, 0L, 2L, 0L, 2L) ) -base_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M","M","F","F")) -child1_progeny <- data.table::data.table(ID = "child1") -child2_progeny <- data.table::data.table(ID = "child2") -base_progeny <- data.table::data.table(ID = c("child1","child2")) +base_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), + sex = c("M", "M", "F", "F")) +child1_progeny <- data.table::data.table(id = "child1") +child2_progeny <- data.table::data.table(id = "child2") +base_progeny <- data.table::data.table(id = c("child1", "child2")) # All-zero genotypes — every pair ties at 0% error tied_genos <- data.table::data.table( - ID = c("S1","S2","D1","D2","child_tie"), + id = c("S1", "S2", "D1", "D2", "child_tie"), M1 = c(0L, 0L, 0L, 0L, 0L), M2 = c(0L, 0L, 0L, 0L, 0L) ) -tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M","M","F","F")) -tied_progeny <- data.table::data.table(ID = "child_tie") +tied_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), + sex = c("M", "M", "F", "F")) +tied_progeny <- data.table::data.table(id = "child_tie") # ============================================================================== # 1. Input validation # ============================================================================== - test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, method = "bad_method", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "Method must be one of" ) }) @@ -64,7 +59,7 @@ test_that("min_markers < 1 throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, min_markers = 0, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "min_markers" ) }) @@ -73,426 +68,445 @@ test_that("error_threshold out of range throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = 150, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "error_threshold" ) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = -1, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "error_threshold" ) }) -test_that("invalid na_string throws an error", { - f <- make_files(base_genos, base_parents, child1_progeny) - expect_error( - find_parentage(f$g, f$p, f$pr, na_string = "NULL", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), - regexp = "na_string" - ) -}) - test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage("nonexistent.txt", f$p, f$pr, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "Error reading input files" ) }) test_that("parent IDs absent from genotype file raise a warning and are dropped", { extra_parents <- rbind(base_parents, - data.table::data.table(ID = "GHOST", Sex = "M")) + data.table::data.table(id = "GHOST", sex = "M")) f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "GHOST" ) }) test_that("progeny IDs absent from genotype file raise a warning and are dropped", { extra_progeny <- rbind(child1_progeny, - data.table::data.table(ID = "GHOST_KID")) + data.table::data.table(id = "GHOST_KID")) f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "GHOST_KID" ) }) test_that("no valid progeny candidates after filtering stops with an error", { - ghost_progeny <- data.table::data.table(ID = "NOBODY") + ghost_progeny <- data.table::data.table(id = "NOBODY") f <- make_files(base_genos, base_parents, ghost_progeny) expect_warning( expect_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "No valid progeny" ) ) }) -test_that("missing Sex column raises a warning and defaults to ambiguous", { - parents_no_sex <- data.table::data.table(ID = c("S1","D1")) +test_that("missing sex column raises a warning and defaults to ambiguous", { + parents_no_sex <- data.table::data.table(id = c("S1", "D1")) f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE), - regexp = "Sex" + verbose = FALSE, plot_results = FALSE), + regexp = "sex" ) }) # ============================================================================== -# 2. Return structure +# 2. Return structure — named list # ============================================================================== +test_that("find_parentage returns an invisible named list with all required elements", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_type(out, "list") + expect_named(out, c("pass", "high_error", "low_markers", "full_results", "plot"), + ignore.order = TRUE) +}) -test_that("best_pair returns a data.table with expected columns (no ties)", { +test_that("full_results is a data.table", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Male_Parent","Female_Parent", - "Mendelian_Error_Pct","Markers_Tested", - "Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_s3_class(out$full_results, "data.table") }) -test_that("best_male_parent returns expected columns", { +test_that("full_results has one row per progeny", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_false("Male_Parent" %in% names(res)) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(nrow(out$full_results), 1L) +}) + +test_that("named list subsets together cover all full_results rows", { + f <- make_files(base_genos, base_parents, base_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + subset_total <- nrow(out$pass) + nrow(out$high_error) + nrow(out$low_markers) + expect_equal(subset_total, nrow(out$full_results)) }) -test_that("best_female_parent returns expected columns", { +test_that("plot element is NULL when plot_results = FALSE", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) + + expect_null(out$plot) }) -test_that("best_match returns expected columns", { +test_that("best_pair full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_true(all(c("id", "male_parent", "female_parent", + "mendelian_error_pct", "markers_tested", + "status") %in% names(out$full_results))) }) -test_that("one row returned per progeny for single-parent methods", { - f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_male_parent","best_female_parent","best_match")) { - res <- find_parentage(f$g, f$p, f$pr, method = m, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_equal(nrow(res), 1L, label = paste("row count for method", m)) - } +test_that("best_male_parent full_results has expected lowercase columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) + expect_false("male_parent" %in% names(out$full_results)) }) -test_that("Markers_Tested equals the number of non-NA marker columns", { +test_that("best_female_parent full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) }) -test_that("Mendelian_Error_Pct is between 0 and 100", { +test_that("best_match full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - pct <- as.numeric(res$Mendelian_Error_Pct) - expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) + out <- find_parentage(f$g, f$p, f$pr, method = "best_match", + verbose = FALSE, plot_results = FALSE) + + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) }) # ============================================================================== # 3. Biological correctness # ============================================================================== - test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Male_Parent, "S1") - expect_equal(res$Female_Parent, "D1") - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$male_parent, "S1") + expect_equal(out$full_results$female_parent, "D1") + expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) }) test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { f <- make_files(base_genos, base_parents, child2_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Male_Parent, "S2") - expect_equal(res$Female_Parent, "D2") - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$male_parent, "S2") + expect_equal(out$full_results$female_parent, "D2") + expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) }) test_that("best_male_parent identifies S1 as best male for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Best_Match, "S1") + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$best_match, "S1") }) test_that("best_female_parent identifies D1 as best female for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Best_Match, "D1") + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$best_match, "D1") +}) + +test_that("both child1 and child2 correctly assigned when run together", { + f <- make_files(base_genos, base_parents, base_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(nrow(out$full_results), 2L) + expect_equal(out$full_results[id == "child1"]$male_parent, "S1") + expect_equal(out$full_results[id == "child1"]$female_parent, "D1") + expect_equal(out$full_results[id == "child2"]$male_parent, "S2") + expect_equal(out$full_results[id == "child2"]$female_parent, "D2") }) -test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { +test_that("markers_tested equals number of non-NA marker columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$markers_tested, ncol(base_genos) - 1L) }) -test_that("both child1 and child2 are correctly assigned when run together", { - f <- make_files(base_genos, base_parents, base_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(nrow(res), 2L) - child1_row <- res[Progeny == "child1"] - child2_row <- res[Progeny == "child2"] - expect_equal(child1_row$Male_Parent, "S1") - expect_equal(child1_row$Female_Parent, "D1") - expect_equal(child2_row$Male_Parent, "S2") - expect_equal(child2_row$Female_Parent, "D2") +test_that("mendelian_error_pct is between 0 and 100", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + pct <- as.numeric(out$full_results$mendelian_error_pct) + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) # ============================================================================== -# 4. Assignment_Status — min_markers and error_threshold +# 4. status — lowercase values # ============================================================================== - -test_that("Assignment_Status = PASS for perfect trio within thresholds", { +test_that("status = pass for perfect trio within thresholds", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Assignment_Status, "PASS") + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$status, "pass") }) -test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available markers", { +test_that("status = low_markers when min_markers exceeds available markers", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 99999, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Assignment_Status, "LOW_MARKERS") + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$status, "low_markers") }) -test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { - # Use wrong parents so error rate is high +test_that("status = high_error when error rate exceeds threshold", { high_error_genos <- data.table::data.table( - ID = c("S1","D1","bad_child"), + id = c("S1", "D1", "bad_child"), M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) ) - parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("M","F")) - progeny <- data.table::data.table(ID = "bad_child") - f <- make_files(high_error_genos, parents, progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Assignment_Status, "HIGH_ERROR") + parents <- data.table::data.table(id = c("S1", "D1"), sex = c("M", "F")) + progeny <- data.table::data.table(id = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results$status, "high_error") }) -test_that("Assignment_Status column is present in all methods", { +test_that("status column is present and lowercase in all methods", { f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { - res <- find_parentage(f$g, f$p, f$pr, method = m, - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_true("Assignment_Status" %in% names(res), - label = paste("Assignment_Status present for method", m)) + for (m in c("best_pair", "best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_true("status" %in% names(out$full_results), + label = paste("status column present for method", m)) + expect_true(all(out$full_results$status %in% + c("pass", "high_error", "low_markers", NA)), + label = paste("lowercase status values for method", m)) } }) -test_that("LOW_MARKERS is flagged for single-parent methods too", { +test_that("low_markers is flagged for single-parent methods too", { f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_male_parent","best_female_parent","best_match")) { - res <- find_parentage(f$g, f$p, f$pr, method = m, + for (m in c("best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, min_markers = 99999, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(res$Assignment_Status, "LOW_MARKERS", - label = paste("LOW_MARKERS for method", m)) + plot_results = FALSE) + expect_equal(out$full_results$status, "low_markers", + label = paste("low_markers for method", m)) } }) +test_that("pass list element contains only pass rows", { + f <- make_files(base_genos, base_parents, base_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + if (nrow(out$pass) > 0) + expect_true(all(out$pass$status == "pass")) +}) + +test_that("high_error list element contains only high_error rows", { + high_error_genos <- data.table::data.table( + id = c("S1", "D1", "bad_child"), + M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) + ) + parents <- data.table::data.table(id = c("S1", "D1"), sex = c("M", "F")) + progeny <- data.table::data.table(id = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + if (nrow(out$high_error) > 0) + expect_true(all(out$high_error$status == "high_error")) +}) + # ============================================================================== # 5. allow_selfing # ============================================================================== - test_that("allow_selfing = FALSE removes self-pairs from candidates", { - ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) + ambig_parents <- data.table::data.table(id = c("S1", "D1"), sex = c("A", "A")) f <- make_files(base_genos, ambig_parents, child1_progeny) - res <- suppressWarnings( + out <- suppressWarnings( find_parentage(f$g, f$p, f$pr, method = "best_pair", allow_selfing = FALSE, show_ties = FALSE, - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) + verbose = FALSE, plot_results = FALSE) ) - if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) - expect_false(res$Male_Parent == res$Female_Parent) + r <- out$full_results + if (!is.na(r$male_parent) && !is.na(r$female_parent)) + expect_false(r$male_parent == r$female_parent) }) # ============================================================================== # 6. show_ties # ============================================================================== - -test_that("show_ties = TRUE produces suffixed columns when ties exist", { +test_that("show_ties = TRUE produces lowercase suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = TRUE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_true(any(grepl("^Male_Parent_", names(res)))) - expect_true(any(grepl("^Female_Parent_", names(res)))) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, plot_results = FALSE) + + expect_true(any(grepl("^male_parent_", names(out$full_results)))) + expect_true(any(grepl("^female_parent_", names(out$full_results)))) }) test_that("show_ties = FALSE warns about ties and returns single-result columns", { f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE), + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE), regexp = "tied" ) - expect_true("Male_Parent" %in% names(res)) - expect_true("Female_Parent" %in% names(res)) - expect_false(any(grepl("^Male_Parent_\\d", names(res)))) - expect_false(any(grepl("^Female_Parent_\\d", names(res)))) + expect_true("male_parent" %in% names(out$full_results)) + expect_true("female_parent" %in% names(out$full_results)) + expect_false(any(grepl("^male_parent_\\d", names(out$full_results)))) + expect_false(any(grepl("^female_parent_\\d", names(out$full_results)))) }) test_that("base columns are always populated even when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = TRUE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_false(is.na(res$Male_Parent[1])) - expect_false(is.na(res$Female_Parent[1])) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, plot_results = FALSE) + + expect_false(is.na(out$full_results$male_parent[1])) + expect_false(is.na(out$full_results$female_parent[1])) }) # ============================================================================== -# 7. verbose / write_txt +# 7. verbose # ============================================================================== - -test_that("verbose = TRUE returns the result as data.table", { +test_that("verbose = TRUE returns the result as a named list", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = TRUE, write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = TRUE, plot_results = FALSE) + expect_type(out, "list") }) -test_that("verbose = FALSE returns the result as data.table", { - f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") +test_that("verbose = FALSE suppresses console output", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_silent( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) + ) }) -test_that("write_txt = TRUE creates parentage_testing_results.txt", { +# ============================================================================== +# 8. No write logic — find_parentage does not write files +# ============================================================================== +test_that("no output files are written to disk", { + tmp_dir <- tempfile() + dir.create(tmp_dir) old_wd <- getwd() - tmp <- tempdir() - setwd(tmp) - on.exit(setwd(old_wd), add = TRUE) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) - find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = TRUE, plot_results = FALSE) - expect_true(file.exists(file.path(tmp, "parentage_testing_results.txt"))) -}) - -test_that("write_txt = FALSE does not create the output file", { - old_wd <- getwd() - tmp <- tempdir() - setwd(tmp) - on.exit(setwd(old_wd), add = TRUE) - out_file <- file.path(tmp, "parentage_testing_results.txt") - if (file.exists(out_file)) file.remove(out_file) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp_dir) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - expect_false(file.exists(out_file)) -}) + verbose = FALSE, plot_results = FALSE) -test_that("na_string = '' is accepted without error", { - f <- make_files(base_genos, base_parents, child1_progeny) - expect_no_error( - find_parentage(f$g, f$p, f$pr, method = "best_pair", - na_string = "", verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - ) + written_files <- list.files(tmp_dir, pattern = "\\.txt$|\\.jpg$|\\.csv$") + # Only the input files we created should be there + expect_equal(length(written_files), 3L) }) # ============================================================================== -# 8. Sex-based candidate filtering +# 9. Sex-based candidate filtering # ============================================================================== - test_that("best_male_parent only assigns M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID - expect_true(res$Best_Match %in% valid_male_parents) + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + + valid_male_parents <- base_parents[sex %in% c("M", "A")]$id + expect_true(out$full_results$best_match %in% valid_male_parents) }) test_that("best_female_parent only assigns F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) - valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID - expect_true(res$Best_Match %in% valid_female_parents) + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + + valid_female_parents <- base_parents[sex %in% c("F", "A")]$id + expect_true(out$full_results$best_match %in% valid_female_parents) }) test_that("best_pair male slot contains only M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - valid_males <- base_parents[Sex %in% c("M","A")]$ID - expect_true(res$Male_Parent %in% valid_males) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + valid_males <- base_parents[sex %in% c("M", "A")]$id + expect_true(out$full_results$male_parent %in% valid_males) }) test_that("best_pair female slot contains only F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - valid_females <- base_parents[Sex %in% c("F","A")]$ID - expect_true(res$Female_Parent %in% valid_females) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + + valid_females <- base_parents[sex %in% c("F", "A")]$id + expect_true(out$full_results$female_parent %in% valid_females) }) # ============================================================================== -# 9. Edge cases +# 10. Edge cases # ============================================================================== - test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 1L) }) test_that("all-NA marker column does not cause an error", { @@ -501,33 +515,64 @@ test_that("all-NA marker column does not cause an error", { f <- make_files(na_genos, base_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE, plot_results = FALSE) + verbose = FALSE, plot_results = FALSE) ) }) -test_that("Progeny column contains the correct progeny IDs", { +test_that("id column in full_results contains the correct progeny IDs", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_setequal(res$Progeny, child1_progeny$ID) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_setequal(out$full_results$id, child1_progeny$id) }) -test_that("multiple progeny are all represented in output", { +test_that("multiple progeny are all represented in full_results", { f <- make_files(base_genos, base_parents, base_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) - expect_setequal(res$Progeny, base_progeny$ID) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_setequal(out$full_results$id, base_progeny$id) }) test_that("single parent pair (n_pairs = 1) does not cause dimension errors", { - single_pair_parents <- data.table::data.table(ID = c("S1","D1"), - Sex = c("M","F")) - f <- make_files(base_genos, single_pair_parents, child1_progeny) + single_pair_parents <- data.table::data.table(id = c("S1", "D1"), + sex = c("M", "F")) + f <- make_files(base_genos, single_pair_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, - write_txt = FALSE, plot_results = FALSE) + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("one row returned per progeny for single-parent methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, + verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 1L, + label = paste("row count for method", m)) + } +}) + +# ============================================================================== +# 11. plot element +# ============================================================================== +test_that("plot element is a ggplot when plot_results = TRUE", { + skip_if_not_installed("ggplot2") + f <- make_files(base_genos, base_parents, child1_progeny) + out <- suppressWarnings( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = TRUE) + ) + expect_s3_class(out$plot, "ggplot") +}) + +# ============================================================================== +# 12. Return value is invisible +# ============================================================================== +test_that("find_parentage returns invisibly", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_invisible( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) ) }) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index 02acc22..dabaaad 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -6,32 +6,30 @@ library(data.table) # ============================================================================== # Helpers # ============================================================================== - make_genos <- function() { - # IND_A: all 0, IND_B: all 2, IND_C: all 1 (het), IND_D: all 0, IND_E: all 0 n_markers <- 20 marker_names <- paste0("M", seq_len(n_markers)) dt <- data.table( - ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), + id = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), rbind( rep(0L, n_markers), # IND_A — all ref homozygous rep(2L, n_markers), # IND_B — all alt homozygous rep(1L, n_markers), # IND_C — all het (valid child of IND_A x IND_B) - rep(0L, n_markers), # IND_D — all ref (impossible child of IND_B x IND_A) + rep(0L, n_markers), # IND_D — impossible child of IND_B x IND_A rep(0L, n_markers) # IND_E — all ref ) ) - setnames(dt, c("ID", marker_names)) + setnames(dt, c("id", marker_names)) dt } make_pedigree <- function() { - # IND_C: perfect Mendelian child of IND_A x IND_B -> PASS - # IND_D: declared parents swapped -> FAIL + # IND_C: perfect Mendelian child of IND_A x IND_B -> pass + # IND_D: declared parents swapped -> fail data.table( - ID = c("IND_C", "IND_D"), - Male_Parent = c("IND_A", "IND_B"), - Female_Parent = c("IND_B", "IND_A") + id = c("IND_C", "IND_D"), + male_parent = c("IND_A", "IND_B"), + female_parent = c("IND_B", "IND_A") ) } @@ -43,462 +41,419 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { list(ped = ped_file, genos = genos_file) } -# Helper to run validate_pedigree in a temporary working directory -# so corrected_pedigree.txt doesn't pollute the real wd -run_in_tmpdir <- function(expr) { - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - force(expr) -} - # ============================================================================== # 1. Input validation # ============================================================================== - test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - run_in_tmpdir({ - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, write_results = FALSE, - plot_results = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, write_results = FALSE, - plot_results = FALSE)) - }) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = 150, + verbose = FALSE, plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + trio_error_threshold = -1, + verbose = FALSE, plot_results = FALSE)) }) test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - run_in_tmpdir({ - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, write_results = FALSE, - plot_results = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, write_results = FALSE, - plot_results = FALSE)) - }) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = 101, + verbose = FALSE, plot_results = FALSE)) + expect_error(validate_pedigree(f$ped, f$genos, + single_parent_error_threshold = -5, + verbose = FALSE, plot_results = FALSE)) }) test_that("missing required pedigree column raises an error", { - bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") + bad_ped <- data.table(id = "IND_C", parent1 = "IND_A", female_parent = "IND_B") f <- write_temp_files(ped = bad_ped) - run_in_tmpdir({ - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, - plot_results = FALSE), - regexp = "missing required columns" - ) - }) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), + regexp = "missing required columns" + ) }) -test_that("missing ID column in genotypes raises an error", { +test_that("missing id column in genotypes raises an error", { bad_genos <- copy(make_genos()) - setnames(bad_genos, "ID", "SampleID") + setnames(bad_genos, "id", "SampleID") f <- write_temp_files(genos = bad_genos) - run_in_tmpdir({ - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, - plot_results = FALSE), - regexp = "ID" - ) - }) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), + regexp = "id" + ) }) -test_that("all trios with no genotype data stop with an error", { - ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") +test_that("all trios with no genotype data stops with an error", { + ped <- data.table(id = "GHOST", male_parent = "IND_A", female_parent = "IND_B") f <- write_temp_files(ped = ped) - run_in_tmpdir({ - expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_results = FALSE, - plot_results = FALSE), - regexp = "No valid trios remain" - ) - }) + expect_error( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), + regexp = "No valid trios remain" + ) }) # ============================================================================== -# 2. Return structure +# 2. Return structure — named list # ============================================================================== +test_that("returns an invisible named list with all required elements", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) -test_that("returns a data.table", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_s3_class(res, "data.table") - }) + expect_type(out, "list") + expect_named(out, c("pass", "fail", "low_markers", "no_genotype_data", + "founders", "missing_parents", "full_results", + "corrected_pedigree", "plot"), + ignore.order = TRUE) }) -test_that("result has one row per pedigree entry", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(nrow(res), 2L) - }) +test_that("full_results has one row per pedigree entry", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(nrow(out$full_results), 2L) }) -test_that("result has all expected columns", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expected_cols <- c( - "ID", "Orig_Male_Parent", "Orig_Female_Parent", - "Trio_Mendelian_Error_Pct", "Trio_Markers_Tested", "Status", - "Recommended_Correction", - "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", - "Best_Male_Candidate", "Best_Male_Candidate_Error_Pct", - "Best_Female_Candidate", "Best_Female_Candidate_Error_Pct" - ) - expect_true(all(expected_cols %in% names(res))) - }) +test_that("full_results has all expected lowercase columns", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expected_cols <- c( + "id", "orig_male_parent", "orig_female_parent", + "trio_mendelian_error_pct", "trio_markers_tested", "status", + "recommended_correction", + "male_parent_hom_error_pct", "female_parent_hom_error_pct", + "best_male_candidate", "best_male_candidate_error_pct", + "best_female_candidate", "best_female_candidate_error_pct" + ) + expect_true(all(expected_cols %in% names(out$full_results))) +}) + +test_that("full_results is a data.table", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_s3_class(out$full_results, "data.table") +}) + +test_that("corrected_pedigree is a data.table with lowercase columns", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_s3_class(out$corrected_pedigree, "data.table") + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("plot element is NULL when plot_results = FALSE", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_null(out$plot) +}) + +test_that("named list subsets sum correctly to full_results row count", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + subset_total <- nrow(out$pass) + nrow(out$fail) + nrow(out$low_markers) + + nrow(out$no_genotype_data) + nrow(out$founders) + nrow(out$missing_parents) + expect_equal(subset_total, nrow(out$full_results)) }) # ============================================================================== -# 3. PASS / FAIL / LOW_MARKERS / NO_DATA statuses +# 3. pass / fail / low_markers / no_data statuses — lowercase # ============================================================================== +test_that("pass trio is correctly identified", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) -test_that("PASS trio is correctly identified", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - pass_row <- res[ID == "IND_C"] - expect_equal(nrow(pass_row), 1L) - expect_equal(pass_row$Status, "PASS") - expect_equal(pass_row$Trio_Mendelian_Error_Pct, 0) - expect_equal(pass_row$Recommended_Correction, "NONE") - }) -}) - -test_that("FAIL trio is correctly identified", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(nrow(fail_row), 1L) - expect_equal(fail_row$Status, "FAIL") - expect_gt(fail_row$Trio_Mendelian_Error_Pct, 5.0) - }) + pass_row <- out$full_results[id == "IND_C"] + expect_equal(nrow(pass_row), 1L) + expect_equal(pass_row$status, "pass") + expect_equal(pass_row$trio_mendelian_error_pct, 0) + expect_equal(pass_row$recommended_correction, "none") }) -test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(fail_row$Recommended_Correction, "REMOVE_MALE_PARENT") - expect_false(is.na(fail_row$Best_Male_Candidate)) - expect_true(is.na(fail_row$Best_Female_Candidate)) - }) +test_that("fail trio is correctly identified", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + fail_row <- out$full_results[id == "IND_D"] + expect_equal(nrow(fail_row), 1L) + expect_equal(fail_row$status, "fail") + expect_gt(fail_row$trio_mendelian_error_pct, 5.0) }) -test_that("Trio_Mendelian_Error_Pct is 0 for a perfect trio", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_C"]$Trio_Mendelian_Error_Pct, 0) - }) +test_that("fail trio has remove_male_parent decision and best candidate populated", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + fail_row <- out$full_results[id == "IND_D"] + expect_equal(fail_row$recommended_correction, "remove_male_parent") + expect_false(is.na(fail_row$best_male_candidate)) + expect_true(is.na(fail_row$best_female_candidate)) }) -test_that("Trio_Markers_Tested equals number of markers for complete data", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 20L) - }) +test_that("trio_mendelian_error_pct is 0 for a perfect trio", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C"]$trio_mendelian_error_pct, 0) }) -test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { - f <- write_temp_files() - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE, - min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(grepl("^LOW_MARKERS_", res$Recommended_Correction))) - }) +test_that("trio_markers_tested equals number of markers for complete data", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 20L) }) -test_that("NA markers reduce Trio_Markers_Tested and do not cause errors", { +test_that("low_markers status assigned when markers_tested < min_markers", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE, min_markers = 25L) + + expect_true(all(out$full_results$status == "low_markers")) + expect_true(all(grepl("^low_markers_", out$full_results$recommended_correction))) +}) + +test_that("NA markers reduce trio_markers_tested and do not cause errors", { genos <- make_genos() - genos[ID == "IND_C", M1 := NA_integer_] - genos[ID == "IND_C", M2 := NA_integer_] - f <- write_temp_files(genos = genos) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_C"]$Trio_Markers_Tested, 18L) - expect_equal(res[ID == "IND_C"]$Status, "PASS") - }) + genos[id == "IND_C", M1 := NA_integer_] + genos[id == "IND_C", M2 := NA_integer_] + f <- write_temp_files(genos = genos) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 18L) + expect_equal(out$full_results[id == "IND_C"]$status, "pass") +}) + +test_that("pass list element contains only pass rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_true(all(out$pass$status == "pass")) +}) + +test_that("fail list element contains only fail rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_true(all(out$fail$status == "fail")) }) # ============================================================================== -# 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / -# MISSING_BOTH_PARENTS) +# 4. missing parent statuses # ============================================================================== +test_that("missing_male_parent status and recommendation are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] -test_that("MISSING_MALE_PARENT status and recommendation are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_MALE_PARENT") - expect_equal(r$Recommended_Correction, "NONE") - expect_false(is.na(r$Best_Male_Candidate)) - expect_true(is.na(r$Best_Female_Candidate)) - }) -}) - -test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_FEMALE_PARENT") - expect_equal(r$Recommended_Correction, "NONE") - expect_true(is.na(r$Best_Male_Candidate)) - expect_false(is.na(r$Best_Female_Candidate)) - }) -}) - -test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_BOTH_PARENTS") - expect_equal(r$Recommended_Correction, "NONE") - expect_false(is.na(r$Best_Male_Candidate)) - expect_false(is.na(r$Best_Female_Candidate)) - }) -}) - -test_that("MISSING_* rows preserve 0s in corrected_pedigree.txt", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + expect_equal(r$status, "missing_male_parent") + expect_equal(r$recommended_correction, "none") + expect_false(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # MISSING_MALE_PARENT — male stays "0", female unchanged - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") +test_that("missing_female_parent status and recommendation are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + + expect_equal(r$status, "missing_female_parent") + expect_equal(r$recommended_correction, "none") + expect_true(is.na(r$best_male_candidate)) + expect_false(is.na(r$best_female_candidate)) }) -test_that("Best_Male_Candidate for MISSING_MALE_PARENT is not the known female parent", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - r <- res[ID == "IND_E"] - # The known female parent should not be suggested as the best male candidate - expect_false(r$Best_Male_Candidate == "IND_B") - }) +test_that("missing_both_parents status and recommendations are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + + expect_equal(r$status, "missing_both_parents") + expect_equal(r$recommended_correction, "none") + expect_false(is.na(r$best_male_candidate)) + expect_false(is.na(r$best_female_candidate)) +}) + +test_that("best_male_candidate for missing_male_parent is not the known female parent", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + + expect_false(r$best_male_candidate == "IND_B") +}) + +test_that("missing_parents list element contains only missing_* rows", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_true(all(grepl("^missing_", out$missing_parents$status))) }) # ============================================================================== -# 5. FOUNDERS status +# 5. founders status # ============================================================================== +test_that("founders status is assigned when ID in founders list with 0 0 parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", female_parent = "0")) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(id = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_A"] -test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") - ) + expect_equal(r$status, "founders") + expect_equal(r$recommended_correction, "none") + expect_true(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) + +test_that("founders list element contains only founders rows", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", female_parent = "0")) founders_file <- tempfile(fileext = ".txt") - fwrite(data.table(ID = "IND_A"), founders_file, + fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_results = FALSE, - plot_results = FALSE) - founder_row <- res[ID == "IND_A"] - expect_equal(founder_row$Status, "FOUNDERS") - expect_equal(founder_row$Recommended_Correction, "NONE") - expect_true(is.na(founder_row$Best_Male_Candidate)) - expect_true(is.na(founder_row$Best_Female_Candidate)) - }) -}) - -test_that("non-founder rows are still evaluated normally when founders file is supplied", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") - ) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + + expect_true(all(out$founders$status == "founders")) +}) + +test_that("non-founder rows still evaluated normally when founders file is supplied", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", female_parent = "0")) founders_file <- tempfile(fileext = ".txt") - fwrite(data.table(ID = "IND_A"), founders_file, + fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_results = FALSE, - plot_results = FALSE) - # IND_C has real parents — should still get PASS - expect_equal(res[ID == "IND_C"]$Status, "PASS") - }) -}) - -test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") - }) -}) - -test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, - founders_file = NULL, - verbose = FALSE, - write_results = FALSE, - plot_results = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") - }) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C"]$status, "pass") +}) + +test_that("0 0 parents NOT in founders list get missing_both_parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_E"]$status, "missing_both_parents") }) # ============================================================================== -# 6. NO_GENOTYPE_DATA status +# 6. no_genotype_data status # ============================================================================== +test_that("no_genotype_data is flagged for progeny absent from genotype file", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "GHOST"] -test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_equal(nrow(ghost_row), 1L) - expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") - expect_equal(ghost_row$Recommended_Correction, "NONE") - }) -}) - -test_that("NO_GENOTYPE_DATA rows have NA/0 for all analysis columns", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_true(is.na(ghost_row$Trio_Mendelian_Error_Pct)) - expect_equal(ghost_row$Trio_Markers_Tested, 0L) - expect_true(is.na(ghost_row$Best_Male_Candidate)) - expect_true(is.na(ghost_row$Best_Female_Candidate)) - }) -}) - -test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") - }) + expect_equal(nrow(r), 1L) + expect_equal(r$status, "no_genotype_data") + expect_equal(r$recommended_correction, "none") +}) + +test_that("no_genotype_data rows have NA/0 for all analysis columns", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "GHOST"] + + expect_true(is.na(r$trio_mendelian_error_pct)) + expect_equal(r$trio_markers_tested, 0L) + expect_true(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) + +test_that("no_genotype_data flagged when declared parent absent from genotype file", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_C_GHOST", male_parent = "GHOST_DAD", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C_GHOST"]$status, "no_genotype_data") +}) + +test_that("no_genotype_data list element contains only no_genotype_data rows", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_true(all(out$no_genotype_data$status == "no_genotype_data")) }) test_that("valid trios still evaluated correctly when ghost rows are present", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - run_in_tmpdir({ - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_equal(res[ID == "IND_C"]$Status, "PASS") - expect_equal(res[ID == "IND_D"]$Status, "FAIL") - }) + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(out$full_results[id == "IND_C"]$status, "pass") + expect_equal(out$full_results[id == "IND_D"]$status, "fail") }) # ============================================================================== -# 7. Corrected pedigree output +# 7. corrected_pedigree contents # ============================================================================== +test_that("corrected_pedigree: pass parents are unchanged", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + corr <- out$corrected_pedigree -test_that("corrected_pedigree.txt is always written and PASS parents are unchanged", { - f <- write_temp_files() - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + expect_equal(as.character(corr[id == "IND_C"]$male_parent), "IND_A") + expect_equal(as.character(corr[id == "IND_C"]$female_parent), "IND_B") +}) - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) - # IND_C passes — parents must be unchanged - expect_equal(as.character(corr[ID == "IND_C"]$Male_Parent), "IND_A") - expect_equal(as.character(corr[ID == "IND_C"]$Female_Parent), "IND_B") +test_that("corrected_pedigree: bad parent set to 0 for fail trio with remove_male_parent", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + corr <- out$corrected_pedigree + + expect_equal(corr[id == "IND_D"]$male_parent, "0") + expect_equal(corr[id == "IND_D"]$female_parent, "IND_A") }) -test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { +test_that("corrected_pedigree has same number of rows as original pedigree", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + expect_equal(nrow(out$corrected_pedigree), nrow(make_pedigree())) +}) + +# ============================================================================== +# 8. No write logic — functions do not write files +# ============================================================================== +test_that("no output files are written to disk", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) @@ -506,56 +461,35 @@ test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, plot_results = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # IND_D fails with REMOVE_MALE_PARENT — male should become "0" - expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + + written_files <- list.files(tmp_dir) + expect_length(written_files, 0) }) # ============================================================================== -# 8. write_results / output file +# 9. verbose and plot_results # ============================================================================== +test_that("verbose = FALSE suppresses console output", { + f <- write_temp_files() + expect_silent(validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE)) +}) -test_that("write_results = TRUE writes output file with correct row count", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - run_in_tmpdir({ - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = TRUE, output_filename = out_file, - plot_results = FALSE) - expect_true(file.exists(out_file)) - written <- fread(out_file) - expect_equal(nrow(written), 2L) - }) -}) - -test_that("write_results = FALSE does not create output file", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - run_in_tmpdir({ - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = FALSE, output_filename = out_file, - plot_results = FALSE) - expect_false(file.exists(out_file)) - }) -}) - -test_that("output file contains correct number of rows when ghost rows present", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") +test_that("plot element is a ggplot when plot_results = TRUE", { + skip_if_not_installed("ggplot2") + f <- write_temp_files() + out <- suppressWarnings( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = TRUE) ) - f <- write_temp_files(ped = ped) - out_file <- tempfile(fileext = ".txt") - run_in_tmpdir({ - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_results = TRUE, output_filename = out_file, - plot_results = FALSE) - written <- fread(out_file) - # 2 valid trios + 1 ghost = 3 rows total - expect_equal(nrow(written), 3L) - }) + expect_s3_class(out$plot, "ggplot") +}) + +# ============================================================================== +# 10. Return value is invisible +# ============================================================================== +test_that("validate_pedigree returns invisibly", { + f <- write_temp_files() + expect_invisible(validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE)) }) From 26bae5f365ec82befc2c68c90d53cffbc18d7541 Mon Sep 17 00:00:00 2001 From: "josue.chinchilla" Date: Tue, 26 May 2026 16:35:39 -0400 Subject: [PATCH 16/16] updated functions to work as a workflow accepting inputs from objects or file paths, updated documentation and test files --- NAMESPACE | 2 + R/check_ped.R | 58 +-- R/find_parentage.R | 89 +++-- R/validate_pedigree.R | 77 ++-- inst/check_ped_test.txt | 9 +- man/check_ped.Rd | 18 +- man/find_parentage.Rd | 28 +- man/validate_pedigree.Rd | 11 +- tests/testthat/test-check_ped.R | 503 ++---------------------- tests/testthat/test-find_parentage.R | 136 ++++--- tests/testthat/test-validate_pedigree.R | 470 +++++++++++++++++----- 11 files changed, 638 insertions(+), 763 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f94124f..4d25dfe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) +importFrom(data.table,is.data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(dplyr,"%>%") @@ -69,6 +70,7 @@ importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(ggplot2,theme) importFrom(ggplot2,theme_minimal) +importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/R/check_ped.R b/R/check_ped.R index 78c53ed..259b696 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -6,7 +6,8 @@ #' roles are corrected when their respective arguments are TRUE. Cycles are #' reported only and must be resolved manually. #' -#' @param ped.file Path to the pedigree text file. +#' @param ped.file Path to the pedigree text file (TSV/CSV/TXT), OR a +#' data.frame / data.table with columns: id, male_parent, female_parent. #' @param seed Optional integer seed for reproducibility. #' @param verbose Logical. If TRUE (default), prints the report to the console. #' @param correct_conflicting_trios Logical. If TRUE (default), sets conflicting @@ -29,21 +30,19 @@ #' ped_file <- system.file("check_ped_test.txt", package = "BIGr") #' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' -#' ped_errors$inconsistent_sex_roles -#' ped_errors$corrected_pedigree -#' -#' # Identify the conflicting IDs from the parent columns -#' conflicting_sex_ids <- unique(c( -#' ped_errors$inconsistent_sex_roles$male_parent, -#' ped_errors$inconsistent_sex_roles$female_parent -#' )) -#' print(conflicting_sex_ids) +#' # Also accepts a data.table directly +#' library(data.table) +#' ped_dt <- data.table(id = c("A","B","C"), +#' male_parent = c("0","0","A"), +#' female_parent = c("0","0","B")) +#' ped_errors <- check_ped(ped.file = ped_dt, verbose = FALSE) #' #' @author Josue Chinchilla-Vargas #' #' @importFrom dplyr %>% mutate filter group_by ungroup summarize distinct bind_rows select first n n_distinct if_else row_number #' @importFrom stats setNames #' @importFrom utils read.table +#' @importFrom janitor clean_names #' @export check_ped <- function(ped.file, seed = NULL, @@ -54,22 +53,26 @@ check_ped <- function(ped.file, #### setup #### if (!is.null(seed)) set.seed(seed) - # read inputs first, then normalize column names to lowercase - data <- utils::read.table(ped.file, header = TRUE) - names(data) <- tolower(names(data)) + # Accept file path OR in-memory data.frame / data.table + if (is.character(ped.file) && length(ped.file) == 1 && file.exists(ped.file)) { + data <- utils::read.table(ped.file, header = TRUE) + data <- janitor::clean_names(data) + } else if (is.data.frame(ped.file) || data.table::is.data.table(ped.file)) { + data <- as.data.frame(ped.file) + data <- janitor::clean_names(data) + } else { + stop("ped.file must be a valid file path (character) or a data.frame / data.table.") + } - # validate required columns after normalization required_cols <- c("id", "male_parent", "female_parent") missing_cols <- setdiff(required_cols, colnames(data)) if (length(missing_cols) > 0) { stop( - "Input file is missing required column(s): ", + "Input is missing required column(s): ", paste(missing_cols, collapse = ", "), ".\nExpected columns: id, male_parent, female_parent." ) } - - # reorder so required columns always come first regardless of column order in the input file extra_cols <- setdiff(names(data), required_cols) data <- data[, c(required_cols, extra_cols)] @@ -111,7 +114,8 @@ check_ped <- function(ped.file, conflicting_trios_ids <- repeated_ids %>% dplyr::group_by(id) %>% - dplyr::filter(dplyr::n_distinct(male_parent) > 1 | dplyr::n_distinct(female_parent) > 1) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | + dplyr::n_distinct(female_parent) > 1) %>% dplyr::ungroup() if (correct_conflicting_trios && nrow(conflicting_trios_ids) > 0) { @@ -152,7 +156,8 @@ check_ped <- function(ped.file, } if (male_parent == id || female_parent == id) { - errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) + errors <- append(errors, paste("Dependency: Individual", id, + "cannot be its own parent")) } } @@ -166,8 +171,6 @@ check_ped <- function(ped.file, female_ids <- unique(data$female_parent[data$female_parent != "0"]) conflicting_sex_ids <- intersect(male_ids, female_ids) - # FIX 4: return the rows where the conflicting ID appears AS A PARENT - # (not the row where it is the subject) — this shows the actual contradiction inconsistent_sex_roles <- data %>% dplyr::filter(male_parent %in% conflicting_sex_ids | female_parent %in% conflicting_sex_ids) @@ -181,7 +184,7 @@ check_ped <- function(ped.file, dplyr::distinct(id, male_parent, female_parent, .keep_all = TRUE) } - #### check 5: dependencies (cycles) -- reported only, never modified #### + #### check 5: dependencies (cycles) -- reported only #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] @@ -216,9 +219,8 @@ check_ped <- function(ped.file, for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) { + if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) - } } } return(all_cycles) @@ -265,17 +267,17 @@ check_ped <- function(ped.file, } else cat("\nNo conflicting trios found.\n") if (nrow(missing_parents) > 0) { - cat("\nParents missing as IDs found in the pedigree (added as founders in corrected pedigree):\n") + cat("\nParents missing as IDs (added as founders in corrected pedigree):\n") print(missing_parents) } else cat("\nNo missing parents found.\n") if (nrow(inconsistent_sex_roles) > 0) { - cat("\nIDs found as both male_parent and female_parent (is selfing or hermaphrodytism possible?):\n") + cat("\nIDs found as both male_parent and female_parent:\n") print(inconsistent_sex_roles) if (correct_inconsistent_sex_roles) { - cat(" -> parent fields set to 0 for conflicting IDs; exact duplicates removed in corrected pedigree.\n") + cat(" -> parent fields set to 0 for conflicting IDs in corrected pedigree.\n") } else { - cat(" -> correct_inconsistent_sex_roles = FALSE: left as-is in corrected pedigree.\n") + cat(" -> correct_inconsistent_sex_roles = FALSE: left as-is.\n") } } else cat("\nNo IDs found as both male_parent and female_parent.\n") diff --git a/R/find_parentage.R b/R/find_parentage.R index 56cd41b..e53e66d 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -4,41 +4,41 @@ #' using Mendelian error rates or homozygous mismatch rates. Parents or progeny #' absent from the genotype file are removed with a warning. #' -#' @param genotypes_file Path to a TSV/CSV/TXT file with an 'id' column -#' followed by marker columns coded as 0, 1, 2 (allele dosage). -#' @param parents_file Path to a TSV/CSV/TXT file with an 'id' column and an -#' optional 'sex' column ('M', 'F', or 'A'). If absent, all parents are -#' treated as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file with an 'id' column. +#' @param genotypes_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column followed by marker columns coded as 0, 1, 2. +#' @param parents_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column and an optional 'sex' column +#' ('M', 'F', or 'A'). If absent, all parents are treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column. #' @param method Character. One of \code{"best_male_parent"}, #' \code{"best_female_parent"}, \code{"best_match"}, or -#' \code{"best_pair"} (default). See Details. +#' \code{"best_pair"} (default). #' @param min_markers Integer. Minimum markers required; fewer flags #' \code{low_markers} (default: \code{10}). #' @param error_threshold Numeric. Maximum mismatch percentage; exceeded values #' flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100. #' @param show_ties Logical. If \code{TRUE}, tied best pairs are appended as -#' suffix columns (e.g. \code{male_parent_2}) for \code{"best_pair"}. -#' Default is \code{TRUE}. +#' suffix columns. Default is \code{TRUE}. #' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and #' female parent IDs are excluded. Default is \code{TRUE}. -#' @param verbose Logical. If \code{TRUE}, prints progress and summary to the -#' console. Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress and summary. +#' Default is \code{TRUE}. #' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error -#' distribution colored by status. Requires \code{ggplot2}. Default is \code{TRUE}. +#' distribution. Requires \code{ggplot2}. Default is \code{TRUE}. #' -#' @return A named list (returned invisibly) with the following elements: +#' @return A named list (returned invisibly) with elements: #' \describe{ #' \item{pass}{Progeny with a confident parentage assignment.} #' \item{high_error}{Progeny whose best assignment exceeds the error threshold.} #' \item{low_markers}{Progeny with insufficient markers for a valid assignment.} #' \item{full_results}{Complete data.table with all progeny and all output columns.} -#' \item{plot}{ggplot object. Use ggsave() to save if desired.} +#' \item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} #' } #' #' @author Josue Chinchilla-Vargas #' -#' @importFrom data.table fread copy CJ rbindlist set data.table as.data.table +#' @importFrom data.table fread copy CJ rbindlist set data.table as.data.table is.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best_pair", @@ -53,8 +53,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, id <- sex <- male_parent <- female_parent <- NULL mendelian_error_pct <- plot_status <- status <- NULL - #### Input Validation and Data Loading #### - allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") + #### Input Validation #### + allowed_methods <- c("best_male_parent", "best_female_parent", + "best_match", "best_pair") if (!method %in% allowed_methods) stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) if (min_markers < 1) @@ -62,10 +63,23 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (error_threshold < 0 || error_threshold > 100) stop("error_threshold must be between 0 and 100.") + # Accept file path OR in-memory data.frame / data.table + read_flex <- function(x, label, ...) { + if (is.character(x) && length(x) == 1) { + if (!file.exists(x)) + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + data.table::fread(x, sep = "auto", ...) + } else if (is.data.frame(x) || data.table::is.data.table(x)) { + data.table::as.data.table(x) + } else { + stop(label, " must be a file path (character) or a data.frame / data.table.") + } + } + tryCatch({ - genos <- data.table::fread(genotypes_file, sep = "auto") - all_parents <- data.table::fread(parents_file, sep = "auto") - progeny_candidates <- data.table::fread(progeny_file, sep = "auto") + genos <- read_flex(genotypes_file, "genotypes_file") + all_parents <- read_flex(parents_file, "parents_file") + progeny_candidates <- read_flex(progeny_file, "progeny_file") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) @@ -94,9 +108,11 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, male_parent_candidates <- all_parents[sex %in% c("M", "A", "NA"), .SD] female_parent_candidates <- all_parents[sex %in% c("F", "A", "NA")] - if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) + if (base::nrow(male_parent_candidates) == 0 && + method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) - if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) + if (base::nrow(female_parent_candidates) == 0 && + method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") @@ -124,6 +140,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, female_parent_candidates$id)) parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$id, , drop = FALSE] + n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( id = base::rownames(progeny_genos), @@ -132,18 +149,21 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, markers_tested = NA_integer_, status = NA_character_ ) + for (i in base::seq_len(n_progeny)) { progeny_vec <- progeny_genos[i, ] mismatches <- base::rowSums(parent_genos != progeny_vec, na.rm = TRUE) comparisons <- base::rowSums(!base::is.na(parent_genos) & !base::is.na(progeny_vec)) percent_mismatch <- (mismatches / comparisons) * 100 percent_mismatch[base::is.nan(percent_mismatch)] <- NA + best_idx <- base::which.min(percent_mismatch) if (base::length(best_idx) == 0) { data.table::set(results_dt, i, "markers_tested", 0L) data.table::set(results_dt, i, "status", "low_markers") next } + best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) data.table::set(results_dt, i, "best_match", base::rownames(parent_genos)[best_idx]) @@ -238,7 +258,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), - " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) + " tied best pairs. Only one is reported as show_ties=FALSE.", + call. = FALSE) } num_to_report <- base::min(base::nrow(best_pairs), @@ -281,7 +302,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, pass = final_df[status == "pass"], high_error = final_df[status == "high_error"], low_markers = final_df[status == "low_markers"], - full_results = final_df + full_results = final_df, + plot = NULL ) #### Verbose output #### @@ -323,10 +345,10 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, } #### Plot Results #### - p <- NULL if (plot_results) { if (!requireNamespace("ggplot2", quietly = TRUE)) { - warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + warning("ggplot2 is required for plot_results = TRUE. Please install it.", + call. = FALSE) } else { plot_df <- final_df[!is.na(final_df$mendelian_error_pct)] plot_df$mendelian_error_pct <- base::as.numeric(plot_df$mendelian_error_pct) @@ -335,11 +357,7 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, base::ifelse( plot_df$status == "high_error", "high_error", base::ifelse( - plot_df$status == "low_markers", "low_markers", - "other" - ) - ) - ) + plot_df$status == "low_markers", "low_markers", "other"))) n_total <- base::nrow(plot_df) n_pass <- base::sum(plot_df$status == "pass", na.rm = TRUE) @@ -348,8 +366,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, threshold_label <- base::paste0( "Error Threshold: ", error_threshold, "% | ", - "Pass: ", n_pass, " | ", - "High Error: ", n_high, " | ", + "Pass: ", n_pass, " | ", + "High Error: ", n_high, " | ", "Low Markers: ", n_low ) @@ -374,7 +392,8 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ) + ggplot2::labs( title = "Parentage Mendelian Error Distribution", - subtitle = base::paste0("Progeny Tested: ", n_total, "\n \n", threshold_label), + subtitle = base::paste0("Progeny Tested: ", n_total, + "\n \n", threshold_label), x = "Mendelian Error (%)", y = "Number of Progeny", fill = "Status" @@ -383,9 +402,9 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, ggplot2::theme(legend.position = "top") base::print(p) + output_list$plot <- p } } - output_list$plot <- p return(base::invisible(output_list)) } diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 030d80f..fc4cd0f 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -7,10 +7,11 @@ #' supplied. Trios absent from the genotype file are retained as #' no_genotype_data. #' -#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) -#' with columns: id, male_parent, female_parent. -#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) -#' with an id column followed by marker columns coded as 0, 1, 2. +#' @param pedigree_file Path to the pedigree file (TSV/CSV/TXT), OR a +#' data.frame / data.table with columns: id, male_parent, female_parent. +#' @param genotypes_file Path to the genotypes file (TSV/CSV/TXT), OR a +#' data.frame / data.table with an id column followed by marker columns +#' coded as 0, 1, 2. #' @param founders_file Character, optional. Path to a one-column file listing #' founder IDs. Founders with both parents coded as 0 are left unchanged. #' Defaults to NULL. @@ -36,12 +37,12 @@ #' \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} #' \item{full_results}{Complete data.table with all trios and all output columns.} #' \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} -#' \item{plot}{ggplot object. Use ggsave() to save if desired.} +#' \item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} #' } #' #' @author Josue Chinchilla-Vargas #' -#' @importFrom data.table fread copy data.table set rbindlist +#' @importFrom data.table fread copy data.table set rbindlist as.data.table is.data.table #' @export validate_pedigree <- function(pedigree_file, genotypes_file, founders_file = NULL, @@ -51,15 +52,32 @@ validate_pedigree <- function(pedigree_file, genotypes_file, verbose = TRUE, plot_results = TRUE) { + ## silence R CMD check NOTEs + id <- male_parent <- female_parent <- status <- trio_mendelian_error_pct <- NULL + plot_status <- recommended_correction <- NULL + #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) stop("trio_error_threshold must be between 0 and 100") if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") + # Accept file path OR in-memory data.frame / data.table + read_flex <- function(x, label, ...) { + if (is.character(x) && length(x) == 1) { + if (!file.exists(x)) + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + data.table::fread(x, sep = "auto", ...) + } else if (is.data.frame(x) || data.table::is.data.table(x)) { + data.table::as.data.table(x) + } else { + stop(label, " must be a file path (character) or a data.frame / data.table.") + } + } + tryCatch({ - pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") - genos <- data.table::fread(genotypes_file, sep = "auto") + pedigree <- read_flex(pedigree_file, "pedigree_file", colClasses = "character") + genos <- read_flex(genotypes_file, "genotypes_file") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) @@ -304,13 +322,11 @@ validate_pedigree <- function(pedigree_file, genotypes_file, "low_markers_remove_both", "low_markers_remove_male_parent", "low_markers_remove_female_parent")) { - if (grepl("male", decision)) { - data.table::set(corrected_pedigree, row_idx, "male_parent", "0") - } - if (grepl("female", decision)) { + if (grepl("male", decision)) + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + if (grepl("female", decision)) data.table::set(corrected_pedigree, row_idx, "female_parent", "0") - } - if (decision == "low_markers_remove_both" || decision == "remove_both") { + if (decision %in% c("low_markers_remove_both", "remove_both")) { data.table::set(corrected_pedigree, row_idx, "male_parent", "0") data.table::set(corrected_pedigree, row_idx, "female_parent", "0") } @@ -347,16 +363,15 @@ validate_pedigree <- function(pedigree_file, genotypes_file, } else { plot_df <- final_df[!is.na(final_df$trio_mendelian_error_pct)] plot_df$plot_status <- dplyr::case_when( - plot_df$recommended_correction %in% c("none", - "keep_both", - "low_markers_keep_both") ~ "pass", + plot_df$recommended_correction %in% c("none", "keep_both", + "low_markers_keep_both") ~ "pass", plot_df$recommended_correction %in% c("remove_male_parent", "remove_female_parent", "low_markers_remove_male_parent", - "low_markers_remove_female_parent") ~ "fail_one_parent", + "low_markers_remove_female_parent") ~ "fail_one_parent", plot_df$recommended_correction %in% c("remove_both", - "low_markers_remove_both") ~ "fail_both_parents", - TRUE ~ "other" + "low_markers_remove_both") ~ "fail_both_parents", + TRUE ~ "other" ) n_total <- nrow(plot_df) n_fail <- sum(plot_df$trio_mendelian_error_pct > trio_error_threshold) @@ -367,25 +382,27 @@ validate_pedigree <- function(pedigree_file, genotypes_file, "Kept: ", n_pass, " trios" ) p <- ggplot2::ggplot(plot_df, - ggplot2::aes(x = trio_mendelian_error_pct, fill = plot_status)) + + ggplot2::aes(x = trio_mendelian_error_pct, + fill = plot_status)) + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + ggplot2::geom_vline(xintercept = trio_error_threshold, linetype = "dashed", color = "black", linewidth = 1) + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + ggplot2::scale_y_continuous(breaks = seq(0, 100, by = 5)) + ggplot2::scale_fill_manual( - values = c("pass" = "#339900", - "fail_one_parent" = "#F1C40F", - "fail_both_parents"= "#cc3333", - "other" = "#BDC3C7"), - labels = c("pass" = "Pass", - "fail_one_parent" = "Fail - One Parent", - "fail_both_parents"= "Fail - Both Parents", - "other" = "Other") + values = c("pass" = "#339900", + "fail_one_parent" = "#F1C40F", + "fail_both_parents" = "#cc3333", + "other" = "#BDC3C7"), + labels = c("pass" = "Pass", + "fail_one_parent" = "Fail - One Parent", + "fail_both_parents" = "Fail - Both Parents", + "other" = "Other") ) + ggplot2::labs( title = "Trio Mendelian Error Distribution", - subtitle = paste0("Trios with Genotype Data Tested: ", n_total, "\n \n", threshold_label), + subtitle = paste0("Trios with Genotype Data Tested: ", n_total, + "\n \n", threshold_label), x = "Mendelian Error (%)", y = "Number of Trios", fill = "Status" diff --git a/inst/check_ped_test.txt b/inst/check_ped_test.txt index ae7bc08..27eba9d 100644 --- a/inst/check_ped_test.txt +++ b/inst/check_ped_test.txt @@ -1,4 +1,4 @@ -id sire dam +id male_parent female_parent off1 sire1 dam1 off2 sire2 dam2 off3 sire3 dam3 @@ -13,4 +13,9 @@ dam1 grandfather6 grandmother6 dam2 grandfather7 grandmother7 dam3 grandfather8 grandmother8 dam4 0 0 -dam5 0 0 \ No newline at end of file +dam5 0 0 +grandmother4 0 0 +grandmother5 0 0 +grandfather6 0 0 +grandmother6 0 0 +grandfather7 0 0 \ No newline at end of file diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 5f29767..85d7ac3 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -13,7 +13,8 @@ check_ped( ) } \arguments{ -\item{ped.file}{Path to the pedigree text file.} +\item{ped.file}{Path to the pedigree text file (TSV/CSV/TXT), OR a +data.frame / data.table with columns: id, male_parent, female_parent.} \item{seed}{Optional integer seed for reproducibility.} @@ -48,15 +49,12 @@ reported only and must be resolved manually. ped_file <- system.file("check_ped_test.txt", package = "BIGr") ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) -ped_errors$inconsistent_sex_roles -ped_errors$corrected_pedigree - -# Identify the conflicting IDs from the parent columns -conflicting_sex_ids <- unique(c( - ped_errors$inconsistent_sex_roles$male_parent, - ped_errors$inconsistent_sex_roles$female_parent -)) -print(conflicting_sex_ids) +# Also accepts a data.table directly +library(data.table) +ped_dt <- data.table(id = c("A","B","C"), + male_parent = c("0","0","A"), + female_parent = c("0","0","B")) +ped_errors <- check_ped(ped.file = ped_dt, verbose = FALSE) } \author{ diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index a414919..e4c1953 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -18,18 +18,19 @@ find_parentage( ) } \arguments{ -\item{genotypes_file}{Path to a TSV/CSV/TXT file with an 'id' column -followed by marker columns coded as 0, 1, 2 (allele dosage).} +\item{genotypes_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column followed by marker columns coded as 0, 1, 2.} -\item{parents_file}{Path to a TSV/CSV/TXT file with an 'id' column and an -optional 'sex' column ('M', 'F', or 'A'). If absent, all parents are -treated as ambiguous.} +\item{parents_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column and an optional 'sex' column +('M', 'F', or 'A'). If absent, all parents are treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file with an 'id' column.} +\item{progeny_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column.} \item{method}{Character. One of \code{"best_male_parent"}, \code{"best_female_parent"}, \code{"best_match"}, or -\code{"best_pair"} (default). See Details.} +\code{"best_pair"} (default).} \item{min_markers}{Integer. Minimum markers required; fewer flags \code{low_markers} (default: \code{10}).} @@ -38,26 +39,25 @@ treated as ambiguous.} flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100.} \item{show_ties}{Logical. If \code{TRUE}, tied best pairs are appended as -suffix columns (e.g. \code{male_parent_2}) for \code{"best_pair"}. -Default is \code{TRUE}.} +suffix columns. Default is \code{TRUE}.} \item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and female parent IDs are excluded. Default is \code{TRUE}.} -\item{verbose}{Logical. If \code{TRUE}, prints progress and summary to the -console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and summary. +Default is \code{TRUE}.} \item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error -distribution colored by status. Requires \code{ggplot2}. Default is \code{TRUE}.} +distribution. Requires \code{ggplot2}. Default is \code{TRUE}.} } \value{ -A named list (returned invisibly) with the following elements: +A named list (returned invisibly) with elements: \describe{ \item{pass}{Progeny with a confident parentage assignment.} \item{high_error}{Progeny whose best assignment exceeds the error threshold.} \item{low_markers}{Progeny with insufficient markers for a valid assignment.} \item{full_results}{Complete data.table with all progeny and all output columns.} -\item{plot}{ggplot object. Use ggsave() to save if desired.} +\item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} } } \description{ diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 662e91c..3daa9d1 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -16,11 +16,12 @@ validate_pedigree( ) } \arguments{ -\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) -with columns: id, male_parent, female_parent.} +\item{pedigree_file}{Path to the pedigree file (TSV/CSV/TXT), OR a +data.frame / data.table with columns: id, male_parent, female_parent.} -\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) -with an id column followed by marker columns coded as 0, 1, 2.} +\item{genotypes_file}{Path to the genotypes file (TSV/CSV/TXT), OR a +data.frame / data.table with an id column followed by marker columns +coded as 0, 1, 2.} \item{founders_file}{Character, optional. Path to a one-column file listing founder IDs. Founders with both parents coded as 0 are left unchanged. @@ -53,7 +54,7 @@ An invisible named list with the following elements: \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} \item{full_results}{Complete data.table with all trios and all output columns.} \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} -\item{plot}{ggplot object. Use ggsave() to save if desired.} +\item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} } } \description{ diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 57d462c..ff150c3 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -1,9 +1,6 @@ # tests/testthat/test-check_ped.R library(testthat) -# ------------------------------------------------------------------------------ -# Helper: write a data.frame to a temp tab-separated file and return the path -# ------------------------------------------------------------------------------ write_ped <- function(df) { f <- tempfile(fileext = ".txt") utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) @@ -12,9 +9,6 @@ write_ped <- function(df) { context("check_ped – Pedigree Quality Checks") -# ============================================================================== -# 1. Return structure -# ============================================================================== test_that("check_ped returns a named list of length 6", { ped <- data.frame( id = c("A", "B", "C"), @@ -22,7 +16,6 @@ test_that("check_ped returns a named list of length 6", { female_parent = c("0", "0", "0") ) out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE) - expect_type(out, "list") expect_length(out, 6) expect_named(out, c( @@ -35,9 +28,6 @@ test_that("check_ped returns a named list of length 6", { )) }) -# ============================================================================== -# 2. Component types -# ============================================================================== test_that("check_ped report components are data.frames", { ped <- data.frame( id = c("A", "B", "C"), @@ -45,7 +35,6 @@ test_that("check_ped report components are data.frames", { female_parent = c("0", "0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_true(is.data.frame(out$exact_duplicates)) expect_true(is.data.frame(out$conflicting_trios)) expect_true(is.data.frame(out$inconsistent_sex_roles)) @@ -54,9 +43,6 @@ test_that("check_ped report components are data.frames", { expect_true(is.data.frame(out$corrected_pedigree)) }) -# ============================================================================== -# 3. corrected_pedigree column names are lowercase, no row_number leak -# ============================================================================== test_that("corrected_pedigree has lowercase column names and no row_number", { ped <- data.frame( id = c("A", "B"), @@ -64,15 +50,11 @@ test_that("corrected_pedigree has lowercase column names and no row_number", { female_parent = c("0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_true(all(c("id", "male_parent", "female_parent") %in% names(out$corrected_pedigree))) expect_false("row_number" %in% names(out$corrected_pedigree)) }) -# ============================================================================== -# 4. Clean pedigree → empty report (no false positives) -# ============================================================================== test_that("clean pedigree produces no issues", { ped <- data.frame( id = c("G1", "G2", "P1"), @@ -80,7 +62,6 @@ test_that("clean pedigree produces no issues", { female_parent = c("0", "0", "G2") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$exact_duplicates), 0) expect_equal(nrow(out$conflicting_trios), 0) expect_equal(nrow(out$inconsistent_sex_roles), 0) @@ -89,9 +70,6 @@ test_that("clean pedigree produces no issues", { expect_equal(nrow(out$corrected_pedigree), 3) }) -# ============================================================================== -# 5. Exact duplicate detection -# ============================================================================== test_that("check_ped detects exact duplicates", { ped <- data.frame( id = c("A", "A", "B"), @@ -99,7 +77,6 @@ test_that("check_ped detects exact duplicates", { female_parent = c("0", "0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$exact_duplicates), 2) expect_true(all(out$exact_duplicates$id == "A")) }) @@ -111,13 +88,9 @@ test_that("exact duplicates are collapsed to one row in corrected_pedigree", { female_parent = c("0", "0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(sum(out$corrected_pedigree$id == "A"), 1) }) -# ============================================================================== -# 6. Conflicting trios -# ============================================================================== test_that("check_ped detects conflicting trios", { ped <- data.frame( id = c("A", "A", "B"), @@ -125,7 +98,6 @@ test_that("check_ped detects conflicting trios", { female_parent = c("M", "M", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$conflicting_trios), 2) expect_true(all(out$conflicting_trios$id == "A")) }) @@ -139,7 +111,6 @@ test_that("correct_conflicting_trios = TRUE: conflicting field -> '0', consisten out <- check_ped(write_ped(ped), verbose = FALSE, correct_conflicting_trios = TRUE) a_row <- out$corrected_pedigree[out$corrected_pedigree$id == "A", ] - expect_equal(nrow(a_row), 1) expect_equal(a_row$male_parent, "0") expect_equal(a_row$female_parent, "M") @@ -153,13 +124,9 @@ test_that("correct_conflicting_trios = FALSE leaves conflicting rows as-is", { ) out <- check_ped(write_ped(ped), verbose = FALSE, correct_conflicting_trios = FALSE) - expect_equal(sum(out$corrected_pedigree$id == "A"), 2) }) -# ============================================================================== -# 7. Missing parent detection and founder row injection -# ============================================================================== test_that("check_ped detects missing parents", { ped <- data.frame( id = c("A", "B"), @@ -167,7 +134,6 @@ test_that("check_ped detects missing parents", { female_parent = c("0", "Y") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_equal(nrow(out$missing_parents), 2) expect_true("X" %in% out$missing_parents$id) expect_true("Y" %in% out$missing_parents$id) @@ -180,13 +146,10 @@ test_that("missing parents are added as founder rows in corrected_pedigree", { female_parent = c("0", "Y") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_true("X" %in% out$corrected_pedigree$id) expect_true("Y" %in% out$corrected_pedigree$id) - x_row <- out$corrected_pedigree[out$corrected_pedigree$id == "X", ] y_row <- out$corrected_pedigree[out$corrected_pedigree$id == "Y", ] - expect_equal(x_row$male_parent, "0") expect_equal(x_row$female_parent, "0") expect_equal(y_row$male_parent, "0") @@ -200,15 +163,11 @@ test_that("individual that is its own parent is logged as a dependency", { female_parent = c("0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_gt(nrow(out$dependencies), 0) }) -# ============================================================================== -# 8. Inconsistent sex roles -# inconsistent_sex_roles returns the rows where the conflicting ID appears -# AS A PARENT (not the row where it is the subject) [1] -# ============================================================================== +# inconsistent_sex_roles returns rows where the conflicting ID appears +# AS A PARENT — check male_parent and female_parent columns [1] test_that("check_ped detects inconsistent sex roles", { ped <- data.frame( id = c("child1", "child2", "P"), @@ -216,7 +175,6 @@ test_that("check_ped detects inconsistent sex roles", { female_parent = c("0", "P", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_true("inconsistent_sex_roles" %in% names(out)) expect_gt(nrow(out$inconsistent_sex_roles), 0) expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | @@ -232,7 +190,6 @@ test_that("correct_inconsistent_sex_roles = TRUE zeros out conflicting parent re out <- check_ped(write_ped(ped), verbose = FALSE, correct_inconsistent_sex_roles = TRUE) corr <- out$corrected_pedigree - expect_false(any(corr$male_parent == "P")) expect_false(any(corr$female_parent == "P")) }) @@ -246,13 +203,9 @@ test_that("correct_inconsistent_sex_roles = FALSE leaves conflicting references" out <- check_ped(write_ped(ped), verbose = FALSE, correct_inconsistent_sex_roles = FALSE) corr <- out$corrected_pedigree - expect_true(any(corr$male_parent == "P" | corr$female_parent == "P")) }) -# ============================================================================== -# 9. Cycle / dependency detection -# ============================================================================== test_that("check_ped detects a direct two-node cycle", { ped <- data.frame( id = c("A", "B"), @@ -260,7 +213,6 @@ test_that("check_ped detects a direct two-node cycle", { female_parent = c("0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_gt(nrow(out$dependencies), 0) }) @@ -271,16 +223,10 @@ test_that("cycle-involved IDs are still present in corrected_pedigree", { female_parent = c("0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_true("A" %in% out$corrected_pedigree$id) expect_true("B" %in% out$corrected_pedigree$id) }) -# ============================================================================== -# 10. Missing required columns → hard stop -# tolower(names()) in check_ped.R normalizes caps before validation [1] -# so only truly unrecognizable columns trigger this error -# ============================================================================== test_that("check_ped errors when required columns are missing", { bad_df <- data.frame( animal_id = c("a", "b"), @@ -289,13 +235,10 @@ test_that("check_ped errors when required columns are missing", { ) expect_error( check_ped(write_ped(bad_df), verbose = FALSE), - "Input file is missing required column" + regexp = "missing required column" ) }) -# ============================================================================== -# 11. Capitalization tolerance — mixed caps should be accepted [1] -# ============================================================================== test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Parent)", { ped <- data.frame( ID = c("A", "B", "C"), @@ -303,7 +246,6 @@ test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Pa Female_Parent = c("0", "0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_length(out, 6) expect_true(all(c("id", "male_parent", "female_parent") %in% names(out$corrected_pedigree))) @@ -316,13 +258,9 @@ test_that("check_ped accepts all-uppercase column names", { FEMALE_PARENT = c("0", "0") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_length(out, 6) }) -# ============================================================================== -# 12. Column order tolerance — columns in any order should be accepted [1] -# ============================================================================== test_that("check_ped accepts columns in any order", { ped <- data.frame( female_parent = c("0", "0"), @@ -330,14 +268,10 @@ test_that("check_ped accepts columns in any order", { id = c("A", "B") ) out <- check_ped(write_ped(ped), verbose = FALSE) - expect_length(out, 6) expect_equal(nrow(out$corrected_pedigree), 2) }) -# ============================================================================== -# 13. Seed reproducibility -# ============================================================================== test_that("seed produces reproducible results", { ped <- data.frame( id = c("A", "B", "C"), @@ -347,13 +281,9 @@ test_that("seed produces reproducible results", { f <- write_ped(ped) out1 <- check_ped(f, seed = 42, verbose = FALSE) out2 <- check_ped(f, seed = 42, verbose = FALSE) - expect_identical(out1$corrected_pedigree, out2$corrected_pedigree) }) -# ============================================================================== -# 14. verbose = FALSE suppresses console output -# ============================================================================== test_that("verbose = FALSE suppresses console output", { ped <- data.frame( id = c("A", "B"), @@ -363,9 +293,6 @@ test_that("verbose = FALSE suppresses console output", { expect_silent(check_ped(write_ped(ped), verbose = FALSE)) }) -# ============================================================================== -# 15. Return value is invisible -# ============================================================================== test_that("check_ped returns invisibly", { ped <- data.frame( id = c("A", "B"), @@ -375,30 +302,25 @@ test_that("check_ped returns invisibly", { expect_invisible(check_ped(write_ped(ped), verbose = FALSE)) }) -# ============================================================================== -# 16. No write logic — check_ped does not write files to disk -# ============================================================================== test_that("no output files are written to disk", { tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - ped <- data.frame( id = c("A", "B"), male_parent = c("0", "A"), female_parent = c("0", "0") ) check_ped(write_ped(ped), verbose = FALSE) - expect_length(list.files(tmp_dir), 0) }) # ============================================================================== -# 17. Integration test with bundled fixture file -# Fixture has mixed-case headers (ID, Male_parent, Female_Parent) [1] -# tolower() in check_ped.R normalizes these automatically +# Integration test +# Fixture has sire/dam columns renamed to male_parent/female_parent [1] +# janitor::clean_names() handles any remaining capitalization variants # ============================================================================== test_that("integration test with bundled fixture file", { ped_file <- system.file("check_ped_test.txt", package = "BIGr") @@ -409,429 +331,58 @@ test_that("integration test with bundled fixture file", { expect_length(out, 6) expect_gt(nrow(out$inconsistent_sex_roles), 0) - # inconsistent_sex_roles stores rows where conflicting ID appears AS A PARENT [1] + # inconsistent_sex_roles stores rows where the conflicting ID appears + # AS A PARENT in male_parent or female_parent columns [1] conflicting_ids <- unique(c( out$inconsistent_sex_roles$male_parent, out$inconsistent_sex_roles$female_parent )) expect_true(any(c("grandfather2", "grandfather3") %in% conflicting_ids)) expect_equal(nrow(out$missing_parents), 8) -})# tests/testthat/test-check_ped.R -library(testthat) - -# ------------------------------------------------------------------------------ -# Helper: write a data.frame to a temp tab-separated file and return the path -# ------------------------------------------------------------------------------ -write_ped <- function(df) { - f <- tempfile(fileext = ".txt") - utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) - f -} - -context("check_ped – Pedigree Quality Checks") - -# ============================================================================== -# 1. Return structure -# ============================================================================== -test_that("check_ped returns a named list of length 6", { - ped <- data.frame( - id = c("A", "B", "C"), - male_parent = c("0", "A", "A"), - female_parent = c("0", "0", "0") - ) - out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE) - - expect_type(out, "list") - expect_length(out, 6) - expect_named(out, c( - "exact_duplicates", - "conflicting_trios", - "inconsistent_sex_roles", - "missing_parents", - "dependencies", - "corrected_pedigree" - )) }) - # ============================================================================== -# 2. Component types +# In-memory input — data.frame / data.table accepted directly # ============================================================================== -test_that("check_ped report components are data.frames", { + +test_that("check_ped accepts a data.frame directly", { ped <- data.frame( id = c("A", "B", "C"), male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_true(is.data.frame(out$exact_duplicates)) - expect_true(is.data.frame(out$conflicting_trios)) - expect_true(is.data.frame(out$inconsistent_sex_roles)) - expect_true(is.data.frame(out$missing_parents)) - expect_true(is.data.frame(out$dependencies)) - expect_true(is.data.frame(out$corrected_pedigree)) -}) - -# ============================================================================== -# 3. corrected_pedigree column names are lowercase, no row_number leak -# ============================================================================== -test_that("corrected_pedigree has lowercase column names and no row_number", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "A"), - female_parent = c("0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - + out <- check_ped(ped, verbose = FALSE) + expect_length(out, 6) expect_true(all(c("id", "male_parent", "female_parent") %in% names(out$corrected_pedigree))) - expect_false("row_number" %in% names(out$corrected_pedigree)) }) -# ============================================================================== -# 4. Clean pedigree → empty report (no false positives) -# ============================================================================== -test_that("clean pedigree produces no issues", { - ped <- data.frame( - id = c("G1", "G2", "P1"), - male_parent = c("0", "0", "G1"), - female_parent = c("0", "0", "G2") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_equal(nrow(out$exact_duplicates), 0) - expect_equal(nrow(out$conflicting_trios), 0) - expect_equal(nrow(out$inconsistent_sex_roles), 0) - expect_equal(nrow(out$missing_parents), 0) - expect_equal(nrow(out$dependencies), 0) - expect_equal(nrow(out$corrected_pedigree), 3) -}) - -# ============================================================================== -# 5. Exact duplicate detection -# ============================================================================== -test_that("check_ped detects exact duplicates", { - ped <- data.frame( - id = c("A", "A", "B"), - male_parent = c("0", "0", "A"), - female_parent = c("0", "0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_equal(nrow(out$exact_duplicates), 2) - expect_true(all(out$exact_duplicates$id == "A")) -}) - -test_that("exact duplicates are collapsed to one row in corrected_pedigree", { - ped <- data.frame( - id = c("A", "A", "B"), - male_parent = c("0", "0", "A"), +test_that("check_ped accepts a data.table directly", { + ped <- data.table::data.table( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_equal(sum(out$corrected_pedigree$id == "A"), 1) -}) - -# ============================================================================== -# 6. Conflicting trios -# ============================================================================== -test_that("check_ped detects conflicting trios", { - ped <- data.frame( - id = c("A", "A", "B"), - male_parent = c("X", "Y", "A"), - female_parent = c("M", "M", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_equal(nrow(out$conflicting_trios), 2) - expect_true(all(out$conflicting_trios$id == "A")) -}) - -test_that("correct_conflicting_trios = TRUE: conflicting field -> '0', consistent kept", { - ped <- data.frame( - id = c("A", "A", "B"), - male_parent = c("X", "Y", "A"), - female_parent = c("M", "M", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE, - correct_conflicting_trios = TRUE) - a_row <- out$corrected_pedigree[out$corrected_pedigree$id == "A", ] - - expect_equal(nrow(a_row), 1) - expect_equal(a_row$male_parent, "0") - expect_equal(a_row$female_parent, "M") -}) - -test_that("correct_conflicting_trios = FALSE leaves conflicting rows as-is", { - ped <- data.frame( - id = c("A", "A", "B"), - male_parent = c("X", "Y", "A"), - female_parent = c("M", "M", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE, - correct_conflicting_trios = FALSE) - - expect_equal(sum(out$corrected_pedigree$id == "A"), 2) -}) - -# ============================================================================== -# 7. Missing parent detection and founder row injection -# ============================================================================== -test_that("check_ped detects missing parents", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "X"), - female_parent = c("0", "Y") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_equal(nrow(out$missing_parents), 2) - expect_true("X" %in% out$missing_parents$id) - expect_true("Y" %in% out$missing_parents$id) -}) - -test_that("missing parents are added as founder rows in corrected_pedigree", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "X"), - female_parent = c("0", "Y") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_true("X" %in% out$corrected_pedigree$id) - expect_true("Y" %in% out$corrected_pedigree$id) - - x_row <- out$corrected_pedigree[out$corrected_pedigree$id == "X", ] - y_row <- out$corrected_pedigree[out$corrected_pedigree$id == "Y", ] - - expect_equal(x_row$male_parent, "0") - expect_equal(x_row$female_parent, "0") - expect_equal(y_row$male_parent, "0") - expect_equal(y_row$female_parent, "0") -}) - -test_that("individual that is its own parent is logged as a dependency", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("A", "0"), - female_parent = c("0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_gt(nrow(out$dependencies), 0) -}) - -# ============================================================================== -# 8. Inconsistent sex roles -# inconsistent_sex_roles returns the rows where the conflicting ID appears -# AS A PARENT (not the row where it is the subject) [1] -# ============================================================================== -test_that("check_ped detects inconsistent sex roles", { - ped <- data.frame( - id = c("child1", "child2", "P"), - male_parent = c("P", "0", "0"), - female_parent = c("0", "P", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_true("inconsistent_sex_roles" %in% names(out)) - expect_gt(nrow(out$inconsistent_sex_roles), 0) - expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | - out$inconsistent_sex_roles$female_parent == "P")) -}) - -test_that("correct_inconsistent_sex_roles = TRUE zeros out conflicting parent references", { - ped <- data.frame( - id = c("child1", "child2", "P"), - male_parent = c("P", "0", "0"), - female_parent = c("0", "P", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE, - correct_inconsistent_sex_roles = TRUE) - corr <- out$corrected_pedigree - - expect_false(any(corr$male_parent == "P")) - expect_false(any(corr$female_parent == "P")) -}) - -test_that("correct_inconsistent_sex_roles = FALSE leaves conflicting references", { - ped <- data.frame( - id = c("child1", "child2", "P"), - male_parent = c("P", "0", "0"), - female_parent = c("0", "P", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE, - correct_inconsistent_sex_roles = FALSE) - corr <- out$corrected_pedigree - - expect_true(any(corr$male_parent == "P" | corr$female_parent == "P")) -}) - -# ============================================================================== -# 9. Cycle / dependency detection -# ============================================================================== -test_that("check_ped detects a direct two-node cycle", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("B", "A"), - female_parent = c("0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_gt(nrow(out$dependencies), 0) -}) - -test_that("cycle-involved IDs are still present in corrected_pedigree", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("B", "A"), - female_parent = c("0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_true("A" %in% out$corrected_pedigree$id) - expect_true("B" %in% out$corrected_pedigree$id) -}) - -# ============================================================================== -# 10. Missing required columns → hard stop -# tolower(names()) in check_ped.R normalizes caps before validation [1] -# so only truly unrecognizable columns trigger this error -# ============================================================================== -test_that("check_ped errors when required columns are missing", { - bad_df <- data.frame( - animal_id = c("a", "b"), - parent1 = c("0", "a"), - parent2 = c("0", "0") - ) - expect_error( - check_ped(write_ped(bad_df), verbose = FALSE), - "Input file is missing required column" - ) -}) - -# ============================================================================== -# 11. Capitalization tolerance — mixed caps should be accepted [1] -# ============================================================================== -test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Parent)", { - ped <- data.frame( - ID = c("A", "B", "C"), - Male_Parent = c("0", "A", "A"), - Female_Parent = c("0", "0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - + out <- check_ped(ped, verbose = FALSE) expect_length(out, 6) expect_true(all(c("id", "male_parent", "female_parent") %in% names(out$corrected_pedigree))) }) -test_that("check_ped accepts all-uppercase column names", { - ped <- data.frame( - ID = c("A", "B"), - MALE_PARENT = c("0", "A"), - FEMALE_PARENT = c("0", "0") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_length(out, 6) -}) - -# ============================================================================== -# 12. Column order tolerance — columns in any order should be accepted [1] -# ============================================================================== -test_that("check_ped accepts columns in any order", { - ped <- data.frame( - female_parent = c("0", "0"), - male_parent = c("0", "A"), - id = c("A", "B") - ) - out <- check_ped(write_ped(ped), verbose = FALSE) - - expect_length(out, 6) - expect_equal(nrow(out$corrected_pedigree), 2) -}) - -# ============================================================================== -# 13. Seed reproducibility -# ============================================================================== -test_that("seed produces reproducible results", { +test_that("in-memory and file-path inputs produce identical corrected_pedigree", { ped <- data.frame( id = c("A", "B", "C"), male_parent = c("0", "A", "A"), female_parent = c("0", "0", "0") ) - f <- write_ped(ped) - out1 <- check_ped(f, seed = 42, verbose = FALSE) - out2 <- check_ped(f, seed = 42, verbose = FALSE) - - expect_identical(out1$corrected_pedigree, out2$corrected_pedigree) -}) - -# ============================================================================== -# 14. verbose = FALSE suppresses console output -# ============================================================================== -test_that("verbose = FALSE suppresses console output", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "A"), - female_parent = c("0", "0") - ) - expect_silent(check_ped(write_ped(ped), verbose = FALSE)) + out_file <- check_ped(write_ped(ped), verbose = FALSE) + out_mem <- check_ped(ped, verbose = FALSE) + expect_identical(out_file$corrected_pedigree, + out_mem$corrected_pedigree) }) -# ============================================================================== -# 15. Return value is invisible -# ============================================================================== -test_that("check_ped returns invisibly", { - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "A"), - female_parent = c("0", "0") - ) - expect_invisible(check_ped(write_ped(ped), verbose = FALSE)) -}) - -# ============================================================================== -# 16. No write logic — check_ped does not write files to disk -# ============================================================================== -test_that("no output files are written to disk", { - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - - ped <- data.frame( - id = c("A", "B"), - male_parent = c("0", "A"), - female_parent = c("0", "0") +test_that("invalid input type raises a descriptive error for check_ped", { + expect_error( + check_ped(list(id = "A"), verbose = FALSE), + regexp = "file path" ) - check_ped(write_ped(ped), verbose = FALSE) - - expect_length(list.files(tmp_dir), 0) -}) - -# ============================================================================== -# 17. Integration test with bundled fixture file -# Fixture has mixed-case headers (ID, Male_parent, Female_Parent) [1] -# tolower() in check_ped.R normalizes these automatically -# ============================================================================== -test_that("integration test with bundled fixture file", { - ped_file <- system.file("check_ped_test.txt", package = "BIGr") - skip_if(ped_file == "", "Bundled fixture file not found; skipping.") - - out <- check_ped(ped_file, seed = 101919, verbose = FALSE) - - expect_length(out, 6) - expect_gt(nrow(out$inconsistent_sex_roles), 0) - - # inconsistent_sex_roles stores rows where conflicting ID appears AS A PARENT [1] - conflicting_ids <- unique(c( - out$inconsistent_sex_roles$male_parent, - out$inconsistent_sex_roles$female_parent - )) - expect_true(any(c("grandfather2", "grandfather3") %in% conflicting_ids)) - expect_equal(nrow(out$missing_parents), 8) }) diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index bf62952..3fae013 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -1,11 +1,13 @@ # tests/testthat/test-find_parentage.R # Run with: testthat::test_file("tests/testthat/test-find_parentage.R") + library(testthat) library(data.table) # ============================================================================== # Helpers # ============================================================================== + make_files <- function(genos, parents, progeny, dir = tempdir()) { geno_file <- file.path(dir, paste0("genos_", sample(1e6, 1), ".txt")) parent_file <- file.path(dir, paste0("parents_", sample(1e6, 1), ".txt")) @@ -16,9 +18,12 @@ make_files <- function(genos, parents, progeny, dir = tempdir()) { list(g = geno_file, p = parent_file, pr = progeny_file) } -# Base toy data — lowercase id column -# S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 -# S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 +# ============================================================================== +# Base fixtures +# ============================================================================== + +# S1/D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 +# S2/D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 base_genos <- data.table::data.table( id = c("S1", "S2", "D1", "D2", "child1", "child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), @@ -27,8 +32,9 @@ base_genos <- data.table::data.table( M4 = c(0L, 2L, 0L, 2L, 0L, 2L), M5 = c(0L, 2L, 0L, 2L, 0L, 2L) ) + base_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), - sex = c("M", "M", "F", "F")) + sex = c("M", "M", "F", "F")) child1_progeny <- data.table::data.table(id = "child1") child2_progeny <- data.table::data.table(id = "child2") base_progeny <- data.table::data.table(id = c("child1", "child2")) @@ -40,12 +46,13 @@ tied_genos <- data.table::data.table( M2 = c(0L, 0L, 0L, 0L, 0L) ) tied_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), - sex = c("M", "M", "F", "F")) + sex = c("M", "M", "F", "F")) tied_progeny <- data.table::data.table(id = "child_tie") # ============================================================================== # 1. Input validation # ============================================================================== + test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( @@ -134,11 +141,11 @@ test_that("missing sex column raises a warning and defaults to ambiguous", { # ============================================================================== # 2. Return structure — named list # ============================================================================== + test_that("find_parentage returns an invisible named list with all required elements", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_type(out, "list") expect_named(out, c("pass", "high_error", "low_markers", "full_results", "plot"), ignore.order = TRUE) @@ -148,7 +155,6 @@ test_that("full_results is a data.table", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_s3_class(out$full_results, "data.table") }) @@ -156,7 +162,6 @@ test_that("full_results has one row per progeny", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(nrow(out$full_results), 1L) }) @@ -164,7 +169,6 @@ test_that("named list subsets together cover all full_results rows", { f <- make_files(base_genos, base_parents, base_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - subset_total <- nrow(out$pass) + nrow(out$high_error) + nrow(out$low_markers) expect_equal(subset_total, nrow(out$full_results)) }) @@ -173,7 +177,6 @@ test_that("plot element is NULL when plot_results = FALSE", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", verbose = FALSE, plot_results = FALSE) - expect_null(out$plot) }) @@ -181,7 +184,6 @@ test_that("best_pair full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_true(all(c("id", "male_parent", "female_parent", "mendelian_error_pct", "markers_tested", "status") %in% names(out$full_results))) @@ -191,7 +193,6 @@ test_that("best_male_parent full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, plot_results = FALSE) - expect_true(all(c("id", "best_match", "mendelian_error_pct", "markers_tested", "status") %in% names(out$full_results))) expect_false("male_parent" %in% names(out$full_results)) @@ -201,7 +202,6 @@ test_that("best_female_parent full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, plot_results = FALSE) - expect_true(all(c("id", "best_match", "mendelian_error_pct", "markers_tested", "status") %in% names(out$full_results))) }) @@ -210,7 +210,6 @@ test_that("best_match full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_match", verbose = FALSE, plot_results = FALSE) - expect_true(all(c("id", "best_match", "mendelian_error_pct", "markers_tested", "status") %in% names(out$full_results))) }) @@ -218,11 +217,11 @@ test_that("best_match full_results has expected lowercase columns", { # ============================================================================== # 3. Biological correctness # ============================================================================== + test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$male_parent, "S1") expect_equal(out$full_results$female_parent, "D1") expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) @@ -232,7 +231,6 @@ test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { f <- make_files(base_genos, base_parents, child2_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$male_parent, "S2") expect_equal(out$full_results$female_parent, "D2") expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) @@ -242,7 +240,6 @@ test_that("best_male_parent identifies S1 as best male for child1", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$best_match, "S1") }) @@ -250,7 +247,6 @@ test_that("best_female_parent identifies D1 as best female for child1", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$best_match, "D1") }) @@ -258,7 +254,6 @@ test_that("both child1 and child2 correctly assigned when run together", { f <- make_files(base_genos, base_parents, base_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(nrow(out$full_results), 2L) expect_equal(out$full_results[id == "child1"]$male_parent, "S1") expect_equal(out$full_results[id == "child1"]$female_parent, "D1") @@ -270,7 +265,6 @@ test_that("markers_tested equals number of non-NA marker columns", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$markers_tested, ncol(base_genos) - 1L) }) @@ -278,7 +272,6 @@ test_that("mendelian_error_pct is between 0 and 100", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - pct <- as.numeric(out$full_results$mendelian_error_pct) expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) @@ -286,12 +279,12 @@ test_that("mendelian_error_pct is between 0 and 100", { # ============================================================================== # 4. status — lowercase values # ============================================================================== + test_that("status = pass for perfect trio within thresholds", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$status, "pass") }) @@ -300,7 +293,6 @@ test_that("status = low_markers when min_markers exceeds available markers", { out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 99999, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$status, "low_markers") }) @@ -319,7 +311,6 @@ test_that("status = high_error when error rate exceeds threshold", { out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$status, "high_error") }) @@ -328,11 +319,8 @@ test_that("status column is present and lowercase in all methods", { for (m in c("best_pair", "best_male_parent", "best_female_parent", "best_match")) { out <- find_parentage(f$g, f$p, f$pr, method = m, show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - expect_true("status" %in% names(out$full_results), - label = paste("status column present for method", m)) - expect_true(all(out$full_results$status %in% - c("pass", "high_error", "low_markers", NA)), - label = paste("lowercase status values for method", m)) + expect_true("status" %in% names(out$full_results)) + expect_true(all(out$full_results$status %in% c("pass", "high_error", "low_markers", NA))) } }) @@ -342,20 +330,10 @@ test_that("low_markers is flagged for single-parent methods too", { out <- find_parentage(f$g, f$p, f$pr, method = m, min_markers = 99999, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results$status, "low_markers", - label = paste("low_markers for method", m)) + expect_equal(out$full_results$status, "low_markers") } }) -test_that("pass list element contains only pass rows", { - f <- make_files(base_genos, base_parents, base_progeny) - out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - - if (nrow(out$pass) > 0) - expect_true(all(out$pass$status == "pass")) -}) - test_that("high_error list element contains only high_error rows", { high_error_genos <- data.table::data.table( id = c("S1", "D1", "bad_child"), @@ -368,7 +346,6 @@ test_that("high_error list element contains only high_error rows", { out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - if (nrow(out$high_error) > 0) expect_true(all(out$high_error$status == "high_error")) }) @@ -376,6 +353,7 @@ test_that("high_error list element contains only high_error rows", { # ============================================================================== # 5. allow_selfing # ============================================================================== + test_that("allow_selfing = FALSE removes self-pairs from candidates", { ambig_parents <- data.table::data.table(id = c("S1", "D1"), sex = c("A", "A")) f <- make_files(base_genos, ambig_parents, child1_progeny) @@ -392,11 +370,11 @@ test_that("allow_selfing = FALSE removes self-pairs from candidates", { # ============================================================================== # 6. show_ties # ============================================================================== + test_that("show_ties = TRUE produces lowercase suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = TRUE, verbose = FALSE, plot_results = FALSE) - expect_true(any(grepl("^male_parent_", names(out$full_results)))) expect_true(any(grepl("^female_parent_", names(out$full_results)))) }) @@ -418,7 +396,6 @@ test_that("base columns are always populated even when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = TRUE, verbose = FALSE, plot_results = FALSE) - expect_false(is.na(out$full_results$male_parent[1])) expect_false(is.na(out$full_results$female_parent[1])) }) @@ -426,11 +403,16 @@ test_that("base columns are always populated even when ties exist", { # ============================================================================== # 7. verbose # ============================================================================== + test_that("verbose = TRUE returns the result as a named list", { - f <- make_files(base_genos, base_parents, child1_progeny) - out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = TRUE, plot_results = FALSE) + f <- make_files(base_genos, base_parents, child1_progeny) + invisible(capture.output( + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = TRUE, plot_results = FALSE) + )) expect_type(out, "list") + expect_named(out, c("pass", "high_error", "low_markers", "full_results", "plot"), + ignore.order = TRUE) }) test_that("verbose = FALSE suppresses console output", { @@ -444,6 +426,7 @@ test_that("verbose = FALSE suppresses console output", { # ============================================================================== # 8. No write logic — find_parentage does not write files # ============================================================================== + test_that("no output files are written to disk", { tmp_dir <- tempfile() dir.create(tmp_dir) @@ -456,18 +439,17 @@ test_that("no output files are written to disk", { verbose = FALSE, plot_results = FALSE) written_files <- list.files(tmp_dir, pattern = "\\.txt$|\\.jpg$|\\.csv$") - # Only the input files we created should be there expect_equal(length(written_files), 3L) }) # ============================================================================== # 9. Sex-based candidate filtering # ============================================================================== + test_that("best_male_parent only assigns M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", verbose = FALSE, plot_results = FALSE) - valid_male_parents <- base_parents[sex %in% c("M", "A")]$id expect_true(out$full_results$best_match %in% valid_male_parents) }) @@ -476,7 +458,6 @@ test_that("best_female_parent only assigns F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", verbose = FALSE, plot_results = FALSE) - valid_female_parents <- base_parents[sex %in% c("F", "A")]$id expect_true(out$full_results$best_match %in% valid_female_parents) }) @@ -485,7 +466,6 @@ test_that("best_pair male slot contains only M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - valid_males <- base_parents[sex %in% c("M", "A")]$id expect_true(out$full_results$male_parent %in% valid_males) }) @@ -494,7 +474,6 @@ test_that("best_pair female slot contains only F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", show_ties = FALSE, verbose = FALSE, plot_results = FALSE) - valid_females <- base_parents[sex %in% c("F", "A")]$id expect_true(out$full_results$female_parent %in% valid_females) }) @@ -502,6 +481,7 @@ test_that("best_pair female slot contains only F or A parents", { # ============================================================================== # 10. Edge cases # ============================================================================== + test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", @@ -533,9 +513,9 @@ test_that("multiple progeny are all represented in full_results", { expect_setequal(out$full_results$id, base_progeny$id) }) -test_that("single parent pair (n_pairs = 1) does not cause dimension errors", { +test_that("single parent pair does not cause dimension errors", { single_pair_parents <- data.table::data.table(id = c("S1", "D1"), - sex = c("M", "F")) + sex = c("M", "F")) f <- make_files(base_genos, single_pair_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", @@ -548,14 +528,14 @@ test_that("one row returned per progeny for single-parent methods", { for (m in c("best_male_parent", "best_female_parent", "best_match")) { out <- find_parentage(f$g, f$p, f$pr, method = m, verbose = FALSE, plot_results = FALSE) - expect_equal(nrow(out$full_results), 1L, - label = paste("row count for method", m)) + expect_equal(nrow(out$full_results), 1L) } }) # ============================================================================== # 11. plot element # ============================================================================== + test_that("plot element is a ggplot when plot_results = TRUE", { skip_if_not_installed("ggplot2") f <- make_files(base_genos, base_parents, child1_progeny) @@ -569,6 +549,7 @@ test_that("plot element is a ggplot when plot_results = TRUE", { # ============================================================================== # 12. Return value is invisible # ============================================================================== + test_that("find_parentage returns invisibly", { f <- make_files(base_genos, base_parents, child1_progeny) expect_invisible( @@ -576,3 +557,48 @@ test_that("find_parentage returns invisibly", { verbose = FALSE, plot_results = FALSE) ) }) +# ============================================================================== +# 13. In-memory input — data.frame / data.table accepted directly +# ============================================================================== + +test_that("find_parentage accepts data.table inputs directly", { + out <- find_parentage(base_genos, base_parents, child1_progeny, + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 1L) +}) + +test_that("find_parentage accepts data.frame inputs directly", { + out <- find_parentage(as.data.frame(base_genos), + as.data.frame(base_parents), + as.data.frame(child1_progeny), + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 1L) +}) + +test_that("in-memory and file-path inputs produce identical results for find_parentage", { + f <- make_files(base_genos, base_parents, child1_progeny) + out_file <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + plot_results = FALSE) + out_mem <- find_parentage(base_genos, base_parents, child1_progeny, + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_equal(out_file$full_results$male_parent, + out_mem$full_results$male_parent) + expect_equal(out_file$full_results$mendelian_error_pct, + out_mem$full_results$mendelian_error_pct) + expect_equal(out_file$full_results$status, + out_mem$full_results$status) +}) + +test_that("invalid input type raises a descriptive error for find_parentage", { + expect_error( + find_parentage(list(id = "S1"), base_parents, child1_progeny, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) +}) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index dabaaad..7930ae2 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -1,22 +1,24 @@ # tests/testthat/test-validate_pedigree.R # Run with: testthat::test_file("tests/testthat/test-validate_pedigree.R") + library(testthat) library(data.table) # ============================================================================== # Helpers # ============================================================================== + make_genos <- function() { n_markers <- 20 marker_names <- paste0("M", seq_len(n_markers)) dt <- data.table( id = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), rbind( - rep(0L, n_markers), # IND_A — all ref homozygous - rep(2L, n_markers), # IND_B — all alt homozygous - rep(1L, n_markers), # IND_C — all het (valid child of IND_A x IND_B) - rep(0L, n_markers), # IND_D — impossible child of IND_B x IND_A - rep(0L, n_markers) # IND_E — all ref + rep(0L, n_markers), # IND_A — all ref homozygous + rep(2L, n_markers), # IND_B — all alt homozygous + rep(1L, n_markers), # IND_C — all het: valid child of IND_A x IND_B + rep(0L, n_markers), # IND_D — impossible child of IND_B x IND_A + rep(0L, n_markers) # IND_E — all ref ) ) setnames(dt, c("id", marker_names)) @@ -25,7 +27,7 @@ make_genos <- function() { make_pedigree <- function() { # IND_C: perfect Mendelian child of IND_A x IND_B -> pass - # IND_D: declared parents swapped -> fail + # IND_D: declared parents swapped -> fail data.table( id = c("IND_C", "IND_D"), male_parent = c("IND_A", "IND_B"), @@ -44,24 +46,63 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { # ============================================================================== # 1. Input validation # ============================================================================== + test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, plot_results = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, plot_results = FALSE)) + expect_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 150, + verbose = FALSE, plot_results = FALSE), + regexp = "trio_error_threshold" + ) + expect_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = -1, + verbose = FALSE, plot_results = FALSE), + regexp = "trio_error_threshold" + ) }) test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, plot_results = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, plot_results = FALSE)) + expect_error( + validate_pedigree(f$ped, f$genos, single_parent_error_threshold = 101, + verbose = FALSE, plot_results = FALSE), + regexp = "single_parent_error_threshold" + ) + expect_error( + validate_pedigree(f$ped, f$genos, single_parent_error_threshold = -5, + verbose = FALSE, plot_results = FALSE), + regexp = "single_parent_error_threshold" + ) +}) + +test_that("boundary values 0 and 100 are accepted for trio_error_threshold", { + f <- write_temp_files() + expect_no_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 0, + verbose = FALSE, plot_results = FALSE) + ) + expect_no_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 100, + verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("nonexistent pedigree file throws 'Error reading input files'", { + f <- write_temp_files() + expect_error( + validate_pedigree("nonexistent.txt", f$genos, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) +}) + +test_that("nonexistent genotypes file throws 'Error reading input files'", { + f <- write_temp_files() + expect_error( + validate_pedigree(f$ped, "nonexistent.txt", + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) }) test_that("missing required pedigree column raises an error", { @@ -92,13 +133,21 @@ test_that("all trios with no genotype data stops with an error", { ) }) +test_that("unreadable founders file raises an error", { + f <- write_temp_files() + expect_error( + validate_pedigree(f$ped, f$genos, founders_file = "nonexistent_founders.txt", + verbose = FALSE, plot_results = FALSE) + ) +}) + # ============================================================================== # 2. Return structure — named list # ============================================================================== + test_that("returns an invisible named list with all required elements", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_type(out, "list") expect_named(out, c("pass", "fail", "low_markers", "no_genotype_data", "founders", "missing_parents", "full_results", @@ -106,17 +155,28 @@ test_that("returns an invisible named list with all required elements", { ignore.order = TRUE) }) -test_that("full_results has one row per pedigree entry", { +test_that("validate_pedigree returns invisibly", { + f <- write_temp_files() + expect_invisible( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("full_results is a data.table", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") +}) +test_that("full_results has one row per pedigree entry", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) expect_equal(nrow(out$full_results), 2L) }) test_that("full_results has all expected lowercase columns", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expected_cols <- c( "id", "orig_male_parent", "orig_female_parent", "trio_mendelian_error_pct", "trio_markers_tested", "status", @@ -128,83 +188,99 @@ test_that("full_results has all expected lowercase columns", { expect_true(all(expected_cols %in% names(out$full_results))) }) -test_that("full_results is a data.table", { - f <- write_temp_files() - out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - expect_s3_class(out$full_results, "data.table") -}) - test_that("corrected_pedigree is a data.table with lowercase columns", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_s3_class(out$corrected_pedigree, "data.table") expect_true(all(c("id", "male_parent", "female_parent") %in% names(out$corrected_pedigree))) }) -test_that("plot element is NULL when plot_results = FALSE", { +test_that("corrected_pedigree has same number of rows as original pedigree", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$corrected_pedigree), nrow(make_pedigree())) +}) +test_that("plot element is NULL when plot_results = FALSE", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) expect_null(out$plot) }) +test_that("plot element is a ggplot when plot_results = TRUE", { + skip_if_not_installed("ggplot2") + f <- write_temp_files() + out <- suppressWarnings( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = TRUE) + ) + expect_s3_class(out$plot, "ggplot") +}) + test_that("named list subsets sum correctly to full_results row count", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - subset_total <- nrow(out$pass) + nrow(out$fail) + nrow(out$low_markers) + nrow(out$no_genotype_data) + nrow(out$founders) + nrow(out$missing_parents) expect_equal(subset_total, nrow(out$full_results)) }) # ============================================================================== -# 3. pass / fail / low_markers / no_data statuses — lowercase +# 3. pass / fail / low_markers statuses # ============================================================================== -test_that("pass trio is correctly identified", { + +test_that("pass trio is correctly identified with 0% error", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - pass_row <- out$full_results[id == "IND_C"] - expect_equal(nrow(pass_row), 1L) - expect_equal(pass_row$status, "pass") - expect_equal(pass_row$trio_mendelian_error_pct, 0) - expect_equal(pass_row$recommended_correction, "none") + r <- out$full_results[id == "IND_C"] + expect_equal(nrow(r), 1L) + expect_equal(r$status, "pass") + expect_equal(r$trio_mendelian_error_pct, 0) + expect_equal(r$recommended_correction, "none") }) -test_that("fail trio is correctly identified", { +test_that("fail trio is correctly identified with error above threshold", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - fail_row <- out$full_results[id == "IND_D"] - expect_equal(nrow(fail_row), 1L) - expect_equal(fail_row$status, "fail") - expect_gt(fail_row$trio_mendelian_error_pct, 5.0) + r <- out$full_results[id == "IND_D"] + expect_equal(nrow(r), 1L) + expect_equal(r$status, "fail") + expect_gt(r$trio_mendelian_error_pct, 5.0) }) -test_that("fail trio has remove_male_parent decision and best candidate populated", { +test_that("fail trio has a non-NA recommended_correction", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - fail_row <- out$full_results[id == "IND_D"] - expect_equal(fail_row$recommended_correction, "remove_male_parent") - expect_false(is.na(fail_row$best_male_candidate)) - expect_true(is.na(fail_row$best_female_candidate)) + r <- out$full_results[id == "IND_D"] + expect_false(is.na(r$recommended_correction)) + expect_false(r$recommended_correction == "none") }) -test_that("trio_mendelian_error_pct is 0 for a perfect trio", { +test_that("fail trio with one acceptable parent gets remove_* correction", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] + expect_true(r$recommended_correction %in% + c("remove_male_parent", "remove_female_parent", "remove_both", + "keep_both")) +}) +test_that("trio_mendelian_error_pct is 0 for a perfect Mendelian trio", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) expect_equal(out$full_results[id == "IND_C"]$trio_mendelian_error_pct, 0) }) -test_that("trio_markers_tested equals number of markers for complete data", { +test_that("trio_mendelian_error_pct is between 0 and 100 for all trios", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + pct <- out$full_results$trio_mendelian_error_pct + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) +}) +test_that("trio_markers_tested equals number of markers for complete data", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 20L) }) @@ -212,7 +288,6 @@ test_that("low_markers status assigned when markers_tested < min_markers", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE, min_markers = 25L) - expect_true(all(out$full_results$status == "low_markers")) expect_true(all(grepl("^low_markers_", out$full_results$recommended_correction))) }) @@ -223,7 +298,6 @@ test_that("NA markers reduce trio_markers_tested and do not cause errors", { genos[id == "IND_C", M2 := NA_integer_] f <- write_temp_files(genos = genos) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 18L) expect_equal(out$full_results[id == "IND_C"]$status, "pass") }) @@ -231,20 +305,38 @@ test_that("NA markers reduce trio_markers_tested and do not cause errors", { test_that("pass list element contains only pass rows", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - expect_true(all(out$pass$status == "pass")) + if (nrow(out$pass) > 0) + expect_true(all(out$pass$status == "pass")) }) test_that("fail list element contains only fail rows", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$fail) > 0) + expect_true(all(out$fail$status == "fail")) +}) - expect_true(all(out$fail$status == "fail")) +test_that("low_markers list element contains only low_markers rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE, min_markers = 25L) + if (nrow(out$low_markers) > 0) + expect_true(all(out$low_markers$status == "low_markers")) +}) + +test_that("raising trio_error_threshold turns fail rows into pass rows", { + f <- write_temp_files() + strict <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + verbose = FALSE, plot_results = FALSE) + lenient <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 100.0, + verbose = FALSE, plot_results = FALSE) + expect_gte(nrow(lenient$pass), nrow(strict$pass)) }) # ============================================================================== # 4. missing parent statuses # ============================================================================== + test_that("missing_male_parent status and recommendation are correct", { ped <- rbind(make_pedigree(), data.table(id = "IND_E", male_parent = "0", @@ -252,7 +344,6 @@ test_that("missing_male_parent status and recommendation are correct", { f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "IND_E"] - expect_equal(r$status, "missing_male_parent") expect_equal(r$recommended_correction, "none") expect_false(is.na(r$best_male_candidate)) @@ -266,7 +357,6 @@ test_that("missing_female_parent status and recommendation are correct", { f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "IND_E"] - expect_equal(r$status, "missing_female_parent") expect_equal(r$recommended_correction, "none") expect_true(is.na(r$best_male_candidate)) @@ -275,43 +365,55 @@ test_that("missing_female_parent status and recommendation are correct", { test_that("missing_both_parents status and recommendations are correct", { ped <- rbind(make_pedigree(), - data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "IND_E"] - expect_equal(r$status, "missing_both_parents") expect_equal(r$recommended_correction, "none") expect_false(is.na(r$best_male_candidate)) expect_false(is.na(r$best_female_candidate)) }) -test_that("best_male_candidate for missing_male_parent is not the known female parent", { +test_that("best_male_candidate for missing_male_parent excludes the known female parent", { ped <- rbind(make_pedigree(), data.table(id = "IND_E", male_parent = "0", female_parent = "IND_B")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "IND_E"] - expect_false(r$best_male_candidate == "IND_B") }) -test_that("missing_parents list element contains only missing_* rows", { +test_that("best_female_candidate for missing_female_parent excludes the known male parent", { ped <- rbind(make_pedigree(), - data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "0")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_false(r$best_female_candidate == "IND_A") +}) - expect_true(all(grepl("^missing_", out$missing_parents$status))) +test_that("missing_parents list element contains only missing_* rows", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$missing_parents) > 0) + expect_true(all(grepl("^missing_", out$missing_parents$status))) }) # ============================================================================== # 5. founders status # ============================================================================== -test_that("founders status is assigned when ID in founders list with 0 0 parents", { + +test_that("founders status is assigned when ID is in founders list with 0 0 parents", { ped <- rbind(make_pedigree(), - data.table(id = "IND_A", male_parent = "0", female_parent = "0")) + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) founders_file <- tempfile(fileext = ".txt") fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) @@ -319,7 +421,6 @@ test_that("founders status is assigned when ID in founders list with 0 0 parents out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "IND_A"] - expect_equal(r$status, "founders") expect_equal(r$recommended_correction, "none") expect_true(is.na(r$best_male_candidate)) @@ -328,42 +429,60 @@ test_that("founders status is assigned when ID in founders list with 0 0 parents test_that("founders list element contains only founders rows", { ped <- rbind(make_pedigree(), - data.table(id = "IND_A", male_parent = "0", female_parent = "0")) + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) founders_file <- tempfile(fileext = ".txt") fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, verbose = FALSE, plot_results = FALSE) - - expect_true(all(out$founders$status == "founders")) + if (nrow(out$founders) > 0) + expect_true(all(out$founders$status == "founders")) }) test_that("non-founder rows still evaluated normally when founders file is supplied", { ped <- rbind(make_pedigree(), - data.table(id = "IND_A", male_parent = "0", female_parent = "0")) + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) founders_file <- tempfile(fileext = ".txt") fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results[id == "IND_C"]$status, "pass") + expect_equal(out$full_results[id == "IND_D"]$status, "fail") }) test_that("0 0 parents NOT in founders list get missing_both_parents", { ped <- rbind(make_pedigree(), - data.table(id = "IND_E", male_parent = "0", female_parent = "0")) + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results[id == "IND_E"]$status, "missing_both_parents") }) +test_that("founder row does not appear in pass, fail, or missing_parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(id = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + expect_false("IND_A" %in% out$pass$id) + expect_false("IND_A" %in% out$fail$id) + expect_false("IND_A" %in% out$missing_parents$id) +}) + # ============================================================================== # 6. no_genotype_data status # ============================================================================== + test_that("no_genotype_data is flagged for progeny absent from genotype file", { ped <- rbind(make_pedigree(), data.table(id = "GHOST", male_parent = "IND_A", @@ -371,7 +490,6 @@ test_that("no_genotype_data is flagged for progeny absent from genotype file", { f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "GHOST"] - expect_equal(nrow(r), 1L) expect_equal(r$status, "no_genotype_data") expect_equal(r$recommended_correction, "none") @@ -384,20 +502,18 @@ test_that("no_genotype_data rows have NA/0 for all analysis columns", { f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) r <- out$full_results[id == "GHOST"] - expect_true(is.na(r$trio_mendelian_error_pct)) expect_equal(r$trio_markers_tested, 0L) expect_true(is.na(r$best_male_candidate)) expect_true(is.na(r$best_female_candidate)) }) -test_that("no_genotype_data flagged when declared parent absent from genotype file", { +test_that("no_genotype_data flagged when a declared parent is absent from genotype file", { ped <- rbind(make_pedigree(), data.table(id = "IND_C_GHOST", male_parent = "GHOST_DAD", female_parent = "IND_B")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results[id == "IND_C_GHOST"]$status, "no_genotype_data") }) @@ -407,8 +523,8 @@ test_that("no_genotype_data list element contains only no_genotype_data rows", { female_parent = "IND_B")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - expect_true(all(out$no_genotype_data$status == "no_genotype_data")) + if (nrow(out$no_genotype_data) > 0) + expect_true(all(out$no_genotype_data$status == "no_genotype_data")) }) test_that("valid trios still evaluated correctly when ghost rows are present", { @@ -417,7 +533,6 @@ test_that("valid trios still evaluated correctly when ghost rows are present", { female_parent = "IND_B")) f <- write_temp_files(ped = ped) out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - expect_equal(out$full_results[id == "IND_C"]$status, "pass") expect_equal(out$full_results[id == "IND_D"]$status, "fail") }) @@ -425,34 +540,52 @@ test_that("valid trios still evaluated correctly when ghost rows are present", { # ============================================================================== # 7. corrected_pedigree contents # ============================================================================== + test_that("corrected_pedigree: pass parents are unchanged", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) corr <- out$corrected_pedigree - expect_equal(as.character(corr[id == "IND_C"]$male_parent), "IND_A") expect_equal(as.character(corr[id == "IND_C"]$female_parent), "IND_B") }) -test_that("corrected_pedigree: bad parent set to 0 for fail trio with remove_male_parent", { +test_that("corrected_pedigree: removed parent set to 0 for remove_male_parent", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] corr <- out$corrected_pedigree + if (r$recommended_correction == "remove_male_parent") { + expect_equal(corr[id == "IND_D"]$male_parent, "0") + } +}) - expect_equal(corr[id == "IND_D"]$male_parent, "0") - expect_equal(corr[id == "IND_D"]$female_parent, "IND_A") +test_that("corrected_pedigree: removed parent set to 0 for remove_female_parent", { + # construct a trio where IND_A (all 0) is correct male and female is wrong + genos <- make_genos() + ped <- data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "IND_B") + # IND_E is all ref (0); IND_A is all ref (0); IND_B is all alt (2) + # IND_E as child of IND_A x IND_B is impossible → remove_female_parent + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + corr <- out$corrected_pedigree + r <- out$full_results[id == "IND_E"] + if (r$recommended_correction == "remove_female_parent") { + expect_equal(corr[id == "IND_E"]$female_parent, "0") + expect_false(corr[id == "IND_E"]$male_parent == "0") + } }) -test_that("corrected_pedigree has same number of rows as original pedigree", { +test_that("corrected_pedigree preserves id column values", { f <- write_temp_files() out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - - expect_equal(nrow(out$corrected_pedigree), nrow(make_pedigree())) + expect_setequal(out$corrected_pedigree$id, make_pedigree()$id) }) # ============================================================================== -# 8. No write logic — functions do not write files +# 8. No write logic — function does not write files # ============================================================================== + test_that("no output files are written to disk", { f <- write_temp_files() tmp_dir <- tempfile() @@ -460,36 +593,157 @@ test_that("no output files are written to disk", { old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) - written_files <- list.files(tmp_dir) expect_length(written_files, 0) }) # ============================================================================== -# 9. verbose and plot_results +# 9. verbose # ============================================================================== + test_that("verbose = FALSE suppresses console output", { f <- write_temp_files() - expect_silent(validate_pedigree(f$ped, f$genos, verbose = FALSE, - plot_results = FALSE)) + expect_silent( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + ) }) -test_that("plot element is a ggplot when plot_results = TRUE", { - skip_if_not_installed("ggplot2") - f <- write_temp_files() - out <- suppressWarnings( - validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = TRUE) - ) - expect_s3_class(out$plot, "ggplot") +test_that("verbose = TRUE returns valid named list without error", { + f <- write_temp_files() + invisible(capture.output( + out <- validate_pedigree(f$ped, f$genos, verbose = TRUE, plot_results = FALSE) + )) + expect_type(out, "list") + expect_named(out, c("pass", "fail", "low_markers", "no_genotype_data", + "founders", "missing_parents", "full_results", + "corrected_pedigree", "plot"), + ignore.order = TRUE) }) # ============================================================================== -# 10. Return value is invisible +# 10. Mendelian error correctness # ============================================================================== -test_that("validate_pedigree returns invisibly", { + +test_that("0x0 parents produce 0% error for dosage-0 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 0L, 0L), M2 = c(0L, 0L, 0L), M3 = c(0L, 0L, 0L), + M4 = c(0L, 0L, 0L), M5 = c(0L, 0L, 0L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("2x2 parents produce 0% error for dosage-2 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(2L, 2L, 2L), M2 = c(2L, 2L, 2L), M3 = c(2L, 2L, 2L), + M4 = c(2L, 2L, 2L), M5 = c(2L, 2L, 2L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("0x0 parents produce 100% error for dosage-2 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 100) +}) + +test_that("0x2 parents produce 0% error for dosage-1 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 2L, 1L), M2 = c(0L, 2L, 1L), M3 = c(0L, 2L, 1L), + M4 = c(0L, 2L, 1L), M5 = c(0L, 2L, 1L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("0x2 parents produce 100% error for dosage-0 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 2L, 0L), M2 = c(0L, 2L, 0L), M3 = c(0L, 2L, 0L), + M4 = c(0L, 2L, 0L), M5 = c(0L, 2L, 0L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 100) +}) +# ============================================================================== +# 11. In-memory input — data.frame / data.table accepted directly +# ============================================================================== + +test_that("validate_pedigree accepts a data.table pedigree directly", { + f <- write_temp_files() + ped <- make_pedigree() + out <- validate_pedigree(ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("validate_pedigree accepts a data.table genotypes object directly", { + f <- write_temp_files() + genos <- make_genos() + out <- validate_pedigree(f$ped, genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("validate_pedigree accepts both inputs as data.tables directly", { + ped <- make_pedigree() + genos <- make_genos() + out <- validate_pedigree(ped, genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("validate_pedigree accepts a data.frame pedigree directly", { + f <- write_temp_files() + ped <- as.data.frame(make_pedigree()) + out <- validate_pedigree(ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("in-memory and file-path inputs produce identical results for validate_pedigree", { + f <- write_temp_files() + ped <- make_pedigree() + genos <- make_genos() + out_file <- validate_pedigree(f$ped, f$genos, + verbose = FALSE, plot_results = FALSE) + out_memory <- validate_pedigree(ped, genos, + verbose = FALSE, plot_results = FALSE) + expect_equal(out_file$full_results$status, + out_memory$full_results$status) + expect_equal(out_file$full_results$trio_mendelian_error_pct, + out_memory$full_results$trio_mendelian_error_pct) +}) + +test_that("invalid input type raises an error for validate_pedigree", { f <- write_temp_files() - expect_invisible(validate_pedigree(f$ped, f$genos, verbose = FALSE, - plot_results = FALSE)) + expect_error( + validate_pedigree(list(id = "IND_C"), f$genos, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) })