diff --git a/.Rbuildignore b/.Rbuildignore index 307d5d0..50c7896 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,3 +10,5 @@ ^package-code\.Rmd$ ^docs$ ^CRAN-SUBMISSION$ +^\.positai$ +^\.claude$ diff --git a/.gitignore b/.gitignore index d3ffaad..e056686 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata revdep/ .DS_Store +.positai diff --git a/DESCRIPTION b/DESCRIPTION index 0524f2f..b71abd1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,6 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: @@ -74,3 +73,4 @@ Suggests: polyRAD, testthat (>= 3.0.0) RdMacros: Rdpack +Config/roxygen2/version: 8.0.0 diff --git a/NAMESPACE b/NAMESPACE index c919736..4d25dfe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,8 +23,8 @@ export(merge_MADCs) export(solve_composition_poly) export(thinSNP) export(updog2vcf) +export(validate_pedigree) import(dplyr) -import(janitor) import(parallel) import(quadprog) import(stringr) @@ -40,21 +40,37 @@ importFrom(data.table,as.data.table) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fread) -importFrom(data.table,fwrite) +importFrom(data.table,is.data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) +importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) +importFrom(dplyr,distinct) importFrom(dplyr,filter) +importFrom(dplyr,first) importFrom(dplyr,group_by) importFrom(dplyr,group_modify) +importFrom(dplyr,if_else) importFrom(dplyr,mutate) +importFrom(dplyr,n) +importFrom(dplyr,n_distinct) +importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) importFrom(dplyr,where) +importFrom(ggplot2,aes) +importFrom(ggplot2,element_text) +importFrom(ggplot2,geom_bar) +importFrom(ggplot2,ggplot) +importFrom(ggplot2,labs) +importFrom(ggplot2,theme) +importFrom(ggplot2,theme_minimal) +importFrom(janitor,clean_names) importFrom(pwalign,nucleotideSubstitutionMatrix) importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R index 5f21836..52a2d85 100644 --- a/R/breedtools_functions.R +++ b/R/breedtools_functions.R @@ -1,149 +1,129 @@ -#' Compute Allele Frequencies for Populations +#' Compute allele frequencies for populations #' -#' Computes allele frequencies for specified populations given SNP array data +#' Computes allele frequencies for specified populations from SNP array data +#' coded as dosage of allele B. #' -#' @param geno matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' with individuals in rows (named) and SNPs in columns (named) -#' @param populations list of named populations. Each population has a vector of IDs -#' that belong to the population. Allele frequencies will be derived from all animals -#' @param ploidy integer indicating the ploidy level (default is 2 for diploid) -#' @return data.frame consisting of allele_frequencies for populations (columns) for -#' each SNP (rows) -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' @param geno Numeric matrix of genotypes coded as dosage of allele B +#' \code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +#' SNPs in columns (named). +#' @param populations Named list of populations, each containing a character +#' vector of individual IDs belonging to that population. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame of allele frequencies with SNPs as rows and populations +#' as columns. +#' +#' @author Josué Chinchilla-Vargas +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' #' @examples -#' # Example inputs #' geno_matrix <- matrix( -#' c(4, 1, 4, 0, # S1 -#' 2, 2, 1, 3, # S2 -#' 0, 4, 0, 4, # S3 -#' 3, 3, 2, 2, # S4 -#' 1, 4, 2, 3),# S5 -#' nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) +#' c(4, 1, 4, 0, +#' 2, 2, 1, 3, +#' 0, 4, 0, 4, +#' 3, 3, 2, 2, +#' 1, 4, 2, 3), +#' nrow = 4, ncol = 5, byrow = FALSE, +#' dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) #' ) -#' -#'pop_list <- list( -#' PopA = c("Ind1", "Ind2"), -#' PopB = c("Ind3", "Ind4") +#' pop_list <- list( +#' PopA = c("Ind1", "Ind2"), +#' PopB = c("Ind3", "Ind4") #' ) -#' #' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) #' print(allele_freqs) #' #' @export allele_freq_poly <- function(geno, populations, ploidy = 2) { - - # Initialize returned df + # Initialize result matrix X <- matrix(NA, nrow = ncol(geno), ncol = length(populations)) - - # Subset geno into different populations for (i in 1:length(populations)) { - - # Get name of ith item in the list (population name) pop_name <- names(populations[i]) - - # Subset geno to only include genotypes of IDs in pop - pop_geno <- geno[rownames(geno) %in% populations[[i]], ] - - # Calculate allele frequencies - al_freq <- colMeans(pop_geno, na.rm = TRUE) / ploidy - - # Add to X - X[, i] <- al_freq + pop_geno <- geno[base::rownames(geno) %in% populations[[i]], ] + al_freq <- base::colMeans(pop_geno, na.rm = TRUE) / ploidy + X[, i] <- al_freq } - - # Label X with populations and SNPs - colnames(X) <- names(populations) - rownames(X) <- colnames(geno) - + base::colnames(X) <- base::names(populations) + base::rownames(X) <- base::colnames(geno) return(X) } +#' Solve breed composition for a single animal via quadratic programming +#' +#' Internal helper that solves the constrained OLS problem for one animal, +#' returning breed proportion estimates and the R² of the fit. #' -#' Performs whole genome breed composition prediction. +#' @param Y Numeric vector of genotypes (named by SNP) for a single animal, +#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' +#' @return A named numeric vector of breed proportions plus `R2`. +#' +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. #' -#' @param Y numeric vector of genotypes (with names as SNPs) from a single animal. -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies from reference animals -#' @param p numeric indicating number of breeds represented in X -#' @param names character names of breeds -#' @return data.frame of breed composition estimates #' @import quadprog #' @importFrom stats cor -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. #' #' @noRd QPsolve <- function(Y, X) { - - # Remove NAs from Y and remove corresponding - # SNPs from X. Ensure Y is numeric - Ymod <- Y[!is.na(Y)] - Xmod <- X[names(Ymod), ] - - # Determine properties from X matrix - the number of parameters (breeds) p - # and the names of those parameters. - p <- ncol(X) - names <- colnames(X) - - # perfom steps needed to solve OLS by framing - # as a QP problem - # Rinv - matrix should be of dimensions px(p+1) where p is the number of variables in X - Rinv <- solve(chol(t(Xmod) %*% Xmod)) - - # C - the first column is a sum restriction (all equal to 1) and the rest of the columns an identity matrix - C <- cbind(rep(1, p), diag(p)) - - # b2 - This should be a vector of length p+1 the first element is the value of the sum (1) - # the other elements are the restriction of individual coefficients (>) - # so a value 0 produces positive coefficients - b2 <- c(1, rep(0, p)) - - # dd - this should be a matrix NOT a vector - dd <- (t(Ymod) %*% Xmod) - - qp <- solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, Amat = C, bvec = b2, meq = 1) - beta <- qp$solution - rr <- cor(Ymod, Xmod %*% beta)^2 + Ymod <- Y[!base::is.na(Y)] + Xmod <- X[base::names(Ymod), ] + p <- base::ncol(X) + nms <- base::colnames(X) + Rinv <- base::solve(base::chol(base::t(Xmod) %*% Xmod)) + C <- base::cbind(base::rep(1, p), base::diag(p)) + b2 <- c(1, base::rep(0, p)) + dd <- base::t(Ymod) %*% Xmod + qp <- quadprog::solve.QP(Dmat = Rinv, factorized = TRUE, dvec = dd, + Amat = C, bvec = b2, meq = 1) + beta <- qp$solution + rr <- stats::cor(Ymod, Xmod %*% beta)^2 result <- c(beta, rr) - names(result) <- c(names, "R2") + base::names(result) <- c(nms, "R2") return(result) } -#' Compute Genome-Wide Breed Composition -#' -#' Computes genome-wide breed/ancestry composition using quadratic programming on a -#' batch of animals. -#' -#' @param Y numeric matrix of genotypes (columns) from all animals (rows) in population -#' coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}} -#' @param X numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -#' relative to allele B. -#' @param ped data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam" -#' @param groups list of IDs categorized by breed/population. If specified, output will be a list -#' of results categorized by breed/population. -#' @param mia logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing the inferred maternally inherited allele for each locus for each animal instead -#' of breed composition results. -#' @param sire logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing sire genotypes for each locus for each animal instead of breed composition results. -#' @param dam logical. Only applies if ped argument is supplied. If true, returns a data.frame -#' containing dam genotypes for each locus for each animal instead of breed composition results. -#' @param ploidy integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.). -#' @return A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -#' results -#' @import quadprog -#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -#' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +#' Compute genome-wide breed composition #' -#' @examples -#' # Example inputs for solve_composition_poly (ploidy = 4) +#' Estimates genome-wide breed/ancestry composition for a batch of animals +#' using quadratic programming, with optional pedigree-assisted and +#' grouped-output modes. +#' +#' @param Y Numeric matrix of genotypes with individuals as rows and SNPs as +#' columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}. +#' @param X Numeric matrix of allele frequencies with SNPs as rows and +#' breeds/populations as columns. +#' @param ped Optional data frame with pedigree information formatted with +#' columns `ID`, `Sire`, and `Dam`. When supplied, `QPsolve_par` is used +#' and only animals with genotyped parents are included. +#' @param groups Optional named list of IDs grouped by breed/population. +#' When supplied, results are returned as a named list partitioned by group. +#' @param mia Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' the inferred maternally inherited allele per locus per animal. Default `FALSE`. +#' @param sire Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' sire genotypes per locus per animal. Default `FALSE`. +#' @param dam Logical. Only applies when `ped` is supplied. If `TRUE`, returns +#' dam genotypes per locus per animal. Default `FALSE`. +#' @param ploidy Integer. Ploidy level of the species. Default is `2`. +#' +#' @return A data frame, or a named list of data frames when `groups` is +#' supplied, containing breed/ancestry composition estimates. +#' +#' @author Josué Chinchilla-Vargas #' -#' # (This would typically be the output from allele_freq_poly) +#' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +#' Estimation of genome-wide and locus-specific breed composition in pigs. +#' _Transl Anim Sci._ 2017;1(1):36–44. +#' +#' @examples #' allele_freqs_matrix <- matrix( #' c(0.625, 0.500, #' 0.500, 0.500, @@ -153,70 +133,50 @@ QPsolve <- function(Y, X) { #' nrow = 5, ncol = 2, byrow = TRUE, #' dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) #' ) -#' -#' # Validation Genotypes (individuals x SNPs) #' val_geno_matrix <- matrix( -#' c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 -#' 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 +#' c(2, 1, 2, 3, 4, +#' 3, 4, 2, 3, 0), #' nrow = 2, ncol = 5, byrow = TRUE, #' dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) #' ) -#' -#' # Calculate Breed Composition #' composition <- solve_composition_poly(Y = val_geno_matrix, #' X = allele_freqs_matrix, #' ploidy = 4) #' print(composition) #' +#' @import quadprog +#' #' @export solve_composition_poly <- function(Y, X, - ped = NULL, + ped = NULL, groups = NULL, - mia = FALSE, - sire = FALSE, - dam = FALSE, + mia = FALSE, + sire = FALSE, + dam = FALSE, ploidy = 2) { + # Transpose so Y is SNPs x animals, as required internally + Y <- base::t(Y) + # Retain only SNPs present in X + Y <- Y[base::rownames(Y) %in% base::rownames(X), ] + + if (!base::is.null(ped)) { + # Pedigree-assisted mode: use QPsolve_par for animals with genotyped parents + mat_results <- base::lapply(base::colnames(Y), QPsolve_par, Y, X, ped, + mia = mia, sire = sire, dam = dam) + mat_results_tab <- base::do.call(rbind, mat_results) + return(mat_results_tab) + + } else if (!base::is.null(groups)) { + # Grouped mode: standard computation, results partitioned by group + Y <- Y / ploidy + grouped_results <- base::lapply(groups, QPseparate, Y, X) + return(grouped_results) - # Functions require Y to be animals x SNPs. Transpose - Y <- t(Y) - - # SNPs in Y should only be the ones present in X - Y <- Y[rownames(Y) %in% rownames(X), ] - - # If ped is supplied, use QPsolve_par to compute genomic composition using - # only animals who have genotyped parents (by incorporating Sire genotype). - if (!is.null(ped)) { - mat_results <- lapply(colnames(Y), - QPsolve_par, - Y, - X, - ped, - mia = mia, - sire = sire, - dam = dam) - - mat_results_tab <- do.call(rbind, mat_results) - return (mat_results_tab) - - # Else if groups supplied - perform regular genomic computation - # and list results by groups - } else if (!is.null(groups)) { - - # When using regular genomic computation - adjust dosage based on ploidy - Y <- Y / ploidy #(default is 2) - - grouped_results <- lapply(groups, QPseparate, Y, X) - return (grouped_results) - - # If neither using the ped or grouping option - just perform normal, unsegregated - # calculation } else { - - # Adjust dosage based on ploidy (default is 2) - Y <- Y / ploidy - - results <- t(apply(Y, 2, QPsolve, X)) - return (results) + # Standard mode: unsegregated computation across all animals + Y <- Y / ploidy + results <- base::t(base::apply(Y, 2, QPsolve, X)) + return(results) } } diff --git a/R/check_ped.R b/R/check_ped.R index 0ebbd5a..259b696 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,138 +1,201 @@ -#' Check a pedigree file for accuracy and report/correct common errors +#' Check and Correct Common Pedigree Errors #' -#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) -#' and performs quality checks, optionally correcting or flagging errors. +#' Reads a 3-column pedigree file (id, male_parent, female_parent) and performs +#' quality checks, optionally correcting detected errors. Exact duplicates and +#' missing parents are always corrected. Conflicting trios and inconsistent sex +#' roles are corrected when their respective arguments are TRUE. Cycles are +#' reported only and must be resolved manually. #' -#' The function checks for: -#' * Exact duplicate rows and removes them (keeping one copy) -#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -#' * IDs that appear in both sire and dam columns -#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" -#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant +#' @param ped.file Path to the pedigree text file (TSV/CSV/TXT), OR a +#' data.frame / data.table with columns: id, male_parent, female_parent. +#' @param seed Optional integer seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints the report to the console. +#' @param correct_conflicting_trios Logical. If TRUE (default), sets conflicting +#' male_parent and female_parent to 0 and collapses to one row per ID. +#' @param correct_inconsistent_sex_roles Logical. If TRUE (default), sets +#' male_parent and female_parent to 0 for rows involving IDs found as both, +#' then removes any resulting exact duplicates. #' -#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -#' -#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: -#' * Corrected pedigree as a dataframe in the global environment -#' * A report listing all detected issues -#' -#' @param ped.file Path to the pedigree text file. -#' @param seed Optional seed for reproducibility. -#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. -#' -#' @return A list of data.frames containing detected issues: -#' * `exact_duplicates`: rows that were exact duplicates -#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam -#' * `messy_parents`: IDs appearing as both sire and dam -#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam -#' * `dependencies`: detected cycles in the pedigree +#' @return An invisible named list of data frames: +#' \describe{ +#' \item{exact_duplicates}{Exact duplicate rows found in the input.} +#' \item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} +#' \item{inconsistent_sex_roles}{Rows where a conflicting ID appears as male_parent or female_parent.} +#' \item{missing_parents}{Parent IDs absent from id, added as founders.} +#' \item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} +#' \item{corrected_pedigree}{Corrected pedigree table.} +#' } #' #' @examples #' ped_file <- system.file("check_ped_test.txt", package = "BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) #' -#' # Access messy parents -#' ped_errors$messy_parents +#' # Also accepts a data.table directly +#' library(data.table) +#' ped_dt <- data.table(id = c("A","B","C"), +#' male_parent = c("0","0","A"), +#' female_parent = c("0","0","B")) +#' ped_errors <- check_ped(ped.file = ped_dt, verbose = FALSE) #' -#' # IDs with messy parents -#' messy_ids <- ped_errors$messy_parents$id -#' print(messy_ids) +#' @author Josue Chinchilla-Vargas #' -#' @import dplyr -#' @import janitor +#' @importFrom dplyr %>% mutate filter group_by ungroup summarize distinct bind_rows select first n n_distinct if_else row_number #' @importFrom stats setNames #' @importFrom utils read.table +#' @importFrom janitor clean_names #' @export -check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { +check_ped <- function(ped.file, + seed = NULL, + verbose = TRUE, + correct_conflicting_trios = TRUE, + correct_inconsistent_sex_roles = TRUE) { #### setup #### if (!is.null(seed)) set.seed(seed) - # Read and clean data - data <- utils::read.table(ped.file, header = TRUE) %>% - janitor::clean_names() %>% - mutate( - id = as.character(id), - sire = as.character(sire), - dam = as.character(dam) + # Accept file path OR in-memory data.frame / data.table + if (is.character(ped.file) && length(ped.file) == 1 && file.exists(ped.file)) { + data <- utils::read.table(ped.file, header = TRUE) + data <- janitor::clean_names(data) + } else if (is.data.frame(ped.file) || data.table::is.data.table(ped.file)) { + data <- as.data.frame(ped.file) + data <- janitor::clean_names(data) + } else { + stop("ped.file must be a valid file path (character) or a data.frame / data.table.") + } + + required_cols <- c("id", "male_parent", "female_parent") + missing_cols <- setdiff(required_cols, colnames(data)) + if (length(missing_cols) > 0) { + stop( + "Input is missing required column(s): ", + paste(missing_cols, collapse = ", "), + ".\nExpected columns: id, male_parent, female_parent." + ) + } + extra_cols <- setdiff(names(data), required_cols) + data <- data[, c(required_cols, extra_cols)] + + data <- data %>% + dplyr::mutate( + id = as.character(id), + male_parent = as.character(male_parent), + female_parent = as.character(female_parent) ) - original_data <- data - errors <- list() - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + data <- data %>% dplyr::mutate(row_number = dplyr::row_number(), .before = id) - #### check 1: exact duplicates #### - exact_duplicates <- data[duplicated(data), ] + errors <- list() + missing_parents <- data.frame( + row_number = integer(), + id = character(), + male_parent = character(), + female_parent = character(), + stringsAsFactors = FALSE + ) + + #### check 1: exact duplicates (always fixed) #### + exact_duplicates <- data[ + duplicated(data %>% dplyr::select(-row_number)) | + duplicated(data %>% dplyr::select(-row_number), fromLast = TRUE), + ] if (nrow(exact_duplicates) > 0) { - data <- distinct(data) # remove exact duplicates + data <- data %>% + dplyr::select(-row_number) %>% + dplyr::distinct() %>% + dplyr::mutate(row_number = dplyr::row_number(), .before = id) } - #### check 2: repeated IDs with conflicting sire/dam #### + #### check 2: conflicting trios #### repeated_ids <- data %>% - group_by(id) %>% - filter(n() > 1) %>% - ungroup() - - # Only IDs with actual conflicting sire/dam - conflicting_ids <- repeated_ids %>% - group_by(id) %>% - filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% - ungroup() - - if (nrow(conflicting_ids) > 0) { - # Keep one row per ID, set sire/dam to "0" + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() + + conflicting_trios_ids <- repeated_ids %>% + dplyr::group_by(id) %>% + dplyr::filter(dplyr::n_distinct(male_parent) > 1 | + dplyr::n_distinct(female_parent) > 1) %>% + dplyr::ungroup() + + if (correct_conflicting_trios && nrow(conflicting_trios_ids) > 0) { data <- data %>% - group_by(id) %>% - summarize( - sire = if(n_distinct(sire) > 1) "0" else first(sire), - dam = if(n_distinct(dam) > 1) "0" else first(dam), + dplyr::group_by(id) %>% + dplyr::summarize( + row_number = dplyr::first(row_number), + male_parent = if (dplyr::n_distinct(male_parent) > 1) "0" else dplyr::first(male_parent), + female_parent = if (dplyr::n_distinct(female_parent) > 1) "0" else dplyr::first(female_parent), .groups = "drop" - ) + ) %>% + dplyr::select(row_number, id, male_parent, female_parent) } - repeated_ids_report <- conflicting_ids + conflicting_trios <- conflicting_trios_ids - #### check 3: missing parents #### + #### check 3: missing parents (always fixed) #### for (i in seq_len(nrow(data))) { - id <- data$id[i] - sire <- data$sire[i] - dam <- data$dam[i] - - if (sire != "0" && sire != id && !sire %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) + id <- data$id[i] + male_parent <- data$male_parent[i] + female_parent <- data$female_parent[i] + + if (male_parent != "0" && male_parent != id && !male_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = male_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) } - if (dam != "0" && dam != id && !dam %in% data$id) { - missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) + if (female_parent != "0" && female_parent != id && !female_parent %in% data$id) { + missing_parents <- rbind( + missing_parents, + data.frame(row_number = data$row_number[i], id = female_parent, + male_parent = "0", female_parent = "0", + stringsAsFactors = FALSE) + ) } - if (sire == id || dam == id) { - errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) + if (male_parent == id || female_parent == id) { + errors <- append(errors, paste("Dependency: Individual", id, + "cannot be its own parent")) } } - missing_parents <- distinct(missing_parents) + missing_parents <- dplyr::distinct(missing_parents) if (nrow(missing_parents) > 0) { - data <- bind_rows(data, missing_parents) + data <- dplyr::bind_rows(data, missing_parents) } - #### check 4: messy parents #### - sire_ids <- unique(data$sire[data$sire != "0"]) - dam_ids <- unique(data$dam[data$dam != "0"]) - messy_ids <- intersect(sire_ids, dam_ids) - messy_parents <- data %>% filter(id %in% messy_ids) + #### check 4: inconsistent sex roles #### + male_ids <- unique(data$male_parent[data$male_parent != "0"]) + female_ids <- unique(data$female_parent[data$female_parent != "0"]) + conflicting_sex_ids <- intersect(male_ids, female_ids) + + inconsistent_sex_roles <- data %>% + dplyr::filter(male_parent %in% conflicting_sex_ids | + female_parent %in% conflicting_sex_ids) + + if (correct_inconsistent_sex_roles && length(conflicting_sex_ids) > 0) { + data <- data %>% + dplyr::mutate( + male_parent = dplyr::if_else(male_parent %in% conflicting_sex_ids, "0", male_parent), + female_parent = dplyr::if_else(female_parent %in% conflicting_sex_ids, "0", female_parent) + ) %>% + dplyr::distinct(id, male_parent, female_parent, .keep_all = TRUE) + } - #### check 5: dependencies (cycles) #### + #### check 5: dependencies (cycles) -- reported only #### detect_all_cycles <- function(data) { adj_list <- lapply(data$id, function(x) { row <- data[data$id == x, ] - c(row$sire, row$dam) + c(row$male_parent, row$female_parent) }) names(adj_list) <- data$id dfs <- function(node, visited, rec_stack, path) { - visited[node] <- TRUE + visited[node] <- TRUE rec_stack[node] <- TRUE - path <- append(path, node) + path <- append(path, node) cycles <- list() for (neighbor in adj_list[[node]]) { @@ -145,21 +208,19 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } - rec_stack[node] <- FALSE return(cycles) } - visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) - if (length(node_cycles) > 0) { + if (length(node_cycles) > 0) all_cycles <- append(all_cycles, node_cycles) - } } } return(all_cycles) @@ -169,69 +230,63 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { if (length(cycles) > 0) { for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, + paste("Cycle detected involving IDs:", + paste(cycle_ids, collapse = " -> "))) } } #### compile findings #### input_ped_report <- list( - exact_duplicates = exact_duplicates, - repeated_ids_diff = repeated_ids_report, - messy_parents = messy_parents, - missing_parents = missing_parents, - dependencies = data.frame(Dependency = unique(unlist(errors))) + exact_duplicates = exact_duplicates, + conflicting_trios = conflicting_trios, + inconsistent_sex_roles = inconsistent_sex_roles, + missing_parents = missing_parents, + dependencies = data.frame(dependency = unique(unlist(errors)), + stringsAsFactors = FALSE), + corrected_pedigree = data %>% dplyr::select(-row_number) ) - #### file names #### - file_base <- tools::file_path_sans_ext(basename(ped.file)) - corrected_name <- paste0(file_base, "_corrected") - report_name <- paste0(file_base, "_report") - #### output #### if (verbose) { cat("\n=== Pedigree Quality Check Report ===\n") if (nrow(exact_duplicates) > 0) { - cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") + cat("\nExact duplicate trios detected (removed in corrected pedigree):\n") print(exact_duplicates) } else cat("\nNo exact duplicate trios found.\n") - if (nrow(repeated_ids_report) > 0) { - cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") - print(repeated_ids_report) - } else cat("\nNo conflicting repeated trios found.\n") + if (nrow(conflicting_trios) > 0) { + cat("\nConflicting trios detected:\n") + print(conflicting_trios) + if (correct_conflicting_trios) { + cat(" -> parents set to 0 and collapsed to one row in corrected pedigree.\n") + } else { + cat(" -> correct_conflicting_trios = FALSE: left as-is in corrected pedigree.\n") + } + } else cat("\nNo conflicting trios found.\n") if (nrow(missing_parents) > 0) { - cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + cat("\nParents missing as IDs (added as founders in corrected pedigree):\n") print(missing_parents) } else cat("\nNo missing parents found.\n") - if (nrow(messy_parents) > 0) { - cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") - print(messy_parents) - } else cat("\nNo IDs found as both sire and dam.\n") - + if (nrow(inconsistent_sex_roles) > 0) { + cat("\nIDs found as both male_parent and female_parent:\n") + print(inconsistent_sex_roles) + if (correct_inconsistent_sex_roles) { + cat(" -> parent fields set to 0 for conflicting IDs in corrected pedigree.\n") + } else { + cat(" -> correct_inconsistent_sex_roles = FALSE: left as-is.\n") + } + } else cat("\nNo IDs found as both male_parent and female_parent.\n") if (nrow(input_ped_report$dependencies) > 0) { - cat("\nDependencies detected:\n") + cat("\nDependencies detected (must be resolved manually):\n") print(input_ped_report$dependencies) } else cat("\nNo dependencies detected.\n") - #### interactive save #### - cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) - ans <- tolower(trimws(readline())) - if (ans == "y") { - assign(corrected_name, data, envir = .GlobalEnv) - assign("input_ped_report", input_ped_report, envir = .GlobalEnv) - cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) - } else { - cat("No corrected pedigree was saved.\n") - } - - } else { - # Silent automatic mode - assign(corrected_name, data, envir = .GlobalEnv) - assign(report_name, input_ped_report, envir = .GlobalEnv) + cat("\nThe corrected pedigree is included in the returned list as corrected_pedigree.\n") } invisible(input_ped_report) diff --git a/R/find_parentage.R b/R/find_parentage.R index 728822a..e53e66d 100644 --- a/R/find_parentage.R +++ b/R/find_parentage.R @@ -1,133 +1,44 @@ #' Find Parentage Assignments for Progeny #' -#' Assigns the most likely parent(s) to each progeny individual based on -#' genotypic data using Mendelian error rates or homozygous mismatch rates. +#' Assigns the most likely parent(s) to each progeny from SNP genotype data +#' using Mendelian error rates or homozygous mismatch rates. Parents or progeny +#' absent from the genotype file are removed with a warning. #' -#' @param genotypes_file Path to a TSV/CSV/TXT file containing genotype data. -#' Must include an 'ID' column followed by marker columns coded as 0, 1, 2 -#' (allele dosage). -#' @param parents_file Path to a TSV/CSV/TXT file listing candidate parent IDs. -#' Must include an 'ID' column. An optional 'Sex' column with values -#' 'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -#' which parents are tested for each role. If absent, all parents are treated -#' as ambiguous. -#' @param progeny_file Path to a TSV/CSV/TXT file listing progeny IDs to assign. -#' Must include an 'ID' column. -#' @param method Character. Parentage assignment method. One of: -#' \itemize{ -#' \item \code{"best_male_parent"} — finds the best male parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_female_parent"} — finds the best female parent for each -#' progeny using homozygous mismatch rate. -#' \item \code{"best_match"} — finds the single best parent (either sex) -#' using homozygous mismatch rate. -#' \item \code{"best_pair"} — finds the best male-female parent pair for -#' each progeny using full Mendelian error rate (default). -#' } -#' @param min_markers Integer. Minimum number of non-missing markers required -#' to report a parentage assignment. Progeny-parent comparisons with fewer -#' markers are flagged as \code{LOW_MARKERS} and no assignment is made -#' (default: \code{10}). -#' @param error_threshold Numeric. Maximum mismatch percentage to report a -#' parentage assignment as confident. Assignments above this threshold are -#' flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -#' (default: \code{5.0}). Must be between 0 and 100. -#' @param show_ties Logical. If \code{TRUE}, all tied best pairs (after -#' tie-breaking by maximum markers tested) are reported as additional columns -#' (\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -#' \code{method = "best_pair"}. The base columns (\code{Male_Parent}, -#' \code{Female_Parent}, etc.) are always populated with the top result. -#' If \code{FALSE}, only one tied pair is reported with a warning. +#' @param genotypes_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column followed by marker columns coded as 0, 1, 2. +#' @param parents_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column and an optional 'sex' column +#' ('M', 'F', or 'A'). If absent, all parents are treated as ambiguous. +#' @param progeny_file Path to a TSV/CSV/TXT file, OR a data.frame / +#' data.table with an 'id' column. +#' @param method Character. One of \code{"best_male_parent"}, +#' \code{"best_female_parent"}, \code{"best_match"}, or +#' \code{"best_pair"} (default). +#' @param min_markers Integer. Minimum markers required; fewer flags +#' \code{low_markers} (default: \code{10}). +#' @param error_threshold Numeric. Maximum mismatch percentage; exceeded values +#' flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100. +#' @param show_ties Logical. If \code{TRUE}, tied best pairs are appended as +#' suffix columns. Default is \code{TRUE}. +#' @param allow_selfing Logical. If \code{FALSE}, pairs with identical male and +#' female parent IDs are excluded. Default is \code{TRUE}. +#' @param verbose Logical. If \code{TRUE}, prints progress and summary. #' Default is \code{TRUE}. -#' @param allow_selfing Logical. If \code{FALSE}, male-female parent pairs where -#' both IDs are identical are excluded when \code{method = "best_pair"}. -#' Default is \code{TRUE}. -#' @param verbose Logical. If \code{TRUE}, prints progress messages, summary -#' statistics, and the results table to the console. Default is \code{TRUE}. -#' @param write_txt Logical. If \code{TRUE}, writes results to -#' \code{parentage_results_dt.txt} in the working directory. Default is -#' \code{TRUE}. -#' -#' @return A \code{data.table} with one row per progeny. Columns depend on the -#' method used: -#' \itemize{ -#' \item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -#' \code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -#' \code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -#' \code{Markers_Tested}, \code{Assignment_Status}. -#' \item \code{best_pair} (ties remain after tie-breaking, -#' \code{show_ties = TRUE}): base columns are always populated with the -#' top result, plus suffix columns \code{Male_Parent_1}, -#' \code{Female_Parent_1}, etc. for each tied pair. -#' } -#' \code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -#' \code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. -#' -#' @details -#' A homozygous-only genotype matrix is pre-computed once at startup and shared -#' across all methods that require it, avoiding redundant computation. -#' -#' For \code{"best_male_parent"}, \code{"best_female_parent"}, and -#' \code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -#' comparison; heterozygous markers (coded 1) are set to \code{NA}. This -#' reduces false mismatches caused by phase ambiguity. -#' -#' For \code{"best_pair"}, all markers are used and full Mendelian inheritance -#' rules are applied. Mismatch rates and comparison counts are computed across -#' all progeny simultaneously using vectorised \code{vapply} calls, producing -#' \code{n_pairs x n_progeny} matrices and giving substantial speed gains for -#' large datasets. Both matrices are explicitly coerced to matrix form to -#' handle the edge case of a single parent pair correctly. -#' -#' When multiple pairs share the lowest Mendelian error rate, ties are broken -#' by selecting the pair(s) with the greatest number of markers tested. If ties -#' still remain after this step, all remaining tied pairs are reported when -#' \code{show_ties = TRUE}. -#' -#' The base columns (\code{Male_Parent}, \code{Female_Parent}, -#' \code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -#' the top result, ensuring no missing values in these columns regardless of -#' tie status. -#' -#' Output rows are pre-allocated as a \code{data.table} and filled by reference -#' using \code{data.table::set()}, avoiding repeated memory allocation during -#' the results-building step. -#' -#' Individuals in \code{parents_file} or \code{progeny_file} that are absent -#' from \code{genotypes_file} are removed with a warning. -#' -#' Progeny with fewer non-missing markers than \code{min_markers} are flagged -#' \code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -#' match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. +#' @param plot_results Logical. If \code{TRUE}, plots the Mendelian error +#' distribution. Requires \code{ggplot2}. Default is \code{TRUE}. #' -#' @examples -#' \dontrun{ -#' # Assign best male-female parent pair to each progeny -#' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_pair", -#' min_markers = 50, -#' error_threshold = 5.0, -#' show_ties = TRUE, -#' allow_selfing = FALSE -#' ) -#' -#' # Find best individual parent match (ignoring sex) -#' results <- find_parentage( -#' genotypes_file = "genotypes.txt", -#' parents_file = "parents.txt", -#' progeny_file = "progeny.txt", -#' method = "best_match", -#' min_markers = 30, -#' error_threshold = 3.0 -#' ) +#' @return A named list (returned invisibly) with elements: +#' \describe{ +#' \item{pass}{Progeny with a confident parentage assignment.} +#' \item{high_error}{Progeny whose best assignment exceeds the error threshold.} +#' \item{low_markers}{Progeny with insufficient markers for a valid assignment.} +#' \item{full_results}{Complete data.table with all progeny and all output columns.} +#' \item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} #' } #' -#' @importFrom data.table fread fwrite copy CJ rbindlist set data.table as.data.table +#' @author Josue Chinchilla-Vargas +#' +#' @importFrom data.table fread copy CJ rbindlist set data.table as.data.table is.data.table #' @export find_parentage <- function(genotypes_file, parents_file, progeny_file, method = "best_pair", @@ -136,10 +47,15 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE) { + plot_results = TRUE) { + + ## silence R CMD check NOTEs + id <- sex <- male_parent <- female_parent <- NULL + mendelian_error_pct <- plot_status <- status <- NULL - #### Input Validation and Data Loading #### - allowed_methods <- c("best_male_parent", "best_female_parent", "best_match", "best_pair") + #### Input Validation #### + allowed_methods <- c("best_male_parent", "best_female_parent", + "best_match", "best_pair") if (!method %in% allowed_methods) stop("Method must be one of: ", paste(allowed_methods, collapse = ", ")) if (min_markers < 1) @@ -147,83 +63,91 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (error_threshold < 0 || error_threshold > 100) stop("error_threshold must be between 0 and 100.") + # Accept file path OR in-memory data.frame / data.table + read_flex <- function(x, label, ...) { + if (is.character(x) && length(x) == 1) { + if (!file.exists(x)) + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + data.table::fread(x, sep = "auto", ...) + } else if (is.data.frame(x) || data.table::is.data.table(x)) { + data.table::as.data.table(x) + } else { + stop(label, " must be a file path (character) or a data.frame / data.table.") + } + } + tryCatch({ - genos <- data.table::fread(genotypes_file, sep = "auto") - all_parents <- data.table::fread(parents_file, sep = "auto") - progeny_candidates <- data.table::fread(progeny_file, sep = "auto") + genos <- read_flex(genotypes_file, "genotypes_file") + all_parents <- read_flex(parents_file, "parents_file") + progeny_candidates <- read_flex(progeny_file, "progeny_file") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) - valid_ids <- genos$ID - removed_parents <- base::setdiff(all_parents$ID, valid_ids) + valid_ids <- genos$id + removed_parents <- base::setdiff(all_parents$id, valid_ids) if (base::length(removed_parents) > 0) { warning("The following parent IDs were not in the genotype file and will not be analyzed: ", paste(removed_parents, collapse = ", "), call. = FALSE) - all_parents <- all_parents[ID %in% valid_ids] + all_parents <- all_parents[id %in% valid_ids] } - removed_progeny <- base::setdiff(progeny_candidates$ID, valid_ids) + removed_progeny <- base::setdiff(progeny_candidates$id, valid_ids) if (base::length(removed_progeny) > 0) { warning("The following progeny IDs were not in the genotype file and will not be analyzed: ", paste(removed_progeny, collapse = ", "), call. = FALSE) - progeny_candidates <- progeny_candidates[ID %in% valid_ids] + progeny_candidates <- progeny_candidates[id %in% valid_ids] } - if (!"Sex" %in% base::colnames(all_parents)) { - warning("No 'Sex' column in parents file. All parents treated as ambiguous ('A').") - all_parents[, Sex := "A"] + if (!"sex" %in% base::colnames(all_parents)) { + warning("No 'sex' column in parents file. All parents treated as ambiguous ('A').") + all_parents[, sex := "A"] } - all_parents[, Sex := base::toupper(Sex)] - male_parent_candidates <- all_parents[Sex %in% c("M", "A", "NA"), .SD] - female_parent_candidates <- all_parents[Sex %in% c("F", "A", "NA")] + all_parents[, sex := base::toupper(sex)] + male_parent_candidates <- all_parents[sex %in% c("M", "A", "NA"), .SD] + female_parent_candidates <- all_parents[sex %in% c("F", "A", "NA")] - if (base::nrow(male_parent_candidates) == 0 && method %in% c("best_male_parent", "best_pair")) + if (base::nrow(male_parent_candidates) == 0 && + method %in% c("best_male_parent", "best_pair")) warning("No valid male parent candidates remain after filtering.", call. = FALSE) - if (base::nrow(female_parent_candidates) == 0 && method %in% c("best_female_parent", "best_pair")) + if (base::nrow(female_parent_candidates) == 0 && + method %in% c("best_female_parent", "best_pair")) warning("No valid female parent candidates remain after filtering.", call. = FALSE) if (base::nrow(progeny_candidates) == 0) stop("No valid progeny candidates remain after filtering.") - #### Pre-compute genotype matrices once (shared across all methods) #### - # Full genotype matrix used by best_pair - genos_mat <- base::as.matrix(genos, rownames = "ID") - - # Homozygous-only matrix (het markers set to NA) used by hom methods + #### Pre-compute genotype matrices once #### + genos_mat <- base::as.matrix(genos, rownames = "id") genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") + marker_cols <- base::setdiff(base::names(genos_hom), "id") for (col in marker_cols) genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "id") - #### Assignment_Status from markers and error rate #### - # Returns LOW_MARKERS, HIGH_ERROR, or PASS + #### Status helper #### assign_status <- function(markers, error_pct) { - base::ifelse(markers < min_markers, "LOW_MARKERS", - base::ifelse(error_pct > error_threshold, "HIGH_ERROR", "PASS")) + base::ifelse(markers < min_markers, "low_markers", + base::ifelse(error_pct > error_threshold, "high_error", "pass")) } #### Logic for Homozygous Matching Methods #### if (method %in% c("best_male_parent", "best_female_parent", "best_match")) { parent_ids <- base::switch(method, - "best_male_parent" = male_parent_candidates$ID, - "best_female_parent" = female_parent_candidates$ID, - "best_match" = base::union(male_parent_candidates$ID, - female_parent_candidates$ID)) - - # Subset pre-computed homozygous matrix for relevant parents and progeny - parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] - progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$ID, , drop = FALSE] + "best_male_parent" = male_parent_candidates$id, + "best_female_parent" = female_parent_candidates$id, + "best_match" = base::union(male_parent_candidates$id, + female_parent_candidates$id)) + parent_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% parent_ids, , drop = FALSE] + progeny_genos <- genos_hom_mat[base::rownames(genos_hom_mat) %in% progeny_candidates$id, , drop = FALSE] - # Pre-allocate results data.table; fill by reference with set() n_progeny <- base::nrow(progeny_genos) results_dt <- data.table::data.table( - Progeny = base::rownames(progeny_genos), - Best_Match = NA_character_, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = NA_integer_, - Assignment_Status = NA_character_ + id = base::rownames(progeny_genos), + best_match = NA_character_, + mendelian_error_pct = NA_real_, + markers_tested = NA_integer_, + status = NA_character_ ) for (i in base::seq_len(n_progeny)) { @@ -234,47 +158,39 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, percent_mismatch[base::is.nan(percent_mismatch)] <- NA best_idx <- base::which.min(percent_mismatch) - - # No candidate found — flag LOW_MARKERS and continue if (base::length(best_idx) == 0) { - data.table::set(results_dt, i, "Markers_Tested", 0L) - data.table::set(results_dt, i, "Assignment_Status", "LOW_MARKERS") + data.table::set(results_dt, i, "markers_tested", 0L) + data.table::set(results_dt, i, "status", "low_markers") next } best_markers <- comparisons[best_idx] best_error <- base::round(percent_mismatch[best_idx], 2) - - data.table::set(results_dt, i, "Best_Match", base::rownames(parent_genos)[best_idx]) - data.table::set(results_dt, i, "Mendelian_Error_Pct", best_error) - data.table::set(results_dt, i, "Markers_Tested", base::as.integer(best_markers)) - data.table::set(results_dt, i, "Assignment_Status", assign_status(best_markers, best_error)) + data.table::set(results_dt, i, "best_match", base::rownames(parent_genos)[best_idx]) + data.table::set(results_dt, i, "mendelian_error_pct", best_error) + data.table::set(results_dt, i, "markers_tested", base::as.integer(best_markers)) + data.table::set(results_dt, i, "status", assign_status(best_markers, best_error)) } final_df <- results_dt } #### Logic for Best Pair Method #### if (method == "best_pair") { - parent_pairs <- data.table::CJ(Male_Parent = male_parent_candidates$ID, - Female_Parent = female_parent_candidates$ID) + parent_pairs <- data.table::CJ(male_parent = male_parent_candidates$id, + female_parent = female_parent_candidates$id) if (!allow_selfing) { - parent_pairs <- parent_pairs[Male_Parent != Female_Parent] + parent_pairs <- parent_pairs[male_parent != female_parent] if (verbose) base::cat("Selfing is disallowed. Pairs with identical parents are removed.\n") } if (base::nrow(parent_pairs) == 0) stop("No valid parent pairs to test.") - # Pre-extract parent genotype row blocks for vectorised operations - male_parent_genos_mat <- genos_mat[parent_pairs$Male_Parent, , drop = FALSE] - female_parent_genos_mat <- genos_mat[parent_pairs$Female_Parent, , drop = FALSE] - - # Subset full genotype matrix to progeny only - progeny_ids <- progeny_candidates$ID + male_parent_genos_mat <- genos_mat[parent_pairs$male_parent, , drop = FALSE] + female_parent_genos_mat <- genos_mat[parent_pairs$female_parent, , drop = FALSE] + progeny_ids <- progeny_candidates$id progeny_mat <- genos_mat[progeny_ids, , drop = FALSE] n_progeny <- base::nrow(progeny_mat) n_pairs <- base::nrow(parent_pairs) - # Vectorised mismatch computation across ALL progeny at once - # Wrapped in matrix() to handle the n_pairs = 1 edge case mismatch_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -293,8 +209,6 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - # Vectorised comparison count across ALL progeny at once - # Wrapped in matrix() to handle the n_pairs = 1 edge case comparison_mat <- base::matrix( base::vapply(base::seq_len(n_progeny), function(j) { progeny_vec <- progeny_mat[j, ] @@ -305,21 +219,18 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, nrow = n_pairs, ncol = n_progeny ) - # Percent mismatch matrix: n_pairs x n_progeny pct_mismatch_mat <- (mismatch_mat / comparison_mat) * 100 pct_mismatch_mat[base::is.nan(pct_mismatch_mat)] <- NA - # Pre-allocate base results data.table; tie columns added dynamically results_dt <- data.table::data.table( - Progeny = progeny_ids, - Male_Parent = NA_character_, - Female_Parent = NA_character_, - Mendelian_Error_Pct = NA_character_, - Markers_Tested = NA_integer_, - Assignment_Status = NA_character_ + id = progeny_ids, + male_parent = NA_character_, + female_parent = NA_character_, + mendelian_error_pct = NA_character_, + markers_tested = NA_integer_, + status = NA_character_ ) - # Per-progeny result extraction from pre-computed matrices results_list <- base::vector("list", n_progeny) for (j in base::seq_len(n_progeny)) { prog_id <- progeny_ids[j] @@ -327,16 +238,13 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, comparisons <- comparison_mat[, j] min_mismatch_val <- base::min(percent_mismatch, na.rm = TRUE) - # No markers overlap at all — flag LOW_MARKERS if (base::is.infinite(min_mismatch_val)) { - data.table::set(results_dt, j, "Markers_Tested", 0L) - data.table::set(results_dt, j, "Assignment_Status", "LOW_MARKERS") + data.table::set(results_dt, j, "markers_tested", 0L) + data.table::set(results_dt, j, "status", "low_markers") next } best_indices <- base::which(percent_mismatch == min_mismatch_val) - - # Tie-break: prefer pair(s) with the most markers tested if (base::length(best_indices) > 1) { best_markers_per_pair <- comparisons[best_indices] max_markers <- base::max(best_markers_per_pair) @@ -350,74 +258,153 @@ find_parentage <- function(genotypes_file, parents_file, progeny_file, if (!show_ties && base::nrow(best_pairs) > 1) { warning("Progeny '", prog_id, "' has ", base::nrow(best_pairs), - " tied best pairs. Only one is reported as show_ties=FALSE.", call. = FALSE) + " tied best pairs. Only one is reported as show_ties=FALSE.", + call. = FALSE) } num_to_report <- base::min(base::nrow(best_pairs), if (show_ties) base::nrow(best_pairs) else 1) - # Always populate base columns with the top result - data.table::set(results_dt, j, "Male_Parent", best_pairs$Male_Parent[1]) - data.table::set(results_dt, j, "Female_Parent", best_pairs$Female_Parent[1]) - data.table::set(results_dt, j, "Mendelian_Error_Pct", base::sprintf("%.2f", min_mismatch_val)) - data.table::set(results_dt, j, "Markers_Tested", base::as.integer(best_markers)) - data.table::set(results_dt, j, "Assignment_Status", a_status) + data.table::set(results_dt, j, "male_parent", best_pairs$male_parent[1]) + data.table::set(results_dt, j, "female_parent", best_pairs$female_parent[1]) + data.table::set(results_dt, j, "mendelian_error_pct", base::sprintf("%.2f", min_mismatch_val)) + data.table::set(results_dt, j, "markers_tested", base::as.integer(best_markers)) + data.table::set(results_dt, j, "status", a_status) - # If ties remain after tie-breaking and show_ties is TRUE, - # store tie details for later column binding if (show_ties && num_to_report > 1) { - tie_row <- base::list(Progeny = prog_id) - for (k in base::seq_len(num_to_report)) { - tie_row[[base::paste0("Male_Parent_", k)]] <- best_pairs$Male_Parent[k] - tie_row[[base::paste0("Female_Parent_", k)]] <- best_pairs$Female_Parent[k] - tie_row[[base::paste0("Mendelian_Error_Pct_", k)]] <- min_mismatch_val - tie_row[[base::paste0("Markers_Tested_", k)]] <- comparisons[best_indices[k]] + tie_row <- base::list(id = prog_id) + for (k in base::seq(2, num_to_report)) { + tie_row[[base::paste0("male_parent_", k)]] <- best_pairs$male_parent[k] + tie_row[[base::paste0("female_parent_", k)]] <- best_pairs$female_parent[k] + tie_row[[base::paste0("mendelian_error_pct_", k)]] <- min_mismatch_val + tie_row[[base::paste0("markers_tested_", k)]] <- comparisons[best_indices[k]] } results_list[[j]] <- data.table::as.data.table(tie_row) } } - # Merge any tie suffix columns onto the pre-allocated base table tie_rows <- data.table::rbindlist( base::Filter(Negate(base::is.null), results_list), - fill = TRUE + fill = TRUE, + use.names = TRUE ) if (base::nrow(tie_rows) > 0) { - final_df <- merge(results_dt, tie_rows, by = "Progeny", all.x = TRUE) + final_df <- merge(results_dt, tie_rows, by = "id", all.x = TRUE) + for (col in base::names(final_df)) + data.table::set(final_df, which(final_df[[col]] == ""), col, NA_character_) } else { final_df <- results_dt } } - #### Summary #### + #### Compile named list #### + output_list <- list( + pass = final_df[status == "pass"], + high_error = final_df[status == "high_error"], + low_markers = final_df[status == "low_markers"], + full_results = final_df, + plot = NULL + ) + + #### Verbose output #### if (verbose) { - total <- base::nrow(final_df) - a_counts <- base::table(final_df$Assignment_Status) - base::cat("\n--- Parentage Assignment Summary ---\n") - base::cat("Total progeny evaluated:", total, "\n") - for (s in base::names(a_counts)) - base::cat(base::sprintf(" %-14s: %d (%.1f%%)\n", s, - a_counts[s], (a_counts[s] / total) * 100)) - base::cat("Min markers threshold :", min_markers, "\n") - base::cat("Error threshold :", error_threshold, "%\n\n") - } + total_progeny <- base::nrow(final_df) + base::cat("\n=== Parentage Assignment Report ===\n") + base::cat("\nTotal progeny evaluated:", total_progeny, "\n") + base::cat("Method:", method, " | ", + "Error threshold:", error_threshold, "% | ", + "Minimum markers:", min_markers, "\n") + + n_pass <- base::nrow(output_list$pass) + if (n_pass > 0) { + base::cat(base::sprintf("\n%d progeny passed (%.1f%%).\n", + n_pass, (n_pass / total_progeny) * 100)) + } else { + base::cat("\nNo progeny passed.\n") + } + + n_high <- base::nrow(output_list$high_error) + if (n_high > 0) { + base::cat(base::sprintf("\n%d progeny flagged high_error (%.1f%%):\n", + n_high, (n_high / total_progeny) * 100)) + base::print(output_list$high_error) + } else { + base::cat("\nNo progeny flagged for high error.\n") + } + + n_low <- base::nrow(output_list$low_markers) + if (n_low > 0) { + base::cat(base::sprintf("\n%d progeny flagged low_markers (%.1f%%):\n", + n_low, (n_low / total_progeny) * 100)) + base::print(output_list$low_markers) + } else { + base::cat("\nNo progeny flagged for low marker count.\n") + } - #### Output #### - if (write_txt) { - output_filename <- "parentage_results_dt.txt" - tryCatch({ - data.table::fwrite(final_df, file = output_filename, sep = "\t", quote = FALSE) - if (verbose) base::cat("Results successfully written to:", output_filename, "\n") - }, error = function(e) { - warning("Could not write results to file. Error: ", e$message, call. = FALSE) - }) + base::cat("\nFull results are included in the returned list as $full_results.\n") } - if (verbose) { - base::cat("\n--- Parentage Assignment Results ---\n") - base::print(final_df) - return(base::invisible(final_df)) - } else { - return(final_df) + #### Plot Results #### + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", + call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$mendelian_error_pct)] + plot_df$mendelian_error_pct <- base::as.numeric(plot_df$mendelian_error_pct) + plot_df$plot_status <- base::ifelse( + plot_df$status == "pass", "pass", + base::ifelse( + plot_df$status == "high_error", "high_error", + base::ifelse( + plot_df$status == "low_markers", "low_markers", "other"))) + + n_total <- base::nrow(plot_df) + n_pass <- base::sum(plot_df$status == "pass", na.rm = TRUE) + n_high <- base::sum(plot_df$status == "high_error", na.rm = TRUE) + n_low <- base::sum(plot_df$status == "low_markers", na.rm = TRUE) + + threshold_label <- base::paste0( + "Error Threshold: ", error_threshold, "% | ", + "Pass: ", n_pass, " | ", + "High Error: ", n_high, " | ", + "Low Markers: ", n_low + ) + + p <- ggplot2::ggplot( + plot_df, + ggplot2::aes(x = mendelian_error_pct, fill = plot_status) + ) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline(xintercept = error_threshold, + linetype = "dashed", color = "black", linewidth = 1) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 10000, by = 5)) + + ggplot2::scale_fill_manual( + values = c("pass" = "#339900", + "high_error" = "#cc3333", + "low_markers" = "#F1C40F", + "other" = "#BDC3C7"), + labels = c("pass" = "Pass", + "high_error" = "High Error", + "low_markers" = "Low Markers", + "other" = "Other") + ) + + ggplot2::labs( + title = "Parentage Mendelian Error Distribution", + subtitle = base::paste0("Progeny Tested: ", n_total, + "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Progeny", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + + base::print(p) + output_list$plot <- p + } } + + return(base::invisible(output_list)) } diff --git a/R/imputation_concordance.R b/R/imputation_concordance.R index 918071a..7ebbdcc 100644 --- a/R/imputation_concordance.R +++ b/R/imputation_concordance.R @@ -1,79 +1,60 @@ -#' Calculate Concordance between Imputed and Reference Genotypes -#' -#' This function calculates the concordance between imputed and reference -#' genotypes. It assumes that samples are rows and markers are columns. -#' Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -#' Missing data in either dataset can be excluded from the concordance calculation -#' using the `missing_code` argument. Specific markers can be excluded using -#' the `snps_2_exclude` argument. -#' -#' @param reference_genos A data frame containing reference genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param imputed_genos A data frame containing imputed genotype data, -#' with rows as samples and columns as markers. Must include a column named `ID`. -#' -#' @param missing_code Optional value specifying missing data. If provided, -#' loci with this value in either dataset will be excluded from the concordance calculation. -#' -#' @param snps_2_exclude Optional vector of marker IDs to exclude from the concordance calculation. -#' -#' @param verbose Logical. If `TRUE`, prints summary statistics (minimum, quartiles, -#' median, mean, maximum) of concordance percentages. -#' -#' @param plot Logical. If `TRUE`, produces a bar plot of concordance percentage -#' by sample. -#' -#' @param print_result Logical. If `TRUE` (default), prints the concordance -#' results data frame to the console. If `FALSE`, results are returned invisibly. -#' -#' @return A data frame with: -#' \itemize{ -#' \item \code{ID}: Sample identifiers shared between the datasets. -#' \item \code{Concordance}: Percentage of matching genotypes per sample. +#' Calculate concordance between imputed and reference genotypes +#' +#' Compares imputed and reference genotype datasets sample by sample, returning +#' the percentage of matching genotypes per sample. Expects samples as rows and +#' markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +#' formats are supported. +#' +#' @param reference_genos A data frame of reference genotypes (samples × markers) +#' with a column named `ID`. +#' @param imputed_genos A data frame of imputed genotypes (samples × markers) +#' with a column named `ID`. +#' @param missing_code Optional value indicating missing data. Loci carrying this +#' value in either dataset are excluded from the concordance calculation. +#' @param snps_2_exclude Optional character vector of marker names to exclude. +#' @param verbose Logical. Print a five-number summary of concordance? Default `FALSE`. +#' @param plot Logical. Produce a bar plot of concordance by sample? Default `FALSE`. +#' @param print_result Logical. Print the results data frame to the console? +#' Default `TRUE`. If `FALSE`, results are returned invisibly. +#' +#' @return A data frame with columns: +#' \describe{ +#' \item{ID}{Sample identifiers shared between the two datasets.} +#' \item{Concordance}{Percentage of matching genotypes per sample.} #' } -#' If \code{print_result = FALSE}, the data frame is returned invisibly. #' #' @details -#' The function: -#' \enumerate{ -#' \item Identifies common samples and markers between the datasets. -#' \item Optionally excludes specified SNPs. -#' \item Removes loci with missing data (if \code{missing_code} is provided). -#' \item Computes per-sample concordance as the percentage of matching genotypes. -#' } -#' -#' When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -#' is generated using \pkg{ggplot2}. -#' -#' @import dplyr +#' The function identifies common samples and markers between the two datasets, +#' optionally removes specified SNPs and loci with missing data, then computes +#' per-sample concordance as the percentage of matching genotypes over valid loci. +#' When `plot = TRUE`, a bar plot is produced with \pkg{ggplot2}. #' #' @examples #' ref <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 1, 2), #' SNP2 = c(1, 1, 0), #' SNP3 = c(2, 5, 1) #' ) -#' -#' test <- data.frame( -#' ID = c("S1", "S2", "S3"), +#' imp <- data.frame( +#' ID = c("S1", "S2", "S3"), #' SNP1 = c(0, 0, 2), #' SNP2 = c(1, 1, 1), #' SNP3 = c(2, 5, 0) #' ) -#' -#' result <- imputation_concordance( +#' imputation_concordance( #' reference_genos = ref, -#' imputed_genos = test, -#' snps_2_exclude = "SNP2", -#' missing_code = 5, -#' print_result = FALSE +#' imputed_genos = imp, +#' snps_2_exclude = "SNP2", +#' missing_code = 5, +#' print_result = FALSE #' ) #' -#' result +#' @author Josué Chinchilla-Vargas #' +#' @importFrom dplyr %>% filter arrange #' @importFrom stats reorder +#' @importFrom ggplot2 ggplot aes geom_bar labs theme_minimal theme element_text #' @export imputation_concordance <- function(reference_genos, imputed_genos, diff --git a/R/validate_pedigree.R b/R/validate_pedigree.R index 8b2ecbc..fc4cd0f 100644 --- a/R/validate_pedigree.R +++ b/R/validate_pedigree.R @@ -1,64 +1,60 @@ #' Validate Pedigree Trios Using Mendelian Error Analysis #' -#' Validates parent-offspring trios by calculating Mendelian error rates from -#' SNP genotype data. Identifies incorrect parentage assignments and suggests -#' best-matching replacements. If a list of founders is supplied, trios that -#' are declared founders (both parents coded as 0) are preserved unchanged -#' with no recommendations. Trios removed due to missing genotype data are -#' retained in the output with a NO_GENOTYPE_DATA status. +#' Validates parent-offspring trios against SNP genotype data using Mendelian +#' error rates. Identifies incorrect parentage assignments, suggests +#' best-matching replacements, and outputs a corrected pedigree. Founder trios +#' (both parents coded as 0) are preserved unchanged if a founders file is +#' supplied. Trios absent from the genotype file are retained as +#' no_genotype_data. #' -#' @param pedigree_file Character. Path to the pedigree file (TSV/CSV/TXT) -#' with columns: ID, Male_Parent, Female_Parent. -#' @param genotypes_file Character. Path to the genotypes file (TSV/CSV/TXT) -#' with an ID column followed by marker columns coded as 0, 1, 2. -#' @param founders_file Character, optional. Path to a one-column file -#' listing the IDs of founder individuals. Founders with both parents -#' coded as 0 are left unchanged with no recommendations. Defaults to NULL. -#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage -#' to classify a trio as PASS (default: 5.0). Must be between 0 and 100. -#' @param min_markers Integer. Minimum number of non-missing markers -#' required to evaluate a trio (default: 10). +#' @param pedigree_file Path to the pedigree file (TSV/CSV/TXT), OR a +#' data.frame / data.table with columns: id, male_parent, female_parent. +#' @param genotypes_file Path to the genotypes file (TSV/CSV/TXT), OR a +#' data.frame / data.table with an id column followed by marker columns +#' coded as 0, 1, 2. +#' @param founders_file Character, optional. Path to a one-column file listing +#' founder IDs. Founders with both parents coded as 0 are left unchanged. +#' Defaults to NULL. +#' @param trio_error_threshold Numeric. Maximum Mendelian error percentage to +#' classify a trio as pass (default: 5.0). Must be between 0 and 100. +#' @param min_markers Integer. Minimum non-missing markers required to evaluate +#' a trio (default: 10). #' @param single_parent_error_threshold Numeric. Maximum homozygous-marker -#' mismatch percentage for a parent to be considered acceptable during -#' parent-level evaluation (default: 2.0). Must be between 0 and 100. -#' @param verbose Logical. If TRUE, prints progress messages, a summary -#' table, and results to the console (default: TRUE). -#' @param write_txt Logical. If TRUE, writes validation results to -#' output_filename (default: TRUE). -#' @param output_filename Character. Path/name of the output file -#' (default: "pedigree_validation_results.txt"). +#' mismatch percentage for a parent to be considered acceptable +#' (default: 2.0). Must be between 0 and 100. +#' @param verbose Logical. If TRUE, prints progress, summary, and results to +#' the console (default: TRUE). +#' @param plot_results Logical. If TRUE, prints a histogram of trio Mendelian +#' error percentages with a threshold line (default: TRUE). #' -#' @return A data.table (returned invisibly) with one row per trio and -#' the following columns: -#' \describe{ -#' \item{ID}{Individual ID.} -#' \item{Male_Parent}{Declared male parent ID.} -#' \item{Female_Parent}{Declared female parent ID.} -#' \item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -#' \item{Markers_Tested}{Number of markers with non-missing genotypes.} -#' \item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, -#' MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -#' or NO_GENOTYPE_DATA.} -#' \item{Correction_Decision}{One of NONE, KEEP_BOTH, -#' REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, -#' LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, -#' LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} -#' \item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} -#' \item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -#' \item{Best_Male_Parent}{Best-matching male parent candidate ID.} -#' \item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -#' \item{Best_Female_Parent}{Best-matching female parent candidate ID.} -#' \item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} -#' } +#' @return An invisible named list with the following elements: +#' \describe{ +#' \item{pass}{Trios that passed the Mendelian error threshold.} +#' \item{fail}{Trios that failed the Mendelian error threshold.} +#' \item{low_markers}{Trios with insufficient markers for evaluation.} +#' \item{no_genotype_data}{Trios absent from the genotype file.} +#' \item{founders}{Trios identified as founders.} +#' \item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} +#' \item{full_results}{Complete data.table with all trios and all output columns.} +#' \item{corrected_pedigree}{Pedigree table after applying recommended corrections.} +#' \item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} +#' } +#' +#' @author Josue Chinchilla-Vargas +#' +#' @importFrom data.table fread copy data.table set rbindlist as.data.table is.data.table #' @export validate_pedigree <- function(pedigree_file, genotypes_file, - founders_file = NULL, - trio_error_threshold = 5.0, - min_markers = 10, + founders_file = NULL, + trio_error_threshold = 5.0, + min_markers = 10, single_parent_error_threshold = 2.0, - verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt") { + verbose = TRUE, + plot_results = TRUE) { + + ## silence R CMD check NOTEs + id <- male_parent <- female_parent <- status <- trio_mendelian_error_pct <- NULL + plot_status <- recommended_correction <- NULL #### Input validation #### if (trio_error_threshold < 0 || trio_error_threshold > 100) @@ -66,25 +62,37 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (single_parent_error_threshold < 0 || single_parent_error_threshold > 100) stop("single_parent_error_threshold must be between 0 and 100") + # Accept file path OR in-memory data.frame / data.table + read_flex <- function(x, label, ...) { + if (is.character(x) && length(x) == 1) { + if (!file.exists(x)) + stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") + data.table::fread(x, sep = "auto", ...) + } else if (is.data.frame(x) || data.table::is.data.table(x)) { + data.table::as.data.table(x) + } else { + stop(label, " must be a file path (character) or a data.frame / data.table.") + } + } + tryCatch({ - pedigree <- data.table::fread(pedigree_file, sep = "auto", colClasses = "character") - genos <- data.table::fread(genotypes_file, sep = "auto") + pedigree <- read_flex(pedigree_file, "pedigree_file", colClasses = "character") + genos <- read_flex(genotypes_file, "genotypes_file") }, error = function(e) { stop("Error reading input files. Ensure paths are correct and files are TXT/TSV/CSV.") }) #### Check required columns #### - required_ped_cols <- c("ID", "Male_Parent", "Female_Parent") + required_ped_cols <- c("id", "male_parent", "female_parent") missing_cols <- base::setdiff(required_ped_cols, base::names(pedigree)) if (base::length(missing_cols) > 0) stop("Pedigree file missing required columns: ", base::paste(missing_cols, collapse = ", ")) - if (!"ID" %in% base::names(genos)) - stop("Genotypes file must have an 'ID' column") + if (!"id" %in% base::names(genos)) + stop("Genotypes file must have an 'id' column") - # Ensure parent columns are character for consistent "0" comparisons - pedigree[, Male_Parent := as.character(Male_Parent)] - pedigree[, Female_Parent := as.character(Female_Parent)] + pedigree[, male_parent := as.character(male_parent)] + pedigree[, female_parent := as.character(female_parent)] original_pedigree <- data.table::copy(pedigree) #### Read founders list #### @@ -99,41 +107,35 @@ validate_pedigree <- function(pedigree_file, genotypes_file, founder_ids <- character(0) } - #### Identify trios missing from the genotype file #### - valid_ids <- as.character(genos$ID) - has_geno <- pedigree[ID %in% valid_ids & - (Male_Parent %in% valid_ids | Male_Parent == "0") & - (Female_Parent %in% valid_ids | Female_Parent == "0")] - - no_geno_rows <- pedigree[!(ID %in% valid_ids) | - (!(Male_Parent %in% valid_ids) & Male_Parent != "0") | - (!(Female_Parent %in% valid_ids) & Female_Parent != "0")] + #### Build genotype matrices #### + genos_mat <- base::as.matrix(genos, rownames = "id") + genos_hom <- data.table::copy(genos) + marker_cols <- base::setdiff(base::names(genos_hom), "id") + for (col in marker_cols) + genos_hom[base::get(col) == 1, (col) := NA_integer_] + genos_hom_mat <- base::as.matrix(genos_hom, rownames = "id") + #### Identify trios missing from the genotype file #### + valid_ids <- as.character(genos$id) + has_geno <- pedigree[id %in% valid_ids & + (male_parent %in% valid_ids | male_parent == "0") & + (female_parent %in% valid_ids | female_parent == "0")] + no_geno_rows <- pedigree[!(id %in% valid_ids) | + (!(male_parent %in% valid_ids) & male_parent != "0") | + (!(female_parent %in% valid_ids) & female_parent != "0")] if (base::nrow(no_geno_rows) > 0 && verbose) base::cat("Found", base::nrow(no_geno_rows), - "trios with missing genotype data; flagged as NO_GENOTYPE_DATA.\n") - + "trios with missing genotype data; flagged as no_genotype_data.\n") pedigree <- has_geno if (base::nrow(pedigree) == 0) stop("No valid trios remain after filtering for genotype availability.") - #### Build genotype matrices #### - genos_mat <- base::as.matrix(genos, rownames = "ID") - - # Homozygous-only matrix (het markers set to NA) - genos_hom <- data.table::copy(genos) - marker_cols <- base::setdiff(base::names(genos_hom), "ID") - for (col in marker_cols) - genos_hom[base::get(col) == 1, (col) := NA_integer_] - genos_hom_mat <- base::as.matrix(genos_hom, rownames = "ID") - - #### Helper: find best matching parent via homozygous mismatch #### + #### Find best matching parent via homozygous mismatch #### find_best_parent <- function(prog_id, exclude_ids = base::character(0)) { candidates <- base::setdiff(base::rownames(genos_hom_mat), c(prog_id, exclude_ids)) if (base::length(candidates) == 0) return(base::list(id = NA_character_, error_pct = NA_real_)) - prog_hom <- genos_hom_mat[prog_id, ] errors <- base::sapply(candidates, function(cand_id) { cand_hom <- genos_hom_mat[cand_id, ] @@ -141,7 +143,8 @@ validate_pedigree <- function(pedigree_file, genotypes_file, if (comparisons == 0) return(NA_real_) (base::sum(cand_hom != prog_hom, na.rm = TRUE) / comparisons) * 100 }) - + if (base::all(base::is.na(errors))) + return(base::list(id = NA_character_, error_pct = NA_real_)) best_idx <- base::which.min(errors) base::list(id = candidates[best_idx], error_pct = base::round(errors[best_idx], 2)) @@ -149,15 +152,12 @@ validate_pedigree <- function(pedigree_file, genotypes_file, #### Main trio evaluation loop #### results_list <- base::lapply(base::seq_len(base::nrow(pedigree)), function(i) { - - prog_id <- pedigree$ID[i] - male_parent_id <- pedigree$Male_Parent[i] - female_parent_id <- pedigree$Female_Parent[i] - - # Default values - correction_decision <- "NONE" + prog_id <- pedigree$id[i] + male_parent_id <- pedigree$male_parent[i] + female_parent_id <- pedigree$female_parent[i] + correction_decision <- "none" error_pct <- NA_real_ - status <- "NO_DATA" + status <- "no_data" markers_tested <- 0L male_parent_error_pct <- NA_real_ female_parent_error_pct <- NA_real_ @@ -166,50 +166,36 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- NA_character_ best_female_parent_pct <- NA_real_ - ## Founder check — both parents "0" and ID in founders list if (male_parent_id == "0" && female_parent_id == "0" && prog_id %in% founder_ids) { - status <- "FOUNDERS" - correction_decision <- "NONE" - + status <- "founders" + correction_decision <- "none" } else { - - ## Missing parent(s) — recommendations only, "0"s preserved in pedigree if (male_parent_id == "0" && female_parent_id == "0") { - status <- "MISSING_BOTH_PARENTS" - correction_decision <- "NONE" - + status <- "missing_both_parents" + correction_decision <- "none" best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - best_f <- find_best_parent(prog_id, exclude_ids = c(best_m$id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else if (male_parent_id == "0" && female_parent_id != "0") { - status <- "MISSING_MALE_PARENT" - correction_decision <- "NONE" - + status <- "missing_male_parent" + correction_decision <- "none" best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct - } else if (male_parent_id != "0" && female_parent_id == "0") { - status <- "MISSING_FEMALE_PARENT" - correction_decision <- "NONE" - + status <- "missing_female_parent" + correction_decision <- "none" best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct - } else { - - ## Both parents present — Mendelian error calculation progeny_vec <- genos_mat[prog_id, ] male_parent_vec <- genos_mat[male_parent_id, ] female_parent_vec <- genos_mat[female_parent_id, ] - mismatches <- base::sum( (male_parent_vec == 0 & female_parent_vec == 0 & progeny_vec > 0) | (male_parent_vec == 2 & female_parent_vec == 2 & progeny_vec < 2) | @@ -221,67 +207,54 @@ validate_pedigree <- function(pedigree_file, genotypes_file, (male_parent_vec == 2 & female_parent_vec == 0)) & (progeny_vec != 1), na.rm = TRUE ) - markers_tested <- base::sum(!base::is.na(male_parent_vec) & !base::is.na(female_parent_vec) & !base::is.na(progeny_vec)) - if (markers_tested == 0) { - status <- "NO_DATA" - correction_decision <- "NONE" - + status <- "no_data" + correction_decision <- "none" } else { error_pct <- (mismatches / markers_tested) * 100 - - # LOW_MARKERS still computes parent mismatch/recommendations if (markers_tested < min_markers) { - status <- "LOW_MARKERS" + status <- "low_markers" } else if (error_pct <= trio_error_threshold) { - status <- "PASS" - correction_decision <- "NONE" + status <- "pass" + correction_decision <- "none" } else { - status <- "FAIL" + status <- "fail" } - - # Run parent-level evaluation for both FAIL and LOW_MARKERS - if (status %in% c("FAIL", "LOW_MARKERS")) { - - # Homozygous mismatch per parent + if (status %in% c("fail", "low_markers")) { progeny_hom <- genos_hom_mat[prog_id, ] male_parent_hom <- genos_hom_mat[male_parent_id, ] female_parent_hom <- genos_hom_mat[female_parent_id, ] - male_comparisons <- base::sum(!base::is.na(male_parent_hom) & !base::is.na(progeny_hom)) male_parent_error_pct <- if (male_comparisons == 0) NA_real_ else base::round((base::sum(male_parent_hom != progeny_hom, na.rm = TRUE) / male_comparisons) * 100, 2) - female_comparisons <- base::sum(!base::is.na(female_parent_hom) & !base::is.na(progeny_hom)) female_parent_error_pct <- if (female_comparisons == 0) NA_real_ else base::round((base::sum(female_parent_hom != progeny_hom, na.rm = TRUE) / female_comparisons) * 100, 2) - - male_acceptable <- !is.na(male_parent_error_pct) && + male_acceptable <- !is.na(male_parent_error_pct) && male_parent_error_pct <= single_parent_error_threshold female_acceptable <- !is.na(female_parent_error_pct) && female_parent_error_pct <= single_parent_error_threshold - if (male_acceptable && female_acceptable) { - correction_decision <- "KEEP_BOTH" + correction_decision <- "keep_both" } else if (male_acceptable && !female_acceptable) { - correction_decision <- "REMOVE_FEMALE_PARENT" + correction_decision <- "remove_female_parent" best_f <- find_best_parent(prog_id, exclude_ids = c(male_parent_id)) best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } else if (!male_acceptable && female_acceptable) { - correction_decision <- "REMOVE_MALE_PARENT" + correction_decision <- "remove_male_parent" best_m <- find_best_parent(prog_id, exclude_ids = c(female_parent_id)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct } else { - correction_decision <- "REMOVE_BOTH" + correction_decision <- "remove_both" best_m <- find_best_parent(prog_id, exclude_ids = character(0)) best_male_parent <- best_m$id best_male_parent_pct <- best_m$error_pct @@ -289,112 +262,170 @@ validate_pedigree <- function(pedigree_file, genotypes_file, best_female_parent <- best_f$id best_female_parent_pct <- best_f$error_pct } - - # Do not alter corrected pedigree for LOW_MARKERS rows - if (status == "LOW_MARKERS") - correction_decision <- paste0("LOW_MARKERS_", correction_decision) + if (status == "low_markers") + correction_decision <- paste0("low_markers_", correction_decision) } } } } data.table::data.table( - ID = prog_id, - Male_Parent = male_parent_id, - Female_Parent = female_parent_id, - Mendelian_Error_Pct = base::round(error_pct, 2), - Markers_Tested = markers_tested, - Status = status, - Correction_Decision = correction_decision, - Male_Parent_Hom_Error_Pct = male_parent_error_pct, - Female_Parent_Hom_Error_Pct = female_parent_error_pct, - Best_Male_Parent = best_male_parent, - Best_Male_Parent_Error_Pct = best_male_parent_pct, - Best_Female_Parent = best_female_parent, - Best_Female_Parent_Error_Pct = best_female_parent_pct + id = prog_id, + orig_male_parent = male_parent_id, + orig_female_parent = female_parent_id, + trio_mendelian_error_pct = base::round(error_pct, 2), + trio_markers_tested = markers_tested, + status = status, + recommended_correction = correction_decision, + male_parent_hom_error_pct = male_parent_error_pct, + female_parent_hom_error_pct = female_parent_error_pct, + best_male_candidate = best_male_parent, + best_male_candidate_error_pct = best_male_parent_pct, + best_female_candidate = best_female_parent, + best_female_candidate_error_pct = best_female_parent_pct ) }) final_df <- data.table::rbindlist(results_list) - #### Append NO_GENOTYPE_DATA rows to the final report #### + #### Append no_genotype_data rows #### if (base::nrow(no_geno_rows) > 0) { no_geno_df <- data.table::data.table( - ID = no_geno_rows$ID, - Male_Parent = no_geno_rows$Male_Parent, - Female_Parent = no_geno_rows$Female_Parent, - Mendelian_Error_Pct = NA_real_, - Markers_Tested = 0L, - Status = "NO_GENOTYPE_DATA", - Correction_Decision = "NONE", - Male_Parent_Hom_Error_Pct = NA_real_, - Female_Parent_Hom_Error_Pct = NA_real_, - Best_Male_Parent = NA_character_, - Best_Male_Parent_Error_Pct = NA_real_, - Best_Female_Parent = NA_character_, - Best_Female_Parent_Error_Pct = NA_real_ + id = no_geno_rows$id, + orig_male_parent = no_geno_rows$male_parent, + orig_female_parent = no_geno_rows$female_parent, + trio_mendelian_error_pct = NA_real_, + trio_markers_tested = 0L, + status = "no_genotype_data", + recommended_correction = "none", + male_parent_hom_error_pct = NA_real_, + female_parent_hom_error_pct = NA_real_, + best_male_candidate = NA_character_, + best_male_candidate_error_pct = NA_real_, + best_female_candidate = NA_character_, + best_female_candidate_error_pct = NA_real_ ) final_df <- data.table::rbindlist(list(final_df, no_geno_df)) } - #### Write corrected pedigree #### + #### Build corrected pedigree #### corrected_pedigree <- data.table::copy(original_pedigree) for (i in base::seq_len(base::nrow(final_df))) { - prog_id <- final_df$ID[i] - decision <- final_df$Correction_Decision[i] - row_idx <- base::which(corrected_pedigree$ID == prog_id) - - if (decision == "REMOVE_MALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - } else if (decision == "REMOVE_FEMALE_PARENT") { - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") - } else if (decision == "REMOVE_BOTH") { - data.table::set(corrected_pedigree, row_idx, "Male_Parent", "0") - data.table::set(corrected_pedigree, row_idx, "Female_Parent", "0") + prog_id <- final_df$id[i] + decision <- final_df$recommended_correction[i] + row_idx <- base::which(corrected_pedigree$id == prog_id) + if (decision == "remove_male_parent") { + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + } else if (decision == "remove_female_parent") { + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") + } else if (decision %in% c("remove_both", + "low_markers_remove_both", + "low_markers_remove_male_parent", + "low_markers_remove_female_parent")) { + if (grepl("male", decision)) + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + if (grepl("female", decision)) + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") + if (decision %in% c("low_markers_remove_both", "remove_both")) { + data.table::set(corrected_pedigree, row_idx, "male_parent", "0") + data.table::set(corrected_pedigree, row_idx, "female_parent", "0") + } } } - tryCatch({ - data.table::fwrite(corrected_pedigree, file = "corrected_pedigree.txt", - sep = "\t", quote = FALSE) - if (verbose) base::cat("Corrected pedigree written to: corrected_pedigree.txt\n") - }, error = function(e) { - warning("Could not write corrected pedigree. Error: ", e$message, call. = FALSE) - }) - #### Summary output #### if (verbose) { total_trios <- base::nrow(final_df) - status_counts <- base::table(final_df$Status) + status_counts <- base::table(final_df$status) base::cat("\n--- Trio Validation Summary ---\n") base::cat("Total trios in pedigree:", total_trios, "\n") for (s in base::names(status_counts)) - base::cat(base::sprintf("%-24s: %d (%.1f%%)\n", s, + base::cat(base::sprintf("%-28s: %d (%.1f%%)\n", s, status_counts[s], (status_counts[s] / total_trios) * 100)) base::cat("Error threshold:", trio_error_threshold, "%\n") base::cat("Homozygous threshold:", single_parent_error_threshold, "%\n") base::cat("Minimum markers required:", min_markers, "\n\n") - - corrections <- base::table(final_df$Correction_Decision) + corrections <- base::table(final_df$recommended_correction) base::cat("Correction summary:\n") for (decision in base::names(corrections)) - if (decision != "NONE") + if (decision != "none") base::cat(" ", decision, ":", corrections[decision], "\n") base::cat("\n") base::print(final_df) } - #### Write results #### - if (write_txt) { - tryCatch({ - data.table::fwrite(final_df, file = output_filename, - sep = "\t", quote = FALSE) - if (verbose) base::cat("Results written to:", output_filename, "\n") - }, error = function(e) { - warning("Could not write results. Error: ", e$message, call. = FALSE) - }) + #### Plot results #### + p <- NULL + if (plot_results) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + warning("ggplot2 is required for plot_results = TRUE. Please install it.", call. = FALSE) + } else { + plot_df <- final_df[!is.na(final_df$trio_mendelian_error_pct)] + plot_df$plot_status <- dplyr::case_when( + plot_df$recommended_correction %in% c("none", "keep_both", + "low_markers_keep_both") ~ "pass", + plot_df$recommended_correction %in% c("remove_male_parent", + "remove_female_parent", + "low_markers_remove_male_parent", + "low_markers_remove_female_parent") ~ "fail_one_parent", + plot_df$recommended_correction %in% c("remove_both", + "low_markers_remove_both") ~ "fail_both_parents", + TRUE ~ "other" + ) + n_total <- nrow(plot_df) + n_fail <- sum(plot_df$trio_mendelian_error_pct > trio_error_threshold) + n_pass <- sum(plot_df$trio_mendelian_error_pct <= trio_error_threshold) + threshold_label <- paste0( + "Mendelian Error Threshold: ", trio_error_threshold, "% | ", + "Lost: ", n_fail, " trios | ", + "Kept: ", n_pass, " trios" + ) + p <- ggplot2::ggplot(plot_df, + ggplot2::aes(x = trio_mendelian_error_pct, + fill = plot_status)) + + ggplot2::geom_histogram(binwidth = 1, color = "white", alpha = 0.9) + + ggplot2::geom_vline(xintercept = trio_error_threshold, + linetype = "dashed", color = "black", linewidth = 1) + + ggplot2::scale_x_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_y_continuous(breaks = seq(0, 100, by = 5)) + + ggplot2::scale_fill_manual( + values = c("pass" = "#339900", + "fail_one_parent" = "#F1C40F", + "fail_both_parents" = "#cc3333", + "other" = "#BDC3C7"), + labels = c("pass" = "Pass", + "fail_one_parent" = "Fail - One Parent", + "fail_both_parents" = "Fail - Both Parents", + "other" = "Other") + ) + + ggplot2::labs( + title = "Trio Mendelian Error Distribution", + subtitle = paste0("Trios with Genotype Data Tested: ", n_total, + "\n \n", threshold_label), + x = "Mendelian Error (%)", + y = "Number of Trios", + fill = "Status" + ) + + ggplot2::theme_classic(base_size = 13) + + ggplot2::theme(legend.position = "top") + print(p) + } } - return(base::invisible(final_df)) + #### Compile and return named list #### + output_list <- base::list( + pass = final_df[status == "pass"], + fail = final_df[status == "fail"], + low_markers = final_df[status == "low_markers"], + no_genotype_data = final_df[status == "no_genotype_data"], + founders = final_df[status == "founders"], + missing_parents = final_df[status %in% c("missing_both_parents", + "missing_male_parent", + "missing_female_parent")], + full_results = final_df, + corrected_pedigree = corrected_pedigree, + plot = p + ) + return(base::invisible(output_list)) } diff --git a/inst/check_ped_test.txt b/inst/check_ped_test.txt index ae7bc08..27eba9d 100644 --- a/inst/check_ped_test.txt +++ b/inst/check_ped_test.txt @@ -1,4 +1,4 @@ -id sire dam +id male_parent female_parent off1 sire1 dam1 off2 sire2 dam2 off3 sire3 dam3 @@ -13,4 +13,9 @@ dam1 grandfather6 grandmother6 dam2 grandfather7 grandmother7 dam3 grandfather8 grandmother8 dam4 0 0 -dam5 0 0 \ No newline at end of file +dam5 0 0 +grandmother4 0 0 +grandmother5 0 0 +grandfather6 0 0 +grandmother6 0 0 +grandfather7 0 0 \ No newline at end of file diff --git a/man/allele_freq_poly.Rd b/man/allele_freq_poly.Rd index 407d158..6d56f79 100644 --- a/man/allele_freq_poly.Rd +++ b/man/allele_freq_poly.Rd @@ -2,48 +2,51 @@ % Please edit documentation in R/breedtools_functions.R \name{allele_freq_poly} \alias{allele_freq_poly} -\title{Compute Allele Frequencies for Populations} +\title{Compute allele frequencies for populations} \usage{ allele_freq_poly(geno, populations, ploidy = 2) } \arguments{ -\item{geno}{matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} -with individuals in rows (named) and SNPs in columns (named)} +\item{geno}{Numeric matrix of genotypes coded as dosage of allele B +\code{{0, 1, 2, ..., ploidy}}, with individuals in rows (named) and +SNPs in columns (named).} -\item{populations}{list of named populations. Each population has a vector of IDs -that belong to the population. Allele frequencies will be derived from all animals} +\item{populations}{Named list of populations, each containing a character +vector of individual IDs belonging to that population.} -\item{ploidy}{integer indicating the ploidy level (default is 2 for diploid)} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -data.frame consisting of allele_frequencies for populations (columns) for -each SNP (rows) +A data frame of allele frequencies with SNPs as rows and populations +as columns. } \description{ -Computes allele frequencies for specified populations given SNP array data +Computes allele frequencies for specified populations from SNP array data +coded as dosage of allele B. } \examples{ -# Example inputs geno_matrix <- matrix( -c(4, 1, 4, 0, # S1 - 2, 2, 1, 3, # S2 - 0, 4, 0, 4, # S3 - 3, 3, 2, 2, # S4 - 1, 4, 2, 3),# S5 -nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols -dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) + c(4, 1, 4, 0, + 2, 2, 1, 3, + 0, 4, 0, 4, + 3, 3, 2, 2, + 1, 4, 2, 3), + nrow = 4, ncol = 5, byrow = FALSE, + dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) ) - pop_list <- list( -PopA = c("Ind1", "Ind2"), -PopB = c("Ind3", "Ind4") + PopA = c("Ind1", "Ind2"), + PopB = c("Ind3", "Ind4") ) - allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) print(allele_freqs) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index ea63de7..85d7ac3 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,58 +2,61 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Check a pedigree file for accuracy and report/correct common errors} +\title{Check and Correct Common Pedigree Errors} \usage{ -check_ped(ped.file, seed = NULL, verbose = TRUE) +check_ped( + ped.file, + seed = NULL, + verbose = TRUE, + correct_conflicting_trios = TRUE, + correct_inconsistent_sex_roles = TRUE +) } \arguments{ -\item{ped.file}{Path to the pedigree text file.} +\item{ped.file}{Path to the pedigree text file (TSV/CSV/TXT), OR a +data.frame / data.table with columns: id, male_parent, female_parent.} -\item{seed}{Optional seed for reproducibility.} +\item{seed}{Optional integer seed for reproducibility.} -\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} +\item{verbose}{Logical. If TRUE (default), prints the report to the console.} + +\item{correct_conflicting_trios}{Logical. If TRUE (default), sets conflicting +male_parent and female_parent to 0 and collapses to one row per ID.} + +\item{correct_inconsistent_sex_roles}{Logical. If TRUE (default), sets +male_parent and female_parent to 0 for rows involving IDs found as both, +then removes any resulting exact duplicates.} } \value{ -A list of data.frames containing detected issues: -\itemize{ -\item \code{exact_duplicates}: rows that were exact duplicates -\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam -\item \code{messy_parents}: IDs appearing as both sire and dam -\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam -\item \code{dependencies}: detected cycles in the pedigree +An invisible named list of data frames: +\describe{ +\item{exact_duplicates}{Exact duplicate rows found in the input.} +\item{conflicting_trios}{IDs with conflicting male_parent or female_parent assignments.} +\item{inconsistent_sex_roles}{Rows where a conflicting ID appears as male_parent or female_parent.} +\item{missing_parents}{Parent IDs absent from id, added as founders.} +\item{dependencies}{Cycles detected in the pedigree. Must be resolved manually.} +\item{corrected_pedigree}{Corrected pedigree table.} } } \description{ -\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) -and performs quality checks, optionally correcting or flagging errors. -} -\details{ -The function checks for: -\itemize{ -\item Exact duplicate rows and removes them (keeping one copy) -\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") -\item IDs that appear in both sire and dam columns -\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" -\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant -} - -After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. - -The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: -\itemize{ -\item Corrected pedigree as a dataframe in the global environment -\item A report listing all detected issues -} +Reads a 3-column pedigree file (id, male_parent, female_parent) and performs +quality checks, optionally correcting detected errors. Exact duplicates and +missing parents are always corrected. Conflicting trios and inconsistent sex +roles are corrected when their respective arguments are TRUE. Cycles are +reported only and must be resolved manually. } \examples{ ped_file <- system.file("check_ped_test.txt", package = "BIGr") -ped_errors <- check_ped(ped.file = ped_file, seed = 101919) +ped_errors <- check_ped(ped.file = ped_file, seed = 101919, verbose = FALSE) -# Access messy parents -ped_errors$messy_parents - -# IDs with messy parents -messy_ids <- ped_errors$messy_parents$id -print(messy_ids) +# Also accepts a data.table directly +library(data.table) +ped_dt <- data.table(id = c("A","B","C"), + male_parent = c("0","0","A"), + female_parent = c("0","0","B")) +ped_errors <- check_ped(ped.file = ped_dt, verbose = FALSE) } +\author{ +Josue Chinchilla-Vargas +} diff --git a/man/find_parentage.Rd b/man/find_parentage.Rd index 706a15f..e4c1953 100644 --- a/man/find_parentage.Rd +++ b/man/find_parentage.Rd @@ -14,146 +14,57 @@ find_parentage( show_ties = TRUE, allow_selfing = TRUE, verbose = TRUE, - write_txt = TRUE + plot_results = TRUE ) } \arguments{ -\item{genotypes_file}{Path to a TSV/CSV/TXT file containing genotype data. -Must include an 'ID' column followed by marker columns coded as 0, 1, 2 -(allele dosage).} +\item{genotypes_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column followed by marker columns coded as 0, 1, 2.} -\item{parents_file}{Path to a TSV/CSV/TXT file listing candidate parent IDs. -Must include an 'ID' column. An optional 'Sex' column with values -'M' (male parent), 'F' (female parent), or 'A' (ambiguous) determines -which parents are tested for each role. If absent, all parents are treated -as ambiguous.} +\item{parents_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column and an optional 'sex' column +('M', 'F', or 'A'). If absent, all parents are treated as ambiguous.} -\item{progeny_file}{Path to a TSV/CSV/TXT file listing progeny IDs to assign. -Must include an 'ID' column.} +\item{progeny_file}{Path to a TSV/CSV/TXT file, OR a data.frame / +data.table with an 'id' column.} -\item{method}{Character. Parentage assignment method. One of: -\itemize{ -\item \code{"best_male_parent"} — finds the best male parent for each -progeny using homozygous mismatch rate. -\item \code{"best_female_parent"} — finds the best female parent for each -progeny using homozygous mismatch rate. -\item \code{"best_match"} — finds the single best parent (either sex) -using homozygous mismatch rate. -\item \code{"best_pair"} — finds the best male-female parent pair for -each progeny using full Mendelian error rate (default). -}} +\item{method}{Character. One of \code{"best_male_parent"}, +\code{"best_female_parent"}, \code{"best_match"}, or +\code{"best_pair"} (default).} -\item{min_markers}{Integer. Minimum number of non-missing markers required -to report a parentage assignment. Progeny-parent comparisons with fewer -markers are flagged as \code{LOW_MARKERS} and no assignment is made -(default: \code{10}).} +\item{min_markers}{Integer. Minimum markers required; fewer flags +\code{low_markers} (default: \code{10}).} -\item{error_threshold}{Numeric. Maximum mismatch percentage to report a -parentage assignment as confident. Assignments above this threshold are -flagged as \code{HIGH_ERROR} in the \code{Assignment_Status} column -(default: \code{5.0}). Must be between 0 and 100.} +\item{error_threshold}{Numeric. Maximum mismatch percentage; exceeded values +flag \code{high_error} (default: \code{5.0}). Must be between 0 and 100.} -\item{show_ties}{Logical. If \code{TRUE}, all tied best pairs (after -tie-breaking by maximum markers tested) are reported as additional columns -(\code{Male_Parent_1}, \code{Male_Parent_2}, etc.) when -\code{method = "best_pair"}. The base columns (\code{Male_Parent}, -\code{Female_Parent}, etc.) are always populated with the top result. -If \code{FALSE}, only one tied pair is reported with a warning. -Default is \code{TRUE}.} +\item{show_ties}{Logical. If \code{TRUE}, tied best pairs are appended as +suffix columns. Default is \code{TRUE}.} -\item{allow_selfing}{Logical. If \code{FALSE}, male-female parent pairs where -both IDs are identical are excluded when \code{method = "best_pair"}. -Default is \code{TRUE}.} +\item{allow_selfing}{Logical. If \code{FALSE}, pairs with identical male and +female parent IDs are excluded. Default is \code{TRUE}.} -\item{verbose}{Logical. If \code{TRUE}, prints progress messages, summary -statistics, and the results table to the console. Default is \code{TRUE}.} +\item{verbose}{Logical. If \code{TRUE}, prints progress and summary. +Default is \code{TRUE}.} -\item{write_txt}{Logical. If \code{TRUE}, writes results to -\code{parentage_results_dt.txt} in the working directory. Default is -\code{TRUE}.} +\item{plot_results}{Logical. If \code{TRUE}, plots the Mendelian error +distribution. Requires \code{ggplot2}. Default is \code{TRUE}.} } \value{ -A \code{data.table} with one row per progeny. Columns depend on the -method used: -\itemize{ -\item \code{best_male_parent} / \code{best_female_parent} / \code{best_match}: -\code{Progeny}, \code{Best_Match}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (no ties after tie-breaking): \code{Progeny}, -\code{Male_Parent}, \code{Female_Parent}, \code{Mendelian_Error_Pct}, -\code{Markers_Tested}, \code{Assignment_Status}. -\item \code{best_pair} (ties remain after tie-breaking, -\code{show_ties = TRUE}): base columns are always populated with the -top result, plus suffix columns \code{Male_Parent_1}, -\code{Female_Parent_1}, etc. for each tied pair. +A named list (returned invisibly) with elements: +\describe{ +\item{pass}{Progeny with a confident parentage assignment.} +\item{high_error}{Progeny whose best assignment exceeds the error threshold.} +\item{low_markers}{Progeny with insufficient markers for a valid assignment.} +\item{full_results}{Complete data.table with all progeny and all output columns.} +\item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} } -\code{Assignment_Status} is one of \code{PASS}, \code{HIGH_ERROR}, or -\code{LOW_MARKERS}. Returned invisibly when \code{verbose = TRUE}. } \description{ -Assigns the most likely parent(s) to each progeny individual based on -genotypic data using Mendelian error rates or homozygous mismatch rates. -} -\details{ -A homozygous-only genotype matrix is pre-computed once at startup and shared -across all methods that require it, avoiding redundant computation. - -For \code{"best_male_parent"}, \code{"best_female_parent"}, and -\code{"best_match"}, only homozygous markers (coded 0 or 2) are used for -comparison; heterozygous markers (coded 1) are set to \code{NA}. This -reduces false mismatches caused by phase ambiguity. - -For \code{"best_pair"}, all markers are used and full Mendelian inheritance -rules are applied. Mismatch rates and comparison counts are computed across -all progeny simultaneously using vectorised \code{vapply} calls, producing -\code{n_pairs x n_progeny} matrices and giving substantial speed gains for -large datasets. Both matrices are explicitly coerced to matrix form to -handle the edge case of a single parent pair correctly. - -When multiple pairs share the lowest Mendelian error rate, ties are broken -by selecting the pair(s) with the greatest number of markers tested. If ties -still remain after this step, all remaining tied pairs are reported when -\code{show_ties = TRUE}. - -The base columns (\code{Male_Parent}, \code{Female_Parent}, -\code{Mendelian_Error_Pct}, \code{Markers_Tested}) are always populated with -the top result, ensuring no missing values in these columns regardless of -tie status. - -Output rows are pre-allocated as a \code{data.table} and filled by reference -using \code{data.table::set()}, avoiding repeated memory allocation during -the results-building step. - -Individuals in \code{parents_file} or \code{progeny_file} that are absent -from \code{genotypes_file} are removed with a warning. - -Progeny with fewer non-missing markers than \code{min_markers} are flagged -\code{LOW_MARKERS} and no parent assignment is reported. Progeny whose best -match exceeds \code{error_threshold} are flagged \code{HIGH_ERROR}. -} -\examples{ -\dontrun{ -# Assign best male-female parent pair to each progeny -results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_pair", - min_markers = 50, - error_threshold = 5.0, - show_ties = TRUE, - allow_selfing = FALSE -) - -# Find best individual parent match (ignoring sex) -results <- find_parentage( - genotypes_file = "genotypes.txt", - parents_file = "parents.txt", - progeny_file = "progeny.txt", - method = "best_match", - min_markers = 30, - error_threshold = 3.0 -) +Assigns the most likely parent(s) to each progeny from SNP genotype data +using Mendelian error rates or homozygous mismatch rates. Parents or progeny +absent from the genotype file are removed with a warning. } - +\author{ +Josue Chinchilla-Vargas } diff --git a/man/imputation_concordance.Rd b/man/imputation_concordance.Rd index 22e9462..f96493f 100644 --- a/man/imputation_concordance.Rd +++ b/man/imputation_concordance.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/imputation_concordance.R \name{imputation_concordance} \alias{imputation_concordance} -\title{Calculate Concordance between Imputed and Reference Genotypes} +\title{Calculate concordance between imputed and reference genotypes} \usage{ imputation_concordance( reference_genos, @@ -15,77 +15,65 @@ imputation_concordance( ) } \arguments{ -\item{reference_genos}{A data frame containing reference genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{reference_genos}{A data frame of reference genotypes (samples × markers) +with a column named \code{ID}.} -\item{imputed_genos}{A data frame containing imputed genotype data, -with rows as samples and columns as markers. Must include a column named \code{ID}.} +\item{imputed_genos}{A data frame of imputed genotypes (samples × markers) +with a column named \code{ID}.} -\item{missing_code}{Optional value specifying missing data. If provided, -loci with this value in either dataset will be excluded from the concordance calculation.} +\item{missing_code}{Optional value indicating missing data. Loci carrying this +value in either dataset are excluded from the concordance calculation.} -\item{snps_2_exclude}{Optional vector of marker IDs to exclude from the concordance calculation.} +\item{snps_2_exclude}{Optional character vector of marker names to exclude.} -\item{verbose}{Logical. If \code{TRUE}, prints summary statistics (minimum, quartiles, -median, mean, maximum) of concordance percentages.} +\item{verbose}{Logical. Print a five-number summary of concordance? Default \code{FALSE}.} -\item{plot}{Logical. If \code{TRUE}, produces a bar plot of concordance percentage -by sample.} +\item{plot}{Logical. Produce a bar plot of concordance by sample? Default \code{FALSE}.} -\item{print_result}{Logical. If \code{TRUE} (default), prints the concordance -results data frame to the console. If \code{FALSE}, results are returned invisibly.} +\item{print_result}{Logical. Print the results data frame to the console? +Default \code{TRUE}. If \code{FALSE}, results are returned invisibly.} } \value{ -A data frame with: -\itemize{ -\item \code{ID}: Sample identifiers shared between the datasets. -\item \code{Concordance}: Percentage of matching genotypes per sample. +A data frame with columns: +\describe{ +\item{ID}{Sample identifiers shared between the two datasets.} +\item{Concordance}{Percentage of matching genotypes per sample.} } -If \code{print_result = FALSE}, the data frame is returned invisibly. } \description{ -This function calculates the concordance between imputed and reference -genotypes. It assumes that samples are rows and markers are columns. -Allele dosages (0, 1, 2) are recommended but other numeric formats are supported. -Missing data in either dataset can be excluded from the concordance calculation -using the \code{missing_code} argument. Specific markers can be excluded using -the \code{snps_2_exclude} argument. +Compares imputed and reference genotype datasets sample by sample, returning +the percentage of matching genotypes per sample. Expects samples as rows and +markers as columns, with allele dosages (0, 1, 2) recommended. Other numeric +formats are supported. } \details{ -The function: -\enumerate{ -\item Identifies common samples and markers between the datasets. -\item Optionally excludes specified SNPs. -\item Removes loci with missing data (if \code{missing_code} is provided). -\item Computes per-sample concordance as the percentage of matching genotypes. -} - -When \code{plot = TRUE}, a bar plot showing concordance percentage per sample -is generated using \pkg{ggplot2}. +The function identifies common samples and markers between the two datasets, +optionally removes specified SNPs and loci with missing data, then computes +per-sample concordance as the percentage of matching genotypes over valid loci. +When \code{plot = TRUE}, a bar plot is produced with \pkg{ggplot2}. } \examples{ ref <- data.frame( - ID = c("S1", "S2", "S3"), + ID = c("S1", "S2", "S3"), SNP1 = c(0, 1, 2), SNP2 = c(1, 1, 0), SNP3 = c(2, 5, 1) ) - -test <- data.frame( - ID = c("S1", "S2", "S3"), +imp <- data.frame( + ID = c("S1", "S2", "S3"), SNP1 = c(0, 0, 2), SNP2 = c(1, 1, 1), SNP3 = c(2, 5, 0) ) - -result <- imputation_concordance( +imputation_concordance( reference_genos = ref, - imputed_genos = test, - snps_2_exclude = "SNP2", - missing_code = 5, - print_result = FALSE + imputed_genos = imp, + snps_2_exclude = "SNP2", + missing_code = 5, + print_result = FALSE ) -result - +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/solve_composition_poly.Rd b/man/solve_composition_poly.Rd index bd92739..128f7ad 100644 --- a/man/solve_composition_poly.Rd +++ b/man/solve_composition_poly.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/breedtools_functions.R \name{solve_composition_poly} \alias{solve_composition_poly} -\title{Compute Genome-Wide Breed Composition} +\title{Compute genome-wide breed composition} \usage{ solve_composition_poly( Y, @@ -16,41 +16,40 @@ solve_composition_poly( ) } \arguments{ -\item{Y}{numeric matrix of genotypes (columns) from all animals (rows) in population -coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} +\item{Y}{Numeric matrix of genotypes with individuals as rows and SNPs as +columns, coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}.} -\item{X}{numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are -relative to allele B.} +\item{X}{Numeric matrix of allele frequencies with SNPs as rows and +breeds/populations as columns.} -\item{ped}{data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam"} +\item{ped}{Optional data frame with pedigree information formatted with +columns \code{ID}, \code{Sire}, and \code{Dam}. When supplied, \code{QPsolve_par} is used +and only animals with genotyped parents are included.} -\item{groups}{list of IDs categorized by breed/population. If specified, output will be a list -of results categorized by breed/population.} +\item{groups}{Optional named list of IDs grouped by breed/population. +When supplied, results are returned as a named list partitioned by group.} -\item{mia}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing the inferred maternally inherited allele for each locus for each animal instead -of breed composition results.} +\item{mia}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +the inferred maternally inherited allele per locus per animal. Default \code{FALSE}.} -\item{sire}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing sire genotypes for each locus for each animal instead of breed composition results.} +\item{sire}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +sire genotypes per locus per animal. Default \code{FALSE}.} -\item{dam}{logical. Only applies if ped argument is supplied. If true, returns a data.frame -containing dam genotypes for each locus for each animal instead of breed composition results.} +\item{dam}{Logical. Only applies when \code{ped} is supplied. If \code{TRUE}, returns +dam genotypes per locus per animal. Default \code{FALSE}.} -\item{ploidy}{integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.).} +\item{ploidy}{Integer. Ploidy level of the species. Default is \code{2}.} } \value{ -A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition -results +A data frame, or a named list of data frames when \code{groups} is +supplied, containing breed/ancestry composition estimates. } \description{ -Computes genome-wide breed/ancestry composition using quadratic programming on a -batch of animals. +Estimates genome-wide breed/ancestry composition for a batch of animals +using quadratic programming, with optional pedigree-assisted and +grouped-output modes. } \examples{ -# Example inputs for solve_composition_poly (ploidy = 4) - -# (This would typically be the output from allele_freq_poly) allele_freqs_matrix <- matrix( c(0.625, 0.500, 0.500, 0.500, @@ -60,16 +59,12 @@ allele_freqs_matrix <- matrix( nrow = 5, ncol = 2, byrow = TRUE, dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) ) - -# Validation Genotypes (individuals x SNPs) val_geno_matrix <- matrix( - c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 - 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 + c(2, 1, 2, 3, 4, + 3, 4, 2, 3, 0), nrow = 2, ncol = 5, byrow = TRUE, dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) ) - -# Calculate Breed Composition composition <- solve_composition_poly(Y = val_geno_matrix, X = allele_freqs_matrix, ploidy = 4) @@ -77,6 +72,10 @@ print(composition) } \references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. +Estimation of genome-wide and locus-specific breed composition in pigs. +\emph{Transl Anim Sci.} 2017;1(1):36–44. +} +\author{ +Josué Chinchilla-Vargas } diff --git a/man/validate_pedigree.Rd b/man/validate_pedigree.Rd index 8f47a7d..3daa9d1 100644 --- a/man/validate_pedigree.Rd +++ b/man/validate_pedigree.Rd @@ -12,69 +12,59 @@ validate_pedigree( min_markers = 10, single_parent_error_threshold = 2, verbose = TRUE, - write_txt = TRUE, - output_filename = "pedigree_validation_results.txt" + plot_results = TRUE ) } \arguments{ -\item{pedigree_file}{Character. Path to the pedigree file (TSV/CSV/TXT) -with columns: ID, Male_Parent, Female_Parent.} +\item{pedigree_file}{Path to the pedigree file (TSV/CSV/TXT), OR a +data.frame / data.table with columns: id, male_parent, female_parent.} -\item{genotypes_file}{Character. Path to the genotypes file (TSV/CSV/TXT) -with an ID column followed by marker columns coded as 0, 1, 2.} +\item{genotypes_file}{Path to the genotypes file (TSV/CSV/TXT), OR a +data.frame / data.table with an id column followed by marker columns +coded as 0, 1, 2.} -\item{founders_file}{Character, optional. Path to a one-column file -listing the IDs of founder individuals. Founders with both parents -coded as 0 are left unchanged with no recommendations. Defaults to NULL.} +\item{founders_file}{Character, optional. Path to a one-column file listing +founder IDs. Founders with both parents coded as 0 are left unchanged. +Defaults to NULL.} -\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage -to classify a trio as PASS (default: 5.0). Must be between 0 and 100.} +\item{trio_error_threshold}{Numeric. Maximum Mendelian error percentage to +classify a trio as pass (default: 5.0). Must be between 0 and 100.} -\item{min_markers}{Integer. Minimum number of non-missing markers -required to evaluate a trio (default: 10).} +\item{min_markers}{Integer. Minimum non-missing markers required to evaluate +a trio (default: 10).} \item{single_parent_error_threshold}{Numeric. Maximum homozygous-marker -mismatch percentage for a parent to be considered acceptable during -parent-level evaluation (default: 2.0). Must be between 0 and 100.} +mismatch percentage for a parent to be considered acceptable +(default: 2.0). Must be between 0 and 100.} -\item{verbose}{Logical. If TRUE, prints progress messages, a summary -table, and results to the console (default: TRUE).} +\item{verbose}{Logical. If TRUE, prints progress, summary, and results to +the console (default: TRUE).} -\item{write_txt}{Logical. If TRUE, writes validation results to -output_filename (default: TRUE).} - -\item{output_filename}{Character. Path/name of the output file -(default: "pedigree_validation_results.txt").} +\item{plot_results}{Logical. If TRUE, prints a histogram of trio Mendelian +error percentages with a threshold line (default: TRUE).} } \value{ -A data.table (returned invisibly) with one row per trio and -the following columns: +An invisible named list with the following elements: \describe{ -\item{ID}{Individual ID.} -\item{Male_Parent}{Declared male parent ID.} -\item{Female_Parent}{Declared female parent ID.} -\item{Mendelian_Error_Pct}{Trio-level Mendelian error percentage.} -\item{Markers_Tested}{Number of markers with non-missing genotypes.} -\item{Status}{One of PASS, FAIL, LOW_MARKERS, NO_DATA, FOUNDERS, -MISSING_MALE_PARENT, MISSING_FEMALE_PARENT, MISSING_BOTH_PARENTS, -or NO_GENOTYPE_DATA.} -\item{Correction_Decision}{One of NONE, KEEP_BOTH, -REMOVE_MALE_PARENT, REMOVE_FEMALE_PARENT, REMOVE_BOTH, -LOW_MARKERS_KEEP_BOTH, LOW_MARKERS_REMOVE_MALE_PARENT, -LOW_MARKERS_REMOVE_FEMALE_PARENT, LOW_MARKERS_REMOVE_BOTH.} -\item{Male_Parent_Hom_Error_Pct}{Male parent homozygous-marker mismatch percentage.} -\item{Female_Parent_Hom_Error_Pct}{Female parent homozygous-marker mismatch percentage.} -\item{Best_Male_Parent}{Best-matching male parent candidate ID.} -\item{Best_Male_Parent_Error_Pct}{Homozygous mismatch percentage for the best male parent candidate.} -\item{Best_Female_Parent}{Best-matching female parent candidate ID.} -\item{Best_Female_Parent_Error_Pct}{Homozygous mismatch percentage for the best female parent candidate.} +\item{pass}{Trios that passed the Mendelian error threshold.} +\item{fail}{Trios that failed the Mendelian error threshold.} +\item{low_markers}{Trios with insufficient markers for evaluation.} +\item{no_genotype_data}{Trios absent from the genotype file.} +\item{founders}{Trios identified as founders.} +\item{missing_parents}{Trios with one or both parents coded as 0 (non-founders).} +\item{full_results}{Complete data.table with all trios and all output columns.} +\item{corrected_pedigree}{Pedigree table after applying recommended corrections.} +\item{plot}{ggplot object if plot_results = TRUE, otherwise NULL.} } } \description{ -Validates parent-offspring trios by calculating Mendelian error rates from -SNP genotype data. Identifies incorrect parentage assignments and suggests -best-matching replacements. If a list of founders is supplied, trios that -are declared founders (both parents coded as 0) are preserved unchanged -with no recommendations. Trios removed due to missing genotype data are -retained in the output with a NO_GENOTYPE_DATA status. +Validates parent-offspring trios against SNP genotype data using Mendelian +error rates. Identifies incorrect parentage assignments, suggests +best-matching replacements, and outputs a corrected pedigree. Founder trios +(both parents coded as 0) are preserved unchanged if a founders file is +supplied. Trios absent from the genotype file are retained as +no_genotype_data. +} +\author{ +Josue Chinchilla-Vargas } diff --git a/tests/testthat/corrected_pedigree.txt b/tests/testthat/corrected_pedigree.txt deleted file mode 100644 index 0434b0c..0000000 --- a/tests/testthat/corrected_pedigree.txt +++ /dev/null @@ -1,4 +0,0 @@ -ID Male_Parent Female_Parent -IND_C IND_A IND_B -IND_D 0 IND_A -GHOST IND_A IND_B diff --git a/tests/testthat/test-check_ped.R b/tests/testthat/test-check_ped.R index 706143f..ff150c3 100644 --- a/tests/testthat/test-check_ped.R +++ b/tests/testthat/test-check_ped.R @@ -1,22 +1,388 @@ -context("Imputation Concordance") +# tests/testthat/test-check_ped.R +library(testthat) +write_ped <- function(df) { + f <- tempfile(fileext = ".txt") + utils::write.table(df, f, sep = "\t", row.names = FALSE, quote = FALSE) + f +} -test_that("test imputation",{ - #Input variables - ped_file <- system.file("check_ped_test.txt", package="BIGr") +context("check_ped – Pedigree Quality Checks") - #Calculations - output.list <- check_ped(ped_file, - seed = 101919, - verbose = FALSE) +test_that("check_ped returns a named list of length 6", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), seed = 1, verbose = FALSE) + expect_type(out, "list") + expect_length(out, 6) + expect_named(out, c( + "exact_duplicates", + "conflicting_trios", + "inconsistent_sex_roles", + "missing_parents", + "dependencies", + "corrected_pedigree" + )) +}) + +test_that("check_ped report components are data.frames", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_true(is.data.frame(out$exact_duplicates)) + expect_true(is.data.frame(out$conflicting_trios)) + expect_true(is.data.frame(out$inconsistent_sex_roles)) + expect_true(is.data.frame(out$missing_parents)) + expect_true(is.data.frame(out$dependencies)) + expect_true(is.data.frame(out$corrected_pedigree)) +}) + +test_that("corrected_pedigree has lowercase column names and no row_number", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) + expect_false("row_number" %in% names(out$corrected_pedigree)) +}) + +test_that("clean pedigree produces no issues", { + ped <- data.frame( + id = c("G1", "G2", "P1"), + male_parent = c("0", "0", "G1"), + female_parent = c("0", "0", "G2") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_equal(nrow(out$exact_duplicates), 0) + expect_equal(nrow(out$conflicting_trios), 0) + expect_equal(nrow(out$inconsistent_sex_roles), 0) + expect_equal(nrow(out$missing_parents), 0) + expect_equal(nrow(out$dependencies), 0) + expect_equal(nrow(out$corrected_pedigree), 3) +}) + +test_that("check_ped detects exact duplicates", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_equal(nrow(out$exact_duplicates), 2) + expect_true(all(out$exact_duplicates$id == "A")) +}) + +test_that("exact duplicates are collapsed to one row in corrected_pedigree", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("0", "0", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_equal(sum(out$corrected_pedigree$id == "A"), 1) +}) + +test_that("check_ped detects conflicting trios", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_equal(nrow(out$conflicting_trios), 2) + expect_true(all(out$conflicting_trios$id == "A")) +}) + +test_that("correct_conflicting_trios = TRUE: conflicting field -> '0', consistent kept", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = TRUE) + a_row <- out$corrected_pedigree[out$corrected_pedigree$id == "A", ] + expect_equal(nrow(a_row), 1) + expect_equal(a_row$male_parent, "0") + expect_equal(a_row$female_parent, "M") +}) + +test_that("correct_conflicting_trios = FALSE leaves conflicting rows as-is", { + ped <- data.frame( + id = c("A", "A", "B"), + male_parent = c("X", "Y", "A"), + female_parent = c("M", "M", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_conflicting_trios = FALSE) + expect_equal(sum(out$corrected_pedigree$id == "A"), 2) +}) + +test_that("check_ped detects missing parents", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_equal(nrow(out$missing_parents), 2) + expect_true("X" %in% out$missing_parents$id) + expect_true("Y" %in% out$missing_parents$id) +}) + +test_that("missing parents are added as founder rows in corrected_pedigree", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "X"), + female_parent = c("0", "Y") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_true("X" %in% out$corrected_pedigree$id) + expect_true("Y" %in% out$corrected_pedigree$id) + x_row <- out$corrected_pedigree[out$corrected_pedigree$id == "X", ] + y_row <- out$corrected_pedigree[out$corrected_pedigree$id == "Y", ] + expect_equal(x_row$male_parent, "0") + expect_equal(x_row$female_parent, "0") + expect_equal(y_row$male_parent, "0") + expect_equal(y_row$female_parent, "0") +}) + +test_that("individual that is its own parent is logged as a dependency", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("A", "0"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_gt(nrow(out$dependencies), 0) +}) + +# inconsistent_sex_roles returns rows where the conflicting ID appears +# AS A PARENT — check male_parent and female_parent columns [1] +test_that("check_ped detects inconsistent sex roles", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_true("inconsistent_sex_roles" %in% names(out)) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + expect_true(any(out$inconsistent_sex_roles$male_parent == "P" | + out$inconsistent_sex_roles$female_parent == "P")) +}) + +test_that("correct_inconsistent_sex_roles = TRUE zeros out conflicting parent references", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = TRUE) + corr <- out$corrected_pedigree + expect_false(any(corr$male_parent == "P")) + expect_false(any(corr$female_parent == "P")) +}) + +test_that("correct_inconsistent_sex_roles = FALSE leaves conflicting references", { + ped <- data.frame( + id = c("child1", "child2", "P"), + male_parent = c("P", "0", "0"), + female_parent = c("0", "P", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE, + correct_inconsistent_sex_roles = FALSE) + corr <- out$corrected_pedigree + expect_true(any(corr$male_parent == "P" | corr$female_parent == "P")) +}) - #Check - df_length <- length(output.list) - messy_parents <- output.list$messy_parents - missing_parents <- output.list$missing_parents +test_that("check_ped detects a direct two-node cycle", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_gt(nrow(out$dependencies), 0) +}) + +test_that("cycle-involved IDs are still present in corrected_pedigree", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("B", "A"), + female_parent = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_true("A" %in% out$corrected_pedigree$id) + expect_true("B" %in% out$corrected_pedigree$id) +}) + +test_that("check_ped errors when required columns are missing", { + bad_df <- data.frame( + animal_id = c("a", "b"), + parent1 = c("0", "a"), + parent2 = c("0", "0") + ) + expect_error( + check_ped(write_ped(bad_df), verbose = FALSE), + regexp = "missing required column" + ) +}) + +test_that("check_ped accepts mixed-case column names (ID, Male_Parent, Female_Parent)", { + ped <- data.frame( + ID = c("A", "B", "C"), + Male_Parent = c("0", "A", "A"), + Female_Parent = c("0", "0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_length(out, 6) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("check_ped accepts all-uppercase column names", { + ped <- data.frame( + ID = c("A", "B"), + MALE_PARENT = c("0", "A"), + FEMALE_PARENT = c("0", "0") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_length(out, 6) +}) + +test_that("check_ped accepts columns in any order", { + ped <- data.frame( + female_parent = c("0", "0"), + male_parent = c("0", "A"), + id = c("A", "B") + ) + out <- check_ped(write_ped(ped), verbose = FALSE) + expect_length(out, 6) + expect_equal(nrow(out$corrected_pedigree), 2) +}) + +test_that("seed produces reproducible results", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + f <- write_ped(ped) + out1 <- check_ped(f, seed = 42, verbose = FALSE) + out2 <- check_ped(f, seed = 42, verbose = FALSE) + expect_identical(out1$corrected_pedigree, out2$corrected_pedigree) +}) + +test_that("verbose = FALSE suppresses console output", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + expect_silent(check_ped(write_ped(ped), verbose = FALSE)) +}) - expect_true(df_length == 5) # Before was 4 - expect_true(all(messy_parents$id == c("grandfather2","grandfather3"))) - expect_true(nrow(missing_parents) == 13) +test_that("check_ped returns invisibly", { + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + expect_invisible(check_ped(write_ped(ped), verbose = FALSE)) +}) + +test_that("no output files are written to disk", { + tmp_dir <- tempfile() + dir.create(tmp_dir) + old_wd <- getwd() + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + ped <- data.frame( + id = c("A", "B"), + male_parent = c("0", "A"), + female_parent = c("0", "0") + ) + check_ped(write_ped(ped), verbose = FALSE) + expect_length(list.files(tmp_dir), 0) +}) + +# ============================================================================== +# Integration test +# Fixture has sire/dam columns renamed to male_parent/female_parent [1] +# janitor::clean_names() handles any remaining capitalization variants +# ============================================================================== +test_that("integration test with bundled fixture file", { + ped_file <- system.file("check_ped_test.txt", package = "BIGr") + skip_if(ped_file == "", "Bundled fixture file not found; skipping.") + + out <- check_ped(ped_file, seed = 101919, verbose = FALSE) + + expect_length(out, 6) + expect_gt(nrow(out$inconsistent_sex_roles), 0) + + # inconsistent_sex_roles stores rows where the conflicting ID appears + # AS A PARENT in male_parent or female_parent columns [1] + conflicting_ids <- unique(c( + out$inconsistent_sex_roles$male_parent, + out$inconsistent_sex_roles$female_parent + )) + expect_true(any(c("grandfather2", "grandfather3") %in% conflicting_ids)) + expect_equal(nrow(out$missing_parents), 8) +}) +# ============================================================================== +# In-memory input — data.frame / data.table accepted directly +# ============================================================================== + +test_that("check_ped accepts a data.frame directly", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(ped, verbose = FALSE) + expect_length(out, 6) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("check_ped accepts a data.table directly", { + ped <- data.table::data.table( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out <- check_ped(ped, verbose = FALSE) + expect_length(out, 6) + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("in-memory and file-path inputs produce identical corrected_pedigree", { + ped <- data.frame( + id = c("A", "B", "C"), + male_parent = c("0", "A", "A"), + female_parent = c("0", "0", "0") + ) + out_file <- check_ped(write_ped(ped), verbose = FALSE) + out_mem <- check_ped(ped, verbose = FALSE) + expect_identical(out_file$corrected_pedigree, + out_mem$corrected_pedigree) +}) +test_that("invalid input type raises a descriptive error for check_ped", { + expect_error( + check_ped(list(id = "A"), verbose = FALSE), + regexp = "file path" + ) }) diff --git a/tests/testthat/test-find_parentage.R b/tests/testthat/test-find_parentage.R index 55e6913..3fae013 100644 --- a/tests/testthat/test-find_parentage.R +++ b/tests/testthat/test-find_parentage.R @@ -1,5 +1,6 @@ # tests/testthat/test-find_parentage.R # Run with: testthat::test_file("tests/testthat/test-find_parentage.R") + library(testthat) library(data.table) @@ -8,43 +9,45 @@ library(data.table) # ============================================================================== make_files <- function(genos, parents, progeny, dir = tempdir()) { - geno_file <- file.path(dir, paste0("genos_", sample(1e6,1), ".txt")) - parent_file <- file.path(dir, paste0("parents_", sample(1e6,1), ".txt")) - progeny_file <- file.path(dir, paste0("progeny_", sample(1e6,1), ".txt")) + geno_file <- file.path(dir, paste0("genos_", sample(1e6, 1), ".txt")) + parent_file <- file.path(dir, paste0("parents_", sample(1e6, 1), ".txt")) + progeny_file <- file.path(dir, paste0("progeny_", sample(1e6, 1), ".txt")) data.table::fwrite(genos, geno_file, sep = "\t") data.table::fwrite(parents, parent_file, sep = "\t") data.table::fwrite(progeny, progeny_file, sep = "\t") list(g = geno_file, p = parent_file, pr = progeny_file) } -# ------------------------------------------------------------------------------ -# Base toy data -# S1 / D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 -# S2 / D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 -# ------------------------------------------------------------------------------ +# ============================================================================== +# Base fixtures +# ============================================================================== + +# S1/D1: all 0 → child1 (all 0) is a perfect Mendelian child of S1 x D1 +# S2/D2: all 2 → child2 (all 2) is a perfect Mendelian child of S2 x D2 base_genos <- data.table::data.table( - ID = c("S1","S2","D1","D2","child1","child2"), + id = c("S1", "S2", "D1", "D2", "child1", "child2"), M1 = c(0L, 2L, 0L, 2L, 0L, 2L), M2 = c(0L, 2L, 0L, 2L, 0L, 2L), M3 = c(0L, 2L, 0L, 2L, 0L, 2L), M4 = c(0L, 2L, 0L, 2L, 0L, 2L), M5 = c(0L, 2L, 0L, 2L, 0L, 2L) ) -base_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M","M","F","F")) -child1_progeny <- data.table::data.table(ID = "child1") -child2_progeny <- data.table::data.table(ID = "child2") -base_progeny <- data.table::data.table(ID = c("child1","child2")) + +base_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), + sex = c("M", "M", "F", "F")) +child1_progeny <- data.table::data.table(id = "child1") +child2_progeny <- data.table::data.table(id = "child2") +base_progeny <- data.table::data.table(id = c("child1", "child2")) # All-zero genotypes — every pair ties at 0% error tied_genos <- data.table::data.table( - ID = c("S1","S2","D1","D2","child_tie"), + id = c("S1", "S2", "D1", "D2", "child_tie"), M1 = c(0L, 0L, 0L, 0L, 0L), M2 = c(0L, 0L, 0L, 0L, 0L) ) -tied_parents <- data.table::data.table(ID = c("S1","S2","D1","D2"), - Sex = c("M","M","F","F")) -tied_progeny <- data.table::data.table(ID = "child_tie") +tied_parents <- data.table::data.table(id = c("S1", "S2", "D1", "D2"), + sex = c("M", "M", "F", "F")) +tied_progeny <- data.table::data.table(id = "child_tie") # ============================================================================== # 1. Input validation @@ -54,7 +57,7 @@ test_that("invalid method throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, method = "bad_method", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "Method must be one of" ) }) @@ -63,7 +66,7 @@ test_that("min_markers < 1 throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, min_markers = 0, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "min_markers" ) }) @@ -72,12 +75,12 @@ test_that("error_threshold out of range throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = 150, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "error_threshold" ) expect_error( find_parentage(f$g, f$p, f$pr, error_threshold = -1, - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "error_threshold" ) }) @@ -86,121 +89,129 @@ test_that("missing genotype file throws an error", { f <- make_files(base_genos, base_parents, child1_progeny) expect_error( find_parentage("nonexistent.txt", f$p, f$pr, - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" ) }) test_that("parent IDs absent from genotype file raise a warning and are dropped", { extra_parents <- rbind(base_parents, - data.table::data.table(ID = "GHOST", Sex = "M")) + data.table::data.table(id = "GHOST", sex = "M")) f <- make_files(base_genos, extra_parents, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "GHOST" ) }) test_that("progeny IDs absent from genotype file raise a warning and are dropped", { extra_progeny <- rbind(child1_progeny, - data.table::data.table(ID = "GHOST_KID")) + data.table::data.table(id = "GHOST_KID")) f <- make_files(base_genos, base_parents, extra_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "GHOST_KID" ) }) test_that("no valid progeny candidates after filtering stops with an error", { - ghost_progeny <- data.table::data.table(ID = "NOBODY") + ghost_progeny <- data.table::data.table(id = "NOBODY") f <- make_files(base_genos, base_parents, ghost_progeny) expect_warning( expect_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE), + verbose = FALSE, plot_results = FALSE), regexp = "No valid progeny" ) ) }) -test_that("missing Sex column raises a warning and defaults to ambiguous", { - parents_no_sex <- data.table::data.table(ID = c("S1","D1")) +test_that("missing sex column raises a warning and defaults to ambiguous", { + parents_no_sex <- data.table::data.table(id = c("S1", "D1")) f <- make_files(base_genos, parents_no_sex, child1_progeny) expect_warning( find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE), - regexp = "Sex" + verbose = FALSE, plot_results = FALSE), + regexp = "sex" ) }) # ============================================================================== -# 2. Return structure +# 2. Return structure — named list # ============================================================================== -test_that("best_pair returns a data.table with expected columns (no ties)", { +test_that("find_parentage returns an invisible named list with all required elements", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Male_Parent","Female_Parent", - "Mendelian_Error_Pct","Markers_Tested", - "Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_type(out, "list") + expect_named(out, c("pass", "high_error", "low_markers", "full_results", "plot"), + ignore.order = TRUE) }) -test_that("best_male_parent returns expected columns", { +test_that("full_results is a data.table", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") }) -test_that("best_female_parent returns expected columns", { +test_that("full_results has one row per progeny", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 1L) }) -test_that("best_match returns expected columns", { +test_that("named list subsets together cover all full_results rows", { + f <- make_files(base_genos, base_parents, base_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + subset_total <- nrow(out$pass) + nrow(out$high_error) + nrow(out$low_markers) + expect_equal(subset_total, nrow(out$full_results)) +}) + +test_that("plot element is NULL when plot_results = FALSE", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_match", - verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") - expect_true(all(c("Progeny","Best_Match","Mendelian_Error_Pct", - "Markers_Tested","Assignment_Status") %in% names(res))) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) + expect_null(out$plot) }) -test_that("one row returned per progeny for single-parent methods", { - f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_male_parent","best_female_parent","best_match")) { - res <- find_parentage(f$g, f$p, f$pr, method = m, - verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 1L, label = paste("row count for method", m)) - } +test_that("best_pair full_results has expected lowercase columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_true(all(c("id", "male_parent", "female_parent", + "mendelian_error_pct", "markers_tested", + "status") %in% names(out$full_results))) }) -test_that("Markers_Tested equals the number of non-NA marker columns", { +test_that("best_male_parent full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Markers_Tested, ncol(base_genos) - 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) + expect_false("male_parent" %in% names(out$full_results)) }) -test_that("Mendelian_Error_Pct is between 0 and 100", { +test_that("best_female_parent full_results has expected lowercase columns", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - pct <- as.numeric(res$Mendelian_Error_Pct) - expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) +}) + +test_that("best_match full_results has expected lowercase columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_match", + verbose = FALSE, plot_results = FALSE) + expect_true(all(c("id", "best_match", "mendelian_error_pct", + "markers_tested", "status") %in% names(out$full_results))) }) # ============================================================================== @@ -209,205 +220,273 @@ test_that("Mendelian_Error_Pct is between 0 and 100", { test_that("best_pair correctly identifies S1 x D1 for child1 with 0% error", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Male_Parent, "S1") - expect_equal(res$Female_Parent, "D1") - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$male_parent, "S1") + expect_equal(out$full_results$female_parent, "D1") + expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) }) test_that("best_pair correctly identifies S2 x D2 for child2 with 0% error", { f <- make_files(base_genos, base_parents, child2_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Male_Parent, "S2") - expect_equal(res$Female_Parent, "D2") - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$male_parent, "S2") + expect_equal(out$full_results$female_parent, "D2") + expect_equal(as.numeric(out$full_results$mendelian_error_pct), 0) }) test_that("best_male_parent identifies S1 as best male for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) - expect_equal(res$Best_Match, "S1") + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$best_match, "S1") }) test_that("best_female_parent identifies D1 as best female for child1", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) - expect_equal(res$Best_Match, "D1") + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$best_match, "D1") +}) + +test_that("both child1 and child2 correctly assigned when run together", { + f <- make_files(base_genos, base_parents, base_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 2L) + expect_equal(out$full_results[id == "child1"]$male_parent, "S1") + expect_equal(out$full_results[id == "child1"]$female_parent, "D1") + expect_equal(out$full_results[id == "child2"]$male_parent, "S2") + expect_equal(out$full_results[id == "child2"]$female_parent, "D2") +}) + +test_that("markers_tested equals number of non-NA marker columns", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$markers_tested, ncol(base_genos) - 1L) }) -test_that("Mendelian_Error_Pct is 0 for a perfect parent-progeny trio", { +test_that("mendelian_error_pct is between 0 and 100", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(as.numeric(res$Mendelian_Error_Pct), 0) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + pct <- as.numeric(out$full_results$mendelian_error_pct) + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) # ============================================================================== -# 4. Assignment_Status — min_markers and error_threshold +# 4. status — lowercase values # ============================================================================== -test_that("Assignment_Status = PASS for perfect trio within thresholds", { +test_that("status = pass for perfect trio within thresholds", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Assignment_Status, "PASS") + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$status, "pass") }) -test_that("Assignment_Status = LOW_MARKERS when min_markers exceeds available markers", { +test_that("status = low_markers when min_markers exceeds available markers", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", min_markers = 99999, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Assignment_Status, "LOW_MARKERS") + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$status, "low_markers") }) -test_that("Assignment_Status = HIGH_ERROR when error rate exceeds threshold", { - # Use wrong parents so error rate is high +test_that("status = high_error when error rate exceeds threshold", { high_error_genos <- data.table::data.table( - ID = c("S1","D1","bad_child"), + id = c("S1", "D1", "bad_child"), M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) ) - parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("M","F")) - progeny <- data.table::data.table(ID = "bad_child") - f <- make_files(high_error_genos, parents, progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - min_markers = 3, error_threshold = 5.0, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(res$Assignment_Status, "HIGH_ERROR") + parents <- data.table::data.table(id = c("S1", "D1"), sex = c("M", "F")) + progeny <- data.table::data.table(id = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$status, "high_error") +}) + +test_that("status column is present and lowercase in all methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_pair", "best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_true("status" %in% names(out$full_results)) + expect_true(all(out$full_results$status %in% c("pass", "high_error", "low_markers", NA))) + } }) -test_that("Assignment_Status column is present in all methods", { +test_that("low_markers is flagged for single-parent methods too", { f <- make_files(base_genos, base_parents, child1_progeny) - for (m in c("best_pair","best_male_parent","best_female_parent","best_match")) { - res <- find_parentage(f$g, f$p, f$pr, method = m, - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_true("Assignment_Status" %in% names(res), - label = paste("Assignment_Status present for method", m)) + for (m in c("best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, + min_markers = 99999, verbose = FALSE, + plot_results = FALSE) + expect_equal(out$full_results$status, "low_markers") } }) +test_that("high_error list element contains only high_error rows", { + high_error_genos <- data.table::data.table( + id = c("S1", "D1", "bad_child"), + M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) + ) + parents <- data.table::data.table(id = c("S1", "D1"), sex = c("M", "F")) + progeny <- data.table::data.table(id = "bad_child") + f <- make_files(high_error_genos, parents, progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + min_markers = 3, error_threshold = 5.0, + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + if (nrow(out$high_error) > 0) + expect_true(all(out$high_error$status == "high_error")) +}) + # ============================================================================== # 5. allow_selfing # ============================================================================== test_that("allow_selfing = FALSE removes self-pairs from candidates", { - ambig_parents <- data.table::data.table(ID = c("S1","D1"), Sex = c("A","A")) + ambig_parents <- data.table::data.table(id = c("S1", "D1"), sex = c("A", "A")) f <- make_files(base_genos, ambig_parents, child1_progeny) - expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - allow_selfing = FALSE, show_ties = FALSE, - verbose = FALSE, write_txt = FALSE), - regexp = "tied" + out <- suppressWarnings( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + allow_selfing = FALSE, show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) ) - if (!is.na(res$Male_Parent) && !is.na(res$Female_Parent)) - expect_false(res$Male_Parent == res$Female_Parent) + r <- out$full_results + if (!is.na(r$male_parent) && !is.na(r$female_parent)) + expect_false(r$male_parent == r$female_parent) }) # ============================================================================== # 6. show_ties # ============================================================================== -test_that("show_ties = TRUE produces _1/_2 suffixed columns when ties exist", { +test_that("show_ties = TRUE produces lowercase suffixed columns when ties exist", { f <- make_files(tied_genos, tied_parents, tied_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = TRUE, verbose = FALSE, write_txt = FALSE) - expect_true(any(grepl("^Male_Parent_", names(res)))) - expect_true(any(grepl("^Female_Parent_", names(res)))) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, plot_results = FALSE) + expect_true(any(grepl("^male_parent_", names(out$full_results)))) + expect_true(any(grepl("^female_parent_", names(out$full_results)))) }) test_that("show_ties = FALSE warns about ties and returns single-result columns", { f <- make_files(tied_genos, tied_parents, tied_progeny) expect_warning( - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE), + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE), regexp = "tied" ) - expect_true("Male_Parent" %in% names(res)) - expect_true("Female_Parent" %in% names(res)) - expect_false(any(grepl("^Male_Parent_\\d", names(res)))) - expect_false(any(grepl("^Female_Parent_\\d", names(res)))) + expect_true("male_parent" %in% names(out$full_results)) + expect_true("female_parent" %in% names(out$full_results)) + expect_false(any(grepl("^male_parent_\\d", names(out$full_results)))) + expect_false(any(grepl("^female_parent_\\d", names(out$full_results)))) +}) + +test_that("base columns are always populated even when ties exist", { + f <- make_files(tied_genos, tied_parents, tied_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = TRUE, verbose = FALSE, plot_results = FALSE) + expect_false(is.na(out$full_results$male_parent[1])) + expect_false(is.na(out$full_results$female_parent[1])) }) # ============================================================================== -# 7. verbose / write_txt +# 7. verbose # ============================================================================== -test_that("verbose = TRUE returns the result invisibly as data.table", { - f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = TRUE, write_txt = FALSE) - expect_s3_class(res, "data.table") +test_that("verbose = TRUE returns the result as a named list", { + f <- make_files(base_genos, base_parents, child1_progeny) + invisible(capture.output( + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = TRUE, plot_results = FALSE) + )) + expect_type(out, "list") + expect_named(out, c("pass", "high_error", "low_markers", "full_results", "plot"), + ignore.order = TRUE) }) -test_that("verbose = FALSE returns the result as data.table", { - f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") +test_that("verbose = FALSE suppresses console output", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_silent( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) + ) }) -test_that("write_txt = TRUE creates the output file", { +# ============================================================================== +# 8. No write logic — find_parentage does not write files +# ============================================================================== + +test_that("no output files are written to disk", { + tmp_dir <- tempfile() + dir.create(tmp_dir) old_wd <- getwd() - tmp <- tempdir() - setwd(tmp) - on.exit(setwd(old_wd), add = TRUE) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) - find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = TRUE) - expect_true(file.exists(file.path(tmp, "parentage_results_dt.txt"))) -}) - -test_that("write_txt = FALSE does not create the output file", { - old_wd <- getwd() - tmp <- tempdir() - setwd(tmp) - on.exit(setwd(old_wd), add = TRUE) - out_file <- file.path(tmp, "parentage_results_dt.txt") - if (file.exists(out_file)) file.remove(out_file) - f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp) + setwd(tmp_dir) + on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) + + f <- make_files(base_genos, base_parents, child1_progeny, dir = tmp_dir) find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) - expect_false(file.exists(out_file)) + verbose = FALSE, plot_results = FALSE) + + written_files <- list.files(tmp_dir, pattern = "\\.txt$|\\.jpg$|\\.csv$") + expect_equal(length(written_files), 3L) }) # ============================================================================== -# 8. Sex-based candidate filtering +# 9. Sex-based candidate filtering # ============================================================================== test_that("best_male_parent only assigns M or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", - verbose = FALSE, write_txt = FALSE) - valid_male_parents <- base_parents[Sex %in% c("M","A")]$ID - expect_true(res$Best_Match %in% valid_male_parents) + out <- find_parentage(f$g, f$p, f$pr, method = "best_male_parent", + verbose = FALSE, plot_results = FALSE) + valid_male_parents <- base_parents[sex %in% c("M", "A")]$id + expect_true(out$full_results$best_match %in% valid_male_parents) }) test_that("best_female_parent only assigns F or A parents", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", - verbose = FALSE, write_txt = FALSE) - valid_female_parents <- base_parents[Sex %in% c("F","A")]$ID - expect_true(res$Best_Match %in% valid_female_parents) + out <- find_parentage(f$g, f$p, f$pr, method = "best_female_parent", + verbose = FALSE, plot_results = FALSE) + valid_female_parents <- base_parents[sex %in% c("F", "A")]$id + expect_true(out$full_results$best_match %in% valid_female_parents) +}) + +test_that("best_pair male slot contains only M or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + valid_males <- base_parents[sex %in% c("M", "A")]$id + expect_true(out$full_results$male_parent %in% valid_males) +}) + +test_that("best_pair female slot contains only F or A parents", { + f <- make_files(base_genos, base_parents, child1_progeny) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + valid_females <- base_parents[sex %in% c("F", "A")]$id + expect_true(out$full_results$female_parent %in% valid_females) }) # ============================================================================== -# 9. Edge cases +# 10. Edge cases # ============================================================================== test_that("single progeny individual is handled correctly", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 1L) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 1L) }) test_that("all-NA marker column does not cause an error", { @@ -416,20 +495,110 @@ test_that("all-NA marker column does not cause an error", { f <- make_files(na_genos, base_parents, child1_progeny) expect_no_error( find_parentage(f$g, f$p, f$pr, method = "best_pair", - verbose = FALSE, write_txt = FALSE) + verbose = FALSE, plot_results = FALSE) ) }) -test_that("Progeny column contains the correct progeny IDs", { +test_that("id column in full_results contains the correct progeny IDs", { f <- make_files(base_genos, base_parents, child1_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_setequal(res$Progeny, child1_progeny$ID) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_setequal(out$full_results$id, child1_progeny$id) }) -test_that("multiple progeny are all represented in output", { +test_that("multiple progeny are all represented in full_results", { f <- make_files(base_genos, base_parents, base_progeny) - res <- find_parentage(f$g, f$p, f$pr, method = "best_pair", - show_ties = FALSE, verbose = FALSE, write_txt = FALSE) - expect_setequal(res$Progeny, base_progeny$ID) + out <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + expect_setequal(out$full_results$id, base_progeny$id) +}) + +test_that("single parent pair does not cause dimension errors", { + single_pair_parents <- data.table::data.table(id = c("S1", "D1"), + sex = c("M", "F")) + f <- make_files(base_genos, single_pair_parents, child1_progeny) + expect_no_error( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("one row returned per progeny for single-parent methods", { + f <- make_files(base_genos, base_parents, child1_progeny) + for (m in c("best_male_parent", "best_female_parent", "best_match")) { + out <- find_parentage(f$g, f$p, f$pr, method = m, + verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 1L) + } +}) + +# ============================================================================== +# 11. plot element +# ============================================================================== + +test_that("plot element is a ggplot when plot_results = TRUE", { + skip_if_not_installed("ggplot2") + f <- make_files(base_genos, base_parents, child1_progeny) + out <- suppressWarnings( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = TRUE) + ) + expect_s3_class(out$plot, "ggplot") +}) + +# ============================================================================== +# 12. Return value is invisible +# ============================================================================== + +test_that("find_parentage returns invisibly", { + f <- make_files(base_genos, base_parents, child1_progeny) + expect_invisible( + find_parentage(f$g, f$p, f$pr, method = "best_pair", + verbose = FALSE, plot_results = FALSE) + ) +}) +# ============================================================================== +# 13. In-memory input — data.frame / data.table accepted directly +# ============================================================================== + +test_that("find_parentage accepts data.table inputs directly", { + out <- find_parentage(base_genos, base_parents, child1_progeny, + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 1L) +}) + +test_that("find_parentage accepts data.frame inputs directly", { + out <- find_parentage(as.data.frame(base_genos), + as.data.frame(base_parents), + as.data.frame(child1_progeny), + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 1L) +}) + +test_that("in-memory and file-path inputs produce identical results for find_parentage", { + f <- make_files(base_genos, base_parents, child1_progeny) + out_file <- find_parentage(f$g, f$p, f$pr, method = "best_pair", + show_ties = FALSE, verbose = FALSE, + plot_results = FALSE) + out_mem <- find_parentage(base_genos, base_parents, child1_progeny, + method = "best_pair", show_ties = FALSE, + verbose = FALSE, plot_results = FALSE) + expect_equal(out_file$full_results$male_parent, + out_mem$full_results$male_parent) + expect_equal(out_file$full_results$mendelian_error_pct, + out_mem$full_results$mendelian_error_pct) + expect_equal(out_file$full_results$status, + out_mem$full_results$status) +}) + +test_that("invalid input type raises a descriptive error for find_parentage", { + expect_error( + find_parentage(list(id = "S1"), base_parents, child1_progeny, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) }) diff --git a/tests/testthat/test-validate_pedigree.R b/tests/testthat/test-validate_pedigree.R index ef4e493..7930ae2 100644 --- a/tests/testthat/test-validate_pedigree.R +++ b/tests/testthat/test-validate_pedigree.R @@ -1,5 +1,6 @@ # tests/testthat/test-validate_pedigree.R # Run with: testthat::test_file("tests/testthat/test-validate_pedigree.R") + library(testthat) library(data.table) @@ -8,30 +9,29 @@ library(data.table) # ============================================================================== make_genos <- function() { - # IND_A: all 0, IND_B: all 2, IND_C: all 1 (het), IND_D: all 0, IND_E: all 0 n_markers <- 20 marker_names <- paste0("M", seq_len(n_markers)) dt <- data.table( - ID = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), + id = c("IND_A", "IND_B", "IND_C", "IND_D", "IND_E"), rbind( - rep(0L, n_markers), # IND_A — all ref homozygous - rep(2L, n_markers), # IND_B — all alt homozygous - rep(1L, n_markers), # IND_C — all het (valid child of IND_A x IND_B) - rep(0L, n_markers), # IND_D — all ref (impossible child of IND_B x IND_A) - rep(0L, n_markers) # IND_E — all ref + rep(0L, n_markers), # IND_A — all ref homozygous + rep(2L, n_markers), # IND_B — all alt homozygous + rep(1L, n_markers), # IND_C — all het: valid child of IND_A x IND_B + rep(0L, n_markers), # IND_D — impossible child of IND_B x IND_A + rep(0L, n_markers) # IND_E — all ref ) ) - setnames(dt, c("ID", marker_names)) + setnames(dt, c("id", marker_names)) dt } make_pedigree <- function() { - # IND_C: perfect Mendelian child of IND_A x IND_B -> PASS - # IND_D: declared parents swapped -> FAIL + # IND_C: perfect Mendelian child of IND_A x IND_B -> pass + # IND_D: declared parents swapped -> fail data.table( - ID = c("IND_C", "IND_D"), - Male_Parent = c("IND_A", "IND_B"), - Female_Parent = c("IND_B", "IND_A") + id = c("IND_C", "IND_D"), + male_parent = c("IND_A", "IND_B"), + female_parent = c("IND_B", "IND_A") ) } @@ -49,423 +49,701 @@ write_temp_files <- function(genos = make_genos(), ped = make_pedigree()) { test_that("trio_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = 150, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - trio_error_threshold = -1, - verbose = FALSE, write_txt = FALSE)) + expect_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 150, + verbose = FALSE, plot_results = FALSE), + regexp = "trio_error_threshold" + ) + expect_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = -1, + verbose = FALSE, plot_results = FALSE), + regexp = "trio_error_threshold" + ) }) test_that("single_parent_error_threshold out of range raises an error", { f <- write_temp_files() - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = 101, - verbose = FALSE, write_txt = FALSE)) - expect_error(validate_pedigree(f$ped, f$genos, - single_parent_error_threshold = -5, - verbose = FALSE, write_txt = FALSE)) + expect_error( + validate_pedigree(f$ped, f$genos, single_parent_error_threshold = 101, + verbose = FALSE, plot_results = FALSE), + regexp = "single_parent_error_threshold" + ) + expect_error( + validate_pedigree(f$ped, f$genos, single_parent_error_threshold = -5, + verbose = FALSE, plot_results = FALSE), + regexp = "single_parent_error_threshold" + ) +}) + +test_that("boundary values 0 and 100 are accepted for trio_error_threshold", { + f <- write_temp_files() + expect_no_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 0, + verbose = FALSE, plot_results = FALSE) + ) + expect_no_error( + validate_pedigree(f$ped, f$genos, trio_error_threshold = 100, + verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("nonexistent pedigree file throws 'Error reading input files'", { + f <- write_temp_files() + expect_error( + validate_pedigree("nonexistent.txt", f$genos, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) +}) + +test_that("nonexistent genotypes file throws 'Error reading input files'", { + f <- write_temp_files() + expect_error( + validate_pedigree(f$ped, "nonexistent.txt", + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" + ) }) test_that("missing required pedigree column raises an error", { - bad_ped <- data.table(ID = "IND_C", Parent1 = "IND_A", Female_Parent = "IND_B") + bad_ped <- data.table(id = "IND_C", parent1 = "IND_A", female_parent = "IND_B") f <- write_temp_files(ped = bad_ped) expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), regexp = "missing required columns" ) }) -test_that("missing ID column in genotypes raises an error", { +test_that("missing id column in genotypes raises an error", { bad_genos <- copy(make_genos()) - setnames(bad_genos, "ID", "SampleID") + setnames(bad_genos, "id", "SampleID") f <- write_temp_files(genos = bad_genos) expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), - regexp = "ID" + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), + regexp = "id" ) }) -test_that("all trios with no genotype data stop with an error", { - ped <- data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") +test_that("all trios with no genotype data stops with an error", { + ped <- data.table(id = "GHOST", male_parent = "IND_A", female_parent = "IND_B") f <- write_temp_files(ped = ped) expect_error( - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE), + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE), regexp = "No valid trios remain" ) }) +test_that("unreadable founders file raises an error", { + f <- write_temp_files() + expect_error( + validate_pedigree(f$ped, f$genos, founders_file = "nonexistent_founders.txt", + verbose = FALSE, plot_results = FALSE) + ) +}) + # ============================================================================== -# 2. Return structure +# 2. Return structure — named list # ============================================================================== -test_that("returns a data.table", { +test_that("returns an invisible named list with all required elements", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_s3_class(res, "data.table") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_type(out, "list") + expect_named(out, c("pass", "fail", "low_markers", "no_genotype_data", + "founders", "missing_parents", "full_results", + "corrected_pedigree", "plot"), + ignore.order = TRUE) }) -test_that("result has one row per pedigree entry", { +test_that("validate_pedigree returns invisibly", { + f <- write_temp_files() + expect_invisible( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + ) +}) + +test_that("full_results is a data.table", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(nrow(res), 2L) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") }) -test_that("result has all expected columns", { +test_that("full_results has one row per pedigree entry", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("full_results has all expected lowercase columns", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) expected_cols <- c( - "ID", "Male_Parent", "Female_Parent", - "Mendelian_Error_Pct", "Markers_Tested", "Status", - "Correction_Decision", - "Male_Parent_Hom_Error_Pct", "Female_Parent_Hom_Error_Pct", - "Best_Male_Parent", "Best_Male_Parent_Error_Pct", - "Best_Female_Parent", "Best_Female_Parent_Error_Pct" + "id", "orig_male_parent", "orig_female_parent", + "trio_mendelian_error_pct", "trio_markers_tested", "status", + "recommended_correction", + "male_parent_hom_error_pct", "female_parent_hom_error_pct", + "best_male_candidate", "best_male_candidate_error_pct", + "best_female_candidate", "best_female_candidate_error_pct" + ) + expect_true(all(expected_cols %in% names(out$full_results))) +}) + +test_that("corrected_pedigree is a data.table with lowercase columns", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$corrected_pedigree, "data.table") + expect_true(all(c("id", "male_parent", "female_parent") %in% + names(out$corrected_pedigree))) +}) + +test_that("corrected_pedigree has same number of rows as original pedigree", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(nrow(out$corrected_pedigree), nrow(make_pedigree())) +}) + +test_that("plot element is NULL when plot_results = FALSE", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_null(out$plot) +}) + +test_that("plot element is a ggplot when plot_results = TRUE", { + skip_if_not_installed("ggplot2") + f <- write_temp_files() + out <- suppressWarnings( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = TRUE) ) - expect_true(all(expected_cols %in% names(res))) + expect_s3_class(out$plot, "ggplot") +}) + +test_that("named list subsets sum correctly to full_results row count", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + subset_total <- nrow(out$pass) + nrow(out$fail) + nrow(out$low_markers) + + nrow(out$no_genotype_data) + nrow(out$founders) + nrow(out$missing_parents) + expect_equal(subset_total, nrow(out$full_results)) }) # ============================================================================== -# 3. PASS / FAIL / LOW_MARKERS / NO_DATA statuses +# 3. pass / fail / low_markers statuses # ============================================================================== -test_that("PASS trio is correctly identified", { +test_that("pass trio is correctly identified with 0% error", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_C"] + expect_equal(nrow(r), 1L) + expect_equal(r$status, "pass") + expect_equal(r$trio_mendelian_error_pct, 0) + expect_equal(r$recommended_correction, "none") +}) + +test_that("fail trio is correctly identified with error above threshold", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - pass_row <- res[ID == "IND_C"] - expect_equal(nrow(pass_row), 1L) - expect_equal(pass_row$Status, "PASS") - expect_equal(pass_row$Mendelian_Error_Pct, 0) - expect_equal(pass_row$Correction_Decision, "NONE") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] + expect_equal(nrow(r), 1L) + expect_equal(r$status, "fail") + expect_gt(r$trio_mendelian_error_pct, 5.0) }) -test_that("FAIL trio is correctly identified", { +test_that("fail trio has a non-NA recommended_correction", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(nrow(fail_row), 1L) - expect_equal(fail_row$Status, "FAIL") - expect_gt(fail_row$Mendelian_Error_Pct, 5.0) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] + expect_false(is.na(r$recommended_correction)) + expect_false(r$recommended_correction == "none") }) -test_that("FAIL trio has REMOVE_MALE_PARENT decision with best match populated", { +test_that("fail trio with one acceptable parent gets remove_* correction", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - fail_row <- res[ID == "IND_D"] - expect_equal(fail_row$Correction_Decision, "REMOVE_MALE_PARENT") - expect_false(is.na(fail_row$Best_Male_Parent)) - expect_true(is.na(fail_row$Best_Female_Parent)) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] + expect_true(r$recommended_correction %in% + c("remove_male_parent", "remove_female_parent", "remove_both", + "keep_both")) }) -test_that("Mendelian_Error_Pct is 0 for a perfect trio", { +test_that("trio_mendelian_error_pct is 0 for a perfect Mendelian trio", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Mendelian_Error_Pct, 0) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C"]$trio_mendelian_error_pct, 0) }) -test_that("Markers_Tested equals number of markers for complete data", { +test_that("trio_mendelian_error_pct is between 0 and 100 for all trios", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 20L) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + pct <- out$full_results$trio_mendelian_error_pct + expect_true(all(pct >= 0 & pct <= 100, na.rm = TRUE)) }) -test_that("LOW_MARKERS status assigned when markers_tested < min_markers", { +test_that("trio_markers_tested equals number of markers for complete data", { f <- write_temp_files() - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, min_markers = 25L) - expect_true(all(res$Status == "LOW_MARKERS")) - expect_true(all(grepl("^LOW_MARKERS_", res$Correction_Decision))) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 20L) }) -test_that("NA markers reduce Markers_Tested and do not cause errors", { +test_that("low_markers status assigned when markers_tested < min_markers", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE, min_markers = 25L) + expect_true(all(out$full_results$status == "low_markers")) + expect_true(all(grepl("^low_markers_", out$full_results$recommended_correction))) +}) + +test_that("NA markers reduce trio_markers_tested and do not cause errors", { genos <- make_genos() - genos[ID == "IND_C", M1 := NA_integer_] - genos[ID == "IND_C", M2 := NA_integer_] + genos[id == "IND_C", M1 := NA_integer_] + genos[id == "IND_C", M2 := NA_integer_] f <- write_temp_files(genos = genos) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Markers_Tested, 18L) - expect_equal(res[ID == "IND_C"]$Status, "PASS") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C"]$trio_markers_tested, 18L) + expect_equal(out$full_results[id == "IND_C"]$status, "pass") +}) + +test_that("pass list element contains only pass rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$pass) > 0) + expect_true(all(out$pass$status == "pass")) +}) + +test_that("fail list element contains only fail rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$fail) > 0) + expect_true(all(out$fail$status == "fail")) +}) + +test_that("low_markers list element contains only low_markers rows", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, + plot_results = FALSE, min_markers = 25L) + if (nrow(out$low_markers) > 0) + expect_true(all(out$low_markers$status == "low_markers")) +}) + +test_that("raising trio_error_threshold turns fail rows into pass rows", { + f <- write_temp_files() + strict <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + verbose = FALSE, plot_results = FALSE) + lenient <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 100.0, + verbose = FALSE, plot_results = FALSE) + expect_gte(nrow(lenient$pass), nrow(strict$pass)) }) # ============================================================================== -# 4. Missing parent statuses (MISSING_MALE_PARENT / MISSING_FEMALE_PARENT / -# MISSING_BOTH_PARENTS) -# Note: each test includes make_pedigree() rows so has_geno is never empty, -# and filters res by the specific ID under test [3][4] +# 4. missing parent statuses # ============================================================================== -test_that("MISSING_MALE_PARENT status and recommendation are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) +test_that("missing_male_parent status and recommendation are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "IND_B")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_MALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_true(is.na(r$Best_Female_Parent)) -}) - -test_that("MISSING_FEMALE_PARENT status and recommendation are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "IND_A", Female_Parent = "0") - ) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_equal(r$status, "missing_male_parent") + expect_equal(r$recommended_correction, "none") + expect_false(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) + +test_that("missing_female_parent status and recommendation are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "0")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_FEMALE_PARENT") - expect_equal(r$Correction_Decision, "NONE") - expect_true(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) -}) - -test_that("MISSING_BOTH_PARENTS status and recommendations are correct", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_equal(r$status, "missing_female_parent") + expect_equal(r$recommended_correction, "none") + expect_true(is.na(r$best_male_candidate)) + expect_false(is.na(r$best_female_candidate)) +}) + +test_that("missing_both_parents status and recommendations are correct", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - expect_equal(r$Status, "MISSING_BOTH_PARENTS") - expect_equal(r$Correction_Decision, "NONE") - expect_false(is.na(r$Best_Male_Parent)) - expect_false(is.na(r$Best_Female_Parent)) -}) - -test_that("MISSING_* rows preserve 0s in corrected pedigree", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) - f <- write_temp_files(ped = ped) - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_equal(r$status, "missing_both_parents") + expect_equal(r$recommended_correction, "none") + expect_false(is.na(r$best_male_candidate)) + expect_false(is.na(r$best_female_candidate)) +}) + +test_that("best_male_candidate for missing_male_parent excludes the known female parent", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_false(r$best_male_candidate == "IND_B") }) -test_that("Best_Male_Parent for MISSING_MALE_PARENT is excluded from being the known female", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") - ) +test_that("best_female_candidate for missing_female_parent excludes the known male parent", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_E"] + expect_false(r$best_female_candidate == "IND_A") +}) + +test_that("missing_parents list element contains only missing_* rows", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - r <- res[ID == "IND_E"] - # The known female parent should not be suggested as the best male parent - expect_false(r$Best_Male_Parent == "IND_B") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$missing_parents) > 0) + expect_true(all(grepl("^missing_", out$missing_parents$status))) }) # ============================================================================== -# 5. FOUNDERS status +# 5. founders status # ============================================================================== -test_that("FOUNDERS status is assigned when ID in founders list with 0 0 parents", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") - ) +test_that("founders status is assigned when ID is in founders list with 0 0 parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) founders_file <- tempfile(fileext = ".txt") - fwrite(data.table(ID = "IND_A"), founders_file, + fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - founder_row <- res[ID == "IND_A"] - expect_equal(founder_row$Status, "FOUNDERS") - expect_equal(founder_row$Correction_Decision, "NONE") - expect_true(is.na(founder_row$Best_Male_Parent)) - expect_true(is.na(founder_row$Best_Female_Parent)) -}) - -test_that("non-founder rows are still evaluated normally when founders file is supplied", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_A", Male_Parent = "0", Female_Parent = "0") - ) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_A"] + expect_equal(r$status, "founders") + expect_equal(r$recommended_correction, "none") + expect_true(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) + +test_that("founders list element contains only founders rows", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) founders_file <- tempfile(fileext = ".txt") - fwrite(data.table(ID = "IND_A"), founders_file, + fwrite(data.table(id = "IND_A"), founders_file, sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = founders_file, - verbose = FALSE, - write_txt = FALSE) - # IND_C has real parents — should still get PASS - expect_equal(res[ID == "IND_C"]$Status, "PASS") -}) - -test_that("0 0 parents NOT in founders list get MISSING_BOTH_PARENTS", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + if (nrow(out$founders) > 0) + expect_true(all(out$founders$status == "founders")) +}) + +test_that("non-founder rows still evaluated normally when founders file is supplied", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(id = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C"]$status, "pass") + expect_equal(out$full_results[id == "IND_D"]$status, "fail") }) -test_that("0 0 parents with no founders file gets MISSING_BOTH_PARENTS not FOUNDERS", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "0") - ) +test_that("0 0 parents NOT in founders list get missing_both_parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_E", male_parent = "0", + female_parent = "0")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_E"]$status, "missing_both_parents") +}) + +test_that("founder row does not appear in pass, fail, or missing_parents", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_A", male_parent = "0", + female_parent = "0")) + founders_file <- tempfile(fileext = ".txt") + fwrite(data.table(id = "IND_A"), founders_file, + sep = "\t", quote = FALSE, col.names = FALSE) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, - founders_file = NULL, - verbose = FALSE, - write_txt = FALSE) - expect_equal(res[ID == "IND_E"]$Status, "MISSING_BOTH_PARENTS") + out <- validate_pedigree(f$ped, f$genos, founders_file = founders_file, + verbose = FALSE, plot_results = FALSE) + expect_false("IND_A" %in% out$pass$id) + expect_false("IND_A" %in% out$fail$id) + expect_false("IND_A" %in% out$missing_parents$id) }) # ============================================================================== -# 6. NO_GENOTYPE_DATA status +# 6. no_genotype_data status # ============================================================================== -test_that("NO_GENOTYPE_DATA is flagged for progeny absent from genotype file", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) +test_that("no_genotype_data is flagged for progeny absent from genotype file", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_equal(nrow(ghost_row), 1L) - expect_equal(ghost_row$Status, "NO_GENOTYPE_DATA") - expect_equal(ghost_row$Correction_Decision, "NONE") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "GHOST"] + expect_equal(nrow(r), 1L) + expect_equal(r$status, "no_genotype_data") + expect_equal(r$recommended_correction, "none") }) -test_that("NO_GENOTYPE_DATA rows have NA for all analysis columns", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) +test_that("no_genotype_data rows have NA/0 for all analysis columns", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - ghost_row <- res[ID == "GHOST"] - expect_true(is.na(ghost_row$Mendelian_Error_Pct)) - expect_equal(ghost_row$Markers_Tested, 0L) - expect_true(is.na(ghost_row$Best_Male_Parent)) - expect_true(is.na(ghost_row$Best_Female_Parent)) -}) - -test_that("NO_GENOTYPE_DATA flagged when declared parent is absent from genotype file", { - ped <- rbind( - make_pedigree(), # ensures has_geno is not empty - data.table(ID = "IND_C_GHOST", Male_Parent = "GHOST_DAD", Female_Parent = "IND_B") - ) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "GHOST"] + expect_true(is.na(r$trio_mendelian_error_pct)) + expect_equal(r$trio_markers_tested, 0L) + expect_true(is.na(r$best_male_candidate)) + expect_true(is.na(r$best_female_candidate)) +}) + +test_that("no_genotype_data flagged when a declared parent is absent from genotype file", { + ped <- rbind(make_pedigree(), + data.table(id = "IND_C_GHOST", male_parent = "GHOST_DAD", + female_parent = "IND_B")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C_GHOST"]$Status, "NO_GENOTYPE_DATA") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C_GHOST"]$status, "no_genotype_data") +}) + +test_that("no_genotype_data list element contains only no_genotype_data rows", { + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) + f <- write_temp_files(ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + if (nrow(out$no_genotype_data) > 0) + expect_true(all(out$no_genotype_data$status == "no_genotype_data")) }) test_that("valid trios still evaluated correctly when ghost rows are present", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") - ) + ped <- rbind(make_pedigree(), + data.table(id = "GHOST", male_parent = "IND_A", + female_parent = "IND_B")) f <- write_temp_files(ped = ped) - res <- validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_equal(res[ID == "IND_C"]$Status, "PASS") - expect_equal(res[ID == "IND_D"]$Status, "FAIL") + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results[id == "IND_C"]$status, "pass") + expect_equal(out$full_results[id == "IND_D"]$status, "fail") }) # ============================================================================== -# 7. Corrected pedigree output +# 7. corrected_pedigree contents # ============================================================================== -test_that("corrected_pedigree.txt is written and PASS parents are unchanged", { - f <- write_temp_files() - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - expect_true(file.exists(file.path(tmp_dir, "corrected_pedigree.txt"))) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt")) - # IND_C passes — parents must be unchanged - expect_equal(as.character(corr[ID == "IND_C"]$Male_Parent), "IND_A") - expect_equal(as.character(corr[ID == "IND_C"]$Female_Parent), "IND_B") +test_that("corrected_pedigree: pass parents are unchanged", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + corr <- out$corrected_pedigree + expect_equal(as.character(corr[id == "IND_C"]$male_parent), "IND_A") + expect_equal(as.character(corr[id == "IND_C"]$female_parent), "IND_B") +}) + +test_that("corrected_pedigree: removed parent set to 0 for remove_male_parent", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + r <- out$full_results[id == "IND_D"] + corr <- out$corrected_pedigree + if (r$recommended_correction == "remove_male_parent") { + expect_equal(corr[id == "IND_D"]$male_parent, "0") + } +}) + +test_that("corrected_pedigree: removed parent set to 0 for remove_female_parent", { + # construct a trio where IND_A (all 0) is correct male and female is wrong + genos <- make_genos() + ped <- data.table(id = "IND_E", male_parent = "IND_A", + female_parent = "IND_B") + # IND_E is all ref (0); IND_A is all ref (0); IND_B is all alt (2) + # IND_E as child of IND_A x IND_B is impossible → remove_female_parent + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + corr <- out$corrected_pedigree + r <- out$full_results[id == "IND_E"] + if (r$recommended_correction == "remove_female_parent") { + expect_equal(corr[id == "IND_E"]$female_parent, "0") + expect_false(corr[id == "IND_E"]$male_parent == "0") + } +}) + +test_that("corrected_pedigree preserves id column values", { + f <- write_temp_files() + out <- validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_setequal(out$corrected_pedigree$id, make_pedigree()$id) }) -test_that("corrected_pedigree.txt sets bad parent to 0 for FAIL trio", { +# ============================================================================== +# 8. No write logic — function does not write files +# ============================================================================== + +test_that("no output files are written to disk", { f <- write_temp_files() tmp_dir <- tempfile() dir.create(tmp_dir) old_wd <- getwd() setwd(tmp_dir) on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # IND_D fails with REMOVE_MALE_PARENT — male should become "0" - expect_equal(corr[ID == "IND_D"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_D"]$Female_Parent, "IND_A") -}) - -test_that("corrected_pedigree.txt preserves 0s for MISSING_* rows", { - ped <- rbind( - make_pedigree(), - data.table(ID = "IND_E", Male_Parent = "0", Female_Parent = "IND_B") + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) + written_files <- list.files(tmp_dir) + expect_length(written_files, 0) +}) + +# ============================================================================== +# 9. verbose +# ============================================================================== + +test_that("verbose = FALSE suppresses console output", { + f <- write_temp_files() + expect_silent( + validate_pedigree(f$ped, f$genos, verbose = FALSE, plot_results = FALSE) ) - f <- write_temp_files(ped = ped) - tmp_dir <- tempfile() - dir.create(tmp_dir) - old_wd <- getwd() - setwd(tmp_dir) - on.exit({ setwd(old_wd); unlink(tmp_dir, recursive = TRUE) }, add = TRUE) - validate_pedigree(f$ped, f$genos, verbose = FALSE, write_txt = FALSE) - corr <- fread(file.path(tmp_dir, "corrected_pedigree.txt"), - colClasses = "character") - # MISSING_MALE_PARENT — male stays "0", female unchanged - expect_equal(corr[ID == "IND_E"]$Male_Parent, "0") - expect_equal(corr[ID == "IND_E"]$Female_Parent, "IND_B") +}) + +test_that("verbose = TRUE returns valid named list without error", { + f <- write_temp_files() + invisible(capture.output( + out <- validate_pedigree(f$ped, f$genos, verbose = TRUE, plot_results = FALSE) + )) + expect_type(out, "list") + expect_named(out, c("pass", "fail", "low_markers", "no_genotype_data", + "founders", "missing_parents", "full_results", + "corrected_pedigree", "plot"), + ignore.order = TRUE) }) # ============================================================================== -# 8. write_txt / output file +# 10. Mendelian error correctness # ============================================================================== -test_that("write_txt = TRUE writes output file with correct row count", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - expect_true(file.exists(out_file)) - written <- fread(out_file) - expect_equal(nrow(written), 2L) +test_that("0x0 parents produce 0% error for dosage-0 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 0L, 0L), M2 = c(0L, 0L, 0L), M3 = c(0L, 0L, 0L), + M4 = c(0L, 0L, 0L), M5 = c(0L, 0L, 0L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("2x2 parents produce 0% error for dosage-2 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(2L, 2L, 2L), M2 = c(2L, 2L, 2L), M3 = c(2L, 2L, 2L), + M4 = c(2L, 2L, 2L), M5 = c(2L, 2L, 2L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("0x0 parents produce 100% error for dosage-2 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 0L, 2L), M2 = c(0L, 0L, 2L), M3 = c(0L, 0L, 2L), + M4 = c(0L, 0L, 2L), M5 = c(0L, 0L, 2L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 100) +}) + +test_that("0x2 parents produce 0% error for dosage-1 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 2L, 1L), M2 = c(0L, 2L, 1L), M3 = c(0L, 2L, 1L), + M4 = c(0L, 2L, 1L), M5 = c(0L, 2L, 1L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 0) +}) + +test_that("0x2 parents produce 100% error for dosage-0 child", { + genos <- data.table( + id = c("S", "D", "C"), + M1 = c(0L, 2L, 0L), M2 = c(0L, 2L, 0L), M3 = c(0L, 2L, 0L), + M4 = c(0L, 2L, 0L), M5 = c(0L, 2L, 0L) + ) + ped <- data.table(id = "C", male_parent = "S", female_parent = "D") + f <- write_temp_files(genos = genos, ped = ped) + out <- validate_pedigree(f$ped, f$genos, trio_error_threshold = 5.0, + min_markers = 1, verbose = FALSE, plot_results = FALSE) + expect_equal(out$full_results$trio_mendelian_error_pct, 100) +}) +# ============================================================================== +# 11. In-memory input — data.frame / data.table accepted directly +# ============================================================================== + +test_that("validate_pedigree accepts a data.table pedigree directly", { + f <- write_temp_files() + ped <- make_pedigree() + out <- validate_pedigree(ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) }) -test_that("write_txt = FALSE does not create output file", { - f <- write_temp_files() - out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = FALSE, output_filename = out_file) - expect_false(file.exists(out_file)) +test_that("validate_pedigree accepts a data.table genotypes object directly", { + f <- write_temp_files() + genos <- make_genos() + out <- validate_pedigree(f$ped, genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) }) -test_that("output file contains correct number of rows when ghost rows present", { - ped <- rbind( - make_pedigree(), - data.table(ID = "GHOST", Male_Parent = "IND_A", Female_Parent = "IND_B") +test_that("validate_pedigree accepts both inputs as data.tables directly", { + ped <- make_pedigree() + genos <- make_genos() + out <- validate_pedigree(ped, genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("validate_pedigree accepts a data.frame pedigree directly", { + f <- write_temp_files() + ped <- as.data.frame(make_pedigree()) + out <- validate_pedigree(ped, f$genos, verbose = FALSE, plot_results = FALSE) + expect_s3_class(out$full_results, "data.table") + expect_equal(nrow(out$full_results), 2L) +}) + +test_that("in-memory and file-path inputs produce identical results for validate_pedigree", { + f <- write_temp_files() + ped <- make_pedigree() + genos <- make_genos() + out_file <- validate_pedigree(f$ped, f$genos, + verbose = FALSE, plot_results = FALSE) + out_memory <- validate_pedigree(ped, genos, + verbose = FALSE, plot_results = FALSE) + expect_equal(out_file$full_results$status, + out_memory$full_results$status) + expect_equal(out_file$full_results$trio_mendelian_error_pct, + out_memory$full_results$trio_mendelian_error_pct) +}) + +test_that("invalid input type raises an error for validate_pedigree", { + f <- write_temp_files() + expect_error( + validate_pedigree(list(id = "IND_C"), f$genos, + verbose = FALSE, plot_results = FALSE), + regexp = "Error reading input files" ) - f <- write_temp_files(ped = ped) - out_file <- tempfile(fileext = ".txt") - validate_pedigree(f$ped, f$genos, verbose = FALSE, - write_txt = TRUE, output_filename = out_file) - written <- fread(out_file) - # 2 valid trios + 1 ghost = 3 rows total - expect_equal(nrow(written), 3L) })