From c086bd76e6a88c8072520a2c8d4e70cf7bb0407a Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Sat, 25 Apr 2026 17:08:20 -0700 Subject: [PATCH 1/2] add name check --- NAMESPACE | 1 + R/LD.R | 35 ++-------- R/file_utils.R | 112 +++++++++++++++++++++++-------- tests/testthat/test_file_utils.R | 10 +++ 4 files changed, 102 insertions(+), 56 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 65bc5db7..50241c8e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -198,6 +198,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_remove) importFrom(stringr,str_replace) importFrom(stringr,str_split) +importFrom(stringr,str_split_fixed) importFrom(susieR,coef.susie) importFrom(susieR,get_cs_correlation) importFrom(susieR,mr.ash) diff --git a/R/LD.R b/R/LD.R index d1a608dd..753eea02 100644 --- a/R/LD.R +++ b/R/LD.R @@ -143,38 +143,17 @@ process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) { snp_file_path <- found[1] } - # Detect format by extension or header - is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path) - if (!is_pvar) { - # Check if first line starts with #CHROM (pvar header) - first_line <- readLines(snp_file_path, n = 1) - is_pvar <- grepl("^#CHROM", first_line) - } - + LD_variants <- read_variant_metadata(snp_file_path) + is_pvar <- !("gpos" %in% names(LD_variants)) + LD_variants <- LD_variants %>% + mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), + variants = normalize_variant_id(id)) if (is_pvar) { - # PLINK2 .pvar format: read via existing read_pvar() - LD_variants <- read_pvar(snp_file_path) - # read_pvar returns: chrom, id, pos, A2 (REF), A1 (ALT) - LD_variants <- LD_variants %>% - mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), - variants = normalize_variant_id(id)) %>% - rename(GD = pos) # reuse pos column for ordering + LD_variants <- rename(LD_variants, GD = pos) LD_variants$GD <- LD_variants$pos <- as.integer( sapply(LD_variants$variants, function(v) strsplit(v, ":")[[1]][2])) } else { - # PLINK1 .bim format - LD_variants <- read.table(snp_file_path) - col_names <- if (ncol(LD_variants) == 9) { - c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") - } else if (ncol(LD_variants) == 6) { - c("chrom", "variants", "GD", "pos", "A1", "A2") - } else { - stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", snp_file_path) - } - LD_variants <- LD_variants %>% - setNames(col_names) %>% - mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))), - variants = normalize_variant_id(variants)) + LD_variants <- rename(LD_variants, GD = gpos) } # Label and symmetrize the matrix diff --git a/R/file_utils.R b/R/file_utils.R index 59ce7162..92414ad7 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -182,7 +182,7 @@ load_plink2_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari # --- Read samples from .psam --- psam <- as.data.frame(vroom(paths$psam, delim = "\t", show_col_types = FALSE)) - colnames(psam)[1:2] <- c("FID", "IID") + names(psam) <- sub("^#", "", names(psam)) # --- Read genotype dosage via pgenlibr --- pgen <- pgenlibr::NewPgen(paths$pgen) @@ -293,6 +293,41 @@ read_pvar <- function(pvar_path) { ) } +#' Read variant metadata from either .bim or .pvar/.pvar.zst file. +#' +#' Auto-detects the format by extension and header, then returns a +#' standardized data.frame. For PLINK1 .bim files, assigns column names +#' based on the number of columns (6 or 9). For PLINK2 .pvar files, +#' delegates to \code{read_pvar()}. +#' +#' @param snp_file_path Path to .bim, .pvar, or .pvar.zst file. +#' @return data.frame with at minimum columns: chrom, id, pos, A2, A1. +#' Extended .bim files (9 columns) also include: variance, allele_freq, n_nomiss. +#' @importFrom utils read.table +#' @noRd +read_variant_metadata <- function(snp_file_path) { + is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path) + if (!is_pvar) { + first_line <- readLines(snp_file_path, n = 1) + is_pvar <- grepl("^#CHROM", first_line) + } + + if (is_pvar) { + read_pvar(snp_file_path) + } else { + df <- read.table(snp_file_path, stringsAsFactors = FALSE) + n <- ncol(df) + if (n == 6) { + names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2") + } else if (n == 9) { + names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") + } else { + stop("Unexpected number of columns (", n, ") in variant file: ", snp_file_path) + } + df + } +} + #' Get variant information from any LD reference source. #' #' Auto-detects the source type (PLINK2, PLINK1, VCF, GDS, or pre-computed @@ -341,17 +376,18 @@ get_ref_variant_info <- function(source, region = NULL) { info$allele_freq <- colMeans(result$X, na.rm = TRUE) / 2 return(info) # Already region-filtered by the loader } else { - # Pre-computed LD: read bim files via metadata + # Pre-computed LD: read bim/pvar files via metadata bim_paths <- get_regional_ld_meta(resolved$meta_path, region)$intersections$bim_file_paths info <- do.call(rbind, lapply(bim_paths, function(path) { - df <- as.data.frame(vroom(path, col_names = FALSE, show_col_types = FALSE)) + df <- read_variant_metadata(path) out <- data.frame( - chrom = df$X1, id = df$X2, pos = df$X4, - A2 = df$X5, A1 = df$X6, + chrom = df$chrom, id = df$id, pos = df$pos, + A2 = df$A2, A1 = df$A1, stringsAsFactors = FALSE ) - if (ncol(df) >= 8) { out$variance <- df$X7; out$allele_freq <- df$X8 } - if (ncol(df) >= 9) { out$n_nomiss <- df$X9 } + if ("variance" %in% names(df)) out$variance <- df$variance + if ("allele_freq" %in% names(df)) out$allele_freq <- df$allele_freq + if ("n_nomiss" %in% names(df)) out$n_nomiss <- df$n_nomiss out })) info$id <- normalize_variant_id(info$id) @@ -384,13 +420,23 @@ get_ref_variant_info <- function(source, region = NULL) { #' @noRd match_variants_to_keep <- function(variant_info, keep_variants_path) { keep_raw <- as.data.frame(vroom(keep_variants_path, show_col_types = FALSE)) - # parse_variant_id handles chr prefix, underscore/colon formats, build suffixes, - # and returns data.frame with integer chrom/pos and character A2/A1 - keep_variants <- parse_variant_id( - if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) keep_raw else keep_raw[[1]] - ) + if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) { + keep_variants <- parse_variant_id(keep_raw) + } else if (ncol(keep_raw) == 1) { + keep_variants <- parse_variant_id(keep_raw[[1]]) + } else { + stop("keep_variants file must contain either: (1) a single column of variant IDs, ", + "or (2) columns named 'chrom' and 'pos' (optionally 'A1' and 'A2').") + } vi_chrom <- as.integer(strip_chr_prefix(variant_info$chrom)) - paste0(vi_chrom, ":", variant_info$pos) %in% paste0(keep_variants$chrom, ":", keep_variants$pos) + has_alleles <- !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2)) + if (has_alleles) { + paste0(vi_chrom, ":", variant_info$pos, ":", variant_info$A2, ":", variant_info$A1) %in% + paste0(keep_variants$chrom, ":", keep_variants$pos, ":", keep_variants$A2, ":", keep_variants$A1) + } else { + paste0(vi_chrom, ":", variant_info$pos) %in% + paste0(keep_variants$chrom, ":", keep_variants$pos) + } } #' @importFrom vroom vroom @@ -511,12 +557,11 @@ load_plink1_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari # --- Region filter via bim --- if (!is.null(region)) { parsed <- parse_region(region) - col_types <- list(col_character(), col_character(), col_guess(), col_integer(), col_guess(), col_guess()) - bim_data <- vroom(bim_file, col_names = FALSE, col_types = col_types) - bim_data$X1 <- strip_chr_prefix(bim_data$X1) - snp_ids <- bim_data$X2[bim_data$X1 == parsed$chrom & - bim_data$X4 >= parsed$start & - bim_data$X4 <= parsed$end] + bim_data <- read_bim(bed_file) + bim_data$chrom <- strip_chr_prefix(bim_data$chrom) + snp_ids <- bim_data$id[bim_data$chrom == parsed$chrom & + bim_data$pos >= parsed$start & + bim_data$pos <= parsed$end] if (length(snp_ids) == 0) { stop(NoSNPsError(paste("No SNPs found in the specified region", region))) } @@ -807,6 +852,18 @@ load_genotype_region <- function(genotype, region = NULL, keep_indel = TRUE, #' @importFrom readr read_delim cols #' @importFrom dplyr select mutate across everything #' @importFrom magrittr %>% +#' @noRd +read_single_covariate <- function(path) { + df <- read_delim(path, "\t", col_types = cols()) %>% select(-1) + non_numeric <- names(df)[!sapply(df, is.numeric)] + if (length(non_numeric) > 0) { + stop("Non-numeric columns found in covariate file ", path, ": ", + paste(non_numeric, collapse = ", "), + ". All columns except the first (sample ID) must be numeric.") + } + df %>% mutate(across(everything(), as.numeric)) %>% t() +} + #' @noRd load_covariate_data <- function(covariate_path) { # Validate all covariate files exist @@ -814,10 +871,7 @@ load_covariate_data <- function(covariate_path) { if (length(missing) > 0) { stop("Covariate file(s) not found: ", paste(missing, collapse = ", ")) } - return(map(covariate_path, ~ read_delim(.x, "\t", col_types = cols()) %>% - select(-1) %>% - mutate(across(everything(), as.numeric)) %>% - t())) + return(map(covariate_path, read_single_covariate)) } NoPhenotypeError <- function(message) { @@ -945,6 +999,7 @@ prepare_data_list <- function(geno_bed, phenotype, covariate, imiss_cutoff, maf_ #' @importFrom purrr map #' @importFrom dplyr intersect +#' @importFrom stringr str_split_fixed #' @importFrom magrittr %>% #' @noRd prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff) { @@ -963,7 +1018,7 @@ prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_ colnames(X_filtered) <- normalize_variant_id(colnames(X_filtered)) # To keep a log message - variants <- as.data.frame(do.call(rbind, lapply(colnames(X_filtered), function(x) strsplit(x, ":")[[1]][1:2])), stringsAsFactors = FALSE) + variants <- str_split_fixed(colnames(X_filtered), ":", 3) message(paste0("Dimension of input genotype data is ", nrow(X_filtered), " rows and ", ncol(X_filtered), " columns for genomic region of ", variants[1, 1], ":", min(as.integer(variants[, 2])), "-", max(as.integer(variants[, 2])))) return(X_filtered) } @@ -1104,7 +1159,7 @@ load_regional_association_data <- function(genotype, # PLINK file data_list <- add_X_residuals(data_list, scale_residuals) # Get X matrix for union of samples X <- prepare_X_matrix(geno, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff) - region <- if (!is.null(region)) unlist(strsplit(region, ":", fixed = TRUE)) + parsed_region <- if (!is.null(region)) parse_region(region) else NULL ## residual_Y: a list of y either vector or matrix (CpG for example), and they need to match with residual_X in terms of which samples are missing. ## residual_X: is a list of R conditions each is a matrix, with list names being the names of conditions, column names being SNP names and row names being sample names. ## X: is the somewhat original genotype matrix output from `filter_X`, with column names being SNP names and row names being sample names. Sample names of X should match example sample names of residual_Y matrix form (not list); but the matrices inside residual_X would be subsets of sample name of residual_Y matrix form (not list). @@ -1119,8 +1174,8 @@ load_regional_association_data <- function(genotype, # PLINK file X_data = data_list$X, X = X, maf = maf_list, - chrom = region[1], - grange = if (!is.null(region)) unlist(strsplit(region[2], "-", fixed = TRUE)) else NULL, + chrom = if (!is.null(parsed_region)) paste0("chr", parsed_region$chrom) else NULL, + grange = if (!is.null(parsed_region)) as.character(c(parsed_region$start, parsed_region$end)) else NULL, Y_coordinates = if (!is.null(region)) extract_phenotype_coordinates(pheno) else NULL )) } @@ -1693,7 +1748,8 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for conditions <- conditions_list_individual[pos] dat <- load_regional_univariate_data( genotype = genotype, phenotype = phenotype, - covariate = covariate, region = region, + covariate = covariate, + region = region, association_window = association_window, conditions = conditions, xvar_cutoff = xvar_cutoff, maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff, diff --git a/tests/testthat/test_file_utils.R b/tests/testthat/test_file_utils.R index c5a21654..6ca8f44f 100644 --- a/tests/testthat/test_file_utils.R +++ b/tests/testthat/test_file_utils.R @@ -355,6 +355,16 @@ test_that("Test load_covariate_data reads tab-delimited file", { file.remove(tmp) }) +test_that("load_covariate_data errors on non-numeric columns", { + tmp <- tempfile(fileext = ".tsv") + writeLines(c("SampleID\tPC1\tLabel", "S1\t0.1\tabc", "S2\t0.3\tdef"), tmp) + expect_error( + load_covariate_data(tmp), + "Non-numeric columns found in covariate file.*Label.*must be numeric" + ) + file.remove(tmp) +}) + test_that("load_covariate_data errors on missing file", { expect_error( load_covariate_data("/nonexistent/covar.tsv"), From ad5daeb4b8781c5c8f3e3c12869ff5da42946afb Mon Sep 17 00:00:00 2001 From: Daniel Nachun Date: Mon, 27 Apr 2026 11:48:13 -0700 Subject: [PATCH 2/2] update some tests --- NAMESPACE | 1 + R/file_utils.R | 18 ++++--- tests/testthat/test_file_utils.R | 82 ++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 50241c8e..18fb675b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -181,6 +181,7 @@ importFrom(readr,col_integer) importFrom(readr,cols) importFrom(readr,parse_number) importFrom(readr,read_delim) +importFrom(readr,read_lines) importFrom(rlang,"!!!") importFrom(stats,as.dist) importFrom(stats,coef) diff --git a/R/file_utils.R b/R/file_utils.R index 92414ad7..42d47cd0 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -417,19 +417,23 @@ get_ref_variant_info <- function(source, region = NULL) { #' Match variant_info against a whitelist file, returning logical index. #' Uses parse_variant_id() from misc.R to handle all variant ID formats. #' @importFrom vroom vroom +#' @importFrom readr read_lines #' @noRd match_variants_to_keep <- function(variant_info, keep_variants_path) { - keep_raw <- as.data.frame(vroom(keep_variants_path, show_col_types = FALSE)) - if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) { + keep_raw <- tryCatch( + as.data.frame(vroom(keep_variants_path, show_col_types = FALSE)), + error = function(e) NULL + ) + if (!is.null(keep_raw) && "chrom" %in% names(keep_raw) && "pos" %in% names(keep_raw)) { keep_variants <- parse_variant_id(keep_raw) - } else if (ncol(keep_raw) == 1) { - keep_variants <- parse_variant_id(keep_raw[[1]]) } else { - stop("keep_variants file must contain either: (1) a single column of variant IDs, ", - "or (2) columns named 'chrom' and 'pos' (optionally 'A1' and 'A2').") + # Fall back to reading as single-column variant IDs + ids <- read_lines(keep_variants_path) + keep_variants <- parse_variant_id(ids) } vi_chrom <- as.integer(strip_chr_prefix(variant_info$chrom)) - has_alleles <- !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2)) + has_alleles <- "A1" %in% names(keep_variants) && "A2" %in% names(keep_variants) && + !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2)) if (has_alleles) { paste0(vi_chrom, ":", variant_info$pos, ":", variant_info$A2, ":", variant_info$A1) %in% paste0(keep_variants$chrom, ":", keep_variants$pos, ":", keep_variants$A2, ":", keep_variants$A1) diff --git a/tests/testthat/test_file_utils.R b/tests/testthat/test_file_utils.R index 6ca8f44f..6361415f 100644 --- a/tests/testthat/test_file_utils.R +++ b/tests/testthat/test_file_utils.R @@ -2127,6 +2127,88 @@ test_that("match_variants_to_keep returns all FALSE for non-matching variants", expect_true(all(!mask)) }) +# =========================================================================== +# read_variant_metadata +# =========================================================================== + +test_that("read_variant_metadata reads 6-column bim file", { + tmp <- tempfile(fileext = ".bim") + on.exit(unlink(tmp), add = TRUE) + writeLines(c( + "1\trs1\t0\t100\tA\tG", + "1\trs2\t0\t200\tC\tT" + ), tmp) + res <- read_variant_metadata(tmp) + expect_equal(nrow(res), 2) + expect_true("gpos" %in% names(res)) + expect_equal(as.character(res$chrom), c("1", "1")) + expect_equal(res$pos, c(100L, 200L)) +}) + +test_that("read_variant_metadata reads 9-column bim file", { + tmp <- tempfile(fileext = ".bim") + on.exit(unlink(tmp), add = TRUE) + writeLines(c( + "1\trs1\t0\t100\tA\tG\t0.5\t0.3\t100", + "1\trs2\t0\t200\tC\tT\t0.4\t0.2\t99" + ), tmp) + res <- read_variant_metadata(tmp) + expect_equal(nrow(res), 2) + expect_true(all(c("variance", "allele_freq", "n_nomiss") %in% names(res))) +}) + +test_that("read_variant_metadata delegates to read_pvar for .pvar files", { + pvar_path <- test_path("test_data", "test_variants.pvar") + res <- read_variant_metadata(pvar_path) + expect_true(all(c("chrom", "id", "pos", "A1", "A2") %in% names(res))) + expect_false("gpos" %in% names(res)) +}) + +test_that("read_variant_metadata errors on unexpected column count", { + tmp <- tempfile(fileext = ".bim") + on.exit(unlink(tmp), add = TRUE) + writeLines(c("1\trs1\t0\t100\tA"), tmp) + expect_error(read_variant_metadata(tmp), "Unexpected number of columns") +}) + +# =========================================================================== +# match_variants_to_keep (additional coverage) +# =========================================================================== + +test_that("match_variants_to_keep works with single-column variant ID file", { + vi <- data.frame(chrom = c("1", "1", "1"), pos = c(100L, 200L, 300L), + A2 = c("A", "C", "G"), A1 = c("G", "T", "A"), + stringsAsFactors = FALSE) + keep_file <- tempfile(fileext = ".txt") + on.exit(unlink(keep_file), add = TRUE) + writeLines(c("1:100:A:G", "1:300:G:A"), keep_file) + + mask <- match_variants_to_keep(vi, keep_file) + expect_type(mask, "logical") + expect_equal(sum(mask), 2L) + expect_true(mask[1]) + expect_true(mask[3]) +}) + +test_that("match_variants_to_keep uses position-only matching when no alleles", { + skip_if_not_installed("pgenlibr") + td <- test_path("test_data") + result <- load_plink2_data(file.path(td, "test_variants")) + vi <- result$variant_info + + keep_file <- tempfile(fileext = ".tsv") + on.exit(unlink(keep_file), add = TRUE) + # Write keep file with chrom/pos only (no alleles) + keep_df <- vi[c(1, 5), c("chrom", "pos")] + vroom::vroom_write(keep_df, keep_file, delim = "\t") + + mask <- match_variants_to_keep(vi, keep_file) + expect_type(mask, "logical") + expect_equal(sum(mask), 2L) + expect_true(mask[1]) + expect_true(mask[5]) +}) + # =========================================================================== # standardise_sumstats_columns # ===========================================================================