Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ importFrom(readr,col_integer)
importFrom(readr,cols)
importFrom(readr,parse_number)
importFrom(readr,read_delim)
importFrom(readr,read_lines)
importFrom(rlang,"!!!")
importFrom(stats,as.dist)
importFrom(stats,coef)
Expand All @@ -198,6 +199,7 @@ importFrom(stringr,str_detect)
importFrom(stringr,str_remove)
importFrom(stringr,str_replace)
importFrom(stringr,str_split)
importFrom(stringr,str_split_fixed)
importFrom(susieR,coef.susie)
importFrom(susieR,get_cs_correlation)
importFrom(susieR,mr.ash)
Expand Down
35 changes: 7 additions & 28 deletions R/LD.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,38 +143,17 @@ process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) {
snp_file_path <- found[1]
}

# Detect format by extension or header
is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path)
if (!is_pvar) {
# Check if first line starts with #CHROM (pvar header)
first_line <- readLines(snp_file_path, n = 1)
is_pvar <- grepl("^#CHROM", first_line)
}

LD_variants <- read_variant_metadata(snp_file_path)
is_pvar <- !("gpos" %in% names(LD_variants))
LD_variants <- LD_variants %>%
mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
variants = normalize_variant_id(id))
if (is_pvar) {
# PLINK2 .pvar format: read via existing read_pvar()
LD_variants <- read_pvar(snp_file_path)
# read_pvar returns: chrom, id, pos, A2 (REF), A1 (ALT)
LD_variants <- LD_variants %>%
mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
variants = normalize_variant_id(id)) %>%
rename(GD = pos) # reuse pos column for ordering
LD_variants <- rename(LD_variants, GD = pos)
LD_variants$GD <- LD_variants$pos <- as.integer(
sapply(LD_variants$variants, function(v) strsplit(v, ":")[[1]][2]))
} else {
# PLINK1 .bim format
LD_variants <- read.table(snp_file_path)
col_names <- if (ncol(LD_variants) == 9) {
c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")
} else if (ncol(LD_variants) == 6) {
c("chrom", "variants", "GD", "pos", "A1", "A2")
} else {
stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", snp_file_path)
}
LD_variants <- LD_variants %>%
setNames(col_names) %>%
mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
variants = normalize_variant_id(variants))
LD_variants <- rename(LD_variants, GD = gpos)
}

# Label and symmetrize the matrix
Expand Down
116 changes: 88 additions & 28 deletions R/file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ load_plink2_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari

# --- Read samples from .psam ---
psam <- as.data.frame(vroom(paths$psam, delim = "\t", show_col_types = FALSE))
colnames(psam)[1:2] <- c("FID", "IID")
names(psam) <- sub("^#", "", names(psam))

# --- Read genotype dosage via pgenlibr ---
pgen <- pgenlibr::NewPgen(paths$pgen)
Expand Down Expand Up @@ -293,6 +293,41 @@ read_pvar <- function(pvar_path) {
)
}

#' Read variant metadata from either .bim or .pvar/.pvar.zst file.
#'
#' Auto-detects the format by extension and header, then returns a
#' standardized data.frame. For PLINK1 .bim files, assigns column names
#' based on the number of columns (6 or 9). For PLINK2 .pvar files,
#' delegates to \code{read_pvar()}.
#'
#' @param snp_file_path Path to .bim, .pvar, or .pvar.zst file.
#' @return data.frame with at minimum columns: chrom, id, pos, A2, A1.
#' Extended .bim files (9 columns) also include: variance, allele_freq, n_nomiss.
#' @importFrom utils read.table
#' @noRd
read_variant_metadata <- function(snp_file_path) {
is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path)
if (!is_pvar) {
first_line <- readLines(snp_file_path, n = 1)
is_pvar <- grepl("^#CHROM", first_line)
}

if (is_pvar) {
read_pvar(snp_file_path)
} else {
df <- read.table(snp_file_path, stringsAsFactors = FALSE)
n <- ncol(df)
if (n == 6) {
names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2")
} else if (n == 9) {
names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")
} else {
stop("Unexpected number of columns (", n, ") in variant file: ", snp_file_path)
}
df
}
}

#' Get variant information from any LD reference source.
#'
#' Auto-detects the source type (PLINK2, PLINK1, VCF, GDS, or pre-computed
Expand Down Expand Up @@ -341,17 +376,18 @@ get_ref_variant_info <- function(source, region = NULL) {
info$allele_freq <- colMeans(result$X, na.rm = TRUE) / 2
return(info) # Already region-filtered by the loader
} else {
# Pre-computed LD: read bim files via metadata
# Pre-computed LD: read bim/pvar files via metadata
bim_paths <- get_regional_ld_meta(resolved$meta_path, region)$intersections$bim_file_paths
info <- do.call(rbind, lapply(bim_paths, function(path) {
df <- as.data.frame(vroom(path, col_names = FALSE, show_col_types = FALSE))
df <- read_variant_metadata(path)
out <- data.frame(
chrom = df$X1, id = df$X2, pos = df$X4,
A2 = df$X5, A1 = df$X6,
chrom = df$chrom, id = df$id, pos = df$pos,
A2 = df$A2, A1 = df$A1,
stringsAsFactors = FALSE
)
if (ncol(df) >= 8) { out$variance <- df$X7; out$allele_freq <- df$X8 }
if (ncol(df) >= 9) { out$n_nomiss <- df$X9 }
if ("variance" %in% names(df)) out$variance <- df$variance
if ("allele_freq" %in% names(df)) out$allele_freq <- df$allele_freq
if ("n_nomiss" %in% names(df)) out$n_nomiss <- df$n_nomiss
out
}))
info$id <- normalize_variant_id(info$id)
Expand Down Expand Up @@ -381,16 +417,30 @@ get_ref_variant_info <- function(source, region = NULL) {
#' Match variant_info against a whitelist file, returning logical index.
#' Uses parse_variant_id() from misc.R to handle all variant ID formats.
#' @importFrom vroom vroom
#' @importFrom readr read_lines
#' @noRd
match_variants_to_keep <- function(variant_info, keep_variants_path) {
keep_raw <- as.data.frame(vroom(keep_variants_path, show_col_types = FALSE))
# parse_variant_id handles chr prefix, underscore/colon formats, build suffixes,
# and returns data.frame with integer chrom/pos and character A2/A1
keep_variants <- parse_variant_id(
if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) keep_raw else keep_raw[[1]]
keep_raw <- tryCatch(
as.data.frame(vroom(keep_variants_path, show_col_types = FALSE)),
error = function(e) NULL
)
if (!is.null(keep_raw) && "chrom" %in% names(keep_raw) && "pos" %in% names(keep_raw)) {
keep_variants <- parse_variant_id(keep_raw)
} else {
# Fall back to reading as single-column variant IDs
ids <- read_lines(keep_variants_path)
keep_variants <- parse_variant_id(ids)
}
vi_chrom <- as.integer(strip_chr_prefix(variant_info$chrom))
paste0(vi_chrom, ":", variant_info$pos) %in% paste0(keep_variants$chrom, ":", keep_variants$pos)
has_alleles <- "A1" %in% names(keep_variants) && "A2" %in% names(keep_variants) &&
!any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2))
if (has_alleles) {
paste0(vi_chrom, ":", variant_info$pos, ":", variant_info$A2, ":", variant_info$A1) %in%
paste0(keep_variants$chrom, ":", keep_variants$pos, ":", keep_variants$A2, ":", keep_variants$A1)
} else {
paste0(vi_chrom, ":", variant_info$pos) %in%
paste0(keep_variants$chrom, ":", keep_variants$pos)
}
}

#' @importFrom vroom vroom
Expand Down Expand Up @@ -511,12 +561,11 @@ load_plink1_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari
# --- Region filter via bim ---
if (!is.null(region)) {
parsed <- parse_region(region)
col_types <- list(col_character(), col_character(), col_guess(), col_integer(), col_guess(), col_guess())
bim_data <- vroom(bim_file, col_names = FALSE, col_types = col_types)
bim_data$X1 <- strip_chr_prefix(bim_data$X1)
snp_ids <- bim_data$X2[bim_data$X1 == parsed$chrom &
bim_data$X4 >= parsed$start &
bim_data$X4 <= parsed$end]
bim_data <- read_bim(bed_file)
bim_data$chrom <- strip_chr_prefix(bim_data$chrom)
snp_ids <- bim_data$id[bim_data$chrom == parsed$chrom &
bim_data$pos >= parsed$start &
bim_data$pos <= parsed$end]
if (length(snp_ids) == 0) {
stop(NoSNPsError(paste("No SNPs found in the specified region", region)))
}
Expand Down Expand Up @@ -807,17 +856,26 @@ load_genotype_region <- function(genotype, region = NULL, keep_indel = TRUE,
#' @importFrom readr read_delim cols
#' @importFrom dplyr select mutate across everything
#' @importFrom magrittr %>%
#' @noRd
read_single_covariate <- function(path) {
df <- read_delim(path, "\t", col_types = cols()) %>% select(-1)
non_numeric <- names(df)[!sapply(df, is.numeric)]
if (length(non_numeric) > 0) {
stop("Non-numeric columns found in covariate file ", path, ": ",
paste(non_numeric, collapse = ", "),
". All columns except the first (sample ID) must be numeric.")
}
df %>% mutate(across(everything(), as.numeric)) %>% t()
}

#' @noRd
load_covariate_data <- function(covariate_path) {
# Validate all covariate files exist
missing <- covariate_path[!file.exists(covariate_path)]
if (length(missing) > 0) {
stop("Covariate file(s) not found: ", paste(missing, collapse = ", "))
}
return(map(covariate_path, ~ read_delim(.x, "\t", col_types = cols()) %>%
select(-1) %>%
mutate(across(everything(), as.numeric)) %>%
t()))
return(map(covariate_path, read_single_covariate))
}

NoPhenotypeError <- function(message) {
Expand Down Expand Up @@ -945,6 +1003,7 @@ prepare_data_list <- function(geno_bed, phenotype, covariate, imiss_cutoff, maf_

#' @importFrom purrr map
#' @importFrom dplyr intersect
#' @importFrom stringr str_split_fixed
#' @importFrom magrittr %>%
#' @noRd
prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff) {
Expand All @@ -963,7 +1022,7 @@ prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_
colnames(X_filtered) <- normalize_variant_id(colnames(X_filtered))

# To keep a log message
variants <- as.data.frame(do.call(rbind, lapply(colnames(X_filtered), function(x) strsplit(x, ":")[[1]][1:2])), stringsAsFactors = FALSE)
variants <- str_split_fixed(colnames(X_filtered), ":", 3)
message(paste0("Dimension of input genotype data is ", nrow(X_filtered), " rows and ", ncol(X_filtered), " columns for genomic region of ", variants[1, 1], ":", min(as.integer(variants[, 2])), "-", max(as.integer(variants[, 2]))))
return(X_filtered)
}
Expand Down Expand Up @@ -1104,7 +1163,7 @@ load_regional_association_data <- function(genotype, # PLINK file
data_list <- add_X_residuals(data_list, scale_residuals)
# Get X matrix for union of samples
X <- prepare_X_matrix(geno, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff)
region <- if (!is.null(region)) unlist(strsplit(region, ":", fixed = TRUE))
parsed_region <- if (!is.null(region)) parse_region(region) else NULL
## residual_Y: a list of y either vector or matrix (CpG for example), and they need to match with residual_X in terms of which samples are missing.
## residual_X: is a list of R conditions each is a matrix, with list names being the names of conditions, column names being SNP names and row names being sample names.
## X: is the somewhat original genotype matrix output from `filter_X`, with column names being SNP names and row names being sample names. Sample names of X should match example sample names of residual_Y matrix form (not list); but the matrices inside residual_X would be subsets of sample name of residual_Y matrix form (not list).
Expand All @@ -1119,8 +1178,8 @@ load_regional_association_data <- function(genotype, # PLINK file
X_data = data_list$X,
X = X,
maf = maf_list,
chrom = region[1],
grange = if (!is.null(region)) unlist(strsplit(region[2], "-", fixed = TRUE)) else NULL,
chrom = if (!is.null(parsed_region)) paste0("chr", parsed_region$chrom) else NULL,
grange = if (!is.null(parsed_region)) as.character(c(parsed_region$start, parsed_region$end)) else NULL,
Y_coordinates = if (!is.null(region)) extract_phenotype_coordinates(pheno) else NULL
))
}
Expand Down Expand Up @@ -1693,7 +1752,8 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
conditions <- conditions_list_individual[pos]
dat <- load_regional_univariate_data(
genotype = genotype, phenotype = phenotype,
covariate = covariate, region = region,
covariate = covariate,
region = region,
association_window = association_window,
conditions = conditions, xvar_cutoff = xvar_cutoff,
maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff,
Expand Down
92 changes: 92 additions & 0 deletions tests/testthat/test_file_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,16 @@ test_that("Test load_covariate_data reads tab-delimited file", {
file.remove(tmp)
})

test_that("load_covariate_data errors on non-numeric columns", {
tmp <- tempfile(fileext = ".tsv")
writeLines(c("SampleID\tPC1\tLabel", "S1\t0.1\tabc", "S2\t0.3\tdef"), tmp)
expect_error(
load_covariate_data(tmp),
"Non-numeric columns found in covariate file.*Label.*must be numeric"
)
file.remove(tmp)
})

test_that("load_covariate_data errors on missing file", {
expect_error(
load_covariate_data("/nonexistent/covar.tsv"),
Expand Down Expand Up @@ -2117,6 +2127,88 @@ test_that("match_variants_to_keep returns all FALSE for non-matching variants",
expect_true(all(!mask))
})

# ===========================================================================
# read_variant_metadata
# ===========================================================================

test_that("read_variant_metadata reads 6-column bim file", {
tmp <- tempfile(fileext = ".bim")
on.exit(unlink(tmp), add = TRUE)
writeLines(c(
"1\trs1\t0\t100\tA\tG",
"1\trs2\t0\t200\tC\tT"
), tmp)
res <- read_variant_metadata(tmp)
expect_equal(nrow(res), 2)
expect_true("gpos" %in% names(res))
expect_equal(as.character(res$chrom), c("1", "1"))
expect_equal(res$pos, c(100L, 200L))
})

test_that("read_variant_metadata reads 9-column bim file", {
tmp <- tempfile(fileext = ".bim")
on.exit(unlink(tmp), add = TRUE)
writeLines(c(
"1\trs1\t0\t100\tA\tG\t0.5\t0.3\t100",
"1\trs2\t0\t200\tC\tT\t0.4\t0.2\t99"
), tmp)
res <- read_variant_metadata(tmp)
expect_equal(nrow(res), 2)
expect_true(all(c("variance", "allele_freq", "n_nomiss") %in% names(res)))
})

test_that("read_variant_metadata delegates to read_pvar for .pvar files", {
pvar_path <- test_path("test_data", "test_variants.pvar")
res <- read_variant_metadata(pvar_path)
expect_true(all(c("chrom", "id", "pos", "A1", "A2") %in% names(res)))
expect_false("gpos" %in% names(res))
})

test_that("read_variant_metadata errors on unexpected column count", {
tmp <- tempfile(fileext = ".bim")
on.exit(unlink(tmp), add = TRUE)
writeLines(c("1\trs1\t0\t100\tA"), tmp)
expect_error(read_variant_metadata(tmp), "Unexpected number of columns")
})

# ===========================================================================
# match_variants_to_keep (additional coverage)
# ===========================================================================

test_that("match_variants_to_keep works with single-column variant ID file", {
vi <- data.frame(chrom = c("1", "1", "1"), pos = c(100L, 200L, 300L),
A2 = c("A", "C", "G"), A1 = c("G", "T", "A"),
stringsAsFactors = FALSE)
keep_file <- tempfile(fileext = ".txt")
on.exit(unlink(keep_file), add = TRUE)
writeLines(c("1:100:A:G", "1:300:G:A"), keep_file)

mask <- match_variants_to_keep(vi, keep_file)
expect_type(mask, "logical")
expect_equal(sum(mask), 2L)
expect_true(mask[1])
expect_true(mask[3])
})

test_that("match_variants_to_keep uses position-only matching when no alleles", {
skip_if_not_installed("pgenlibr")
td <- test_path("test_data")
result <- load_plink2_data(file.path(td, "test_variants"))
vi <- result$variant_info

keep_file <- tempfile(fileext = ".tsv")
on.exit(unlink(keep_file), add = TRUE)
# Write keep file with chrom/pos only (no alleles)
keep_df <- vi[c(1, 5), c("chrom", "pos")]
vroom::vroom_write(keep_df, keep_file, delim = "\t")

mask <- match_variants_to_keep(vi, keep_file)
expect_type(mask, "logical")
expect_equal(sum(mask), 2L)
expect_true(mask[1])
expect_true(mask[5])
})

# ===========================================================================
# standardise_sumstats_columns
# ===========================================================================
Expand Down