From c086bd76e6a88c8072520a2c8d4e70cf7bb0407a Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Sat, 25 Apr 2026 17:08:20 -0700
Subject: [PATCH 1/2] add name check

---
 NAMESPACE                        |   1 +
 R/LD.R                           |  35 ++--------
 R/file_utils.R                   | 112 +++++++++++++++++++++++--------
 tests/testthat/test_file_utils.R |  10 +++
 4 files changed, 102 insertions(+), 56 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 65bc5db7..50241c8e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -198,6 +198,7 @@ importFrom(stringr,str_detect)
 importFrom(stringr,str_remove)
 importFrom(stringr,str_replace)
 importFrom(stringr,str_split)
+importFrom(stringr,str_split_fixed)
 importFrom(susieR,coef.susie)
 importFrom(susieR,get_cs_correlation)
 importFrom(susieR,mr.ash)
diff --git a/R/LD.R b/R/LD.R
index d1a608dd..753eea02 100644
--- a/R/LD.R
+++ b/R/LD.R
@@ -143,38 +143,17 @@ process_LD_matrix <- function(LD_file_path, snp_file_path = NULL) {
     snp_file_path <- found[1]
   }
 
-  # Detect format by extension or header
-  is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path)
-  if (!is_pvar) {
-    # Check if first line starts with #CHROM (pvar header)
-    first_line <- readLines(snp_file_path, n = 1)
-    is_pvar <- grepl("^#CHROM", first_line)
-  }
-
+  LD_variants <- read_variant_metadata(snp_file_path)
+  is_pvar <- !("gpos" %in% names(LD_variants))
+  LD_variants <- LD_variants %>%
+    mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
+           variants = normalize_variant_id(id))
   if (is_pvar) {
-    # PLINK2 .pvar format: read via existing read_pvar()
-    LD_variants <- read_pvar(snp_file_path)
-    # read_pvar returns: chrom, id, pos, A2 (REF), A1 (ALT)
-    LD_variants <- LD_variants %>%
-      mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
-             variants = normalize_variant_id(id)) %>%
-      rename(GD = pos)  # reuse pos column for ordering
+    LD_variants <- rename(LD_variants, GD = pos)
     LD_variants$GD <- LD_variants$pos <- as.integer(
       sapply(LD_variants$variants, function(v) strsplit(v, ":")[[1]][2]))
   } else {
-    # PLINK1 .bim format
-    LD_variants <- read.table(snp_file_path)
-    col_names <- if (ncol(LD_variants) == 9) {
-      c("chrom", "variants", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")
-    } else if (ncol(LD_variants) == 6) {
-      c("chrom", "variants", "GD", "pos", "A1", "A2")
-    } else {
-      stop("Unexpected number of columns (", ncol(LD_variants), ") in bim file: ", snp_file_path)
-    }
-    LD_variants <- LD_variants %>%
-      setNames(col_names) %>%
-      mutate(chrom = as.character(as.integer(strip_chr_prefix(chrom))),
-             variants = normalize_variant_id(variants))
+    LD_variants <- rename(LD_variants, GD = gpos)
   }
 
   # Label and symmetrize the matrix
diff --git a/R/file_utils.R b/R/file_utils.R
index 59ce7162..92414ad7 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -182,7 +182,7 @@ load_plink2_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari
 
   # --- Read samples from .psam ---
   psam <- as.data.frame(vroom(paths$psam, delim = "\t", show_col_types = FALSE))
-  colnames(psam)[1:2] <- c("FID", "IID")
+  names(psam) <- sub("^#", "", names(psam))
 
   # --- Read genotype dosage via pgenlibr ---
   pgen <- pgenlibr::NewPgen(paths$pgen)
@@ -293,6 +293,41 @@ read_pvar <- function(pvar_path) {
   )
 }
 
+#' Read variant metadata from either .bim or .pvar/.pvar.zst file.
+#'
+#' Auto-detects the format by extension and header, then returns a
+#' standardized data.frame. For PLINK1 .bim files, assigns column names
+#' based on the number of columns (6 or 9). For PLINK2 .pvar files,
+#' delegates to \code{read_pvar()}.
+#'
+#' @param snp_file_path Path to .bim, .pvar, or .pvar.zst file.
+#' @return data.frame with at minimum columns: chrom, id, pos, A2, A1.
+#'   Extended .bim files (9 columns) also include: variance, allele_freq, n_nomiss.
+#' @importFrom utils read.table
+#' @noRd
+read_variant_metadata <- function(snp_file_path) {
+  is_pvar <- grepl("\\.(pvar|pvar\\.zst)$", snp_file_path)
+  if (!is_pvar) {
+    first_line <- readLines(snp_file_path, n = 1)
+    is_pvar <- grepl("^#CHROM", first_line)
+  }
+
+  if (is_pvar) {
+    read_pvar(snp_file_path)
+  } else {
+    df <- read.table(snp_file_path, stringsAsFactors = FALSE)
+    n <- ncol(df)
+    if (n == 6) {
+      names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2")
+    } else if (n == 9) {
+      names(df) <- c("chrom", "id", "gpos", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")
+    } else {
+      stop("Unexpected number of columns (", n, ") in variant file: ", snp_file_path)
+    }
+    df
+  }
+}
+
 #' Get variant information from any LD reference source.
 #'
 #' Auto-detects the source type (PLINK2, PLINK1, VCF, GDS, or pre-computed
@@ -341,17 +376,18 @@ get_ref_variant_info <- function(source, region = NULL) {
     info$allele_freq <- colMeans(result$X, na.rm = TRUE) / 2
     return(info)  # Already region-filtered by the loader
   } else {
-    # Pre-computed LD: read bim files via metadata
+    # Pre-computed LD: read bim/pvar files via metadata
     bim_paths <- get_regional_ld_meta(resolved$meta_path, region)$intersections$bim_file_paths
     info <- do.call(rbind, lapply(bim_paths, function(path) {
-      df <- as.data.frame(vroom(path, col_names = FALSE, show_col_types = FALSE))
+      df <- read_variant_metadata(path)
       out <- data.frame(
-        chrom = df$X1, id = df$X2, pos = df$X4,
-        A2 = df$X5, A1 = df$X6,
+        chrom = df$chrom, id = df$id, pos = df$pos,
+        A2 = df$A2, A1 = df$A1,
         stringsAsFactors = FALSE
       )
-      if (ncol(df) >= 8) { out$variance <- df$X7; out$allele_freq <- df$X8 }
-      if (ncol(df) >= 9) { out$n_nomiss <- df$X9 }
+      if ("variance" %in% names(df)) out$variance <- df$variance
+      if ("allele_freq" %in% names(df)) out$allele_freq <- df$allele_freq
+      if ("n_nomiss" %in% names(df)) out$n_nomiss <- df$n_nomiss
       out
     }))
     info$id <- normalize_variant_id(info$id)
@@ -384,13 +420,23 @@ get_ref_variant_info <- function(source, region = NULL) {
 #' @noRd
 match_variants_to_keep <- function(variant_info, keep_variants_path) {
   keep_raw <- as.data.frame(vroom(keep_variants_path, show_col_types = FALSE))
-  # parse_variant_id handles chr prefix, underscore/colon formats, build suffixes,
-  # and returns data.frame with integer chrom/pos and character A2/A1
-  keep_variants <- parse_variant_id(
-    if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) keep_raw else keep_raw[[1]]
-  )
+  if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) {
+    keep_variants <- parse_variant_id(keep_raw)
+  } else if (ncol(keep_raw) == 1) {
+    keep_variants <- parse_variant_id(keep_raw[[1]])
+  } else {
+    stop("keep_variants file must contain either: (1) a single column of variant IDs, ",
+         "or (2) columns named 'chrom' and 'pos' (optionally 'A1' and 'A2').")
+  }
   vi_chrom <- as.integer(strip_chr_prefix(variant_info$chrom))
-  paste0(vi_chrom, ":", variant_info$pos) %in% paste0(keep_variants$chrom, ":", keep_variants$pos)
+  has_alleles <- !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2))
+  if (has_alleles) {
+    paste0(vi_chrom, ":", variant_info$pos, ":", variant_info$A2, ":", variant_info$A1) %in%
+      paste0(keep_variants$chrom, ":", keep_variants$pos, ":", keep_variants$A2, ":", keep_variants$A1)
+  } else {
+    paste0(vi_chrom, ":", variant_info$pos) %in%
+      paste0(keep_variants$chrom, ":", keep_variants$pos)
+  }
 }
 
 #' @importFrom vroom vroom
@@ -511,12 +557,11 @@ load_plink1_data <- function(prefix, region = NULL, keep_indel = TRUE, keep_vari
   # --- Region filter via bim ---
   if (!is.null(region)) {
     parsed <- parse_region(region)
-    col_types <- list(col_character(), col_character(), col_guess(), col_integer(), col_guess(), col_guess())
-    bim_data <- vroom(bim_file, col_names = FALSE, col_types = col_types)
-    bim_data$X1 <- strip_chr_prefix(bim_data$X1)
-    snp_ids <- bim_data$X2[bim_data$X1 == parsed$chrom &
-                            bim_data$X4 >= parsed$start &
-                            bim_data$X4 <= parsed$end]
+    bim_data <- read_bim(bed_file)
+    bim_data$chrom <- strip_chr_prefix(bim_data$chrom)
+    snp_ids <- bim_data$id[bim_data$chrom == parsed$chrom &
+                            bim_data$pos >= parsed$start &
+                            bim_data$pos <= parsed$end]
     if (length(snp_ids) == 0) {
       stop(NoSNPsError(paste("No SNPs found in the specified region", region)))
     }
@@ -807,6 +852,18 @@ load_genotype_region <- function(genotype, region = NULL, keep_indel = TRUE,
 #' @importFrom readr read_delim cols
 #' @importFrom dplyr select mutate across everything
 #' @importFrom magrittr %>%
+#' @noRd
+read_single_covariate <- function(path) {
+  df <- read_delim(path, "\t", col_types = cols()) %>% select(-1)
+  non_numeric <- names(df)[!sapply(df, is.numeric)]
+  if (length(non_numeric) > 0) {
+    stop("Non-numeric columns found in covariate file ", path, ": ",
+         paste(non_numeric, collapse = ", "),
+         ". All columns except the first (sample ID) must be numeric.")
+  }
+  df %>% mutate(across(everything(), as.numeric)) %>% t()
+}
+
 #' @noRd
 load_covariate_data <- function(covariate_path) {
   # Validate all covariate files exist
@@ -814,10 +871,7 @@ load_covariate_data <- function(covariate_path) {
   if (length(missing) > 0) {
     stop("Covariate file(s) not found: ", paste(missing, collapse = ", "))
   }
-  return(map(covariate_path, ~ read_delim(.x, "\t", col_types = cols()) %>%
-    select(-1) %>%
-    mutate(across(everything(), as.numeric)) %>%
-    t()))
+  return(map(covariate_path, read_single_covariate))
 }
 
 NoPhenotypeError <- function(message) {
@@ -945,6 +999,7 @@ prepare_data_list <- function(geno_bed, phenotype, covariate, imiss_cutoff, maf_
 
 #' @importFrom purrr map
 #' @importFrom dplyr intersect
+#' @importFrom stringr str_split_fixed
 #' @importFrom magrittr %>%
 #' @noRd
 prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff) {
@@ -963,7 +1018,7 @@ prepare_X_matrix <- function(geno_bed, data_list, imiss_cutoff, maf_cutoff, mac_
   colnames(X_filtered) <- normalize_variant_id(colnames(X_filtered))
 
   # To keep a log message
-  variants <- as.data.frame(do.call(rbind, lapply(colnames(X_filtered), function(x) strsplit(x, ":")[[1]][1:2])), stringsAsFactors = FALSE)
+  variants <- str_split_fixed(colnames(X_filtered), ":", 3)
   message(paste0("Dimension of input genotype data is ", nrow(X_filtered), " rows and ", ncol(X_filtered), " columns for genomic region of ", variants[1, 1], ":", min(as.integer(variants[, 2])), "-", max(as.integer(variants[, 2]))))
   return(X_filtered)
 }
@@ -1104,7 +1159,7 @@ load_regional_association_data <- function(genotype, # PLINK file
   data_list <- add_X_residuals(data_list, scale_residuals)
   # Get X matrix for union of samples
   X <- prepare_X_matrix(geno, data_list, imiss_cutoff, maf_cutoff, mac_cutoff, xvar_cutoff)
-  region <- if (!is.null(region)) unlist(strsplit(region, ":", fixed = TRUE))
+  parsed_region <- if (!is.null(region)) parse_region(region) else NULL
   ## residual_Y: a list of y either vector or matrix (CpG for example), and they need to match with residual_X in terms of which samples are missing.
   ## residual_X: is a list of R conditions each is a matrix, with list names being the names of conditions, column names being SNP names and row names being sample names.
   ## X: is the somewhat original genotype matrix output from `filter_X`, with column names being SNP names and row names being sample names. Sample names of X should match example sample names of residual_Y matrix form (not list); but the matrices inside residual_X would be subsets of sample name of residual_Y matrix form (not list).
@@ -1119,8 +1174,8 @@ load_regional_association_data <- function(genotype, # PLINK file
     X_data = data_list$X,
     X = X,
     maf = maf_list,
-    chrom = region[1],
-    grange = if (!is.null(region)) unlist(strsplit(region[2], "-", fixed = TRUE)) else NULL,
+    chrom = if (!is.null(parsed_region)) paste0("chr", parsed_region$chrom) else NULL,
+    grange = if (!is.null(parsed_region)) as.character(c(parsed_region$start, parsed_region$end)) else NULL,
     Y_coordinates = if (!is.null(region)) extract_phenotype_coordinates(pheno) else NULL
   ))
 }
@@ -1693,7 +1748,8 @@ load_multitask_regional_data <- function(region, # a string of chr:start-end for
       conditions <- conditions_list_individual[pos]
       dat <- load_regional_univariate_data(
         genotype = genotype, phenotype = phenotype,
-        covariate = covariate, region = region,
+        covariate = covariate,
+        region = region,
         association_window = association_window,
         conditions = conditions, xvar_cutoff = xvar_cutoff,
         maf_cutoff = maf_cutoff, mac_cutoff = mac_cutoff,
diff --git a/tests/testthat/test_file_utils.R b/tests/testthat/test_file_utils.R
index c5a21654..6ca8f44f 100644
--- a/tests/testthat/test_file_utils.R
+++ b/tests/testthat/test_file_utils.R
@@ -355,6 +355,16 @@ test_that("Test load_covariate_data reads tab-delimited file", {
   file.remove(tmp)
 })
 
+test_that("load_covariate_data errors on non-numeric columns", {
+  tmp <- tempfile(fileext = ".tsv")
+  writeLines(c("SampleID\tPC1\tLabel", "S1\t0.1\tabc", "S2\t0.3\tdef"), tmp)
+  expect_error(
+    load_covariate_data(tmp),
+    "Non-numeric columns found in covariate file.*Label.*must be numeric"
+  )
+  file.remove(tmp)
+})
+
 test_that("load_covariate_data errors on missing file", {
   expect_error(
     load_covariate_data("/nonexistent/covar.tsv"),

From ad5daeb4b8781c5c8f3e3c12869ff5da42946afb Mon Sep 17 00:00:00 2001
From: Daniel Nachun <daniel.nachun@gmail.com>
Date: Mon, 27 Apr 2026 11:48:13 -0700
Subject: [PATCH 2/2] update some tests

---
 NAMESPACE                        |  1 +
 R/file_utils.R                   | 18 ++++---
 tests/testthat/test_file_utils.R | 82 ++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 7 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 50241c8e..18fb675b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -181,6 +181,7 @@ importFrom(readr,col_integer)
 importFrom(readr,cols)
 importFrom(readr,parse_number)
 importFrom(readr,read_delim)
+importFrom(readr,read_lines)
 importFrom(rlang,"!!!")
 importFrom(stats,as.dist)
 importFrom(stats,coef)
diff --git a/R/file_utils.R b/R/file_utils.R
index 92414ad7..42d47cd0 100644
--- a/R/file_utils.R
+++ b/R/file_utils.R
@@ -417,19 +417,23 @@ get_ref_variant_info <- function(source, region = NULL) {
 #' Match variant_info against a whitelist file, returning logical index.
 #' Uses parse_variant_id() from misc.R to handle all variant ID formats.
 #' @importFrom vroom vroom
+#' @importFrom readr read_lines
 #' @noRd
 match_variants_to_keep <- function(variant_info, keep_variants_path) {
-  keep_raw <- as.data.frame(vroom(keep_variants_path, show_col_types = FALSE))
-  if ("chrom" %in% names(keep_raw) & "pos" %in% names(keep_raw)) {
+  keep_raw <- tryCatch(
+    as.data.frame(vroom(keep_variants_path, show_col_types = FALSE)),
+    error = function(e) NULL
+  )
+  if (!is.null(keep_raw) && "chrom" %in% names(keep_raw) && "pos" %in% names(keep_raw)) {
     keep_variants <- parse_variant_id(keep_raw)
-  } else if (ncol(keep_raw) == 1) {
-    keep_variants <- parse_variant_id(keep_raw[[1]])
   } else {
-    stop("keep_variants file must contain either: (1) a single column of variant IDs, ",
-         "or (2) columns named 'chrom' and 'pos' (optionally 'A1' and 'A2').")
+    # Fall back to reading as single-column variant IDs
+    ids <- read_lines(keep_variants_path)
+    keep_variants <- parse_variant_id(ids)
   }
   vi_chrom <- as.integer(strip_chr_prefix(variant_info$chrom))
-  has_alleles <- !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2))
+  has_alleles <- "A1" %in% names(keep_variants) && "A2" %in% names(keep_variants) &&
+    !any(is.na(keep_variants$A1)) && !any(is.na(keep_variants$A2))
   if (has_alleles) {
     paste0(vi_chrom, ":", variant_info$pos, ":", variant_info$A2, ":", variant_info$A1) %in%
       paste0(keep_variants$chrom, ":", keep_variants$pos, ":", keep_variants$A2, ":", keep_variants$A1)
diff --git a/tests/testthat/test_file_utils.R b/tests/testthat/test_file_utils.R
index 6ca8f44f..6361415f 100644
--- a/tests/testthat/test_file_utils.R
+++ b/tests/testthat/test_file_utils.R
@@ -2127,6 +2127,88 @@ test_that("match_variants_to_keep returns all FALSE for non-matching variants",
   expect_true(all(!mask))
 })
 
+# ===========================================================================
+# read_variant_metadata
+# ===========================================================================
+
+test_that("read_variant_metadata reads 6-column bim file", {
+  tmp <- tempfile(fileext = ".bim")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c(
+    "1\trs1\t0\t100\tA\tG",
+    "1\trs2\t0\t200\tC\tT"
+  ), tmp)
+  res <- read_variant_metadata(tmp)
+  expect_equal(nrow(res), 2)
+  expect_true("gpos" %in% names(res))
+  expect_equal(as.character(res$chrom), c("1", "1"))
+  expect_equal(res$pos, c(100L, 200L))
+})
+
+test_that("read_variant_metadata reads 9-column bim file", {
+  tmp <- tempfile(fileext = ".bim")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c(
+    "1\trs1\t0\t100\tA\tG\t0.5\t0.3\t100",
+    "1\trs2\t0\t200\tC\tT\t0.4\t0.2\t99"
+  ), tmp)
+  res <- read_variant_metadata(tmp)
+  expect_equal(nrow(res), 2)
+  expect_true(all(c("variance", "allele_freq", "n_nomiss") %in% names(res)))
+})
+
+test_that("read_variant_metadata delegates to read_pvar for .pvar files", {
+  pvar_path <- test_path("test_data", "test_variants.pvar")
+  res <- read_variant_metadata(pvar_path)
+  expect_true(all(c("chrom", "id", "pos", "A1", "A2") %in% names(res)))
+  expect_false("gpos" %in% names(res))
+})
+
+test_that("read_variant_metadata errors on unexpected column count", {
+  tmp <- tempfile(fileext = ".bim")
+  on.exit(unlink(tmp), add = TRUE)
+  writeLines(c("1\trs1\t0\t100\tA"), tmp)
+  expect_error(read_variant_metadata(tmp), "Unexpected number of columns")
+})
+
+# ===========================================================================
+# match_variants_to_keep (additional coverage)
+# ===========================================================================
+
+test_that("match_variants_to_keep works with single-column variant ID file", {
+  vi <- data.frame(chrom = c("1", "1", "1"), pos = c(100L, 200L, 300L),
+                   A2 = c("A", "C", "G"), A1 = c("G", "T", "A"),
+                   stringsAsFactors = FALSE)
+  keep_file <- tempfile(fileext = ".txt")
+  on.exit(unlink(keep_file), add = TRUE)
+  writeLines(c("1:100:A:G", "1:300:G:A"), keep_file)
+
+  mask <- match_variants_to_keep(vi, keep_file)
+  expect_type(mask, "logical")
+  expect_equal(sum(mask), 2L)
+  expect_true(mask[1])
+  expect_true(mask[3])
+})
+
+test_that("match_variants_to_keep uses position-only matching when no alleles", {
+  skip_if_not_installed("pgenlibr")
+  td <- test_path("test_data")
+  result <- load_plink2_data(file.path(td, "test_variants"))
+  vi <- result$variant_info
+
+  keep_file <- tempfile(fileext = ".tsv")
+  on.exit(unlink(keep_file), add = TRUE)
+  # Write keep file with chrom/pos only (no alleles)
+  keep_df <- vi[c(1, 5), c("chrom", "pos")]
+  vroom::vroom_write(keep_df, keep_file, delim = "\t")
+
+  mask <- match_variants_to_keep(vi, keep_file)
+  expect_type(mask, "logical")
+  expect_equal(sum(mask), 2L)
+  expect_true(mask[1])
+  expect_true(mask[5])
+})
+
 # ===========================================================================
 # standardise_sumstats_columns
 # ===========================================================================