From 26abd4f57d2f9398de88ff773329210cde545795 Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Tue, 12 May 2026 10:45:12 -0600 Subject: [PATCH 1/3] importing data table, add export flags, relative duckdb --- NAMESPACE | 1 + R/data_curation.R | 2 ++ 2 files changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 1f96344..a82433c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,3 +6,4 @@ export(CDHIT2duckdb) export(prepareGenomes) export(runDataProcessing) export(runPanaroo2Duckdb) +importFrom(data.table,":=") diff --git a/R/data_curation.R b/R/data_curation.R index c12ed6f..6e71824 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -732,6 +732,7 @@ #' @return A list with: #' - duckdbConnection: live DBI connection to the created DuckDB #' - table_name: "metadata" +#' @export retrieveMetadata <- function(user_bacs, filter_type = "AMR", base_dir = ".", @@ -1326,6 +1327,7 @@ retrieveMetadata <- function(user_bacs, #' @param chunk_size Genomes per chunk container (default 50). #' @param verbose Verbose messages. #' @return Character vector of genome IDs with complete file sets on disk. +#' @export retrieveGenomes <- function(base_dir = ".", user_bacs, method = c("ftp", "cli"), From 1c0d9184ac1ec6426a3978af7c9e17fccac64a29 Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Tue, 12 May 2026 10:47:54 -0600 Subject: [PATCH 2/3] relativize path --- R/data_processing.R | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/R/data_processing.R b/R/data_processing.R index aeb35a0..512fdd4 100644 --- a/R/data_processing.R +++ b/R/data_processing.R @@ -1349,13 +1349,17 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) + # Views below reference parquet files by bare filename. Point DuckDB at the + # parquet directory so schema inference at CREATE VIEW time can resolve them. + DBI::dbExecute(con_new, sprintf("SET file_search_path='%s'", path)) + # gene_count -> long parquet + view DBI::dbReadTable(con, "gene_count") |> tidyr::pivot_longer(-genome_id, names_to = "gene", values_to = "value") |> dplyr::filter(!is.na(value) & value != "") |> dplyr::mutate(value = as.integer(value)) |> writeCompressedParquet(genes_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_count AS SELECT * FROM read_parquet('%s')", genes_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_count AS SELECT * FROM read_parquet('%s')", basename(genes_parquet))) # protein_count -> long parquet + view DBI::dbReadTable(con, "protein_count") |> @@ -1363,7 +1367,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::filter(!is.na(value) & value != "") |> dplyr::mutate(value = as.integer(value)) |> writeCompressedParquet(proteins_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_count AS SELECT * FROM read_parquet('%s')", proteins_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_count AS SELECT * FROM read_parquet('%s')", basename(proteins_parquet))) # domain_count -> long parquet + view DBI::dbReadTable(con, "domain_count") |> @@ -1371,7 +1375,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::filter(!is.na(value) & value != "") |> dplyr::mutate(value = as.integer(value)) |> writeCompressedParquet(domains_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW domain_count AS SELECT * FROM read_parquet('%s')", domains_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW domain_count AS SELECT * FROM read_parquet('%s')", basename(domains_parquet))) # gene_struct -> long parquet + view DBI::dbReadTable(con, "gene_struct") |> @@ -1379,43 +1383,43 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::filter(!is.na(value) & value != "") |> dplyr::mutate(value = as.integer(value)) |> writeCompressedParquet(struct_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", struct_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", basename(struct_parquet))) # cleaned_metadata -> parquet + view (as metadata) DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", metadata_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", basename(metadata_parquet))) # names/seq tables -> parquet + views DBI::dbReadTable(con, "gene_names") |> writeCompressedParquet(gene_names_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", gene_names_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", basename(gene_names_parquet))) DBI::dbReadTable(con, "protein_names") |> dplyr::select(-locus_tag) |> writeCompressedParquet(protein_names_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_names AS SELECT * FROM read_parquet('%s')", protein_names_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_names AS SELECT * FROM read_parquet('%s')", basename(protein_names_parquet))) DBI::dbReadTable(con, "domain_names") |> dplyr::select(-c(IPRAcc, IPRDesc)) |> writeCompressedParquet(domain_names_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW domain_names AS SELECT * FROM read_parquet('%s')", domain_names_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW domain_names AS SELECT * FROM read_parquet('%s')", basename(domain_names_parquet))) DBI::dbReadTable(con, "gene_ref_seq") |> writeCompressedParquet(gene_ref_seq_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_seqs AS SELECT * FROM read_parquet('%s')", gene_ref_seq_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_seqs AS SELECT * FROM read_parquet('%s')", basename(gene_ref_seq_parquet))) DBI::dbReadTable(con, "protein_cluster_seq") |> writeCompressedParquet(protein_cluster_seq_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')", protein_cluster_seq_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')", basename(protein_cluster_seq_parquet))) DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", genome_gene_protein_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", basename(genome_gene_protein_parquet))) # debug/complete views: amr_phenotype, genome_data, original_metadata DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet)) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet)) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", original_metadata_parquet)) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", basename(amr_phenotype_parquet))) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", basename(genome_data_parquet))) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", basename(original_metadata_parquet))) invisible(TRUE) } From cb01b12312e0af6bfb764bba21004f5915ee5025 Mon Sep 17 00:00:00 2001 From: Emily Boyer Date: Fri, 15 May 2026 13:58:58 -0600 Subject: [PATCH 3/3] regenerate NAMESPACE; add imports.R for data.table := --- NAMESPACE | 2 ++ R/imports.R | 2 ++ 2 files changed, 4 insertions(+) create mode 100644 R/imports.R diff --git a/NAMESPACE b/NAMESPACE index a82433c..a6b5194 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,8 @@ export(.extractAMRtable) export(.updateBVBRCdata) export(CDHIT2duckdb) export(prepareGenomes) +export(retrieveGenomes) +export(retrieveMetadata) export(runDataProcessing) export(runPanaroo2Duckdb) importFrom(data.table,":=") diff --git a/R/imports.R b/R/imports.R new file mode 100644 index 0000000..df44fe6 --- /dev/null +++ b/R/imports.R @@ -0,0 +1,2 @@ +#' @importFrom data.table := +NULL