diff --git a/build.gradle.kts b/build.gradle.kts index db32b42..88c52b6 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.9" +version = "0.2.10" repositories { mavenCentral() diff --git a/docs/commands.md b/docs/commands.md index 14590aa..8a1760c 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -110,7 +110,11 @@ seq_sim setup-environment -w my_workdir ## align-assemblies (Step 01) -Aligns multiple query assemblies to a reference genome using AnchorWave and minimap2. +Aligns multiple query assemblies to a reference genome via the PHGv2 +[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters) +command, which itself drives AnchorWave + minimap2 under the hood. This wrapper +keeps seq_sim's CLI surface (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...) +and the `maf_file_paths.txt` output contract that downstream steps depend on. **Usage:** ```bash @@ -119,31 +123,47 @@ seq_sim align-assemblies [OPTIONS] **Options:** - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`) -- `--ref-gff`, `-g`: Reference GFF file (required) -- `--ref-fasta`, `-r`: Reference FASTA file (required) -- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line -- `--threads`, `-t`: Number of threads to use (default: 1) +- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`) +- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`. +- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line. Translated to a PHGv2 `--assembly-file-list` internally. +- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1) +- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count. +- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1) +- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1) +- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps (anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location. +- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. Useful for SLURM array workflows; no per-query MAFs and no `maf_file_paths.txt` are produced. +- `--output-dir`, `-o`: Custom output directory (default: `/output/01_anchorwave_results`) **What it does:** -1. Extracts CDS sequences from reference GFF using `anchorwave gff2seq` -2. Aligns reference to CDS with `minimap2` (once for all queries) -3. For each query, runs `minimap2` and `anchorwave proali` to produce alignments -4. Generates `maf_file_paths.txt` listing all produced MAF files +1. Collects the query FASTA list from `--query-fasta` and writes it as + `/assemblies_list.txt` (the PHGv2 `--assembly-file-list`). +2. Invokes `phg align-assemblies` from `/src/phg_v2/bin/phg`. PHGv2 + then runs `anchorwave gff2seq`, `minimap2`, and `anchorwave proali` + internally. +3. Collects the resulting `.maf` files PHGv2 wrote to the output directory and + produces `maf_file_paths.txt` so downstream steps (`maf-to-gvcf`, + `create-chain-files`) continue to work unchanged. **Output:** -- `/output/01_anchorwave_results/{refBase}_cds.fa` -- `/output/01_anchorwave_results/{refBase}.sam` -- `/output/01_anchorwave_results/{queryName}/` containing `{queryName}.sam`, `*.anchors`, `*.maf`, `*.f.maf` +- `/output/01_anchorwave_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper) +- `/output/01_anchorwave_results/{queryName}.maf` (per-query alignment, one file each) +- `/output/01_anchorwave_results/{queryName}.sam` +- `/output/01_anchorwave_results/{queryName}_{refBase}.anchorspro` +- `/output/01_anchorwave_results/{queryName}.svg` (dot plot) +- `/output/01_anchorwave_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs) - `/output/01_anchorwave_results/maf_file_paths.txt` - `/logs/01_align_assemblies.log` **Examples:** ```bash -# Directory of queries +# Directory of queries, 8 threads seq_sim align-assemblies -g ref.gff -r ref.fa -q queries/ -t 8 -# Text list of query paths -seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 +# Text list of query paths, 4 threads, run 2 alignments in parallel +seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 --in-parallel 2 + +# Reference-prep only (for SLURM array workflows) +seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt --just-ref-prep ``` --- @@ -409,7 +429,12 @@ seq_sim format-recombined-fastas \ ## align-mutated-assemblies (Step 10) Realigns the formatted recombined (or otherwise mutated) FASTA files back to -the reference genome. This is the first step of the PS4G creation workflow. +the reference genome via the PHGv2 +[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters) +command, which itself drives AnchorWave + minimap2 under the hood. This is the +first step of the PS4G creation workflow. The wrapper keeps seq_sim's existing +CLI surface and the `maf_file_paths.txt` output contract that step 11 +(`mutated-maf-to-gvcf`) depends on. **Usage:** ```bash @@ -418,16 +443,24 @@ seq_sim align-mutated-assemblies [OPTIONS] **Options:** - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`) -- `--ref-gff`, `-g`: Reference GFF file (required) -- `--ref-fasta`, `-r`: Reference FASTA file (required) -- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list -- `--threads`, `-t`: Number of threads to use (default: 1) -- `--output-dir`, `-o`: Custom output directory (default: `work_dir/output/10_mutated_alignment_results`) +- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`) +- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`. +- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list. Translated to a PHGv2 `--assembly-file-list` internally. +- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1) +- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count. +- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1) +- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1) +- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps. Defaults to the `phgv2-conda` env in its standard location. +- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. No per-query MAFs and no `maf_file_paths.txt` are produced. +- `--output-dir`, `-o`: Custom output directory (default: `/output/10_mutated_alignment_results`) **Output:** -- `/output/10_mutated_alignment_results/{refBase}_cds.fa` -- `/output/10_mutated_alignment_results/{refBase}.sam` -- `/output/10_mutated_alignment_results/{fastaName}/` containing alignments +- `/output/10_mutated_alignment_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper) +- `/output/10_mutated_alignment_results/{fastaName}.maf` (per-FASTA alignment, one file each) +- `/output/10_mutated_alignment_results/{fastaName}.sam` +- `/output/10_mutated_alignment_results/{fastaName}_{refBase}.anchorspro` +- `/output/10_mutated_alignment_results/{fastaName}.svg` (dot plot) +- `/output/10_mutated_alignment_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs) - `/output/10_mutated_alignment_results/maf_file_paths.txt` - `/logs/10_align_mutated_assemblies.log` @@ -435,6 +468,11 @@ seq_sim align-mutated-assemblies [OPTIONS] ```bash seq_sim align-mutated-assemblies \ -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ -t 8 + +# Run 2 alignments in parallel with 4 total threads +seq_sim align-mutated-assemblies \ + -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ \ + -t 4 --in-parallel 2 ``` --- diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 6a00c6d..3c4f9e5 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -74,7 +74,12 @@ align_assemblies: ref_gff: "path/to/reference.gff" # Required: Reference GFF annotation file ref_fasta: "path/to/reference.fa" # Required: Reference FASTA file query_fasta: "path/to/queries.txt" # Required: Single query file, directory, or text list - threads: 1 # Optional: Number of threads (default: 1) + threads: 1 # Optional: PHGv2 --total-threads (default: 1) + # in_parallel: 2 # Optional: PHGv2 --in-parallel (omit for auto-tuning) + # ref_max_align_cov: 1 # Optional: PHGv2 --ref-max-align-cov (proali -R, default 1) + # query_max_align_cov: 1 # Optional: PHGv2 --query-max-align-cov (proali -Q, default 1) + # conda_env_prefix: "/path/to/env" # Optional: PHGv2 --conda-env-prefix (overrides phgv2-conda) + # just_ref_prep: false # Optional: PHGv2 --just-ref-prep (ref-prep only, no MAFs) # output: "/custom/output/path/" # Optional: Custom output directory # Step 2: MAF to GVCF Conversion @@ -191,7 +196,12 @@ align_mutated_assemblies: # Uses align_assemblies.ref_fasta if not specified # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) # Uses format_recombined_fastas output if not specified - threads: 1 # Optional: Number of threads (default: 1) + threads: 1 # Optional: PHGv2 --total-threads (default: 1) + # in_parallel: 2 # Optional: PHGv2 --in-parallel (omit for auto-tuning) + # ref_max_align_cov: 1 # Optional: PHGv2 --ref-max-align-cov (proali -R, default 1) + # query_max_align_cov: 1 # Optional: PHGv2 --query-max-align-cov (proali -Q, default 1) + # conda_env_prefix: "/path/to/env" # Optional: PHGv2 --conda-env-prefix (overrides phgv2-conda) + # just_ref_prep: false # Optional: PHGv2 --just-ref-prep (ref-prep only, no MAFs) # output: "/custom/alignment/output/" # Optional: Custom output directory # Step 11: Mutated MAF to GVCF Conversion diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt index 8f494c8..5f86da9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt @@ -1,257 +1,67 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand -import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.groups.provideDelegate import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required -import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path -import net.maizegenetics.Constants -import net.maizegenetics.utils.FileUtils -import net.maizegenetics.utils.LoggingUtils -import net.maizegenetics.utils.ProcessRunner -import net.maizegenetics.utils.SeqSimCommandException -import net.maizegenetics.utils.ValidationUtils +import net.maizegenetics.commands.align.PhgAlignParams +import net.maizegenetics.commands.align.PhgAlignRunner +import net.maizegenetics.commands.align.PhgAlignSharedOptions import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger -import java.nio.file.Path -import kotlin.io.path.* +/** + * Wraps the PHGv2 `align-assemblies` command, which itself drives AnchorWave + + * minimap2 to align query assemblies against a reference. This wrapper keeps + * seq_sim's existing inputs (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...) + * and existing output contract (`output/01_anchorwave_results/maf_file_paths.txt`) + * so downstream pipeline steps continue to work unchanged. + * + * All the heavy lifting (validating PHG, materializing the assembly file list, + * invoking PHGv2, writing `maf_file_paths.txt`) lives in [PhgAlignRunner] and + * is shared with every other align iteration; the only thing this wrapper + * declares is the step-specific input flag and per-step metadata (log + * filename and output subdirectory). + * + * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters + */ class AlignAssemblies : CliktCommand(name = "align-assemblies") { companion object { private const val LOG_FILE_NAME = "01_align_assemblies.log" - private const val OUTPUT_DIR = "output" private const val ANCHORWAVE_RESULTS_DIR = "01_anchorwave_results" - private const val MAF_PATHS_FILE = "maf_file_paths.txt" - - // minimap2 parameters - private const val MINIMAP2_PRESET = "splice" - private const val MINIMAP2_KMER_SIZE = "12" - private const val MINIMAP2_P_VALUE = "0.4" - private const val MINIMAP2_N_VALUE = "20" - - // anchorwave proali parameters - private const val ANCHORWAVE_R_VALUE = "1" - private const val ANCHORWAVE_Q_VALUE = "1" - - // Default values - private const val DEFAULT_THREADS = 1 } private val logger: Logger = LogManager.getLogger(AlignAssemblies::class.java) - private val workDir by option( - "--work-dir", "-w", - help = "Working directory for files and scripts" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - .default(Path.of(Constants.DEFAULT_WORK_DIR)) - - private val refGff by option( - "--ref-gff", "-g", - help = "Reference GFF file" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() - - private val refFasta by option( - "--ref-fasta", "-r", - help = "Reference FASTA file" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() + private val shared by PhgAlignSharedOptions() private val queryInput by option( "--query-fasta", "-q", - help = "Query FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line)" + help = "Query FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line). " + + "Translated to a PHGv2 --assembly-file-list internally." ).path(mustExist = true) .required() - private val threads by option( - "--threads", "-t", - help = "Number of threads to use" - ).int() - .default(DEFAULT_THREADS) - - private val outputDir by option( - "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/01_anchorwave_results)" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private fun collectQueryFiles(): List { - return FileUtils.collectFiles( - queryInput, - Constants.FASTA_EXTENSIONS, - "FASTA", - logger - ) - } - override fun run() { - // Validate working directory - ValidationUtils.validateWorkingDirectory(workDir, logger) - - // Configure file logging to working directory - LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - - logger.info("Starting assembly alignment") - logger.info("Working directory: $workDir") - logger.info("Reference GFF: $refGff") - logger.info("Reference FASTA: $refFasta") - logger.info("Threads: $threads") - - // Collect query files - val queryFiles = collectQueryFiles() - logger.info("Processing ${queryFiles.size} query file(s)") - - // Create base output directory (use custom or default) - val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, ANCHORWAVE_RESULTS_DIR) - FileUtils.createOutputDirectory(baseOutputDir, logger) - - // Derive reference base name - val refBase = refFasta.nameWithoutExtension - logger.info("Reference base name: $refBase") - - // Step 1: Run anchorwave gff2seq (once for reference) - logger.info("Step 1: Extracting CDS sequences with anchorwave gff2seq") - val cdsFile = baseOutputDir.resolve("${refBase}_cds.fa") - val gff2seqExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "gff2seq", - "-i", refGff.toString(), - "-r", refFasta.toString(), - "-o", cdsFile.toString(), - workingDir = workDir.toFile(), - logger = logger - ) - if (gff2seqExitCode != 0) { - logger.error("anchorwave gff2seq failed with exit code $gff2seqExitCode") - throw SeqSimCommandException("anchorwave gff2seq failed with exit code $gff2seqExitCode", gff2seqExitCode) - } - logger.info("CDS file created: $cdsFile") - - // Step 2: Run minimap2 for reference (once for all queries) - logger.info("Step 2: Running minimap2 alignment for reference") - val refSam = baseOutputDir.resolve("${refBase}.sam") - val minimap2RefExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - refFasta.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = refSam.toFile(), - logger = logger - ) - if (minimap2RefExitCode != 0) { - logger.error("minimap2 (reference) failed with exit code $minimap2RefExitCode") - throw SeqSimCommandException("minimap2 (reference) failed with exit code $minimap2RefExitCode", minimap2RefExitCode) - } - logger.info("Reference SAM file created: $refSam") - - // Step 3: Process each query file - var successCount = 0 - var failureCount = 0 - val mafFilePaths = mutableListOf() - - queryFiles.forEachIndexed { index, queryFasta -> - logger.info("=".repeat(80)) - logger.info("Processing query ${index + 1}/${queryFiles.size}: ${queryFasta.name}") - logger.info("=".repeat(80)) - - try { - val mafPath = alignQuery(queryFasta, refBase, refSam, cdsFile, baseOutputDir) - mafFilePaths.add(mafPath) - successCount++ - logger.info("Successfully completed alignment for: ${queryFasta.name}") - } catch (e: Exception) { - failureCount++ - logger.error("Failed to align query: ${queryFasta.name}", e) - logger.error("Continuing with next query...") - } - } - - // Write MAF file paths to text file - FileUtils.writeFilePaths( - mafFilePaths, - baseOutputDir.resolve(MAF_PATHS_FILE), - logger, - "MAF file" - ) - - logger.info("=".repeat(80)) - logger.info("All alignments completed!") - logger.info("Total queries processed: ${queryFiles.size}") - logger.info("Successful: $successCount") - logger.info("Failed: $failureCount") - logger.info("Output directory: $baseOutputDir") - } - - private fun alignQuery(queryFasta: Path, refBase: String, refSam: Path, cdsFile: Path, baseOutputDir: Path): Path { - val queryName = queryFasta.nameWithoutExtension - - // Create query-specific output directory - val queryOutputDir = baseOutputDir.resolve(queryName) - if (!queryOutputDir.exists()) { - queryOutputDir.createDirectories() - } - - // Step 1: Run minimap2 for query - logger.info("Running minimap2 alignment for query") - val querySam = queryOutputDir.resolve("${queryName}.sam") - val minimap2QueryExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - queryFasta.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = querySam.toFile(), - logger = logger - ) - if (minimap2QueryExitCode != 0) { - throw RuntimeException("minimap2 (query) failed with exit code $minimap2QueryExitCode") - } - logger.info("Query SAM file created: $querySam") - - // Step 2: Run anchorwave proali - logger.info("Running anchorwave proali") - val anchorsFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.anchors") - val mafFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.maf") - val fMafFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.f.maf") - - val proaliExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "proali", - "-i", refGff.toString(), - "-as", cdsFile.toString(), - "-r", refFasta.toString(), - "-a", querySam.toString(), - "-ar", refSam.toString(), - "-s", queryFasta.toString(), - "-n", anchorsFile.toString(), - "-R", ANCHORWAVE_R_VALUE, - "-Q", ANCHORWAVE_Q_VALUE, - "-o", mafFile.toString(), - "-f", fMafFile.toString(), - "-t", threads.toString(), - workingDir = workDir.toFile(), - logger = logger + PhgAlignRunner.run( + PhgAlignParams( + workDir = shared.workDir, + refGff = shared.refGff, + refFasta = shared.refFasta, + queryInput = queryInput, + threads = shared.threads, + inParallel = shared.inParallel, + refMaxAlignCov = shared.refMaxAlignCov, + queryMaxAlignCov = shared.queryMaxAlignCov, + condaEnvPrefix = shared.condaEnvPrefix, + justRefPrep = shared.justRefPrep, + customOutputDir = shared.outputDir, + logFileName = LOG_FILE_NAME, + outputSubdir = ANCHORWAVE_RESULTS_DIR, + inputKind = "query", + ), + logger ) - if (proaliExitCode != 0) { - throw RuntimeException("anchorwave proali failed with exit code $proaliExitCode") - } - - logger.info("Output files for ${queryName}:") - logger.info(" Query SAM: $querySam") - logger.info(" Anchors: $anchorsFile") - logger.info(" MAF: $mafFile") - logger.info(" Filtered MAF: $fMafFile") - - // Return the MAF file path (not the filtered one) - return mafFile } } diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt index c642224..d2a48fb 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt @@ -1,256 +1,68 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand -import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.groups.provideDelegate import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required -import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path -import net.maizegenetics.Constants -import net.maizegenetics.utils.FileUtils -import net.maizegenetics.utils.LoggingUtils -import net.maizegenetics.utils.ProcessRunner -import net.maizegenetics.utils.ValidationUtils +import net.maizegenetics.commands.align.PhgAlignParams +import net.maizegenetics.commands.align.PhgAlignRunner +import net.maizegenetics.commands.align.PhgAlignSharedOptions import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger -import java.nio.file.Path -import kotlin.io.path.* -import kotlin.system.exitProcess +/** + * Wraps the PHGv2 `align-assemblies` command for the "circular" mutated / + * recombined FASTA realignment step (step 10). PHGv2 internally drives + * AnchorWave + minimap2; this wrapper keeps seq_sim's existing inputs + * (`--ref-gff`, `--ref-fasta`, `--fasta-input`, ...) and existing output + * contract (`output/10_mutated_alignment_results/maf_file_paths.txt`) so + * downstream pipeline steps continue to work unchanged. + * + * All the heavy lifting (validating PHG, materializing the assembly file list, + * invoking PHGv2, writing `maf_file_paths.txt`) lives in [PhgAlignRunner] and + * is shared with [AlignAssemblies]; the only thing this wrapper declares is + * the step-specific input flag and per-step metadata (log filename and + * output subdirectory). + * + * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters + */ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { companion object { private const val LOG_FILE_NAME = "10_align_mutated_assemblies.log" private const val MUTATED_ALIGNMENT_RESULTS_DIR = "10_mutated_alignment_results" - private const val MAF_PATHS_FILE = "maf_file_paths.txt" - - // minimap2 parameters - private const val MINIMAP2_PRESET = "splice" - private const val MINIMAP2_KMER_SIZE = "12" - private const val MINIMAP2_P_VALUE = "0.4" - private const val MINIMAP2_N_VALUE = "20" - - // anchorwave proali parameters - private const val ANCHORWAVE_R_VALUE = "1" - private const val ANCHORWAVE_Q_VALUE = "1" - - // Default values - private const val DEFAULT_THREADS = 1 } private val logger: Logger = LogManager.getLogger(AlignMutatedAssemblies::class.java) - private val workDir by option( - "--work-dir", "-w", - help = "Working directory for files and scripts" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - .default(Path.of(Constants.DEFAULT_WORK_DIR)) - - private val refGff by option( - "--ref-gff", "-g", - help = "Reference GFF file" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() - - private val refFasta by option( - "--ref-fasta", "-r", - help = "Reference FASTA file" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() + private val shared by PhgAlignSharedOptions() private val fastaInput by option( "--fasta-input", "-f", - help = "FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line)" + help = "FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line). " + + "Translated to a PHGv2 --assembly-file-list internally." ).path(mustExist = true) .required() - private val threads by option( - "--threads", "-t", - help = "Number of threads to use" - ).int() - .default(DEFAULT_THREADS) - - private val outputDir by option( - "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/10_mutated_alignment_results)" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private fun collectFastaFiles(): List { - return FileUtils.collectFiles( - fastaInput, - Constants.FASTA_EXTENSIONS, - "FASTA", - logger - ) - } - override fun run() { - // Validate working directory exists - ValidationUtils.validateWorkingDirectory(workDir, logger) - - // Configure file logging to working directory - LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - - logger.info("Starting mutated assembly alignment") - logger.info("Working directory: $workDir") - logger.info("Reference GFF: $refGff") - logger.info("Reference FASTA: $refFasta") - logger.info("Threads: $threads") - - // Collect FASTA files - val fastaFiles = collectFastaFiles() - logger.info("Processing ${fastaFiles.size} FASTA file(s)") - - // Create base output directory (use custom or default) - val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, MUTATED_ALIGNMENT_RESULTS_DIR) - FileUtils.createOutputDirectory(baseOutputDir, logger) - - // Derive reference base name - val refBase = refFasta.nameWithoutExtension - logger.info("Reference base name: $refBase") - - // Step 1: Run anchorwave gff2seq (once for reference) - logger.info("Step 1: Extracting CDS sequences with anchorwave gff2seq") - val cdsFile = baseOutputDir.resolve("${refBase}_cds.fa") - val gff2seqExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "gff2seq", - "-i", refGff.toString(), - "-r", refFasta.toString(), - "-o", cdsFile.toString(), - workingDir = workDir.toFile(), - logger = logger - ) - if (gff2seqExitCode != 0) { - logger.error("anchorwave gff2seq failed with exit code $gff2seqExitCode") - exitProcess(gff2seqExitCode) - } - logger.info("CDS file created: $cdsFile") - - // Step 2: Run minimap2 for reference (once for all queries) - logger.info("Step 2: Running minimap2 alignment for reference") - val refSam = baseOutputDir.resolve("${refBase}.sam") - val minimap2RefExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - refFasta.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = refSam.toFile(), - logger = logger - ) - if (minimap2RefExitCode != 0) { - logger.error("minimap2 (reference) failed with exit code $minimap2RefExitCode") - exitProcess(minimap2RefExitCode) - } - logger.info("Reference SAM file created: $refSam") - - // Step 3: Process each FASTA file - var successCount = 0 - var failureCount = 0 - val mafFilePaths = mutableListOf() - - fastaFiles.forEachIndexed { index, fastaFile -> - logger.info("=".repeat(80)) - logger.info("Processing FASTA ${index + 1}/${fastaFiles.size}: ${fastaFile.name}") - logger.info("=".repeat(80)) - - try { - val mafPath = alignFasta(fastaFile, refBase, refSam, cdsFile, baseOutputDir) - mafFilePaths.add(mafPath) - successCount++ - logger.info("Successfully completed alignment for: ${fastaFile.name}") - } catch (e: Exception) { - failureCount++ - logger.error("Failed to align FASTA: ${fastaFile.name}", e) - logger.error("Continuing with next FASTA...") - } - } - - // Write MAF file paths to text file - FileUtils.writeFilePaths( - mafFilePaths, - baseOutputDir.resolve(MAF_PATHS_FILE), - logger, - "MAF file" - ) - - logger.info("=".repeat(80)) - logger.info("All alignments completed!") - logger.info("Total FASTA files processed: ${fastaFiles.size}") - logger.info("Successful: $successCount") - logger.info("Failed: $failureCount") - logger.info("Output directory: $baseOutputDir") - } - - private fun alignFasta(fastaFile: Path, refBase: String, refSam: Path, cdsFile: Path, baseOutputDir: Path): Path { - val fastaName = fastaFile.nameWithoutExtension - - // Create FASTA-specific output directory - val fastaOutputDir = baseOutputDir.resolve(fastaName) - if (!fastaOutputDir.exists()) { - fastaOutputDir.createDirectories() - } - - // Step 1: Run minimap2 for FASTA - logger.info("Running minimap2 alignment for FASTA") - val fastaSam = fastaOutputDir.resolve("${fastaName}.sam") - val minimap2FastaExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - fastaFile.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = fastaSam.toFile(), - logger = logger - ) - if (minimap2FastaExitCode != 0) { - throw RuntimeException("minimap2 (FASTA) failed with exit code $minimap2FastaExitCode") - } - logger.info("FASTA SAM file created: $fastaSam") - - // Step 2: Run anchorwave proali - logger.info("Running anchorwave proali") - val anchorsFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.anchors") - val mafFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.maf") - val fMafFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.f.maf") - - val proaliExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "proali", - "-i", refGff.toString(), - "-as", cdsFile.toString(), - "-r", refFasta.toString(), - "-a", fastaSam.toString(), - "-ar", refSam.toString(), - "-s", fastaFile.toString(), - "-n", anchorsFile.toString(), - "-R", ANCHORWAVE_R_VALUE, - "-Q", ANCHORWAVE_Q_VALUE, - "-o", mafFile.toString(), - "-f", fMafFile.toString(), - "-t", threads.toString(), - workingDir = workDir.toFile(), - logger = logger + PhgAlignRunner.run( + PhgAlignParams( + workDir = shared.workDir, + refGff = shared.refGff, + refFasta = shared.refFasta, + queryInput = fastaInput, + threads = shared.threads, + inParallel = shared.inParallel, + refMaxAlignCov = shared.refMaxAlignCov, + queryMaxAlignCov = shared.queryMaxAlignCov, + condaEnvPrefix = shared.condaEnvPrefix, + justRefPrep = shared.justRefPrep, + customOutputDir = shared.outputDir, + logFileName = LOG_FILE_NAME, + outputSubdir = MUTATED_ALIGNMENT_RESULTS_DIR, + inputKind = "FASTA", + ), + logger ) - if (proaliExitCode != 0) { - throw RuntimeException("anchorwave proali failed with exit code $proaliExitCode") - } - - logger.info("Output files for ${fastaName}:") - logger.info(" FASTA SAM: $fastaSam") - logger.info(" Anchors: $anchorsFile") - logger.info(" MAF: $mafFile") - logger.info(" Filtered MAF: $fMafFile") - - // Return the MAF file path (not the filtered one) - return mafFile } } diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index f1306f0..e1fc13b 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -40,8 +40,13 @@ data class AlignAssembliesConfig( val ref_gff: String, val ref_fasta: String, val query_fasta: String, - val threads: Int? = null, - val output: String? = null // Custom output directory + val threads: Int? = null, // PHGv2 --total-threads + val in_parallel: Int? = null, // PHGv2 --in-parallel + val ref_max_align_cov: Int? = null, // PHGv2 --ref-max-align-cov (proali -R) + val query_max_align_cov: Int? = null, // PHGv2 --query-max-align-cov (proali -Q) + val conda_env_prefix: String? = null, // PHGv2 --conda-env-prefix + val just_ref_prep: Boolean? = null, // PHGv2 --just-ref-prep + val output: String? = null // Custom output directory ) data class MafToGvcfConfig( @@ -71,11 +76,16 @@ data class ConvertToFastaConfig( ) data class AlignMutatedAssembliesConfig( - val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) - val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) - val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) - val threads: Int? = null, - val output: String? = null // Custom output directory + val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) + val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) + val threads: Int? = null, // PHGv2 --total-threads + val in_parallel: Int? = null, // PHGv2 --in-parallel + val ref_max_align_cov: Int? = null, // PHGv2 --ref-max-align-cov (proali -R) + val query_max_align_cov: Int? = null, // PHGv2 --query-max-align-cov (proali -Q) + val conda_env_prefix: String? = null, // PHGv2 --conda-env-prefix + val just_ref_prep: Boolean? = null, // PHGv2 --just-ref-prep + val output: String? = null // Custom output directory ) data class PickCrossoversConfig( @@ -151,6 +161,47 @@ class Orchestrate : CliktCommand(name = "orchestrate") { return stepName in config.run_steps } + /** + * Appends the optional PHGv2 align-assemblies knobs shared by every + * align step (threads, in-parallel, proali coverage caps, conda env + * prefix, just-ref-prep, output dir override) to [args]. Each option + * is included only when its config field is non-null/true, matching + * the existing inline behaviour for both align_assemblies and + * align_mutated_assemblies. + */ + private fun appendPhgAlignSharedArgs( + args: MutableList, + threads: Int?, + inParallel: Int?, + refMaxAlignCov: Int?, + queryMaxAlignCov: Int?, + condaEnvPrefix: String?, + justRefPrep: Boolean?, + customOutput: Path?, + ) { + if (threads != null) { + args.add("--threads=$threads") + } + if (inParallel != null) { + args.add("--in-parallel=$inParallel") + } + if (refMaxAlignCov != null) { + args.add("--ref-max-align-cov=$refMaxAlignCov") + } + if (queryMaxAlignCov != null) { + args.add("--query-max-align-cov=$queryMaxAlignCov") + } + if (condaEnvPrefix != null) { + args.add("--conda-env-prefix=$condaEnvPrefix") + } + if (justRefPrep == true) { + args.add("--just-ref-prep") + } + if (customOutput != null) { + args.add("--output-dir=$customOutput") + } + } + private fun validateEnvironment(workDir: Path): Boolean { // Check if working directory exists if (!workDir.exists()) { @@ -179,6 +230,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { return false } + // Check if PHGv2 binary exists (align-assemblies + later steps shell out to it) + val phgBinary = workDir.resolve(Constants.SRC_DIR) + .resolve(Constants.PHGV2_DIR) + .resolve("bin") + .resolve("phg") + if (!phgBinary.exists()) { + logger.info("PHGv2 binary not found: $phgBinary") + return false + } + // All checks passed logger.info("Environment validation passed - all required tools are present") return true @@ -234,6 +295,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ref_fasta = it["ref_fasta"] as? String ?: throw IllegalArgumentException("align_assemblies.ref_fasta is required"), query_fasta = it["query_fasta"] as? String ?: throw IllegalArgumentException("align_assemblies.query_fasta is required"), threads = it["threads"] as? Int, + in_parallel = it["in_parallel"] as? Int, + ref_max_align_cov = it["ref_max_align_cov"] as? Int, + query_max_align_cov = it["query_max_align_cov"] as? Int, + conda_env_prefix = it["conda_env_prefix"] as? String, + just_ref_prep = it["just_ref_prep"] as? Boolean, output = it["output"] as? String ) } @@ -288,6 +354,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ref_fasta = alignMutatedAssembliesMap?.get("ref_fasta") as? String, fasta_input = alignMutatedAssembliesMap?.get("fasta_input") as? String, threads = alignMutatedAssembliesMap?.get("threads") as? Int, + in_parallel = alignMutatedAssembliesMap?.get("in_parallel") as? Int, + ref_max_align_cov = alignMutatedAssembliesMap?.get("ref_max_align_cov") as? Int, + query_max_align_cov = alignMutatedAssembliesMap?.get("query_max_align_cov") as? Int, + conda_env_prefix = alignMutatedAssembliesMap?.get("conda_env_prefix") as? String, + just_ref_prep = alignMutatedAssembliesMap?.get("just_ref_prep") as? Boolean, output = alignMutatedAssembliesMap?.get("output") as? String ) } else null @@ -471,18 +542,22 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("Reference FASTA: $refFasta") logger.info("Query FASTA: $queryFasta") - val args = buildList { - add("--work-dir=$workDir") - add("--ref-gff=$refGff") - add("--ref-fasta=$refFasta") - add("--query-fasta=$queryFasta") - if (config.align_assemblies.threads != null) { - add("--threads=${config.align_assemblies.threads}") - } - if (customOutput != null) { - add("--output-dir=$customOutput") - } - } + val args = mutableListOf( + "--work-dir=$workDir", + "--ref-gff=$refGff", + "--ref-fasta=$refFasta", + "--query-fasta=$queryFasta", + ) + appendPhgAlignSharedArgs( + args, + threads = config.align_assemblies.threads, + inParallel = config.align_assemblies.in_parallel, + refMaxAlignCov = config.align_assemblies.ref_max_align_cov, + queryMaxAlignCov = config.align_assemblies.query_max_align_cov, + condaEnvPrefix = config.align_assemblies.conda_env_prefix, + justRefPrep = config.align_assemblies.just_ref_prep, + customOutput = customOutput, + ) AlignAssemblies().parse(args) restoreOrchestratorLogging(workDir) @@ -1174,18 +1249,22 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine output directory (custom or default) val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } - val args = buildList { - add("--work-dir=${workDir}") - add("--ref-gff=${step10RefGff}") - add("--ref-fasta=${step10RefFasta}") - add("--fasta-input=${step10FastaInput}") - if (config.align_mutated_assemblies.threads != null) { - add("--threads=${config.align_mutated_assemblies.threads}") - } - if (customOutput != null) { - add("--output-dir=${customOutput}") - } - } + val args = mutableListOf( + "--work-dir=$workDir", + "--ref-gff=$step10RefGff", + "--ref-fasta=$step10RefFasta", + "--fasta-input=$step10FastaInput", + ) + appendPhgAlignSharedArgs( + args, + threads = config.align_mutated_assemblies.threads, + inParallel = config.align_mutated_assemblies.in_parallel, + refMaxAlignCov = config.align_mutated_assemblies.ref_max_align_cov, + queryMaxAlignCov = config.align_mutated_assemblies.query_max_align_cov, + condaEnvPrefix = config.align_mutated_assemblies.conda_env_prefix, + justRefPrep = config.align_mutated_assemblies.just_ref_prep, + customOutput = customOutput, + ) AlignMutatedAssemblies().parse(args) restoreOrchestratorLogging(workDir) diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt new file mode 100644 index 0000000..24b86a8 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt @@ -0,0 +1,35 @@ +package net.maizegenetics.commands.align + +import java.nio.file.Path + +/** + * All inputs the [PhgAlignRunner] needs to wrap PHGv2 `align-assemblies`. + * + * Combines: + * - the user-facing PHGv2 knobs that are common to every align step (see + * [PhgAlignSharedOptions]), + * - per-caller metadata (log filename, output subdirectory, label used in + * log strings) that lets each thin wrapper place its outputs and logs + * in step-specific locations, and + * - the caller's unique input (collected as a single [Path] -- single file, + * directory, or .txt list -- just like the previous standalone commands). + * + * Adding a new align iteration is a new [PhgAlignParams] instance from a + * new Clikt wrapper; the runner itself does not change. + */ +data class PhgAlignParams( + val workDir: Path, + val refGff: Path, + val refFasta: Path, + val queryInput: Path, + val threads: Int, + val inParallel: Int? = null, + val refMaxAlignCov: Int? = null, + val queryMaxAlignCov: Int? = null, + val condaEnvPrefix: Path? = null, + val justRefPrep: Boolean = false, + val customOutputDir: Path? = null, + val logFileName: String, + val outputSubdir: String, + val inputKind: String, +) diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt new file mode 100644 index 0000000..5798370 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt @@ -0,0 +1,177 @@ +package net.maizegenetics.commands.align + +import net.maizegenetics.Constants +import net.maizegenetics.utils.FileUtils +import net.maizegenetics.utils.LoggingUtils +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.SeqSimCommandException +import net.maizegenetics.utils.ValidationUtils +import org.apache.logging.log4j.Logger +import java.nio.file.Path +import kotlin.io.path.listDirectoryEntries +import kotlin.io.path.isRegularFile +import kotlin.io.path.name +import kotlin.io.path.writeLines + +/** + * Consolidated runner for every command that wraps PHGv2 `align-assemblies`. + * + * Validates the PHG binary, materializes a PHGv2 `--assembly-file-list` from + * the caller's input, invokes the PHG CLI, and writes the standard + * `maf_file_paths.txt` output. Per-caller variation (log filename, output + * subdirectory, label used in log strings) is provided via [PhgAlignParams] + * so adding a new align iteration is just a new thin Clikt wrapper -- no + * runner changes required. + */ +object PhgAlignRunner { + + private const val MAF_PATHS_FILE = "maf_file_paths.txt" + private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" + + /** + * Run PHGv2 align-assemblies for [params] and return the resolved output + * directory the run wrote to. + */ + fun run(params: PhgAlignParams, logger: Logger): Path { + // Validate working directory and PHG binary + val phgBinary = ValidationUtils.validatePhgSetup(params.workDir, logger) + + // Configure file logging to working directory + LoggingUtils.setupFileLogging(params.workDir, params.logFileName, logger) + + logger.info("Starting assembly alignment via PHGv2 `align-assemblies`") + logger.info("Working directory: ${params.workDir}") + logger.info("Reference GFF: ${params.refGff}") + logger.info("Reference FASTA: ${params.refFasta}") + logger.info("Total threads: ${params.threads}") + params.inParallel?.let { logger.info("In-parallel: $it") } + params.refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } + params.queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } + params.condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } + if (params.justRefPrep) { + logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") + } + + // Collect input files into a PHGv2-shaped assembly-file-list + val inputFiles = FileUtils.collectFiles( + params.queryInput, + Constants.FASTA_EXTENSIONS, + "FASTA", + logger + ) + logger.info("Processing ${inputFiles.size} ${params.inputKind} file(s)") + + // Create base output directory (use custom or default). + // PHGv2 requires the output directory to exist before invocation. + val baseOutputDir = FileUtils.resolveOutputDirectory( + params.workDir, + params.customOutputDir, + params.outputSubdir + ) + FileUtils.createOutputDirectory(baseOutputDir, logger) + + val assemblyListFile = writeAssemblyFileList( + inputFiles, + params.refFasta, + params.inputKind, + baseOutputDir, + logger + ) + + // Build the PHGv2 align-assemblies command + val commandArgs = mutableListOf( + phgBinary.toString(), + "align-assemblies", + "--gff", params.refGff.toAbsolutePath().toString(), + "--reference-file", params.refFasta.toAbsolutePath().toString(), + "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), + "--total-threads", params.threads.toString(), + "-o", baseOutputDir.toAbsolutePath().toString() + ) + params.inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } + params.refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } + params.queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } + params.condaEnvPrefix?.let { + commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) + } + if (params.justRefPrep) { + commandArgs += "--just-ref-prep" + } + + logger.info("Running PHG align-assemblies...") + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), + workingDir = params.workDir.toFile(), + logger = logger + ) + + if (exitCode != 0) { + logger.error("PHG align-assemblies failed with exit code $exitCode") + throw SeqSimCommandException( + "PHG align-assemblies failed with exit code $exitCode", + exitCode + ) + } + + if (params.justRefPrep) { + logger.info("--just-ref-prep was set; skipping MAF collection.") + logger.info("Reference-prep outputs written to: $baseOutputDir") + return baseOutputDir + } + + // Collect MAF outputs PHGv2 wrote into the output directory and + // surface them via the standard maf_file_paths.txt contract so + // downstream pipeline steps (maf-to-gvcf, create-chain-files, ...) + // keep working unchanged. + val mafFiles = baseOutputDir.listDirectoryEntries() + .filter { it.isRegularFile() && it.name.endsWith(".maf") } + .sorted() + + FileUtils.writeFilePaths( + mafFiles, + baseOutputDir.resolve(MAF_PATHS_FILE), + logger, + "MAF file" + ) + + logger.info("=".repeat(80)) + logger.info("PHG align-assemblies completed successfully") + logger.info("Total assemblies aligned: ${inputFiles.size}") + logger.info("MAF files written: ${mafFiles.size}") + logger.info("Output directory: $baseOutputDir") + + return baseOutputDir + } + + /** + * Materializes a PHGv2 `--assembly-file-list` from whatever the caller + * passed (a single FASTA, a directory, or a .txt list). The reference + * FASTA is filtered out if it accidentally appears in the collected list + * (PHGv2 warns against including the reference here). + */ + private fun writeAssemblyFileList( + inputFiles: List, + refFasta: Path, + inputKind: String, + baseOutputDir: Path, + logger: Logger + ): Path { + val refAbsolute = refFasta.toAbsolutePath().normalize() + val filtered = inputFiles + .map { it.toAbsolutePath().normalize() } + .filter { it != refAbsolute } + .distinct() + + if (filtered.size != inputFiles.size) { + logger.warn( + "Reference FASTA was present in the $inputKind input list and was removed; " + + "PHGv2 expects the reference to be passed only via --reference-file." + ) + } + + val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) + listFile.writeLines(filtered.map { it.toString() }) + logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") + return listFile + } +} diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt new file mode 100644 index 0000000..37ab8c5 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt @@ -0,0 +1,83 @@ +package net.maizegenetics.commands.align + +import com.github.ajalt.clikt.parameters.groups.OptionGroup +import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag +import com.github.ajalt.clikt.parameters.options.option +import com.github.ajalt.clikt.parameters.options.required +import com.github.ajalt.clikt.parameters.types.int +import com.github.ajalt.clikt.parameters.types.path +import net.maizegenetics.Constants +import java.nio.file.Path + +/** + * Shared Clikt option group for every command that wraps PHGv2 + * `align-assemblies`. Owns the options that are identical across align + * iterations (working dir, reference files, threading, AnchorWave proali + * knobs, conda env prefix, ref-prep-only flag, and the custom output dir + * override) so option names, help text, and defaults live in exactly one + * place. The per-step unique input flag (e.g. `--query-fasta` / + * `--fasta-input`) is declared on each wrapper itself. + */ +class PhgAlignSharedOptions : OptionGroup(name = "PHGv2 align-assemblies options") { + + val workDir by option( + "--work-dir", "-w", + help = "Working directory for files and scripts" + ).path(mustExist = false, canBeFile = false, canBeDir = true) + .default(Path.of(Constants.DEFAULT_WORK_DIR)) + + val refGff by option( + "--ref-gff", "-g", + help = "Reference GFF file (passed to PHGv2 as --gff)" + ).path(mustExist = true, canBeFile = true, canBeDir = false) + .required() + + val refFasta by option( + "--ref-fasta", "-r", + help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + + "this should be the output of `phg prepare-assemblies`." + ).path(mustExist = true, canBeFile = true, canBeDir = false) + .required() + + val threads by option( + "--threads", "-t", + help = "Total number of threads available to PHGv2 (--total-threads)" + ).int() + .default(1) + + val inParallel by option( + "--in-parallel", + help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + + "If omitted, PHGv2 picks a value from system memory + thread count." + ).int() + + val refMaxAlignCov by option( + "--ref-max-align-cov", + help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + + "passed through as proali's `-R`). PHGv2 defaults this to 1." + ).int() + + val queryMaxAlignCov by option( + "--query-max-align-cov", + help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + + "passed through as proali's `-Q`). PHGv2 defaults this to 1." + ).int() + + val condaEnvPrefix by option( + "--conda-env-prefix", + help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + + "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." + ).path(mustExist = false, canBeFile = false, canBeDir = true) + + val justRefPrep by option( + "--just-ref-prep", + help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + + "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." + ).flag() + + val outputDir by option( + "--output-dir", "-o", + help = "Custom output directory (default: work_dir/output/)" + ).path(mustExist = false, canBeFile = false, canBeDir = true) +} diff --git a/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt index d1cc466..60b008b 100644 --- a/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt @@ -8,13 +8,15 @@ import org.junit.jupiter.api.Test import org.junit.jupiter.api.io.TempDir import java.io.File import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText import kotlin.test.assertEquals import kotlin.test.assertTrue /** - * Unit tests for [AlignAssemblies] that don't actually shell out to - * anchorwave/minimap2 -- we install a [RecordingProcessExecutor] and verify - * the exact command lines seq-sim would send. + * Unit tests for [AlignAssemblies] that don't actually shell out to the + * PHGv2 binary -- we install a [RecordingProcessExecutor] and verify the + * exact command line seq-sim would send to `phg align-assemblies`. */ class AlignAssembliesUnitTest { @@ -26,8 +28,22 @@ class AlignAssembliesUnitTest { ProcessRunner.resetExecutor() } + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + @Test - fun gff2seqAndMinimap2AreInvokedOncePerReference(@TempDir workDir: Path) { + fun phgAlignAssembliesIsInvokedExactlyOnce(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) ProcessRunner.withExecutor(executor) { @@ -42,35 +58,44 @@ class AlignAssembliesUnitTest { ) } - // Exactly one gff2seq invocation for the reference. - val gff2seqCalls = executor.invocations.filter { - it.command.contains("gff2seq") - } - assertEquals(1, gff2seqCalls.size, "gff2seq should run once per reference") - assertTrue( - executor.containsSubsequence("anchorwave", "gff2seq"), - "anchorwave gff2seq should be invoked" - ) + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) - // minimap2 is invoked once for the reference plus once per query (3). - val minimap2Calls = executor.invocationsOf("pixi").filter { - it.command.contains("minimap2") - } + executor.invocationsOf("minimap2") + // Required PHGv2 args are present + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) assertEquals( - 4, minimap2Calls.size, - "minimap2 should run once for the reference and once per query" + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") ) + assertEquals("2", inv.argAfter("--total-threads")) - // proali is invoked once per query (3 queries in smallseq). - val proaliCalls = executor.invocations.filter { - it.command.contains("proali") - } - assertEquals(3, proaliCalls.size, "anchorwave proali should run once per query") + // PHGv2 expects the output dir to exist before running and we hand it + // an assembly-file-list materialized inside that output dir. + val expectedOutputDir = workDir.resolve("output/01_anchorwave_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional flags should NOT be present when not set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) } @Test - fun anchorwaveProaliCommandIncludesRefGffAndQuerySam(@TempDir workDir: Path) { + fun optionalPhgParametersAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } ProcessRunner.withExecutor(executor) { AlignAssemblies().parse( @@ -79,23 +104,68 @@ class AlignAssembliesUnitTest { "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), "--query-fasta", smallseqRoot.resolve("queries/LineA.fa").toString(), - "--threads", "4" + "--threads", "4", + "--in-parallel", "2", + "--ref-max-align-cov", "3", + "--query-max-align-cov", "5", + "--conda-env-prefix", condaPrefix.toString() ) ) } - val proali = executor.invocations.single { it.command.contains("proali") } - assertEquals(smallseqRoot.resolve("anchors.gff").toString(), proali.argAfter("-i")) - assertEquals(smallseqRoot.resolve("Ref.fa").toString(), proali.argAfter("-r")) - assertEquals("4", proali.argAfter("-t")) - assertEquals("1", proali.argAfter("-R")) - assertEquals("1", proali.argAfter("-Q")) + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) } @Test - fun mafFilePathsTextFileIsWrittenForEachQuery(@TempDir workDir: Path) { + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + AlignAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--query-fasta", smallseqRoot.resolve("queries").toString(), + "--just-ref-prep" + ) + ) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/01_anchorwave_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun mafFilePathsTextFileListsMafsWrittenByPhg(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // RecordingProcessExecutor doesn't actually run phg, so simulate its + // side-effect: drop one .maf file per query into the output directory + // before the phg invocation "returns". + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputDir = inv.command.dropWhile { it != "-o" }.getOrNull(1)?.let { File(it) } + outputDir?.mkdirs() + val listFile = inv.command.dropWhile { it != "--assembly-file-list" }.getOrNull(1)?.let { File(it) } + listFile?.readLines()?.filter { it.isNotBlank() }?.forEach { fastaPath -> + val sampleName = File(fastaPath).nameWithoutExtension + File(outputDir, "$sampleName.maf").writeText("##maf version=1\n") + } + 0 + } + ProcessRunner.withExecutor(executor) { AlignAssemblies().parse( listOf( @@ -110,6 +180,7 @@ class AlignAssembliesUnitTest { val mafPathsFile = workDir.resolve("output/01_anchorwave_results/maf_file_paths.txt").toFile() assertTrue(mafPathsFile.exists(), "maf_file_paths.txt should be written") val lines = mafPathsFile.readLines().filter { it.isNotBlank() } - assertEquals(3, lines.size, "Should list one MAF path per query") + assertEquals(3, lines.size, "Should list one MAF path per simulated phg output") + assertTrue(lines.all { it.endsWith(".maf") }, "Every listed path should be a .maf file") } } diff --git a/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt new file mode 100644 index 0000000..61884e6 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt @@ -0,0 +1,189 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [AlignMutatedAssemblies] that don't actually shell out to + * the PHGv2 binary -- we install a [RecordingProcessExecutor] and verify the + * exact command line seq-sim would send to `phg align-assemblies`. + * + * Mirrors [AlignAssembliesUnitTest] since both commands now wrap the same + * PHGv2 subcommand. + */ +class AlignMutatedAssembliesUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + + @Test + fun phgAlignAssembliesIsInvokedExactlyOnce(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString(), + "--threads", "2" + ) + ) + } + + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) + + // Required PHGv2 args are present + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) + assertEquals( + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") + ) + assertEquals("2", inv.argAfter("--total-threads")) + + // PHGv2 expects the output dir to exist before running and we hand it + // an assembly-file-list materialized inside that output dir. + val expectedOutputDir = workDir.resolve("output/10_mutated_alignment_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional flags should NOT be present when not set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) + } + + @Test + fun optionalPhgParametersAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries/LineA.fa").toString(), + "--threads", "4", + "--in-parallel", "2", + "--ref-max-align-cov", "3", + "--query-max-align-cov", "5", + "--conda-env-prefix", condaPrefix.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) + } + + @Test + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString(), + "--just-ref-prep" + ) + ) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/10_mutated_alignment_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun mafFilePathsTextFileListsMafsWrittenByPhg(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // RecordingProcessExecutor doesn't actually run phg, so simulate its + // side-effect: drop one .maf file per query into the output directory + // before the phg invocation "returns". + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputDir = inv.command.dropWhile { it != "-o" }.getOrNull(1)?.let { File(it) } + outputDir?.mkdirs() + val listFile = inv.command.dropWhile { it != "--assembly-file-list" }.getOrNull(1)?.let { File(it) } + listFile?.readLines()?.filter { it.isNotBlank() }?.forEach { fastaPath -> + val sampleName = File(fastaPath).nameWithoutExtension + File(outputDir, "$sampleName.maf").writeText("##maf version=1\n") + } + 0 + } + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString() + ) + ) + } + + val mafPathsFile = workDir.resolve("output/10_mutated_alignment_results/maf_file_paths.txt").toFile() + assertTrue(mafPathsFile.exists(), "maf_file_paths.txt should be written") + val lines = mafPathsFile.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "Should list one MAF path per simulated phg output") + assertTrue(lines.all { it.endsWith(".maf") }, "Every listed path should be a .maf file") + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt new file mode 100644 index 0000000..babf808 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt @@ -0,0 +1,215 @@ +package net.maizegenetics.commands.align + +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.apache.logging.log4j.LogManager +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [PhgAlignRunner] -- the shared backend used by both + * [net.maizegenetics.commands.AlignAssemblies] and + * [net.maizegenetics.commands.AlignMutatedAssemblies]. The wrappers' own + * tests still exercise the externally observable behaviour (command line, + * `assemblies_list.txt`, `maf_file_paths.txt`), so these tests focus on + * the runner's invariants directly: it should work for any caller, with + * any combination of step metadata. + */ +class PhgAlignRunnerUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + private val logger = LogManager.getLogger(PhgAlignRunnerUnitTest::class.java) + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the runner's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path) { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + } + + private fun baseParams(workDir: Path, queryInput: Path) = PhgAlignParams( + workDir = workDir, + refGff = smallseqRoot.resolve("anchors.gff"), + refFasta = smallseqRoot.resolve("Ref.fa"), + queryInput = queryInput, + threads = 2, + logFileName = "test_align.log", + outputSubdir = "test_align_results", + inputKind = "query", + ) + + @Test + fun happyPathInvokesPhgWithSharedArgs(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run( + baseParams(workDir, smallseqRoot.resolve("queries")), + logger, + ) + } + + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) + + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) + assertEquals( + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") + ) + assertEquals("2", inv.argAfter("--total-threads")) + + // Output dir uses params.outputSubdir under /output/ + val expectedOutputDir = workDir.resolve("output/test_align_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + + // assemblies_list.txt is materialized inside the output dir + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional knobs absent unless set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) + } + + @Test + fun runnerHonoursPerCallerLogAndOutputMetadata(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + // Different log filename + subdir from the default test params + val params = baseParams(workDir, smallseqRoot.resolve("queries")).copy( + logFileName = "99_custom_step.log", + outputSubdir = "99_custom_step_results", + inputKind = "FASTA", + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + val expectedOutputDir = workDir.resolve("output/99_custom_step_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + assertTrue( + workDir.resolve("logs/99_custom_step.log").toFile().exists(), + "Custom log file should be created under /logs/" + ) + } + + @Test + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + val params = baseParams(workDir, smallseqRoot.resolve("queries")).copy( + justRefPrep = true, + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/test_align_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun referenceFastaIsFilteredOutOfAssemblyList(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + // Build a text-list input that intentionally includes Ref.fa alongside + // the three real queries -- the runner should drop the reference. + val refFasta = smallseqRoot.resolve("Ref.fa").toAbsolutePath() + val queryList = workDir.resolve("queries_with_ref.txt") + queryList.writeText( + buildString { + appendLine(refFasta.toString()) + appendLine(smallseqRoot.resolve("queries/LineA.fa").toAbsolutePath().toString()) + appendLine(smallseqRoot.resolve("queries/LineB.fa").toAbsolutePath().toString()) + appendLine(smallseqRoot.resolve("queries/LineC.fa").toAbsolutePath().toString()) + } + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(baseParams(workDir, queryList), logger) + } + + val inv = executor.invocations.single() + val assemblyListPath = inv.argAfter("--assembly-file-list") + ?: error("--assembly-file-list not present in command") + val listed = File(assemblyListPath).readLines().filter { it.isNotBlank() } + assertEquals( + 3, + listed.size, + "Reference FASTA should have been filtered out, leaving the 3 queries" + ) + assertTrue( + listed.none { it == refFasta.toString() }, + "Reference FASTA should not be present in the assembly file list" + ) + } + + @Test + fun optionalKnobsAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } + + val params = baseParams(workDir, smallseqRoot.resolve("queries/LineA.fa")).copy( + threads = 4, + inParallel = 2, + refMaxAlignCov = 3, + queryMaxAlignCov = 5, + condaEnvPrefix = condaPrefix, + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) + } +} diff --git a/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt b/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt index cfcbf1e..468d471 100644 --- a/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt @@ -12,8 +12,8 @@ import kotlin.io.path.createDirectories import kotlin.test.assertTrue /** - * Integration test that actually invokes AnchorWave + minimap2 against the - * smallseq test resources. + * Integration test that actually invokes `phg align-assemblies` (which itself + * drives AnchorWave + minimap2) against the smallseq test resources. * * Runs only inside the seq-sim-dev container (gated by [IntegrationGuard]). * Outside the container this is a no-op assumption skip. @@ -26,10 +26,15 @@ class AlignAssembliesIntegrationTest { @Test fun alignsQueryAgainstSmallseqReference(@TempDir workDir: Path) { + IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() // seq-sim expects a pre-existing work dir (validateWorkingDirectory). + // We also need a phg binary at /src/phg_v2/bin/phg. workDir.createDirectories() + val phgSrcDir = workDir.resolve("src/phg_v2/bin").also { it.createDirectories() } + val phgFromEnv = File("${IntegrationGuard.phgDir}/bin/phg") + java.nio.file.Files.createSymbolicLink(phgSrcDir.resolve("phg"), phgFromEnv.toPath()) // Copy the single query into a dir so align-assemblies globs it. val queriesDir = workDir.resolve("queries").also { it.createDirectories() } diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index f7e214c..d01aaf1 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -25,6 +25,10 @@ class OrchestrateE2ETest { @Test fun orchestrateSmallseqPipelineProducesMafAndGvcf(@TempDir workDir: Path) { + // align-assemblies now drives PHGv2 internally, so we need both the + // phg binary and AnchorWave on PATH. The orchestrator's setup-environment + // step takes care of populating /src/phg_v2 from SEQ_SIM_PHG_DIR. + IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() workDir.createDirectories()