Robaina · Robaina · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,30 @@
+name: tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install samtools
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y samtools
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install pysam numpy pytest parallelbam
+          python -m pip install -e . --no-deps
+
+      - name: Run tests
+        run: python -m pytest
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ lib64
 
 __pycache__
 .ipynb_checkpoints
+.pytest_cache
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths = tests
+addopts = -ra
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,89 @@
+"""
+Shared fixtures for the filtersam test suite.
+
+A small, hand-crafted SAM file with segments whose percent-identity and
+percent-matched values are known exactly is used throughout. The values were
+verified against pysam's own ``get_aligned_pairs``/CIGAR parsing:
+
+    name            %identity   %matched   MD tag
+    read_perfect       100.0      100.0     MD:Z:10
+    read_90id           90.0       90.0     MD:Z:5A4
+    read_softclip      100.0       80.0     MD:Z:8     (CIGAR 2S8M)
+    read_70id           70.0       70.0     MD:Z:1A2C2A2
+    read_nomd           ----       ----     (no MD tag, always dropped)
+"""
+
+import pysam
+import pytest
+
+SAM_TEXT = """@HD\tVN:1.6\tSO:unsorted
+@SQ\tSN:ref\tLN:100
+read_perfect\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tMD:Z:10
+read_90id\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTAGGTAC\t*\tMD:Z:5A4
+read_softclip\t0\tref\t1\t60\t2S8M\t*\t0\t0\tTTACGTACGT\t*\tMD:Z:8
+read_70id\t0\tref\t1\t60\t10M\t*\t0\t0\tAGGTGGTGAC\t*\tMD:Z:1A2C2A2
+read_nomd\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tNM:i:0
+"""
+
+# Expected segments retained for a given filter and cutoff.
+IDENTITY_KEPT = {
+    95.0: {"read_perfect", "read_softclip"},
+    90.0: {"read_perfect", "read_90id", "read_softclip"},
+    70.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
+}
+MATCHED_KEPT = {
+    100.0: {"read_perfect"},
+    85.0: {"read_perfect", "read_90id"},
+    50.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
+}
+
+
+def _write_sam(path):
+    path.write_text(SAM_TEXT)
+    return path
+
+
+def read_segment_names(path):
+    """Return the set of query names in a SAM/BAM file (format auto-detected)."""
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(path), "r") as handle:
+        names = {seg.query_name for seg in handle}
+    pysam.set_verbosity(save)
+    return names
+
+
+def detect_format(path):
+    """Return 'bam' if the file is BGZF/BAM-compressed, 'sam' if plain text."""
+    with open(path, "rb") as handle:
+        magic = handle.read(2)
+    return "bam" if magic == b"\x1f\x8b" else "sam"
+
+
+@pytest.fixture
+def sam_path(tmp_path):
+    """A plain-text SAM fixture file."""
+    return _write_sam(tmp_path / "sample.sam")
+
+
+@pytest.fixture
+def bam_path(tmp_path):
+    """A BAM fixture file built from the same records as ``sam_path``."""
+    sam = _write_sam(tmp_path / "_src.sam")
+    bam = tmp_path / "sample.bam"
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam), "r") as src:
+        with pysam.AlignmentFile(str(bam), "wb", template=src) as dst:
+            for seg in src:
+                dst.write(seg)
+    pysam.set_verbosity(save)
+    return bam
+
+
+@pytest.fixture
+def segments(sam_path):
+    """The parsed AlignedSegment objects, keyed by query name."""
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam_path), "r") as handle:
+        segs = {seg.query_name: seg for seg in handle}
+    pysam.set_verbosity(save)
+    return segs
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -0,0 +1,87 @@
+"""
+Tests for the single-file filtering functions:
+filterSAMbyIdentity and filterSAMbyPercentMatched.
+"""
+
+from pathlib import Path
+
+import pytest
+
+from filtersam.filtersam import filterSAMbyIdentity, filterSAMbyPercentMatched
+
+from conftest import IDENTITY_KEPT, MATCHED_KEPT, detect_format, read_segment_names
+
+
+@pytest.mark.parametrize("cutoff", sorted(IDENTITY_KEPT))
+def test_filter_by_identity_keeps_expected(sam_path, tmp_path, cutoff):
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=cutoff)
+    assert read_segment_names(out) == IDENTITY_KEPT[cutoff]
+
+
+@pytest.mark.parametrize("cutoff", sorted(MATCHED_KEPT))
+def test_filter_by_matched_keeps_expected(sam_path, tmp_path, cutoff):
+    out = tmp_path / "out.sam"
+    filterSAMbyPercentMatched(sam_path, out, matched_cutoff=cutoff)
+    assert read_segment_names(out) == MATCHED_KEPT[cutoff]
+
+
+def test_segments_without_md_tag_are_always_dropped(sam_path, tmp_path):
+    # Cutoff 0 keeps everything that has an MD tag, but never the MD-less read.
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=0.0)
+    assert "read_nomd" not in read_segment_names(out)
+
+
+def test_works_on_bam_input(bam_path, tmp_path):
+    out = tmp_path / "out.bam"
+    filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+# --- Output format selection (regression guard for the single-process BAM fix) ---
+
+def test_output_sam_is_text(sam_path, tmp_path):
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "sam"
+
+
+def test_output_bam_is_binary(sam_path, tmp_path):
+    out = tmp_path / "out.bam"
+    filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "bam"
+
+
+def test_output_format_follows_output_extension_not_input(bam_path, tmp_path):
+    # BAM input but a .sam output request must yield text SAM.
+    out = tmp_path / "out.sam"
+    filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
+    assert detect_format(out) == "sam"
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+# --- Default output path naming (regression guard for the suffix fix) ---
+
+def test_default_output_path_naming(tmp_path):
+    # A filename containing 'sam' before the extension used to break the old
+    # regex-based extension detection; the suffix-based logic handles it.
+    src = tmp_path / "mysample.bam"
+    # Build a tiny BAM from the SAM fixture text.
+    from conftest import SAM_TEXT
+    import pysam
+    sam = tmp_path / "_seed.sam"
+    sam.write_text(SAM_TEXT)
+    save = pysam.set_verbosity(0)
+    with pysam.AlignmentFile(str(sam), "r") as s, \
+            pysam.AlignmentFile(str(src), "wb", template=s) as d:
+        for seg in s:
+            d.write(seg)
+    pysam.set_verbosity(save)
+
+    filterSAMbyIdentity(src, identity_cutoff=95.0)
+
+    expected = tmp_path / "mysample.identity_filtered_at_95.0.bam"
+    assert expected.is_file()
+    assert detect_format(expected) == "bam"
+    assert read_segment_names(expected) == IDENTITY_KEPT[95.0]
diff --git a/tests/test_filterSAM.py b/tests/test_filterSAM.py
@@ -0,0 +1,60 @@
+"""
+Tests for the filterSAM dispatcher: argument validation and the
+single-vs-parallel routing logic.
+"""
+
+import pytest
+
+from filtersam import filtersam as fs
+from filtersam.filtersam import filterSAM
+
+from conftest import IDENTITY_KEPT, read_segment_names
+
+
+def test_invalid_filter_by_raises(sam_path, tmp_path):
+    with pytest.raises(ValueError):
+        filterSAM(sam_path, tmp_path / "out.sam", filter_by="nonsense", cutoff=95.0)
+
+
+@pytest.mark.parametrize("cutoff", [-1.0, 100.1, 1000.0])
+def test_out_of_range_cutoff_raises(sam_path, tmp_path, cutoff):
+    with pytest.raises(ValueError):
+        filterSAM(sam_path, tmp_path / "out.sam", filter_by="identity", cutoff=cutoff)
+
+
+def test_none_processes_uses_single_path(bam_path, tmp_path, monkeypatch):
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=None)
+    assert calls == []
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+def test_single_process_does_not_split(bam_path, tmp_path, monkeypatch):
+    # Regression guard for #3 / PR #6: `-p 1` must take the direct path and
+    # never invoke the (expensive) parallel splitting machinery.
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=1)
+    assert calls == [], "n_processes=1 should not call parallelizeBAMoperation"
+    assert read_segment_names(out) == IDENTITY_KEPT[95.0]
+
+
+def test_multiple_processes_use_parallel_path(bam_path, tmp_path, monkeypatch):
+    calls = []
+    monkeypatch.setattr(fs, "parallelizeBAMoperation",
+                        lambda *a, **k: calls.append((a, k)))
+    out = tmp_path / "out.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=2)
+    assert len(calls) == 1, "n_processes>1 should call parallelizeBAMoperation"
+
+
+def test_parallel_run_matches_serial(bam_path, tmp_path):
+    # End-to-end parallel run (uses samtools-backed splitting from parallelbam).
+    out = tmp_path / "out_parallel.bam"
+    filterSAM(bam_path, out, filter_by="identity", cutoff=70.0, n_processes=2)
+    assert read_segment_names(out) == IDENTITY_KEPT[70.0]
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -0,0 +1,59 @@
+"""
+Unit tests for the per-segment metric helpers in filtersam.filtersam.
+"""
+
+import pytest
+
+from filtersam import filtersam as fs
+
+
+def test_has_md_tag(segments):
+    assert fs.has_MD_tag(segments["read_perfect"])
+    assert not fs.has_MD_tag(segments["read_nomd"])
+
+
+def test_sum_matches_and_mismatches(segments):
+    # Sum of CIGAR M lengths.
+    assert fs.sumMatchesAndMismatches(segments["read_perfect"]) == 10
+    assert fs.sumMatchesAndMismatches(segments["read_90id"]) == 10
+    # Soft-clipped bases (2S) do not count towards M.
+    assert fs.sumMatchesAndMismatches(segments["read_softclip"]) == 8
+
+
+def test_get_number_of_matches(segments):
+    assert fs.getNumberOfMatches(segments["read_perfect"]) == 10
+    assert fs.getNumberOfMatches(segments["read_90id"]) == 9
+    assert fs.getNumberOfMatches(segments["read_softclip"]) == 8
+    assert fs.getNumberOfMatches(segments["read_70id"]) == 7
+
+
+def test_get_query_length(segments):
+    # M + I + S + = + X consume query; soft clip is included.
+    assert fs.getQueryLength(segments["read_perfect"]) == 10
+    assert fs.getQueryLength(segments["read_softclip"]) == 10
+
+
+@pytest.mark.parametrize(
+    "name,expected",
+    [
+        ("read_perfect", 100.0),
+        ("read_90id", 90.0),
+        ("read_softclip", 100.0),
+        ("read_70id", 70.0),
+    ],
+)
+def test_percent_identity(segments, name, expected):
+    assert fs.percent_identity(segments[name]) == pytest.approx(expected)
+
+
+@pytest.mark.parametrize(
+    "name,expected",
+    [
+        ("read_perfect", 100.0),
+        ("read_90id", 90.0),
+        ("read_softclip", 80.0),
+        ("read_70id", 70.0),
+    ],
+)
+def test_percent_matched(segments, name, expected):
+    assert fs.percent_matched(segments[name]) == pytest.approx(expected)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,3 +14,4 @@ lib64

		__pycache__
		.ipynb_checkpoints
		.pytest_cache