Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: tests

on:
push:
branches: [main]
pull_request:

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: "3.10"

- name: Install samtools
run: |
sudo apt-get update
sudo apt-get install -y samtools

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pysam numpy pytest parallelbam
python -m pip install -e . --no-deps

- name: Run tests
run: python -m pytest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ lib64

__pycache__
.ipynb_checkpoints
.pytest_cache
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
testpaths = tests
addopts = -ra
89 changes: 89 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Shared fixtures for the filtersam test suite.

A small, hand-crafted SAM file with segments whose percent-identity and
percent-matched values are known exactly is used throughout. The values were
verified against pysam's own ``get_aligned_pairs``/CIGAR parsing:

name %identity %matched MD tag
read_perfect 100.0 100.0 MD:Z:10
read_90id 90.0 90.0 MD:Z:5A4
read_softclip 100.0 80.0 MD:Z:8 (CIGAR 2S8M)
read_70id 70.0 70.0 MD:Z:1A2C2A2
read_nomd ---- ---- (no MD tag, always dropped)
"""

import pysam
import pytest

SAM_TEXT = """@HD\tVN:1.6\tSO:unsorted
@SQ\tSN:ref\tLN:100
read_perfect\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tMD:Z:10
read_90id\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTAGGTAC\t*\tMD:Z:5A4
read_softclip\t0\tref\t1\t60\t2S8M\t*\t0\t0\tTTACGTACGT\t*\tMD:Z:8
read_70id\t0\tref\t1\t60\t10M\t*\t0\t0\tAGGTGGTGAC\t*\tMD:Z:1A2C2A2
read_nomd\t0\tref\t1\t60\t10M\t*\t0\t0\tACGTACGTAC\t*\tNM:i:0
"""

# Expected segments retained for a given filter and cutoff.
IDENTITY_KEPT = {
95.0: {"read_perfect", "read_softclip"},
90.0: {"read_perfect", "read_90id", "read_softclip"},
70.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
}
MATCHED_KEPT = {
100.0: {"read_perfect"},
85.0: {"read_perfect", "read_90id"},
50.0: {"read_perfect", "read_90id", "read_softclip", "read_70id"},
}


def _write_sam(path):
path.write_text(SAM_TEXT)
return path


def read_segment_names(path):
"""Return the set of query names in a SAM/BAM file (format auto-detected)."""
save = pysam.set_verbosity(0)
with pysam.AlignmentFile(str(path), "r") as handle:
names = {seg.query_name for seg in handle}
pysam.set_verbosity(save)
return names


def detect_format(path):
"""Return 'bam' if the file is BGZF/BAM-compressed, 'sam' if plain text."""
with open(path, "rb") as handle:
magic = handle.read(2)
return "bam" if magic == b"\x1f\x8b" else "sam"


@pytest.fixture
def sam_path(tmp_path):
"""A plain-text SAM fixture file."""
return _write_sam(tmp_path / "sample.sam")


@pytest.fixture
def bam_path(tmp_path):
"""A BAM fixture file built from the same records as ``sam_path``."""
sam = _write_sam(tmp_path / "_src.sam")
bam = tmp_path / "sample.bam"
save = pysam.set_verbosity(0)
with pysam.AlignmentFile(str(sam), "r") as src:
with pysam.AlignmentFile(str(bam), "wb", template=src) as dst:
for seg in src:
dst.write(seg)
pysam.set_verbosity(save)
return bam


@pytest.fixture
def segments(sam_path):
"""The parsed AlignedSegment objects, keyed by query name."""
save = pysam.set_verbosity(0)
with pysam.AlignmentFile(str(sam_path), "r") as handle:
segs = {seg.query_name: seg for seg in handle}
pysam.set_verbosity(save)
return segs
87 changes: 87 additions & 0 deletions tests/test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Tests for the single-file filtering functions:
filterSAMbyIdentity and filterSAMbyPercentMatched.
"""

from pathlib import Path

import pytest

from filtersam.filtersam import filterSAMbyIdentity, filterSAMbyPercentMatched

from conftest import IDENTITY_KEPT, MATCHED_KEPT, detect_format, read_segment_names


@pytest.mark.parametrize("cutoff", sorted(IDENTITY_KEPT))
def test_filter_by_identity_keeps_expected(sam_path, tmp_path, cutoff):
out = tmp_path / "out.sam"
filterSAMbyIdentity(sam_path, out, identity_cutoff=cutoff)
assert read_segment_names(out) == IDENTITY_KEPT[cutoff]


@pytest.mark.parametrize("cutoff", sorted(MATCHED_KEPT))
def test_filter_by_matched_keeps_expected(sam_path, tmp_path, cutoff):
out = tmp_path / "out.sam"
filterSAMbyPercentMatched(sam_path, out, matched_cutoff=cutoff)
assert read_segment_names(out) == MATCHED_KEPT[cutoff]


def test_segments_without_md_tag_are_always_dropped(sam_path, tmp_path):
# Cutoff 0 keeps everything that has an MD tag, but never the MD-less read.
out = tmp_path / "out.sam"
filterSAMbyIdentity(sam_path, out, identity_cutoff=0.0)
assert "read_nomd" not in read_segment_names(out)


def test_works_on_bam_input(bam_path, tmp_path):
out = tmp_path / "out.bam"
filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
assert read_segment_names(out) == IDENTITY_KEPT[95.0]


# --- Output format selection (regression guard for the single-process BAM fix) ---

def test_output_sam_is_text(sam_path, tmp_path):
out = tmp_path / "out.sam"
filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
assert detect_format(out) == "sam"


def test_output_bam_is_binary(sam_path, tmp_path):
out = tmp_path / "out.bam"
filterSAMbyIdentity(sam_path, out, identity_cutoff=95.0)
assert detect_format(out) == "bam"


def test_output_format_follows_output_extension_not_input(bam_path, tmp_path):
# BAM input but a .sam output request must yield text SAM.
out = tmp_path / "out.sam"
filterSAMbyIdentity(bam_path, out, identity_cutoff=95.0)
assert detect_format(out) == "sam"
assert read_segment_names(out) == IDENTITY_KEPT[95.0]


# --- Default output path naming (regression guard for the suffix fix) ---

def test_default_output_path_naming(tmp_path):
# A filename containing 'sam' before the extension used to break the old
# regex-based extension detection; the suffix-based logic handles it.
src = tmp_path / "mysample.bam"
# Build a tiny BAM from the SAM fixture text.
from conftest import SAM_TEXT
import pysam
sam = tmp_path / "_seed.sam"
sam.write_text(SAM_TEXT)
save = pysam.set_verbosity(0)
with pysam.AlignmentFile(str(sam), "r") as s, \
pysam.AlignmentFile(str(src), "wb", template=s) as d:
for seg in s:
d.write(seg)
pysam.set_verbosity(save)

filterSAMbyIdentity(src, identity_cutoff=95.0)

expected = tmp_path / "mysample.identity_filtered_at_95.0.bam"
assert expected.is_file()
assert detect_format(expected) == "bam"
assert read_segment_names(expected) == IDENTITY_KEPT[95.0]
60 changes: 60 additions & 0 deletions tests/test_filterSAM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Tests for the filterSAM dispatcher: argument validation and the
single-vs-parallel routing logic.
"""

import pytest

from filtersam import filtersam as fs
from filtersam.filtersam import filterSAM

from conftest import IDENTITY_KEPT, read_segment_names


def test_invalid_filter_by_raises(sam_path, tmp_path):
with pytest.raises(ValueError):
filterSAM(sam_path, tmp_path / "out.sam", filter_by="nonsense", cutoff=95.0)


@pytest.mark.parametrize("cutoff", [-1.0, 100.1, 1000.0])
def test_out_of_range_cutoff_raises(sam_path, tmp_path, cutoff):
with pytest.raises(ValueError):
filterSAM(sam_path, tmp_path / "out.sam", filter_by="identity", cutoff=cutoff)


def test_none_processes_uses_single_path(bam_path, tmp_path, monkeypatch):
calls = []
monkeypatch.setattr(fs, "parallelizeBAMoperation",
lambda *a, **k: calls.append((a, k)))
out = tmp_path / "out.bam"
filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=None)
assert calls == []
assert read_segment_names(out) == IDENTITY_KEPT[95.0]


def test_single_process_does_not_split(bam_path, tmp_path, monkeypatch):
# Regression guard for #3 / PR #6: `-p 1` must take the direct path and
# never invoke the (expensive) parallel splitting machinery.
calls = []
monkeypatch.setattr(fs, "parallelizeBAMoperation",
lambda *a, **k: calls.append((a, k)))
out = tmp_path / "out.bam"
filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=1)
assert calls == [], "n_processes=1 should not call parallelizeBAMoperation"
assert read_segment_names(out) == IDENTITY_KEPT[95.0]


def test_multiple_processes_use_parallel_path(bam_path, tmp_path, monkeypatch):
calls = []
monkeypatch.setattr(fs, "parallelizeBAMoperation",
lambda *a, **k: calls.append((a, k)))
out = tmp_path / "out.bam"
filterSAM(bam_path, out, filter_by="identity", cutoff=95.0, n_processes=2)
assert len(calls) == 1, "n_processes>1 should call parallelizeBAMoperation"


def test_parallel_run_matches_serial(bam_path, tmp_path):
# End-to-end parallel run (uses samtools-backed splitting from parallelbam).
out = tmp_path / "out_parallel.bam"
filterSAM(bam_path, out, filter_by="identity", cutoff=70.0, n_processes=2)
assert read_segment_names(out) == IDENTITY_KEPT[70.0]
59 changes: 59 additions & 0 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Unit tests for the per-segment metric helpers in filtersam.filtersam.
"""

import pytest

from filtersam import filtersam as fs


def test_has_md_tag(segments):
assert fs.has_MD_tag(segments["read_perfect"])
assert not fs.has_MD_tag(segments["read_nomd"])


def test_sum_matches_and_mismatches(segments):
# Sum of CIGAR M lengths.
assert fs.sumMatchesAndMismatches(segments["read_perfect"]) == 10
assert fs.sumMatchesAndMismatches(segments["read_90id"]) == 10
# Soft-clipped bases (2S) do not count towards M.
assert fs.sumMatchesAndMismatches(segments["read_softclip"]) == 8


def test_get_number_of_matches(segments):
assert fs.getNumberOfMatches(segments["read_perfect"]) == 10
assert fs.getNumberOfMatches(segments["read_90id"]) == 9
assert fs.getNumberOfMatches(segments["read_softclip"]) == 8
assert fs.getNumberOfMatches(segments["read_70id"]) == 7


def test_get_query_length(segments):
# M + I + S + = + X consume query; soft clip is included.
assert fs.getQueryLength(segments["read_perfect"]) == 10
assert fs.getQueryLength(segments["read_softclip"]) == 10


@pytest.mark.parametrize(
"name,expected",
[
("read_perfect", 100.0),
("read_90id", 90.0),
("read_softclip", 100.0),
("read_70id", 70.0),
],
)
def test_percent_identity(segments, name, expected):
assert fs.percent_identity(segments[name]) == pytest.approx(expected)


@pytest.mark.parametrize(
"name,expected",
[
("read_perfect", 100.0),
("read_90id", 90.0),
("read_softclip", 80.0),
("read_70id", 70.0),
],
)
def test_percent_matched(segments, name, expected):
assert fs.percent_matched(segments[name]) == pytest.approx(expected)
Loading