Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
"openssh-client",
"libaio-dev",
"unzip",
"yamllint"
"yamllint",
"time"
],
"install Ruby": [
"/usr/bin/apt",
Expand Down
41 changes: 41 additions & 0 deletions .github/workflows/acceptable_memory_usage_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: performance_test

on:
schedule:
- cron: "0 0 * * *"
timezone: "America/Vancouver"
workflow_dispatch:

concurrency:
group: test-${{ github.head_ref }}
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"

jobs:
run:
name: Python ${{ matrix.python-version }} Performance Tests
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
apt update && apt install yamllint
pip install uv

- name: Run slow tests
run: uv run pytest --memray -m slow --no-cov
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [

[dependency-groups]
dev = [
"gprof2dot>=2025.4.14",
"mypy>=1.15.0",
"mypy-extensions>=1.0.0",
"pandas>=2.2.3",
Expand All @@ -47,6 +48,7 @@ dev = [
"pytest>=8.3.5",
"pytest-cov>=6.0.0",
"pytest-html>=4.1.1",
"pytest-memray>=1.8.0",
"pytest-mock>=3.14.0",
"pytest-xdist>=3.6.1",
"ruff>=0.9.9",
Expand Down
12 changes: 1 addition & 11 deletions src/hla_algorithm/hla_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,6 @@ def combine_standards_stepper(
- this is below our mismatch threshold.
If the mismatch threshold is 0, then we will only ever get the former.
"""
# Keep track of matches we've already found:
combos: dict[tuple[int, ...], int] = {}

current_rejection_threshold: int | float = float("inf")
for std_ai, std_a in enumerate(matching_stds):
if std_a.mismatch > current_rejection_threshold:
Expand All @@ -278,14 +275,7 @@ def combine_standards_stepper(
# same sequence, so check if this one's already been found.
combined_std_bin: tuple[int, ...] = tuple(int(s) for s in std_bin)

mismatches: int = -1
if combined_std_bin in combos:
mismatches = combos[combined_std_bin]

else:
# Note that seq is implicitly cast to a NumPy array:
mismatches = np.count_nonzero(std_bin ^ seq != 0)
combos[combined_std_bin] = mismatches # cache this value
mismatches: int = np.count_nonzero(std_bin ^ seq != 0)

if mismatches > current_rejection_threshold:
continue
Expand Down
103 changes: 103 additions & 0 deletions src/scripts/measure_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#! /usr/bin/env python

import argparse
import csv
import json
import re
import subprocess
from pathlib import Path
from typing import TypedDict

TIME_REGEX = re.compile(
r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\): (.*)$",
flags=re.MULTILINE,
)
MEMORY_REGEX = re.compile(
r"^\s*Maximum resident set size \(kbytes\): (.*)$",
flags=re.MULTILINE,
)
SAMPLE_REGEX = re.compile(r"^(.*)\.BA\.txt$")


def get_wall_clock_time(time_output: str) -> str:
return TIME_REGEX.search(time_output).group(1)


def get_max_memory_usage(time_output: str) -> str:
return MEMORY_REGEX.search(time_output).group(1)


class ResourceSummary(TypedDict):
sample_name: str
wall_clock_time: str
max_memory_usage_kb: str


def main():
parser = argparse.ArgumentParser(
"Process HLA sequences and report the resource usage."
)
parser.add_argument(
"input_dir",
help="Directory to scan for HLA sequences",
type=Path,
)
parser.add_argument(
"--output_csv",
help="CSV file summary",
type=Path,
default=Path("out.csv"),
)
args = parser.parse_args()

resource_summaries: list[ResourceSummary] = []

for exon1_filepath in args.input_dir.glob("*.BA.txt"):
sample_name: str = SAMPLE_REGEX.match(exon1_filepath.name).group(1)
exon2_filepath: Path = args.input_dir / f"{sample_name}.BB.txt"
exon1: str = exon1_filepath.read_text().strip()
exon2: str = exon2_filepath.read_text().strip()

json_input = {
"seq1": exon1,
"seq2": exon2,
"locus": "B",
}
json_filepath: Path = args.input_dir / f"{sample_name}.json"
json_filepath.write_text(json.dumps(json_input))

print(f"----\nSample {sample_name}:")
result = subprocess.run(
[
"/usr/bin/time",
"-v",
"interpret_from_json",
json_filepath.as_posix(),
],
capture_output=True,
text=True,
)
print("stdout:")
print(result.stdout)
print("stderr:")
print(result.stderr)

resource_summaries.append(
{
"sample_name": sample_name,
"wall_clock_time": get_wall_clock_time(result.stderr),
"max_memory_usage_kb": get_max_memory_usage(result.stderr),
}
)

with open(args.output_csv, "w") as f:
resource_summary_writer = csv.DictWriter(
f,
fieldnames=("sample_name", "wall_clock_time", "max_memory_usage_kb"),
)
resource_summary_writer.writeheader()
resource_summary_writer.writerows(resource_summaries)
Comment thread
va7eex marked this conversation as resolved.


if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions tests/acceptable_memory_usage_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pytest

from hla_algorithm.hla_algorithm import HLAAlgorithm
from hla_algorithm.models import HLASequence, HLAStandard


@pytest.mark.slow
@pytest.mark.limit_memory("500 MB")
def test_acceptable_memory_usage():
# We process a sequence produced by "mushing together" B*07:02:01G
# and B*45:01:01G, which as of the v2.63.0-alpha HLA alleles produces
# an expensive calculation.
hla_alg = HLAAlgorithm()

allele_1: HLAStandard = hla_alg.hla_standards["B"]["B*07:02:01G"]
allele_2: HLAStandard = hla_alg.hla_standards["B"]["B*45:01:01G"]

expensive_sequence = HLASequence(
two=(int(s) for s in np.array(allele_1.two) | np.array(allele_2.two)),
intron=(),
three=(int(s) for s in np.array(allele_1.three) | np.array(allele_2.three)),
name="expensive_sequence",
locus="B",
)

hla_alg.interpret(expensive_sequence)
Loading
Loading