From 56d412daa64698ab4794033533e668882cfa6981 Mon Sep 17 00:00:00 2001 From: Juan Sugg Date: Mon, 15 Jun 2026 23:30:06 -0300 Subject: [PATCH 1/5] chore(dev): make quality gates reproducible Use frozen uv dev envs for local Make targets and default setup to the repository Python pin when .python-version exists. --- Makefile | 20 ++++++++++---------- scripts/setup_compatible_env.sh | 6 ++++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index fe80eb3..1dc5c92 100644 --- a/Makefile +++ b/Makefile @@ -31,22 +31,22 @@ setup-runtime: SER_SETUP_INCLUDE_DEV=false ./scripts/setup_compatible_env.sh fmt: - uv run --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py') - uv run ruff check --fix ser tests - uv run isort ser tests - uv run black ser tests + uv run --frozen --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py') + uv run --frozen --extra dev ruff check --fix ser tests + uv run --frozen --extra dev isort ser tests + uv run --frozen --extra dev black ser tests lint: - uv run ruff check ser tests - uv run black --check ser tests - uv run isort --check-only ser tests + uv run --frozen --extra dev ruff check ser tests + uv run --frozen --extra dev black --check ser tests + uv run --frozen --extra dev isort --check-only ser tests type: - uv run mypy ser tests - uv run pyright --pythonversion 3.12 ser tests + uv run --frozen --extra dev mypy ser tests + uv run --frozen --extra dev pyright --pythonversion 3.12 ser tests test: - uv run pytest -q + uv run --frozen --extra dev pytest -q test-cov: uv run --frozen --extra dev coverage erase diff --git a/scripts/setup_compatible_env.sh b/scripts/setup_compatible_env.sh index 7fbe8aa..e667908 100755 --- a/scripts/setup_compatible_env.sh +++ b/scripts/setup_compatible_env.sh @@ -75,6 +75,12 @@ done os_name="$(uname -s)" arch_name="$(uname -m)" default_python="3.13" +if [[ -f .python-version ]]; then + pinned_python="$(head -n 1 .python-version | tr -d '[:space:]')" + if [[ -n "$pinned_python" ]]; then + default_python="$pinned_python" + fi +fi python_version="${SER_SETUP_PYTHON:-$default_python}" include_dev="$(normalize_bool "${SER_SETUP_INCLUDE_DEV:-true}" "SER_SETUP_INCLUDE_DEV")" From 5089b4bb7cb08a5eac1206d41ca8b106354b9b32 Mon Sep 17 00:00:00 2001 From: Juan Sugg Date: Mon, 15 Jun 2026 23:30:06 -0300 Subject: [PATCH 2/5] refactor(transcription): simplify MPS tensor guards --- ser/transcript/backends/stable_whisper_mps_compat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ser/transcript/backends/stable_whisper_mps_compat.py b/ser/transcript/backends/stable_whisper_mps_compat.py index 0a1b6fd..489866e 100644 --- a/ser/transcript/backends/stable_whisper_mps_compat.py +++ b/ser/transcript/backends/stable_whisper_mps_compat.py @@ -229,8 +229,8 @@ def _resolve_mps_log_mel_target_device( if explicit_device is None or explicit_device.type != "mps": return None return explicit_device - if torch.is_tensor(audio) and cast(torch.Tensor, audio).device.type == "mps": - return cast(torch.Tensor, audio).device + if isinstance(audio, torch.Tensor) and audio.device.type == "mps": + return audio.device return None @@ -256,7 +256,7 @@ def _log_mel_cpu_safe( device=device, ), ) - cpu_audio = cast(torch.Tensor, audio).float().cpu() if torch.is_tensor(audio) else audio + cpu_audio = audio.float().cpu() if isinstance(audio, torch.Tensor) else audio cpu_log_mel = cast( torch.Tensor, original_log_mel_spectrogram( From 6fd3c953e60f0b4e6bc5a4f4e3479dd7625ada6f Mon Sep 17 00:00:00 2001 From: Juan Sugg Date: Mon, 15 Jun 2026 23:30:06 -0300 Subject: [PATCH 3/5] fix(data): preflight dataset download size Abort provider downloads before writing partial files when known expected size or HTTP Content-Length exceeds destination free space. --- ser/data/provider_downloads.py | 75 ++++++++- .../unit/data/test_provider_downloads.py | 155 +++++++++++++++++- 2 files changed, 228 insertions(+), 2 deletions(-) diff --git a/ser/data/provider_downloads.py b/ser/data/provider_downloads.py index 5db80b5..3c42d29 100644 --- a/ser/data/provider_downloads.py +++ b/ser/data/provider_downloads.py @@ -13,8 +13,9 @@ import time from collections.abc import Callable from dataclasses import dataclass +from email.message import Message from pathlib import Path -from typing import Protocol +from typing import Protocol, runtime_checkable from urllib import error, request @@ -63,6 +64,69 @@ def __call__( ) -> None: ... +@runtime_checkable +class _ResponseWithGetHeader(Protocol): + """HTTP response subset exposing header lookup.""" + + def getheader(self, name: str, default: str | None = None) -> str | None: ... + + +def _format_bytes(size_bytes: int) -> str: + """Formats byte counts for actionable diagnostics.""" + units = ("B", "KB", "MB", "GB", "TB") + value = float(size_bytes) + unit = units[0] + for current_unit in units: + unit = current_unit + if value < 1024.0 or current_unit == units[-1]: + break + value /= 1024.0 + if unit == "B": + return f"{int(value)} {unit}" + return f"{value:.2f} {unit}" + + +def _parse_content_length(value: str | None) -> int | None: + """Parses a positive HTTP content length value.""" + if value is None: + return None + normalized = value.strip() + if not normalized: + return None + try: + parsed = int(normalized) + except ValueError: + return None + if parsed <= 0: + return None + return parsed + + +def _response_content_length(response: object) -> int | None: + """Returns a response content length when the server exposes one.""" + if isinstance(response, _ResponseWithGetHeader): + return _parse_content_length(response.getheader("Content-Length")) + headers = getattr(response, "headers", None) + if isinstance(headers, Message): + return _parse_content_length(headers.get("Content-Length")) + return None + + +def _ensure_download_disk_space(*, destination_path: Path, required_bytes: int | None) -> None: + """Raises when known download size exceeds free space at the destination.""" + if required_bytes is None or required_bytes <= 0: + return + free_bytes = shutil.disk_usage(destination_path.parent).free + if free_bytes >= required_bytes: + return + raise RuntimeError( + "Dataset download aborted due to insufficient disk space. " + f"Required at least {_format_bytes(required_bytes)}, " + f"free {_format_bytes(free_bytes)} at {destination_path.parent}. " + "Use `--dataset-root` on a volume with enough space or free local storage first." + ) + + def is_retryable_http_status(status_code: int) -> bool: """Returns whether one HTTP status is safe to retry.""" return status_code == 429 or 500 <= status_code <= 599 @@ -168,6 +232,8 @@ def download_file_with_retries( elif existing_size > 0: return destination_path tmp_path = destination_path.with_suffix(destination_path.suffix + ".partial") + tmp_path.unlink(missing_ok=True) + _ensure_download_disk_space(destination_path=destination_path, required_bytes=expected_size) def _action() -> None: req = request.Request( @@ -179,6 +245,13 @@ def _action() -> None: method="GET", ) with request.urlopen(req, timeout=timeout_seconds) as response: + response_size = _response_content_length(response) + if response_size is not None: + required_bytes = max(expected_size or 0, response_size) + _ensure_download_disk_space( + destination_path=destination_path, + required_bytes=required_bytes, + ) with tmp_path.open("wb") as output_handle: while True: chunk = response.read(chunk_size) diff --git a/tests/suites/unit/data/test_provider_downloads.py b/tests/suites/unit/data/test_provider_downloads.py index 3f290bd..e0eaa1b 100644 --- a/tests/suites/unit/data/test_provider_downloads.py +++ b/tests/suites/unit/data/test_provider_downloads.py @@ -5,7 +5,7 @@ from collections.abc import Callable from email.message import Message from pathlib import Path -from typing import Any +from typing import Any, NamedTuple from urllib import error import pytest @@ -13,6 +13,14 @@ from ser.data import provider_downloads +class _DiskUsage(NamedTuple): + """Minimal disk-usage result for download preflight tests.""" + + total: int + used: int + free: int + + def test_download_file_with_retries_reuses_existing_file_without_hash( tmp_path: Path, ) -> None: @@ -99,6 +107,151 @@ def _with_retries(*, description: str, action: Callable[[], None]) -> None: assert not destination_path.with_suffix(".zip.partial").exists() +def test_download_file_with_retries_aborts_before_request_when_expected_size_exceeds_disk( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Known-size downloads should fail before network I/O when disk is too small.""" + destination_path = tmp_path / "archive.zip" + urlopen_called = False + retried = False + + def _urlopen(req: object, timeout: float) -> object: + nonlocal urlopen_called + del req, timeout + urlopen_called = True + raise AssertionError("network I/O should not start") + + def _with_retries(*, description: str, action: Callable[[], None]) -> None: + nonlocal retried + del description, action + retried = True + + monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen) + monkeypatch.setattr( + provider_downloads.shutil, + "disk_usage", + lambda _path: _DiskUsage(total=10, used=9, free=1), + ) + + with pytest.raises(RuntimeError, match="insufficient disk space"): + provider_downloads.download_file_with_retries( + url="https://example.invalid/archive.zip", + destination_path=destination_path, + expected_size=10, + with_retries=_with_retries, + compute_file_md5=lambda _path: "unused", + timeout_seconds=1.0, + chunk_size=1024, + ) + + assert retried is False + assert urlopen_called is False + assert not destination_path.exists() + + +def test_download_file_with_retries_aborts_on_content_length_when_disk_is_too_small( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Unknown-size downloads should use Content-Length for disk-space preflight.""" + destination_path = tmp_path / "archive.zip" + + class _FakeResponse: + def __enter__(self) -> _FakeResponse: + return self + + def __exit__(self, exc_type: object, exc: object, tb: object) -> None: + del exc_type, exc, tb + + def getheader(self, name: str, default: str | None = None) -> str | None: + if name.lower() == "content-length": + return "10" + return default + + def read(self, _size: int) -> bytes: + raise AssertionError("payload should not be read after disk-space failure") + + def _urlopen(req: object, timeout: float) -> _FakeResponse: + del req, timeout + return _FakeResponse() + + def _with_retries(*, description: str, action: Callable[[], None]) -> None: + del description + action() + + monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen) + monkeypatch.setattr( + provider_downloads.shutil, + "disk_usage", + lambda _path: _DiskUsage(total=10, used=9, free=1), + ) + + with pytest.raises(RuntimeError, match="insufficient disk space"): + provider_downloads.download_file_with_retries( + url="https://example.invalid/archive.zip", + destination_path=destination_path, + with_retries=_with_retries, + compute_file_md5=lambda _path: "unused", + timeout_seconds=1.0, + chunk_size=1024, + ) + + assert not destination_path.exists() + assert not destination_path.with_suffix(".zip.partial").exists() + + +def test_download_file_with_retries_uses_larger_content_length_than_expected_size( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Response size should protect disk even when stale metadata underestimates bytes.""" + destination_path = tmp_path / "archive.zip" + + class _FakeResponse: + def __enter__(self) -> _FakeResponse: + return self + + def __exit__(self, exc_type: object, exc: object, tb: object) -> None: + del exc_type, exc, tb + + def getheader(self, name: str, default: str | None = None) -> str | None: + if name.lower() == "content-length": + return "10" + return default + + def read(self, _size: int) -> bytes: + raise AssertionError("payload should not be read after disk-space failure") + + def _urlopen(req: object, timeout: float) -> _FakeResponse: + del req, timeout + return _FakeResponse() + + def _with_retries(*, description: str, action: Callable[[], None]) -> None: + del description + action() + + monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen) + monkeypatch.setattr( + provider_downloads.shutil, + "disk_usage", + lambda _path: _DiskUsage(total=10, used=5, free=5), + ) + + with pytest.raises(RuntimeError, match="insufficient disk space"): + provider_downloads.download_file_with_retries( + url="https://example.invalid/archive.zip", + destination_path=destination_path, + expected_size=2, + with_retries=_with_retries, + compute_file_md5=lambda _path: "unused", + timeout_seconds=1.0, + chunk_size=1024, + ) + + assert not destination_path.exists() + + def test_read_github_latest_release_assets_parses_expected_payload() -> None: """GitHub helper should read latest release tag and downloadable assets.""" From 59ba563b95733cbcf4eff50a92fb26f5fe3ccb25 Mon Sep 17 00:00:00 2001 From: Juan Sugg Date: Mon, 15 Jun 2026 23:30:07 -0300 Subject: [PATCH 4/5] fix(ci): harden validation smoke scripts Use platform-aware full-gate artifact directories and force fresh same-version wheel installs during package smoke tests. --- scripts/run_full_dataset_quality_gate.sh | 14 +++++++++- scripts/workflows/smoke_test_wheel_install.sh | 3 ++- ...st_run_full_dataset_quality_gate_script.py | 26 +++++++++++++++++++ .../test_smoke_test_wheel_install_script.py | 22 ++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py create mode 100644 tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py diff --git a/scripts/run_full_dataset_quality_gate.sh b/scripts/run_full_dataset_quality_gate.sh index e414938..9eed3a2 100755 --- a/scripts/run_full_dataset_quality_gate.sh +++ b/scripts/run_full_dataset_quality_gate.sh @@ -12,7 +12,19 @@ medium_model_file_name="${SER_FULL_GATE_MEDIUM_MODEL_FILE_NAME:-ser_model_medium medium_training_report_file_name="${SER_FULL_GATE_MEDIUM_TRAINING_REPORT_FILE_NAME:-training_report_medium_full.json}" report_path="${SER_FULL_GATE_REPORT_PATH:-profile_quality_gate_report_full.json}" progress_every="${SER_FULL_GATE_PROGRESS_EVERY:-120}" -models_dir="${SER_MODELS_DIR:-$HOME/Library/Application Support/ser/models}" + +default_models_dir() { + case "$(uname -s)" in + Darwin) + printf '%s\n' "$HOME/Library/Application Support/ser/models" + ;; + *) + printf '%s\n' "${XDG_DATA_HOME:-$HOME/.local/share}/ser/models" + ;; + esac +} + +models_dir="${SER_MODELS_DIR:-$(default_models_dir)}" if [[ "$run_training" != "true" && "$run_training" != "false" ]]; then printf 'SER_FULL_GATE_RUN_TRAINING must be true or false, got: %s\n' "$run_training" >&2 diff --git a/scripts/workflows/smoke_test_wheel_install.sh b/scripts/workflows/smoke_test_wheel_install.sh index 56d74c6..5e325fd 100755 --- a/scripts/workflows/smoke_test_wheel_install.sh +++ b/scripts/workflows/smoke_test_wheel_install.sh @@ -11,10 +11,11 @@ if [[ ${#wheels[@]} -eq 0 ]]; then exit 2 fi +rm -rf .pkg-smoke python -m venv .pkg-smoke . .pkg-smoke/bin/activate python -m pip install --upgrade pip -pip install --no-deps "${wheels[@]}" +pip install --force-reinstall --no-deps "${wheels[@]}" tmp_dir="$(mktemp -d)" cd "$tmp_dir" diff --git a/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py b/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py new file mode 100644 index 0000000..69d4fe8 --- /dev/null +++ b/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py @@ -0,0 +1,26 @@ +"""Contracts for full-dataset quality gate shell script defaults.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + + +def test_full_dataset_quality_gate_script_has_valid_bash_syntax(repo_root: Path) -> None: + """Full-gate script should parse before CI invokes expensive profile work.""" + script_path = repo_root / "scripts" / "run_full_dataset_quality_gate.sh" + + subprocess.run(["bash", "-n", str(script_path)], check=True) + + +def test_full_dataset_quality_gate_default_models_dir_is_platform_aware(repo_root: Path) -> None: + """Default artifact lookup should match SER platform data-dir conventions.""" + script_path = repo_root / "scripts" / "run_full_dataset_quality_gate.sh" + script_text = script_path.read_text(encoding="utf-8") + + assert "Library/Application Support/ser/models" in script_text + assert "${XDG_DATA_HOME:-$HOME/.local/share}/ser/models" in script_text + assert ( + 'models_dir="${SER_MODELS_DIR:-$HOME/Library/Application Support/ser/models}"' + not in script_text + ) diff --git a/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py b/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py new file mode 100644 index 0000000..7b1ce10 --- /dev/null +++ b/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py @@ -0,0 +1,22 @@ +"""Contracts for wheel-install smoke test script.""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + + +def test_wheel_smoke_script_has_valid_bash_syntax(repo_root: Path) -> None: + """Wheel smoke script should parse before packaging CI invokes it.""" + script_path = repo_root / "scripts" / "workflows" / "smoke_test_wheel_install.sh" + + subprocess.run(["bash", "-n", str(script_path)], check=True) + + +def test_wheel_smoke_script_forces_fresh_same_version_install(repo_root: Path) -> None: + """Wheel smoke should install the current wheel even when package version is unchanged.""" + script_path = repo_root / "scripts" / "workflows" / "smoke_test_wheel_install.sh" + script_text = script_path.read_text(encoding="utf-8") + + assert "rm -rf .pkg-smoke" in script_text + assert "pip install --force-reinstall --no-deps" in script_text From 7ec89f31e30386ccece5dabf7b693dd84c089a1b Mon Sep 17 00:00:00 2001 From: Juan Sugg Date: Mon, 15 Jun 2026 23:30:20 -0300 Subject: [PATCH 5/5] test(docs): guard local markdown links --- docs/adr/README.md | 1 - .../architecture/test_documentation_links.py | 46 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 tests/suites/integration/architecture/test_documentation_links.py diff --git a/docs/adr/README.md b/docs/adr/README.md index e5d212f..4939e5f 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -10,7 +10,6 @@ The repository currently keeps its architecture guidance in these maintained doc - [`../codebase-architecture.md`](../codebase-architecture.md): narrative codebase analysis - [`../subsystem-dependency-map.md`](../subsystem-dependency-map.md): subsystem dependency directions and soft-boundary policy - [`../refactor-hotspot-checks.md`](../refactor-hotspot-checks.md): hotspot inventory for careful refactors -- [`../architecture-refactor-roadmap.md`](../architecture-refactor-roadmap.md): staged refactor priorities ## How this directory should be used diff --git a/tests/suites/integration/architecture/test_documentation_links.py b/tests/suites/integration/architecture/test_documentation_links.py new file mode 100644 index 0000000..9bcc32e --- /dev/null +++ b/tests/suites/integration/architecture/test_documentation_links.py @@ -0,0 +1,46 @@ +"""Repository documentation link contract tests.""" + +from __future__ import annotations + +import re +from pathlib import Path + +_MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)") +_EXTERNAL_LINK_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE) + + +def _markdown_files(repo_root: Path) -> list[Path]: + """Returns repository Markdown files covered by local link checks.""" + return [ + repo_root / "README.md", + repo_root / "CONTRIBUTING.md", + *sorted((repo_root / "docs").rglob("*.md")), + ] + + +def _local_markdown_link_targets(markdown_file: Path) -> list[str]: + """Extracts non-external Markdown link targets from one file.""" + targets: list[str] = [] + for match in _MARKDOWN_LINK_RE.finditer(markdown_file.read_text(encoding="utf-8")): + raw_target = match.group(1).split("#", 1)[0] + if ( + not raw_target + or _EXTERNAL_LINK_RE.match(raw_target) + or raw_target.startswith("mailto:") + ): + continue + targets.append(raw_target) + return targets + + +def test_local_markdown_links_resolve(repo_root: Path) -> None: + """Local documentation links should point at files present in this repository.""" + missing_links: list[str] = [] + for markdown_file in _markdown_files(repo_root): + for target in _local_markdown_link_targets(markdown_file): + resolved_target = (markdown_file.parent / target).resolve() + if not resolved_target.exists(): + relative_source = markdown_file.relative_to(repo_root) + missing_links.append(f"{relative_source}: {target}") + + assert missing_links == []