From 56d412daa64698ab4794033533e668882cfa6981 Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Mon, 15 Jun 2026 23:30:06 -0300
Subject: [PATCH 1/5] chore(dev): make quality gates reproducible

Use frozen uv dev envs for local Make targets and default setup to the repository Python pin when .python-version exists.
---
 Makefile                        | 20 ++++++++++----------
 scripts/setup_compatible_env.sh |  6 ++++++
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index fe80eb3..1dc5c92 100644
--- a/Makefile
+++ b/Makefile
@@ -31,22 +31,22 @@ setup-runtime:
 	SER_SETUP_INCLUDE_DEV=false ./scripts/setup_compatible_env.sh
 
 fmt:
-	uv run --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py')
-	uv run ruff check --fix ser tests
-	uv run isort ser tests
-	uv run black ser tests
+	uv run --frozen --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py')
+	uv run --frozen --extra dev ruff check --fix ser tests
+	uv run --frozen --extra dev isort ser tests
+	uv run --frozen --extra dev black ser tests
 
 lint:
-	uv run ruff check ser tests
-	uv run black --check ser tests
-	uv run isort --check-only ser tests
+	uv run --frozen --extra dev ruff check ser tests
+	uv run --frozen --extra dev black --check ser tests
+	uv run --frozen --extra dev isort --check-only ser tests
 
 type:
-	uv run mypy ser tests
-	uv run pyright --pythonversion 3.12 ser tests
+	uv run --frozen --extra dev mypy ser tests
+	uv run --frozen --extra dev pyright --pythonversion 3.12 ser tests
 
 test:
-	uv run pytest -q
+	uv run --frozen --extra dev pytest -q
 
 test-cov:
 	uv run --frozen --extra dev coverage erase
diff --git a/scripts/setup_compatible_env.sh b/scripts/setup_compatible_env.sh
index 7fbe8aa..e667908 100755
--- a/scripts/setup_compatible_env.sh
+++ b/scripts/setup_compatible_env.sh
@@ -75,6 +75,12 @@ done
 os_name="$(uname -s)"
 arch_name="$(uname -m)"
 default_python="3.13"
+if [[ -f .python-version ]]; then
+  pinned_python="$(head -n 1 .python-version | tr -d '[:space:]')"
+  if [[ -n "$pinned_python" ]]; then
+    default_python="$pinned_python"
+  fi
+fi
 
 python_version="${SER_SETUP_PYTHON:-$default_python}"
 include_dev="$(normalize_bool "${SER_SETUP_INCLUDE_DEV:-true}" "SER_SETUP_INCLUDE_DEV")"

From 5089b4bb7cb08a5eac1206d41ca8b106354b9b32 Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Mon, 15 Jun 2026 23:30:06 -0300
Subject: [PATCH 2/5] refactor(transcription): simplify MPS tensor guards

---
 ser/transcript/backends/stable_whisper_mps_compat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ser/transcript/backends/stable_whisper_mps_compat.py b/ser/transcript/backends/stable_whisper_mps_compat.py
index 0a1b6fd..489866e 100644
--- a/ser/transcript/backends/stable_whisper_mps_compat.py
+++ b/ser/transcript/backends/stable_whisper_mps_compat.py
@@ -229,8 +229,8 @@ def _resolve_mps_log_mel_target_device(
         if explicit_device is None or explicit_device.type != "mps":
             return None
         return explicit_device
-    if torch.is_tensor(audio) and cast(torch.Tensor, audio).device.type == "mps":
-        return cast(torch.Tensor, audio).device
+    if isinstance(audio, torch.Tensor) and audio.device.type == "mps":
+        return audio.device
     return None
 
 
@@ -256,7 +256,7 @@ def _log_mel_cpu_safe(
                     device=device,
                 ),
             )
-        cpu_audio = cast(torch.Tensor, audio).float().cpu() if torch.is_tensor(audio) else audio
+        cpu_audio = audio.float().cpu() if isinstance(audio, torch.Tensor) else audio
         cpu_log_mel = cast(
             torch.Tensor,
             original_log_mel_spectrogram(

From 6fd3c953e60f0b4e6bc5a4f4e3479dd7625ada6f Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Mon, 15 Jun 2026 23:30:06 -0300
Subject: [PATCH 3/5] fix(data): preflight dataset download size

Abort provider downloads before writing partial files when known expected size or HTTP Content-Length exceeds destination free space.
---
 ser/data/provider_downloads.py                |  75 ++++++++-
 .../unit/data/test_provider_downloads.py      | 155 +++++++++++++++++-
 2 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/ser/data/provider_downloads.py b/ser/data/provider_downloads.py
index 5db80b5..3c42d29 100644
--- a/ser/data/provider_downloads.py
+++ b/ser/data/provider_downloads.py
@@ -13,8 +13,9 @@
 import time
 from collections.abc import Callable
 from dataclasses import dataclass
+from email.message import Message
 from pathlib import Path
-from typing import Protocol
+from typing import Protocol, runtime_checkable
 from urllib import error, request
 
 
@@ -63,6 +64,69 @@ def __call__(
     ) -> None: ...
 
 
+@runtime_checkable
+class _ResponseWithGetHeader(Protocol):
+    """HTTP response subset exposing header lookup."""
+
+    def getheader(self, name: str, default: str | None = None) -> str | None: ...
+
+
+def _format_bytes(size_bytes: int) -> str:
+    """Formats byte counts for actionable diagnostics."""
+    units = ("B", "KB", "MB", "GB", "TB")
+    value = float(size_bytes)
+    unit = units[0]
+    for current_unit in units:
+        unit = current_unit
+        if value < 1024.0 or current_unit == units[-1]:
+            break
+        value /= 1024.0
+    if unit == "B":
+        return f"{int(value)} {unit}"
+    return f"{value:.2f} {unit}"
+
+
+def _parse_content_length(value: str | None) -> int | None:
+    """Parses a positive HTTP content length value."""
+    if value is None:
+        return None
+    normalized = value.strip()
+    if not normalized:
+        return None
+    try:
+        parsed = int(normalized)
+    except ValueError:
+        return None
+    if parsed <= 0:
+        return None
+    return parsed
+
+
+def _response_content_length(response: object) -> int | None:
+    """Returns a response content length when the server exposes one."""
+    if isinstance(response, _ResponseWithGetHeader):
+        return _parse_content_length(response.getheader("Content-Length"))
+    headers = getattr(response, "headers", None)
+    if isinstance(headers, Message):
+        return _parse_content_length(headers.get("Content-Length"))
+    return None
+
+
+def _ensure_download_disk_space(*, destination_path: Path, required_bytes: int | None) -> None:
+    """Raises when known download size exceeds free space at the destination."""
+    if required_bytes is None or required_bytes <= 0:
+        return
+    free_bytes = shutil.disk_usage(destination_path.parent).free
+    if free_bytes >= required_bytes:
+        return
+    raise RuntimeError(
+        "Dataset download aborted due to insufficient disk space. "
+        f"Required at least {_format_bytes(required_bytes)}, "
+        f"free {_format_bytes(free_bytes)} at {destination_path.parent}. "
+        "Use `--dataset-root` on a volume with enough space or free local storage first."
+    )
+
+
 def is_retryable_http_status(status_code: int) -> bool:
     """Returns whether one HTTP status is safe to retry."""
     return status_code == 429 or 500 <= status_code <= 599
@@ -168,6 +232,8 @@ def download_file_with_retries(
         elif existing_size > 0:
             return destination_path
     tmp_path = destination_path.with_suffix(destination_path.suffix + ".partial")
+    tmp_path.unlink(missing_ok=True)
+    _ensure_download_disk_space(destination_path=destination_path, required_bytes=expected_size)
 
     def _action() -> None:
         req = request.Request(
@@ -179,6 +245,13 @@ def _action() -> None:
             method="GET",
         )
         with request.urlopen(req, timeout=timeout_seconds) as response:
+            response_size = _response_content_length(response)
+            if response_size is not None:
+                required_bytes = max(expected_size or 0, response_size)
+                _ensure_download_disk_space(
+                    destination_path=destination_path,
+                    required_bytes=required_bytes,
+                )
             with tmp_path.open("wb") as output_handle:
                 while True:
                     chunk = response.read(chunk_size)
diff --git a/tests/suites/unit/data/test_provider_downloads.py b/tests/suites/unit/data/test_provider_downloads.py
index 3f290bd..e0eaa1b 100644
--- a/tests/suites/unit/data/test_provider_downloads.py
+++ b/tests/suites/unit/data/test_provider_downloads.py
@@ -5,7 +5,7 @@
 from collections.abc import Callable
 from email.message import Message
 from pathlib import Path
-from typing import Any
+from typing import Any, NamedTuple
 from urllib import error
 
 import pytest
@@ -13,6 +13,14 @@
 from ser.data import provider_downloads
 
 
+class _DiskUsage(NamedTuple):
+    """Minimal disk-usage result for download preflight tests."""
+
+    total: int
+    used: int
+    free: int
+
+
 def test_download_file_with_retries_reuses_existing_file_without_hash(
     tmp_path: Path,
 ) -> None:
@@ -99,6 +107,151 @@ def _with_retries(*, description: str, action: Callable[[], None]) -> None:
     assert not destination_path.with_suffix(".zip.partial").exists()
 
 
+def test_download_file_with_retries_aborts_before_request_when_expected_size_exceeds_disk(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    """Known-size downloads should fail before network I/O when disk is too small."""
+    destination_path = tmp_path / "archive.zip"
+    urlopen_called = False
+    retried = False
+
+    def _urlopen(req: object, timeout: float) -> object:
+        nonlocal urlopen_called
+        del req, timeout
+        urlopen_called = True
+        raise AssertionError("network I/O should not start")
+
+    def _with_retries(*, description: str, action: Callable[[], None]) -> None:
+        nonlocal retried
+        del description, action
+        retried = True
+
+    monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen)
+    monkeypatch.setattr(
+        provider_downloads.shutil,
+        "disk_usage",
+        lambda _path: _DiskUsage(total=10, used=9, free=1),
+    )
+
+    with pytest.raises(RuntimeError, match="insufficient disk space"):
+        provider_downloads.download_file_with_retries(
+            url="https://example.invalid/archive.zip",
+            destination_path=destination_path,
+            expected_size=10,
+            with_retries=_with_retries,
+            compute_file_md5=lambda _path: "unused",
+            timeout_seconds=1.0,
+            chunk_size=1024,
+        )
+
+    assert retried is False
+    assert urlopen_called is False
+    assert not destination_path.exists()
+
+
+def test_download_file_with_retries_aborts_on_content_length_when_disk_is_too_small(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    """Unknown-size downloads should use Content-Length for disk-space preflight."""
+    destination_path = tmp_path / "archive.zip"
+
+    class _FakeResponse:
+        def __enter__(self) -> _FakeResponse:
+            return self
+
+        def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
+            del exc_type, exc, tb
+
+        def getheader(self, name: str, default: str | None = None) -> str | None:
+            if name.lower() == "content-length":
+                return "10"
+            return default
+
+        def read(self, _size: int) -> bytes:
+            raise AssertionError("payload should not be read after disk-space failure")
+
+    def _urlopen(req: object, timeout: float) -> _FakeResponse:
+        del req, timeout
+        return _FakeResponse()
+
+    def _with_retries(*, description: str, action: Callable[[], None]) -> None:
+        del description
+        action()
+
+    monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen)
+    monkeypatch.setattr(
+        provider_downloads.shutil,
+        "disk_usage",
+        lambda _path: _DiskUsage(total=10, used=9, free=1),
+    )
+
+    with pytest.raises(RuntimeError, match="insufficient disk space"):
+        provider_downloads.download_file_with_retries(
+            url="https://example.invalid/archive.zip",
+            destination_path=destination_path,
+            with_retries=_with_retries,
+            compute_file_md5=lambda _path: "unused",
+            timeout_seconds=1.0,
+            chunk_size=1024,
+        )
+
+    assert not destination_path.exists()
+    assert not destination_path.with_suffix(".zip.partial").exists()
+
+
+def test_download_file_with_retries_uses_larger_content_length_than_expected_size(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    """Response size should protect disk even when stale metadata underestimates bytes."""
+    destination_path = tmp_path / "archive.zip"
+
+    class _FakeResponse:
+        def __enter__(self) -> _FakeResponse:
+            return self
+
+        def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
+            del exc_type, exc, tb
+
+        def getheader(self, name: str, default: str | None = None) -> str | None:
+            if name.lower() == "content-length":
+                return "10"
+            return default
+
+        def read(self, _size: int) -> bytes:
+            raise AssertionError("payload should not be read after disk-space failure")
+
+    def _urlopen(req: object, timeout: float) -> _FakeResponse:
+        del req, timeout
+        return _FakeResponse()
+
+    def _with_retries(*, description: str, action: Callable[[], None]) -> None:
+        del description
+        action()
+
+    monkeypatch.setattr(provider_downloads.request, "urlopen", _urlopen)
+    monkeypatch.setattr(
+        provider_downloads.shutil,
+        "disk_usage",
+        lambda _path: _DiskUsage(total=10, used=5, free=5),
+    )
+
+    with pytest.raises(RuntimeError, match="insufficient disk space"):
+        provider_downloads.download_file_with_retries(
+            url="https://example.invalid/archive.zip",
+            destination_path=destination_path,
+            expected_size=2,
+            with_retries=_with_retries,
+            compute_file_md5=lambda _path: "unused",
+            timeout_seconds=1.0,
+            chunk_size=1024,
+        )
+
+    assert not destination_path.exists()
+
+
 def test_read_github_latest_release_assets_parses_expected_payload() -> None:
     """GitHub helper should read latest release tag and downloadable assets."""
 

From 59ba563b95733cbcf4eff50a92fb26f5fe3ccb25 Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Mon, 15 Jun 2026 23:30:07 -0300
Subject: [PATCH 4/5] fix(ci): harden validation smoke scripts

Use platform-aware full-gate artifact directories and force fresh same-version wheel installs during package smoke tests.
---
 scripts/run_full_dataset_quality_gate.sh      | 14 +++++++++-
 scripts/workflows/smoke_test_wheel_install.sh |  3 ++-
 ...st_run_full_dataset_quality_gate_script.py | 26 +++++++++++++++++++
 .../test_smoke_test_wheel_install_script.py   | 22 ++++++++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py
 create mode 100644 tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py

diff --git a/scripts/run_full_dataset_quality_gate.sh b/scripts/run_full_dataset_quality_gate.sh
index e414938..9eed3a2 100755
--- a/scripts/run_full_dataset_quality_gate.sh
+++ b/scripts/run_full_dataset_quality_gate.sh
@@ -12,7 +12,19 @@ medium_model_file_name="${SER_FULL_GATE_MEDIUM_MODEL_FILE_NAME:-ser_model_medium
 medium_training_report_file_name="${SER_FULL_GATE_MEDIUM_TRAINING_REPORT_FILE_NAME:-training_report_medium_full.json}"
 report_path="${SER_FULL_GATE_REPORT_PATH:-profile_quality_gate_report_full.json}"
 progress_every="${SER_FULL_GATE_PROGRESS_EVERY:-120}"
-models_dir="${SER_MODELS_DIR:-$HOME/Library/Application Support/ser/models}"
+
+default_models_dir() {
+  case "$(uname -s)" in
+    Darwin)
+      printf '%s\n' "$HOME/Library/Application Support/ser/models"
+      ;;
+    *)
+      printf '%s\n' "${XDG_DATA_HOME:-$HOME/.local/share}/ser/models"
+      ;;
+  esac
+}
+
+models_dir="${SER_MODELS_DIR:-$(default_models_dir)}"
 
 if [[ "$run_training" != "true" && "$run_training" != "false" ]]; then
   printf 'SER_FULL_GATE_RUN_TRAINING must be true or false, got: %s\n' "$run_training" >&2
diff --git a/scripts/workflows/smoke_test_wheel_install.sh b/scripts/workflows/smoke_test_wheel_install.sh
index 56d74c6..5e325fd 100755
--- a/scripts/workflows/smoke_test_wheel_install.sh
+++ b/scripts/workflows/smoke_test_wheel_install.sh
@@ -11,10 +11,11 @@ if [[ ${#wheels[@]} -eq 0 ]]; then
   exit 2
 fi
 
+rm -rf .pkg-smoke
 python -m venv .pkg-smoke
 . .pkg-smoke/bin/activate
 python -m pip install --upgrade pip
-pip install --no-deps "${wheels[@]}"
+pip install --force-reinstall --no-deps "${wheels[@]}"
 
 tmp_dir="$(mktemp -d)"
 cd "$tmp_dir"
diff --git a/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py b/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py
new file mode 100644
index 0000000..69d4fe8
--- /dev/null
+++ b/tests/suites/unit/scripts/test_run_full_dataset_quality_gate_script.py
@@ -0,0 +1,26 @@
+"""Contracts for full-dataset quality gate shell script defaults."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+
+def test_full_dataset_quality_gate_script_has_valid_bash_syntax(repo_root: Path) -> None:
+    """Full-gate script should parse before CI invokes expensive profile work."""
+    script_path = repo_root / "scripts" / "run_full_dataset_quality_gate.sh"
+
+    subprocess.run(["bash", "-n", str(script_path)], check=True)
+
+
+def test_full_dataset_quality_gate_default_models_dir_is_platform_aware(repo_root: Path) -> None:
+    """Default artifact lookup should match SER platform data-dir conventions."""
+    script_path = repo_root / "scripts" / "run_full_dataset_quality_gate.sh"
+    script_text = script_path.read_text(encoding="utf-8")
+
+    assert "Library/Application Support/ser/models" in script_text
+    assert "${XDG_DATA_HOME:-$HOME/.local/share}/ser/models" in script_text
+    assert (
+        'models_dir="${SER_MODELS_DIR:-$HOME/Library/Application Support/ser/models}"'
+        not in script_text
+    )
diff --git a/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py b/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py
new file mode 100644
index 0000000..7b1ce10
--- /dev/null
+++ b/tests/suites/unit/scripts/test_smoke_test_wheel_install_script.py
@@ -0,0 +1,22 @@
+"""Contracts for wheel-install smoke test script."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+
+def test_wheel_smoke_script_has_valid_bash_syntax(repo_root: Path) -> None:
+    """Wheel smoke script should parse before packaging CI invokes it."""
+    script_path = repo_root / "scripts" / "workflows" / "smoke_test_wheel_install.sh"
+
+    subprocess.run(["bash", "-n", str(script_path)], check=True)
+
+
+def test_wheel_smoke_script_forces_fresh_same_version_install(repo_root: Path) -> None:
+    """Wheel smoke should install the current wheel even when package version is unchanged."""
+    script_path = repo_root / "scripts" / "workflows" / "smoke_test_wheel_install.sh"
+    script_text = script_path.read_text(encoding="utf-8")
+
+    assert "rm -rf .pkg-smoke" in script_text
+    assert "pip install --force-reinstall --no-deps" in script_text

From 7ec89f31e30386ccece5dabf7b693dd84c089a1b Mon Sep 17 00:00:00 2001
From: Juan Sugg <juanpedrosugg@gmail.com>
Date: Mon, 15 Jun 2026 23:30:20 -0300
Subject: [PATCH 5/5] test(docs): guard local markdown links

---
 docs/adr/README.md                            |  1 -
 .../architecture/test_documentation_links.py  | 46 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 tests/suites/integration/architecture/test_documentation_links.py

diff --git a/docs/adr/README.md b/docs/adr/README.md
index e5d212f..4939e5f 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -10,7 +10,6 @@ The repository currently keeps its architecture guidance in these maintained doc
 - [`../codebase-architecture.md`](../codebase-architecture.md): narrative codebase analysis
 - [`../subsystem-dependency-map.md`](../subsystem-dependency-map.md): subsystem dependency directions and soft-boundary policy
 - [`../refactor-hotspot-checks.md`](../refactor-hotspot-checks.md): hotspot inventory for careful refactors
-- [`../architecture-refactor-roadmap.md`](../architecture-refactor-roadmap.md): staged refactor priorities
 
 ## How this directory should be used
 
diff --git a/tests/suites/integration/architecture/test_documentation_links.py b/tests/suites/integration/architecture/test_documentation_links.py
new file mode 100644
index 0000000..9bcc32e
--- /dev/null
+++ b/tests/suites/integration/architecture/test_documentation_links.py
@@ -0,0 +1,46 @@
+"""Repository documentation link contract tests."""
+
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+_MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
+_EXTERNAL_LINK_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE)
+
+
+def _markdown_files(repo_root: Path) -> list[Path]:
+    """Returns repository Markdown files covered by local link checks."""
+    return [
+        repo_root / "README.md",
+        repo_root / "CONTRIBUTING.md",
+        *sorted((repo_root / "docs").rglob("*.md")),
+    ]
+
+
+def _local_markdown_link_targets(markdown_file: Path) -> list[str]:
+    """Extracts non-external Markdown link targets from one file."""
+    targets: list[str] = []
+    for match in _MARKDOWN_LINK_RE.finditer(markdown_file.read_text(encoding="utf-8")):
+        raw_target = match.group(1).split("#", 1)[0]
+        if (
+            not raw_target
+            or _EXTERNAL_LINK_RE.match(raw_target)
+            or raw_target.startswith("mailto:")
+        ):
+            continue
+        targets.append(raw_target)
+    return targets
+
+
+def test_local_markdown_links_resolve(repo_root: Path) -> None:
+    """Local documentation links should point at files present in this repository."""
+    missing_links: list[str] = []
+    for markdown_file in _markdown_files(repo_root):
+        for target in _local_markdown_link_targets(markdown_file):
+            resolved_target = (markdown_file.parent / target).resolve()
+            if not resolved_target.exists():
+                relative_source = markdown_file.relative_to(repo_root)
+                missing_links.append(f"{relative_source}: {target}")
+
+    assert missing_links == []