Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,22 @@ setup-runtime:
SER_SETUP_INCLUDE_DEV=false ./scripts/setup_compatible_env.sh

fmt:
uv run --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py')
uv run ruff check --fix ser tests
uv run isort ser tests
uv run black ser tests
uv run --frozen --extra dev pyupgrade --py312-plus --exit-zero-even-if-changed $$(rg --files ser tests -g '*.py')
uv run --frozen --extra dev ruff check --fix ser tests
uv run --frozen --extra dev isort ser tests
uv run --frozen --extra dev black ser tests

lint:
uv run ruff check ser tests
uv run black --check ser tests
uv run isort --check-only ser tests
uv run --frozen --extra dev ruff check ser tests
uv run --frozen --extra dev black --check ser tests
uv run --frozen --extra dev isort --check-only ser tests

type:
uv run mypy ser tests
uv run pyright --pythonversion 3.12 ser tests
uv run --frozen --extra dev mypy ser tests
uv run --frozen --extra dev pyright --pythonversion 3.12 ser tests

test:
uv run pytest -q
uv run --frozen --extra dev pytest -q

test-cov:
uv run --frozen --extra dev coverage erase
Expand Down
1 change: 0 additions & 1 deletion docs/adr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ The repository currently keeps its architecture guidance in these maintained doc
- [`../codebase-architecture.md`](../codebase-architecture.md): narrative codebase analysis
- [`../subsystem-dependency-map.md`](../subsystem-dependency-map.md): subsystem dependency directions and soft-boundary policy
- [`../refactor-hotspot-checks.md`](../refactor-hotspot-checks.md): hotspot inventory for careful refactors
- [`../architecture-refactor-roadmap.md`](../architecture-refactor-roadmap.md): staged refactor priorities

## How this directory should be used

Expand Down
14 changes: 13 additions & 1 deletion scripts/run_full_dataset_quality_gate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,19 @@ medium_model_file_name="${SER_FULL_GATE_MEDIUM_MODEL_FILE_NAME:-ser_model_medium
medium_training_report_file_name="${SER_FULL_GATE_MEDIUM_TRAINING_REPORT_FILE_NAME:-training_report_medium_full.json}"
report_path="${SER_FULL_GATE_REPORT_PATH:-profile_quality_gate_report_full.json}"
progress_every="${SER_FULL_GATE_PROGRESS_EVERY:-120}"
models_dir="${SER_MODELS_DIR:-$HOME/Library/Application Support/ser/models}"

default_models_dir() {
case "$(uname -s)" in
Darwin)
printf '%s\n' "$HOME/Library/Application Support/ser/models"
;;
*)
printf '%s\n' "${XDG_DATA_HOME:-$HOME/.local/share}/ser/models"
;;
esac
}

models_dir="${SER_MODELS_DIR:-$(default_models_dir)}"

if [[ "$run_training" != "true" && "$run_training" != "false" ]]; then
printf 'SER_FULL_GATE_RUN_TRAINING must be true or false, got: %s\n' "$run_training" >&2
Expand Down
6 changes: 6 additions & 0 deletions scripts/setup_compatible_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ done
os_name="$(uname -s)"
arch_name="$(uname -m)"
default_python="3.13"
if [[ -f .python-version ]]; then
pinned_python="$(head -n 1 .python-version | tr -d '[:space:]')"
if [[ -n "$pinned_python" ]]; then
default_python="$pinned_python"
fi
fi

python_version="${SER_SETUP_PYTHON:-$default_python}"
include_dev="$(normalize_bool "${SER_SETUP_INCLUDE_DEV:-true}" "SER_SETUP_INCLUDE_DEV")"
Expand Down
3 changes: 2 additions & 1 deletion scripts/workflows/smoke_test_wheel_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ if [[ ${#wheels[@]} -eq 0 ]]; then
exit 2
fi

rm -rf .pkg-smoke
python -m venv .pkg-smoke
. .pkg-smoke/bin/activate
python -m pip install --upgrade pip
pip install --no-deps "${wheels[@]}"
pip install --force-reinstall --no-deps "${wheels[@]}"

tmp_dir="$(mktemp -d)"
cd "$tmp_dir"
Expand Down
75 changes: 74 additions & 1 deletion ser/data/provider_downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
import time
from collections.abc import Callable
from dataclasses import dataclass
from email.message import Message
from pathlib import Path
from typing import Protocol
from typing import Protocol, runtime_checkable
from urllib import error, request


Expand Down Expand Up @@ -63,6 +64,69 @@ def __call__(
) -> None: ...


@runtime_checkable
class _ResponseWithGetHeader(Protocol):
"""HTTP response subset exposing header lookup."""

def getheader(self, name: str, default: str | None = None) -> str | None: ...


def _format_bytes(size_bytes: int) -> str:
"""Formats byte counts for actionable diagnostics."""
units = ("B", "KB", "MB", "GB", "TB")
value = float(size_bytes)
unit = units[0]
for current_unit in units:
unit = current_unit
if value < 1024.0 or current_unit == units[-1]:
break
value /= 1024.0
if unit == "B":
return f"{int(value)} {unit}"
return f"{value:.2f} {unit}"


def _parse_content_length(value: str | None) -> int | None:
"""Parses a positive HTTP content length value."""
if value is None:
return None
normalized = value.strip()
if not normalized:
return None
try:
parsed = int(normalized)
except ValueError:
return None
if parsed <= 0:
return None
return parsed


def _response_content_length(response: object) -> int | None:
"""Returns a response content length when the server exposes one."""
if isinstance(response, _ResponseWithGetHeader):
return _parse_content_length(response.getheader("Content-Length"))
headers = getattr(response, "headers", None)
if isinstance(headers, Message):
return _parse_content_length(headers.get("Content-Length"))
return None


def _ensure_download_disk_space(*, destination_path: Path, required_bytes: int | None) -> None:
"""Raises when known download size exceeds free space at the destination."""
if required_bytes is None or required_bytes <= 0:
return
free_bytes = shutil.disk_usage(destination_path.parent).free
if free_bytes >= required_bytes:
return
raise RuntimeError(
"Dataset download aborted due to insufficient disk space. "
f"Required at least {_format_bytes(required_bytes)}, "
f"free {_format_bytes(free_bytes)} at {destination_path.parent}. "
"Use `--dataset-root` on a volume with enough space or free local storage first."
)


def is_retryable_http_status(status_code: int) -> bool:
"""Returns whether one HTTP status is safe to retry."""
return status_code == 429 or 500 <= status_code <= 599
Expand Down Expand Up @@ -168,6 +232,8 @@ def download_file_with_retries(
elif existing_size > 0:
return destination_path
tmp_path = destination_path.with_suffix(destination_path.suffix + ".partial")
tmp_path.unlink(missing_ok=True)
_ensure_download_disk_space(destination_path=destination_path, required_bytes=expected_size)

def _action() -> None:
req = request.Request(
Expand All @@ -179,6 +245,13 @@ def _action() -> None:
method="GET",
)
with request.urlopen(req, timeout=timeout_seconds) as response:
response_size = _response_content_length(response)
if response_size is not None:
required_bytes = max(expected_size or 0, response_size)
_ensure_download_disk_space(
destination_path=destination_path,
required_bytes=required_bytes,
)
with tmp_path.open("wb") as output_handle:
while True:
chunk = response.read(chunk_size)
Expand Down
6 changes: 3 additions & 3 deletions ser/transcript/backends/stable_whisper_mps_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ def _resolve_mps_log_mel_target_device(
if explicit_device is None or explicit_device.type != "mps":
return None
return explicit_device
if torch.is_tensor(audio) and cast(torch.Tensor, audio).device.type == "mps":
return cast(torch.Tensor, audio).device
if isinstance(audio, torch.Tensor) and audio.device.type == "mps":
return audio.device
return None


Expand All @@ -256,7 +256,7 @@ def _log_mel_cpu_safe(
device=device,
),
)
cpu_audio = cast(torch.Tensor, audio).float().cpu() if torch.is_tensor(audio) else audio
cpu_audio = audio.float().cpu() if isinstance(audio, torch.Tensor) else audio
cpu_log_mel = cast(
torch.Tensor,
original_log_mel_spectrogram(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Repository documentation link contract tests."""

from __future__ import annotations

import re
from pathlib import Path

_MARKDOWN_LINK_RE = re.compile(r"\[[^\]]+\]\(([^)]+)\)")
_EXTERNAL_LINK_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE)


def _markdown_files(repo_root: Path) -> list[Path]:
"""Returns repository Markdown files covered by local link checks."""
return [
repo_root / "README.md",
repo_root / "CONTRIBUTING.md",
*sorted((repo_root / "docs").rglob("*.md")),
]


def _local_markdown_link_targets(markdown_file: Path) -> list[str]:
"""Extracts non-external Markdown link targets from one file."""
targets: list[str] = []
for match in _MARKDOWN_LINK_RE.finditer(markdown_file.read_text(encoding="utf-8")):
raw_target = match.group(1).split("#", 1)[0]
if (
not raw_target
or _EXTERNAL_LINK_RE.match(raw_target)
or raw_target.startswith("mailto:")
):
continue
targets.append(raw_target)
return targets


def test_local_markdown_links_resolve(repo_root: Path) -> None:
"""Local documentation links should point at files present in this repository."""
missing_links: list[str] = []
for markdown_file in _markdown_files(repo_root):
for target in _local_markdown_link_targets(markdown_file):
resolved_target = (markdown_file.parent / target).resolve()
if not resolved_target.exists():
relative_source = markdown_file.relative_to(repo_root)
missing_links.append(f"{relative_source}: {target}")

assert missing_links == []
Loading