Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ docs/ tests/unit/ tests/integration/
- **Filenames:** strictly lowercase `[a-z0-9_-]`
- **JSON string fields validated by `ClipMetadata`:** no characters above U+00A1
- Hebrew text goes in `.j2` templates or `.txt` transcripts only
- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `tts_engine`, `violence_typology`, `generator_version`
- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `violence_typology`, `generator_version`

## Audio format (hard constraints)

Expand Down
2 changes: 1 addition & 1 deletion docs/spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,6 @@ Every clip has a companion `{clip_id}.json` file with this schema:
"generation_date": "2026-04-10",
"generator_version": "0.1.0",
"is_synthetic": true,
"tts_engine": "azure_he_IL",
"acoustic_scene": {
"room_type": "apartment_kitchen",
"device": "phone_in_pocket",
Expand Down Expand Up @@ -345,6 +344,7 @@ Every clip has a companion `{clip_id}.json` file with this schema:
**Field notes**

- `preprocessing_applied.normalized_dbfs` is the **measured** post-preprocess peak (pair with `generation_metadata.loudness_target_peak_dbfs` to compute drift from target — see `labels/schema.py` for the docstring that pins this split).
- `tts_engine` was **removed in #109**. The TTS provider is now recorded per-speaker in `generation_metadata.tts_backend` (e.g. `{"AGG_M_30-45_001": "azure", "VIC_F_25-40_002": "google"}`); read backend diversity from the structured map. Pre-#109 corpus snapshots still carry the field — consumers should tolerate but ignore it.
- `generation_metadata` is **optional**: a JSON object when the generator recorded pipeline provenance, `null` otherwise. Treat absence as "unknown", not as failure. `generator_version` alone is not a reliable presence signal.
- `speakers[].voice_family` is **optional**: a stable family handle (e.g. `"Avri"`) when the speaker YAML overrides it, omitted otherwise. Consumers should fall back to `tts_voice_id`.
- `weak_label.has_violence` is **derived**, not asserted: `any(e.tier1_category != "NONE" for e in events)` — see `synthbanshee/labels/generator.py`. Corollaries: empty `events` → `False`; `NEG` typology clips are `False` (every event lands `tier1_category: "NONE"` by §4.1); `violence_typology` and `has_violence` may disagree (e.g. `SV` with `False` if no violent tier1 fired). The events are the ground truth; the flag is convenience. **External docs and downstream code must mirror this rule** — re-deriving from typology or intensity alone produces disagreement on every NEG row.
Expand Down
6 changes: 3 additions & 3 deletions synthbanshee/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,8 +1702,8 @@ def qa_report(
else str(rs.backend_count)
)
t_run.add_row("TTS backends", be_label)
for engine, count in sorted(rs.clips_by_tts_engine.items()):
t_run.add_row(f" {engine}", str(count))
for backend, count in sorted(rs.clips_by_tts_backend.items()):
t_run.add_row(f" {backend}", str(count))

# Overlap and emotion-downgrade ratios
ovr_label = (
Expand Down Expand Up @@ -1806,7 +1806,7 @@ def qa_report(
],
"voices_by_gender": rs.voices_by_gender,
"backend_count": rs.backend_count,
"clips_by_tts_engine": rs.clips_by_tts_engine,
"clips_by_tts_backend": rs.clips_by_tts_backend,
"overlap_ratio": rs.overlap_ratio,
"clips_with_i4_plus": rs.clips_with_i4_plus,
"emotion_downgrade_ratio": rs.emotion_downgrade_ratio,
Expand Down
3 changes: 2 additions & 1 deletion synthbanshee/labels/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ def generate_clip_metadata(
generation_date=datetime.date.today().isoformat(),
generator_version=self.generator_version,
is_synthetic=True,
tts_engine="azure_he_IL",
# #109: tts_engine removed; provider is recorded per-speaker in
# generation_metadata.tts_backend.
acoustic_scene=acoustic_scene or ClipAcousticScene(),
speakers=speakers or [],
weak_label=WeakLabel(
Expand Down
9 changes: 7 additions & 2 deletions synthbanshee/labels/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,12 @@ class ClipMetadata(BaseModel):
generation_date: str
generator_version: str
is_synthetic: Literal[True] = True
tts_engine: str = "azure_he_IL"
# #109: tts_engine removed — it was a per-clip flat string hardcoded to
# "azure_he_IL" regardless of the actual provider, which mis-labeled
# google-rendered clips and made qa-report's single_backend warning a
# false positive. The structured per-speaker source of truth is
# ``generation_metadata.tts_backend``; downstream tools read backend
# diversity from there.
acoustic_scene: ClipAcousticScene = Field(default_factory=ClipAcousticScene)
speakers: list[SpeakerInfo] = Field(default_factory=list)
weak_label: WeakLabel
Expand Down Expand Up @@ -252,7 +257,7 @@ def valid_quality_flags(cls, v: list[str]) -> list[str]:
raise ValueError(f"Unknown quality_flags: {bad}. Valid: {sorted(_QUALITY_FLAGS)}")
return v

@field_validator("clip_id", "project", "tts_engine", "violence_typology", "generator_version")
@field_validator("clip_id", "project", "violence_typology", "generator_version")
@classmethod
def ascii_safe_string(cls, v: str, info) -> str:
return _assert_ascii_safe(v, info.field_name)
Expand Down
38 changes: 30 additions & 8 deletions synthbanshee/package/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,17 @@ class RunSummary:
# Voice and backend diversity
voices_by_gender: dict[str, int] = field(default_factory=dict)
backend_count: int = 0
clips_by_tts_engine: dict[str, int] = field(default_factory=dict)
# #109: histogram of (clip, distinct backend used) pairs, derived from
# generation_metadata.tts_backend.values() rather than the old hardcoded
# ClipMetadata.tts_engine field. A clip with two speakers on different
# providers contributes one count to each backend, so
# ``sum(clips_by_tts_backend.values()) >= total_clips`` — the pre-#109
# sum invariant no longer holds. Pre-#109 corpus clips (which lack
# ``generation_metadata`` entirely) are bucketed under the
# ``"unknown"`` backend key so they still appear in the histogram and
# in ``backend_count``; otherwise they would silently vanish from
# diversity metrics whenever an old + new corpus is mixed.
clips_by_tts_backend: dict[str, int] = field(default_factory=dict)

# Overlap and emotion-downgrade ratios
overlap_ratio: float = 0.0
Expand Down Expand Up @@ -366,8 +376,8 @@ def run_qa(
# M10b accumulators
_all_turns: list[TurnMetrics] = []
_voices_by_gender: dict[str, set[str]] = defaultdict(set)
_tts_engines: set[str] = set()
_clips_by_engine: dict[str, int] = defaultdict(int)
_tts_backends: set[str] = set()
_clips_by_backend: dict[str, int] = defaultdict(int)
_clips_with_overlap: int = 0
_clips_with_emotion_downgrade: int = 0
_clips_with_i4_plus: int = 0 # denominator for overlap ratio
Expand Down Expand Up @@ -421,10 +431,22 @@ def run_qa(
if any("Strong labels JSONL missing" in w for w in validation.warnings):
stats.clips_missing_strong_labels += 1

# M10b: track voice and backend diversity
# M10b: track voice and backend diversity (#109 — derive backend
# from generation_metadata.tts_backend per-speaker map, not the
# removed flat tts_engine field). Clips that lack
# generation_metadata (pre-#109 corpus snapshots) bucket under
# "unknown" so they remain visible in the histogram rather than
# silently vanishing.
if run_summary:
_tts_engines.add(metadata.tts_engine)
_clips_by_engine[metadata.tts_engine] += 1
if metadata.generation_metadata is not None and (
metadata.generation_metadata.tts_backend
):
clip_backends = set(metadata.generation_metadata.tts_backend.values())
else:
clip_backends = {"unknown"}
_tts_backends.update(clip_backends)
for backend in clip_backends:
_clips_by_backend[backend] += 1
for spk in metadata.speakers:
_voices_by_gender[spk.gender].add(spk.tts_voice_id)

Expand Down Expand Up @@ -520,8 +542,8 @@ def run_qa(
summary = RunSummary(
role_intensity_stats=ri_stats,
voices_by_gender={g: len(v) for g, v in _voices_by_gender.items()},
backend_count=len(_tts_engines),
clips_by_tts_engine=dict(_clips_by_engine),
backend_count=len(_tts_backends),
clips_by_tts_backend=dict(_clips_by_backend),
overlap_ratio=(
_clips_with_overlap / _clips_with_i4_plus if _clips_with_i4_plus > 0 else 0.0
),
Expand Down
41 changes: 40 additions & 1 deletion tests/unit/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def _write_valid_clip(tmp_path: Path, clip_id: str = "test_clip_01") -> Path:
"generation_date": datetime.date.today().isoformat(),
"generator_version": "0.1.0",
"is_synthetic": True,
"tts_engine": "azure_he_IL",
"acoustic_scene": {},
"speakers": [],
"weak_label": {
Expand Down Expand Up @@ -1172,6 +1171,46 @@ def test_metadata_paths_fall_back_to_absolute_when_outside_root(self, tmp_path):
# Out-of-root → absolute fallback. Not great, but not a crash.
assert Path(meta_json["transcript_path"]).is_absolute()

def test_pipeline_output_feeds_qa_backend_derivation(self, tmp_path):
"""End-to-end #109 contract: a clip produced by ``_run_generate_pipeline``
carries ``generation_metadata.tts_backend`` whose values land in
``RunSummary.clips_by_tts_backend`` when fed to ``run_qa``.

Catches the regression where the generator's ``_backends_map``
shape diverges from qa.py's expectation — both unit-tested
components could pass in isolation while the integration breaks.
"""
from synthbanshee.package.qa import run_qa

turns = _make_dialogue_turns(n=1)
mixed = _make_mixed_scene(n_turns=1)
with (
patch("synthbanshee.script.generator.ScriptGenerator") as MockGen,
patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer,
):
MockGen.return_value.generate.return_value = turns
MockRenderer.return_value.render_scene.return_value = mixed
wav, _ = _run_generate_pipeline(
SCENES_DIR / "test_scene_001.yaml",
tmp_path / "out",
tmp_path / "cache",
tmp_path / "dirty",
tmp_path / "scripts",
)

assert wav is not None
report = run_qa(tmp_path / "out", run_summary=True)
rs = report.run_summary
assert rs is not None, "run_summary should be populated when flag is set"
# The example speaker (AGG_M_30-45_001) is azure-backed in the
# example YAML; the generator writes that into tts_backend, and
# qa derives the histogram from there.
assert "azure" in rs.clips_by_tts_backend, f"expected 'azure' in {rs.clips_by_tts_backend}"
assert "unknown" not in rs.clips_by_tts_backend, (
"fresh clips should not bucket as 'unknown' — the generator "
"must write generation_metadata.tts_backend per #109"
)


# ---------------------------------------------------------------------------
# generate command — additional failure and warning branches
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_generation_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def _minimal_clip_metadata(**overrides) -> dict:
"generation_date": "2026-05-01",
"generator_version": "0.1.0",
"is_synthetic": True,
"tts_engine": "azure_he_IL",
"weak_label": {
"has_violence": False,
"violence_categories": [],
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/test_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

import json

import numpy as np
import pytest
from pydantic import ValidationError
Expand Down Expand Up @@ -169,6 +171,36 @@ def test_ascii_safe_clip_id(self):
with pytest.raises(ValidationError):
_make_metadata(clip_id="\u05e9\u05dc\u05d5\u05dd")

def test_tts_engine_field_removed(self):
"""#109: the hardcoded ``tts_engine`` field was dropped \u2014 the
per-speaker ``generation_metadata.tts_backend`` is the source of
truth. Old corpus JSON carrying the field still parses (Pydantic
ignores unknown fields by default); the attribute is no longer
exposed on the model.

This test pins the **breaking-change** half of the contract:
downstream code that touches ``meta.tts_engine`` must be
updated, and an ``AttributeError`` is what they will hit. The
test would silently pass even if Pydantic re-introduced the
field with ``extra="allow"`` semantics \u2014 we exercise the
attribute access explicitly to catch that drift.
"""
m = _make_metadata()
assert not hasattr(m, "tts_engine")
# Touching the attribute must raise AttributeError \u2014 this is
# the silent break a senior reviewer asked be made explicit.
with pytest.raises(AttributeError):
_ = m.tts_engine # type: ignore[attr-defined]

# Old clip JSON with the legacy field still parses cleanly
# (Pydantic v2's default extra="ignore" drops the unknown field).
legacy_json = json.loads(m.model_dump_json())
legacy_json["tts_engine"] = "azure_he_IL"
m2 = ClipMetadata.model_validate_json(json.dumps(legacy_json))
assert m2.clip_id == m.clip_id
with pytest.raises(AttributeError):
_ = m2.tts_engine # type: ignore[attr-defined]


# ---------------------------------------------------------------------------
# LabelGenerator tests
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def _write_valid_clip(
"generation_date": datetime.date.today().isoformat(),
"generator_version": "0.1.0",
"is_synthetic": True,
"tts_engine": "azure_he_IL",
"acoustic_scene": {},
"speakers": [
{
Expand Down Expand Up @@ -360,7 +359,6 @@ def test_json_without_sibling_wav_is_skipped(self, tmp_path):
"generation_date": datetime.date.today().isoformat(),
"generator_version": "0.1.0",
"is_synthetic": True,
"tts_engine": "azure_he_IL",
"acoustic_scene": {},
"speakers": [
{
Expand Down
Loading
Loading