diff --git a/AGENTS.md b/AGENTS.md index 832b423..9919db1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,7 +27,7 @@ docs/ tests/unit/ tests/integration/ - **Filenames:** strictly lowercase `[a-z0-9_-]` - **JSON string fields validated by `ClipMetadata`:** no characters above U+00A1 - Hebrew text goes in `.j2` templates or `.txt` transcripts only -- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `tts_engine`, `violence_typology`, `generator_version` +- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `violence_typology`, `generator_version` ## Audio format (hard constraints) diff --git a/docs/spec.md b/docs/spec.md index 3381eac..1e7c844 100644 --- a/docs/spec.md +++ b/docs/spec.md @@ -284,7 +284,6 @@ Every clip has a companion `{clip_id}.json` file with this schema: "generation_date": "2026-04-10", "generator_version": "0.1.0", "is_synthetic": true, - "tts_engine": "azure_he_IL", "acoustic_scene": { "room_type": "apartment_kitchen", "device": "phone_in_pocket", @@ -345,6 +344,7 @@ Every clip has a companion `{clip_id}.json` file with this schema: **Field notes** - `preprocessing_applied.normalized_dbfs` is the **measured** post-preprocess peak (pair with `generation_metadata.loudness_target_peak_dbfs` to compute drift from target — see `labels/schema.py` for the docstring that pins this split). +- `tts_engine` was **removed in #109**. The TTS provider is now recorded per-speaker in `generation_metadata.tts_backend` (e.g. `{"AGG_M_30-45_001": "azure", "VIC_F_25-40_002": "google"}`); read backend diversity from the structured map. Pre-#109 corpus snapshots still carry the field — consumers should tolerate but ignore it. - `generation_metadata` is **optional**: a JSON object when the generator recorded pipeline provenance, `null` otherwise. Treat absence as "unknown", not as failure. `generator_version` alone is not a reliable presence signal. - `speakers[].voice_family` is **optional**: a stable family handle (e.g. `"Avri"`) when the speaker YAML overrides it, omitted otherwise. Consumers should fall back to `tts_voice_id`. - `weak_label.has_violence` is **derived**, not asserted: `any(e.tier1_category != "NONE" for e in events)` — see `synthbanshee/labels/generator.py`. Corollaries: empty `events` → `False`; `NEG` typology clips are `False` (every event lands `tier1_category: "NONE"` by §4.1); `violence_typology` and `has_violence` may disagree (e.g. `SV` with `False` if no violent tier1 fired). The events are the ground truth; the flag is convenience. **External docs and downstream code must mirror this rule** — re-deriving from typology or intensity alone produces disagreement on every NEG row. diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 74f45cb..0c7028b 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -1702,8 +1702,8 @@ def qa_report( else str(rs.backend_count) ) t_run.add_row("TTS backends", be_label) - for engine, count in sorted(rs.clips_by_tts_engine.items()): - t_run.add_row(f" {engine}", str(count)) + for backend, count in sorted(rs.clips_by_tts_backend.items()): + t_run.add_row(f" {backend}", str(count)) # Overlap and emotion-downgrade ratios ovr_label = ( @@ -1806,7 +1806,7 @@ def qa_report( ], "voices_by_gender": rs.voices_by_gender, "backend_count": rs.backend_count, - "clips_by_tts_engine": rs.clips_by_tts_engine, + "clips_by_tts_backend": rs.clips_by_tts_backend, "overlap_ratio": rs.overlap_ratio, "clips_with_i4_plus": rs.clips_with_i4_plus, "emotion_downgrade_ratio": rs.emotion_downgrade_ratio, diff --git a/synthbanshee/labels/generator.py b/synthbanshee/labels/generator.py index f17bfc1..c62d4f3 100644 --- a/synthbanshee/labels/generator.py +++ b/synthbanshee/labels/generator.py @@ -261,7 +261,8 @@ def generate_clip_metadata( generation_date=datetime.date.today().isoformat(), generator_version=self.generator_version, is_synthetic=True, - tts_engine="azure_he_IL", + # #109: tts_engine removed; provider is recorded per-speaker in + # generation_metadata.tts_backend. acoustic_scene=acoustic_scene or ClipAcousticScene(), speakers=speakers or [], weak_label=WeakLabel( diff --git a/synthbanshee/labels/schema.py b/synthbanshee/labels/schema.py index 934b7ea..5718183 100644 --- a/synthbanshee/labels/schema.py +++ b/synthbanshee/labels/schema.py @@ -210,7 +210,12 @@ class ClipMetadata(BaseModel): generation_date: str generator_version: str is_synthetic: Literal[True] = True - tts_engine: str = "azure_he_IL" + # #109: tts_engine removed — it was a per-clip flat string hardcoded to + # "azure_he_IL" regardless of the actual provider, which mis-labeled + # google-rendered clips and made qa-report's single_backend warning a + # false positive. The structured per-speaker source of truth is + # ``generation_metadata.tts_backend``; downstream tools read backend + # diversity from there. acoustic_scene: ClipAcousticScene = Field(default_factory=ClipAcousticScene) speakers: list[SpeakerInfo] = Field(default_factory=list) weak_label: WeakLabel @@ -252,7 +257,7 @@ def valid_quality_flags(cls, v: list[str]) -> list[str]: raise ValueError(f"Unknown quality_flags: {bad}. Valid: {sorted(_QUALITY_FLAGS)}") return v - @field_validator("clip_id", "project", "tts_engine", "violence_typology", "generator_version") + @field_validator("clip_id", "project", "violence_typology", "generator_version") @classmethod def ascii_safe_string(cls, v: str, info) -> str: return _assert_ascii_safe(v, info.field_name) diff --git a/synthbanshee/package/qa.py b/synthbanshee/package/qa.py index ec6cd26..026bb67 100644 --- a/synthbanshee/package/qa.py +++ b/synthbanshee/package/qa.py @@ -127,7 +127,17 @@ class RunSummary: # Voice and backend diversity voices_by_gender: dict[str, int] = field(default_factory=dict) backend_count: int = 0 - clips_by_tts_engine: dict[str, int] = field(default_factory=dict) + # #109: histogram of (clip, distinct backend used) pairs, derived from + # generation_metadata.tts_backend.values() rather than the old hardcoded + # ClipMetadata.tts_engine field. A clip with two speakers on different + # providers contributes one count to each backend, so + # ``sum(clips_by_tts_backend.values()) >= total_clips`` — the pre-#109 + # sum invariant no longer holds. Pre-#109 corpus clips (which lack + # ``generation_metadata`` entirely) are bucketed under the + # ``"unknown"`` backend key so they still appear in the histogram and + # in ``backend_count``; otherwise they would silently vanish from + # diversity metrics whenever an old + new corpus is mixed. + clips_by_tts_backend: dict[str, int] = field(default_factory=dict) # Overlap and emotion-downgrade ratios overlap_ratio: float = 0.0 @@ -366,8 +376,8 @@ def run_qa( # M10b accumulators _all_turns: list[TurnMetrics] = [] _voices_by_gender: dict[str, set[str]] = defaultdict(set) - _tts_engines: set[str] = set() - _clips_by_engine: dict[str, int] = defaultdict(int) + _tts_backends: set[str] = set() + _clips_by_backend: dict[str, int] = defaultdict(int) _clips_with_overlap: int = 0 _clips_with_emotion_downgrade: int = 0 _clips_with_i4_plus: int = 0 # denominator for overlap ratio @@ -421,10 +431,22 @@ def run_qa( if any("Strong labels JSONL missing" in w for w in validation.warnings): stats.clips_missing_strong_labels += 1 - # M10b: track voice and backend diversity + # M10b: track voice and backend diversity (#109 — derive backend + # from generation_metadata.tts_backend per-speaker map, not the + # removed flat tts_engine field). Clips that lack + # generation_metadata (pre-#109 corpus snapshots) bucket under + # "unknown" so they remain visible in the histogram rather than + # silently vanishing. if run_summary: - _tts_engines.add(metadata.tts_engine) - _clips_by_engine[metadata.tts_engine] += 1 + if metadata.generation_metadata is not None and ( + metadata.generation_metadata.tts_backend + ): + clip_backends = set(metadata.generation_metadata.tts_backend.values()) + else: + clip_backends = {"unknown"} + _tts_backends.update(clip_backends) + for backend in clip_backends: + _clips_by_backend[backend] += 1 for spk in metadata.speakers: _voices_by_gender[spk.gender].add(spk.tts_voice_id) @@ -520,8 +542,8 @@ def run_qa( summary = RunSummary( role_intensity_stats=ri_stats, voices_by_gender={g: len(v) for g, v in _voices_by_gender.items()}, - backend_count=len(_tts_engines), - clips_by_tts_engine=dict(_clips_by_engine), + backend_count=len(_tts_backends), + clips_by_tts_backend=dict(_clips_by_backend), overlap_ratio=( _clips_with_overlap / _clips_with_i4_plus if _clips_with_i4_plus > 0 else 0.0 ), diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 0b6e908..5eeac1c 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -121,7 +121,6 @@ def _write_valid_clip(tmp_path: Path, clip_id: str = "test_clip_01") -> Path: "generation_date": datetime.date.today().isoformat(), "generator_version": "0.1.0", "is_synthetic": True, - "tts_engine": "azure_he_IL", "acoustic_scene": {}, "speakers": [], "weak_label": { @@ -1172,6 +1171,46 @@ def test_metadata_paths_fall_back_to_absolute_when_outside_root(self, tmp_path): # Out-of-root → absolute fallback. Not great, but not a crash. assert Path(meta_json["transcript_path"]).is_absolute() + def test_pipeline_output_feeds_qa_backend_derivation(self, tmp_path): + """End-to-end #109 contract: a clip produced by ``_run_generate_pipeline`` + carries ``generation_metadata.tts_backend`` whose values land in + ``RunSummary.clips_by_tts_backend`` when fed to ``run_qa``. + + Catches the regression where the generator's ``_backends_map`` + shape diverges from qa.py's expectation — both unit-tested + components could pass in isolation while the integration breaks. + """ + from synthbanshee.package.qa import run_qa + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + wav, _ = _run_generate_pipeline( + SCENES_DIR / "test_scene_001.yaml", + tmp_path / "out", + tmp_path / "cache", + tmp_path / "dirty", + tmp_path / "scripts", + ) + + assert wav is not None + report = run_qa(tmp_path / "out", run_summary=True) + rs = report.run_summary + assert rs is not None, "run_summary should be populated when flag is set" + # The example speaker (AGG_M_30-45_001) is azure-backed in the + # example YAML; the generator writes that into tts_backend, and + # qa derives the histogram from there. + assert "azure" in rs.clips_by_tts_backend, f"expected 'azure' in {rs.clips_by_tts_backend}" + assert "unknown" not in rs.clips_by_tts_backend, ( + "fresh clips should not bucket as 'unknown' — the generator " + "must write generation_metadata.tts_backend per #109" + ) + # --------------------------------------------------------------------------- # generate command — additional failure and warning branches diff --git a/tests/unit/test_generation_metadata.py b/tests/unit/test_generation_metadata.py index e6c772a..8def675 100644 --- a/tests/unit/test_generation_metadata.py +++ b/tests/unit/test_generation_metadata.py @@ -25,7 +25,6 @@ def _minimal_clip_metadata(**overrides) -> dict: "generation_date": "2026-05-01", "generator_version": "0.1.0", "is_synthetic": True, - "tts_engine": "azure_he_IL", "weak_label": { "has_violence": False, "violence_categories": [], diff --git a/tests/unit/test_labels.py b/tests/unit/test_labels.py index c19d702..83743c1 100644 --- a/tests/unit/test_labels.py +++ b/tests/unit/test_labels.py @@ -2,6 +2,8 @@ from __future__ import annotations +import json + import numpy as np import pytest from pydantic import ValidationError @@ -169,6 +171,36 @@ def test_ascii_safe_clip_id(self): with pytest.raises(ValidationError): _make_metadata(clip_id="\u05e9\u05dc\u05d5\u05dd") + def test_tts_engine_field_removed(self): + """#109: the hardcoded ``tts_engine`` field was dropped \u2014 the + per-speaker ``generation_metadata.tts_backend`` is the source of + truth. Old corpus JSON carrying the field still parses (Pydantic + ignores unknown fields by default); the attribute is no longer + exposed on the model. + + This test pins the **breaking-change** half of the contract: + downstream code that touches ``meta.tts_engine`` must be + updated, and an ``AttributeError`` is what they will hit. The + test would silently pass even if Pydantic re-introduced the + field with ``extra="allow"`` semantics \u2014 we exercise the + attribute access explicitly to catch that drift. + """ + m = _make_metadata() + assert not hasattr(m, "tts_engine") + # Touching the attribute must raise AttributeError \u2014 this is + # the silent break a senior reviewer asked be made explicit. + with pytest.raises(AttributeError): + _ = m.tts_engine # type: ignore[attr-defined] + + # Old clip JSON with the legacy field still parses cleanly + # (Pydantic v2's default extra="ignore" drops the unknown field). + legacy_json = json.loads(m.model_dump_json()) + legacy_json["tts_engine"] = "azure_he_IL" + m2 = ClipMetadata.model_validate_json(json.dumps(legacy_json)) + assert m2.clip_id == m.clip_id + with pytest.raises(AttributeError): + _ = m2.tts_engine # type: ignore[attr-defined] + # --------------------------------------------------------------------------- # LabelGenerator tests diff --git a/tests/unit/test_manifest.py b/tests/unit/test_manifest.py index 20ecf00..c8c3f01 100644 --- a/tests/unit/test_manifest.py +++ b/tests/unit/test_manifest.py @@ -58,7 +58,6 @@ def _write_valid_clip( "generation_date": datetime.date.today().isoformat(), "generator_version": "0.1.0", "is_synthetic": True, - "tts_engine": "azure_he_IL", "acoustic_scene": {}, "speakers": [ { @@ -360,7 +359,6 @@ def test_json_without_sibling_wav_is_skipped(self, tmp_path): "generation_date": datetime.date.today().isoformat(), "generator_version": "0.1.0", "is_synthetic": True, - "tts_engine": "azure_he_IL", "acoustic_scene": {}, "speakers": [ { diff --git a/tests/unit/test_qa.py b/tests/unit/test_qa.py index 817d1c4..0b3a6f1 100644 --- a/tests/unit/test_qa.py +++ b/tests/unit/test_qa.py @@ -89,12 +89,22 @@ def _write_valid_clip( duration: float = 4.0, speaker_id: str = "AGG_M_30-45_001", quality_flags: list[str] | None = None, - tts_engine: str = "azure_he_IL", + tts_backend: dict[str, str] | None = None, tts_voice_id: str = "he-IL-AvriNeural", gender: str = "male", prosody_cap_events: list[dict] | None = None, + omit_generation_metadata: bool = False, ) -> Path: - """Write a minimal valid WAV + TXT + JSON triplet and return the WAV path.""" + """Write a minimal valid WAV + TXT + JSON triplet and return the WAV path. + + *tts_backend* populates ``generation_metadata.tts_backend`` per-speaker + (#109 replaced the old flat ``tts_engine`` field). Defaults to a + single-speaker azure map keyed on *speaker_id*. + + *omit_generation_metadata* writes the JSON with no ``generation_metadata`` + block at all — models the pre-#109 corpus snapshot shape, used by + tests that exercise the "unknown" backend bucket. + """ parent.mkdir(parents=True, exist_ok=True) wav_path = parent / f"{clip_id}.wav" txt_path = parent / f"{clip_id}.txt" @@ -124,7 +134,6 @@ def _write_valid_clip( "generation_date": datetime.date.today().isoformat(), "generator_version": "0.1.0", "is_synthetic": True, - "tts_engine": tts_engine, "acoustic_scene": {}, "speakers": [ { @@ -146,11 +155,17 @@ def _write_valid_clip( "annotator_confidence": 1.0, "iaa_reviewed": False, } - if prosody_cap_events is not None: - metadata["generation_metadata"] = { + # #109: every clip carries generation_metadata so qa-report's + # backend-diversity derivation has a per-speaker source of truth. + # ``omit_generation_metadata=True`` models a pre-#109 corpus snapshot. + if not omit_generation_metadata: + gen_meta: dict[str, object] = { "pipeline_version": "0.1.0", - "effective_prosody_caps": prosody_cap_events, + "tts_backend": tts_backend if tts_backend is not None else {speaker_id: "azure"}, } + if prosody_cap_events is not None: + gen_meta["effective_prosody_caps"] = prosody_cap_events + metadata["generation_metadata"] = gen_meta json_path.write_text(json.dumps(metadata), encoding="utf-8") return wav_path @@ -1012,9 +1027,75 @@ def test_run_summary_backend_diversity(self, tmp_path): report = run_qa(tmp_path, run_summary=True) rs = report.run_summary assert rs is not None - # Both clips use azure_he_IL by default + # Both clips default to azure backend (single-speaker) + assert rs.backend_count == 1 + assert rs.clips_by_tts_backend.get("azure") == 2 + + def test_run_summary_backend_diversity_mixed_azure_google(self, tmp_path): + """Two clips on different backends → ``backend_count`` is 2 and + ``single_backend`` is NOT in run_warnings (#109).""" + _write_valid_clip( + tmp_path / "spk_a", "clip_001_00", tts_backend={"AGG_M_30-45_001": "azure"} + ) + _write_valid_clip( + tmp_path / "spk_b", "clip_002_00", tts_backend={"AGG_M_30-45_001": "google"} + ) + report = run_qa(tmp_path, run_summary=True) + rs = report.run_summary + assert rs is not None + assert rs.backend_count == 2 + assert rs.clips_by_tts_backend.get("azure") == 1 + assert rs.clips_by_tts_backend.get("google") == 1 + assert "single_backend" not in rs.run_warnings + + def test_run_summary_backend_diversity_all_azure_fires_warning(self, tmp_path): + """Two clips both on azure → ``single_backend`` IS in run_warnings (#109).""" + _write_valid_clip( + tmp_path / "spk_a", "clip_001_00", tts_backend={"AGG_M_30-45_001": "azure"} + ) + _write_valid_clip( + tmp_path / "spk_b", "clip_002_00", tts_backend={"AGG_M_30-45_001": "azure"} + ) + report = run_qa(tmp_path, run_summary=True) + rs = report.run_summary + assert rs is not None + assert rs.backend_count == 1 + assert "single_backend" in rs.run_warnings + + def test_run_summary_clips_without_generation_metadata_bucket_as_unknown(self, tmp_path): + """Pre-#109 clips (no ``generation_metadata``) bucket under "unknown". + + Without this bucketing, an old corpus mixed with new clips would + silently lose those old clips from the histogram and from + ``backend_count`` — making backend diversity misleadingly low or + flipping ``single_backend`` the wrong way. The "unknown" key + keeps them visible. + """ + _write_valid_clip(tmp_path / "spk_a", "clip_legacy_00", omit_generation_metadata=True) + _write_valid_clip( + tmp_path / "spk_b", "clip_new_00", tts_backend={"AGG_M_30-45_001": "azure"} + ) + report = run_qa(tmp_path, run_summary=True) + rs = report.run_summary + assert rs is not None + # Two distinct backends total: "azure" (new) + "unknown" (legacy). + assert rs.backend_count == 2 + assert rs.clips_by_tts_backend.get("azure") == 1 + assert rs.clips_by_tts_backend.get("unknown") == 1 + # And consequently: no single_backend false-fire. + assert "single_backend" not in rs.run_warnings + + def test_run_summary_all_legacy_clips_fire_single_backend(self, tmp_path): + """A corpus where every clip lacks ``generation_metadata`` → all + bucket under "unknown" → still flagged as ``single_backend``.""" + _write_valid_clip(tmp_path / "spk_a", "clip_legacy_00", omit_generation_metadata=True) + _write_valid_clip(tmp_path / "spk_b", "clip_legacy_01", omit_generation_metadata=True) + report = run_qa(tmp_path, run_summary=True) + rs = report.run_summary + assert rs is not None assert rs.backend_count == 1 - assert rs.clips_by_tts_engine.get("azure_he_IL") == 2 + assert rs.clips_by_tts_backend.get("unknown") == 2 + assert "single_backend" in rs.run_warnings def test_run_summary_voice_diversity(self, tmp_path): _write_valid_clip(tmp_path / "spk_a", "clip_001_00", speaker_id="SPK_A")