DataHackIL · shaypal5 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -27,7 +27,7 @@ docs/ tests/unit/ tests/integration/
 - **Filenames:** strictly lowercase `[a-z0-9_-]`
 - **JSON string fields validated by `ClipMetadata`:** no characters above U+00A1
 - Hebrew text goes in `.j2` templates or `.txt` transcripts only
-- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `tts_engine`, `violence_typology`, `generator_version`
+- `ClipMetadata` enforces the string-field rule via `@field_validator` on `clip_id`, `project`, `violence_typology`, `generator_version`
 
 ## Audio format (hard constraints)
 

diff --git a/docs/spec.md b/docs/spec.md
@@ -284,7 +284,6 @@ Every clip has a companion `{clip_id}.json` file with this schema:
   "generation_date": "2026-04-10",
   "generator_version": "0.1.0",
   "is_synthetic": true,
-  "tts_engine": "azure_he_IL",
   "acoustic_scene": {
     "room_type": "apartment_kitchen",
     "device": "phone_in_pocket",
@@ -345,6 +344,7 @@ Every clip has a companion `{clip_id}.json` file with this schema:
 **Field notes**
 
 - `preprocessing_applied.normalized_dbfs` is the **measured** post-preprocess peak (pair with `generation_metadata.loudness_target_peak_dbfs` to compute drift from target — see `labels/schema.py` for the docstring that pins this split).
+- `tts_engine` was **removed in #109**. The TTS provider is now recorded per-speaker in `generation_metadata.tts_backend` (e.g. `{"AGG_M_30-45_001": "azure", "VIC_F_25-40_002": "google"}`); read backend diversity from the structured map. Pre-#109 corpus snapshots still carry the field — consumers should tolerate but ignore it.
 - `generation_metadata` is **optional**: a JSON object when the generator recorded pipeline provenance, `null` otherwise. Treat absence as "unknown", not as failure. `generator_version` alone is not a reliable presence signal.
 - `speakers[].voice_family` is **optional**: a stable family handle (e.g. `"Avri"`) when the speaker YAML overrides it, omitted otherwise. Consumers should fall back to `tts_voice_id`.
 - `weak_label.has_violence` is **derived**, not asserted: `any(e.tier1_category != "NONE" for e in events)` — see `synthbanshee/labels/generator.py`. Corollaries: empty `events` → `False`; `NEG` typology clips are `False` (every event lands `tier1_category: "NONE"` by §4.1); `violence_typology` and `has_violence` may disagree (e.g. `SV` with `False` if no violent tier1 fired). The events are the ground truth; the flag is convenience. **External docs and downstream code must mirror this rule** — re-deriving from typology or intensity alone produces disagreement on every NEG row.

diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py
@@ -1702,8 +1702,8 @@ def qa_report(
             else str(rs.backend_count)
         )
         t_run.add_row("TTS backends", be_label)
-        for engine, count in sorted(rs.clips_by_tts_engine.items()):
-            t_run.add_row(f"  {engine}", str(count))
+        for backend, count in sorted(rs.clips_by_tts_backend.items()):
+            t_run.add_row(f"  {backend}", str(count))
 
         # Overlap and emotion-downgrade ratios
         ovr_label = (
@@ -1806,7 +1806,7 @@ def qa_report(
                 ],
                 "voices_by_gender": rs.voices_by_gender,
                 "backend_count": rs.backend_count,
-                "clips_by_tts_engine": rs.clips_by_tts_engine,
+                "clips_by_tts_backend": rs.clips_by_tts_backend,
                 "overlap_ratio": rs.overlap_ratio,
                 "clips_with_i4_plus": rs.clips_with_i4_plus,
                 "emotion_downgrade_ratio": rs.emotion_downgrade_ratio,

diff --git a/synthbanshee/labels/generator.py b/synthbanshee/labels/generator.py
@@ -261,7 +261,8 @@ def generate_clip_metadata(
             generation_date=datetime.date.today().isoformat(),
             generator_version=self.generator_version,
             is_synthetic=True,
-            tts_engine="azure_he_IL",
+            # #109: tts_engine removed; provider is recorded per-speaker in
+            # generation_metadata.tts_backend.
             acoustic_scene=acoustic_scene or ClipAcousticScene(),
             speakers=speakers or [],
             weak_label=WeakLabel(

diff --git a/synthbanshee/labels/schema.py b/synthbanshee/labels/schema.py
@@ -210,7 +210,12 @@ class ClipMetadata(BaseModel):
     generation_date: str
     generator_version: str
     is_synthetic: Literal[True] = True
-    tts_engine: str = "azure_he_IL"
+    # #109: tts_engine removed — it was a per-clip flat string hardcoded to
+    # "azure_he_IL" regardless of the actual provider, which mis-labeled
+    # google-rendered clips and made qa-report's single_backend warning a
+    # false positive. The structured per-speaker source of truth is
+    # ``generation_metadata.tts_backend``; downstream tools read backend
+    # diversity from there.
     acoustic_scene: ClipAcousticScene = Field(default_factory=ClipAcousticScene)
     speakers: list[SpeakerInfo] = Field(default_factory=list)
     weak_label: WeakLabel
@@ -252,7 +257,7 @@ def valid_quality_flags(cls, v: list[str]) -> list[str]:
             raise ValueError(f"Unknown quality_flags: {bad}. Valid: {sorted(_QUALITY_FLAGS)}")
         return v
 
-    @field_validator("clip_id", "project", "tts_engine", "violence_typology", "generator_version")
+    @field_validator("clip_id", "project", "violence_typology", "generator_version")
     @classmethod
     def ascii_safe_string(cls, v: str, info) -> str:
         return _assert_ascii_safe(v, info.field_name)

diff --git a/synthbanshee/package/qa.py b/synthbanshee/package/qa.py
@@ -127,7 +127,17 @@ class RunSummary:
     # Voice and backend diversity
     voices_by_gender: dict[str, int] = field(default_factory=dict)
     backend_count: int = 0
-    clips_by_tts_engine: dict[str, int] = field(default_factory=dict)
+    # #109: histogram of (clip, distinct backend used) pairs, derived from
+    # generation_metadata.tts_backend.values() rather than the old hardcoded
+    # ClipMetadata.tts_engine field. A clip with two speakers on different
+    # providers contributes one count to each backend, so
+    # ``sum(clips_by_tts_backend.values()) >= total_clips`` — the pre-#109
+    # sum invariant no longer holds. Pre-#109 corpus clips (which lack
+    # ``generation_metadata`` entirely) are bucketed under the
+    # ``"unknown"`` backend key so they still appear in the histogram and
+    # in ``backend_count``; otherwise they would silently vanish from
+    # diversity metrics whenever an old + new corpus is mixed.
+    clips_by_tts_backend: dict[str, int] = field(default_factory=dict)
 
     # Overlap and emotion-downgrade ratios
     overlap_ratio: float = 0.0
@@ -366,8 +376,8 @@ def run_qa(
     # M10b accumulators
     _all_turns: list[TurnMetrics] = []
     _voices_by_gender: dict[str, set[str]] = defaultdict(set)
-    _tts_engines: set[str] = set()
-    _clips_by_engine: dict[str, int] = defaultdict(int)
+    _tts_backends: set[str] = set()
+    _clips_by_backend: dict[str, int] = defaultdict(int)
     _clips_with_overlap: int = 0
     _clips_with_emotion_downgrade: int = 0
     _clips_with_i4_plus: int = 0  # denominator for overlap ratio
@@ -421,10 +431,22 @@ def run_qa(
         if any("Strong labels JSONL missing" in w for w in validation.warnings):
             stats.clips_missing_strong_labels += 1
 
-        # M10b: track voice and backend diversity
+        # M10b: track voice and backend diversity (#109 — derive backend
+        # from generation_metadata.tts_backend per-speaker map, not the
+        # removed flat tts_engine field). Clips that lack
+        # generation_metadata (pre-#109 corpus snapshots) bucket under
+        # "unknown" so they remain visible in the histogram rather than
+        # silently vanishing.
         if run_summary:
-            _tts_engines.add(metadata.tts_engine)
-            _clips_by_engine[metadata.tts_engine] += 1
+            if metadata.generation_metadata is not None and (
+                metadata.generation_metadata.tts_backend
+            ):
+                clip_backends = set(metadata.generation_metadata.tts_backend.values())
+            else:
+                clip_backends = {"unknown"}
+            _tts_backends.update(clip_backends)
+            for backend in clip_backends:
+                _clips_by_backend[backend] += 1
             for spk in metadata.speakers:
                 _voices_by_gender[spk.gender].add(spk.tts_voice_id)
 
@@ -520,8 +542,8 @@ def run_qa(
         summary = RunSummary(
             role_intensity_stats=ri_stats,
             voices_by_gender={g: len(v) for g, v in _voices_by_gender.items()},
-            backend_count=len(_tts_engines),
-            clips_by_tts_engine=dict(_clips_by_engine),
+            backend_count=len(_tts_backends),
+            clips_by_tts_backend=dict(_clips_by_backend),
             overlap_ratio=(
                 _clips_with_overlap / _clips_with_i4_plus if _clips_with_i4_plus > 0 else 0.0
             ),

diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py
@@ -121,7 +121,6 @@ def _write_valid_clip(tmp_path: Path, clip_id: str = "test_clip_01") -> Path:
         "generation_date": datetime.date.today().isoformat(),
         "generator_version": "0.1.0",
         "is_synthetic": True,
-        "tts_engine": "azure_he_IL",
         "acoustic_scene": {},
         "speakers": [],
         "weak_label": {
@@ -1172,6 +1171,46 @@ def test_metadata_paths_fall_back_to_absolute_when_outside_root(self, tmp_path):
         # Out-of-root → absolute fallback. Not great, but not a crash.
         assert Path(meta_json["transcript_path"]).is_absolute()
 
+    def test_pipeline_output_feeds_qa_backend_derivation(self, tmp_path):
+        """End-to-end #109 contract: a clip produced by ``_run_generate_pipeline``
+        carries ``generation_metadata.tts_backend`` whose values land in
+        ``RunSummary.clips_by_tts_backend`` when fed to ``run_qa``.
+
+        Catches the regression where the generator's ``_backends_map``
+        shape diverges from qa.py's expectation — both unit-tested
+        components could pass in isolation while the integration breaks.
+        """
+        from synthbanshee.package.qa import run_qa
+
+        turns = _make_dialogue_turns(n=1)
+        mixed = _make_mixed_scene(n_turns=1)
+        with (
+            patch("synthbanshee.script.generator.ScriptGenerator") as MockGen,
+            patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer,
+        ):
+            MockGen.return_value.generate.return_value = turns
+            MockRenderer.return_value.render_scene.return_value = mixed
+            wav, _ = _run_generate_pipeline(
+                SCENES_DIR / "test_scene_001.yaml",
+                tmp_path / "out",
+                tmp_path / "cache",
+                tmp_path / "dirty",
+                tmp_path / "scripts",
+            )
+
+        assert wav is not None
+        report = run_qa(tmp_path / "out", run_summary=True)
+        rs = report.run_summary
+        assert rs is not None, "run_summary should be populated when flag is set"
+        # The example speaker (AGG_M_30-45_001) is azure-backed in the
+        # example YAML; the generator writes that into tts_backend, and
+        # qa derives the histogram from there.
+        assert "azure" in rs.clips_by_tts_backend, f"expected 'azure' in {rs.clips_by_tts_backend}"
+        assert "unknown" not in rs.clips_by_tts_backend, (
+            "fresh clips should not bucket as 'unknown' — the generator "
+            "must write generation_metadata.tts_backend per #109"
+        )
+
 
 # ---------------------------------------------------------------------------
 # generate command — additional failure and warning branches

diff --git a/tests/unit/test_generation_metadata.py b/tests/unit/test_generation_metadata.py
@@ -25,7 +25,6 @@ def _minimal_clip_metadata(**overrides) -> dict:
         "generation_date": "2026-05-01",
         "generator_version": "0.1.0",
         "is_synthetic": True,
-        "tts_engine": "azure_he_IL",
         "weak_label": {
             "has_violence": False,
             "violence_categories": [],

diff --git a/tests/unit/test_labels.py b/tests/unit/test_labels.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+import json
+
 import numpy as np
 import pytest
 from pydantic import ValidationError
@@ -169,6 +171,36 @@ def test_ascii_safe_clip_id(self):
         with pytest.raises(ValidationError):
             _make_metadata(clip_id="\u05e9\u05dc\u05d5\u05dd")
 
+    def test_tts_engine_field_removed(self):
+        """#109: the hardcoded ``tts_engine`` field was dropped \u2014 the
+        per-speaker ``generation_metadata.tts_backend`` is the source of
+        truth. Old corpus JSON carrying the field still parses (Pydantic
+        ignores unknown fields by default); the attribute is no longer
+        exposed on the model.
+
+        This test pins the **breaking-change** half of the contract:
+        downstream code that touches ``meta.tts_engine`` must be
+        updated, and an ``AttributeError`` is what they will hit. The
+        test would silently pass even if Pydantic re-introduced the
+        field with ``extra="allow"`` semantics \u2014 we exercise the
+        attribute access explicitly to catch that drift.
+        """
+        m = _make_metadata()
+        assert not hasattr(m, "tts_engine")
+        # Touching the attribute must raise AttributeError \u2014 this is
+        # the silent break a senior reviewer asked be made explicit.
+        with pytest.raises(AttributeError):
+            _ = m.tts_engine  # type: ignore[attr-defined]
+
+        # Old clip JSON with the legacy field still parses cleanly
+        # (Pydantic v2's default extra="ignore" drops the unknown field).
+        legacy_json = json.loads(m.model_dump_json())
+        legacy_json["tts_engine"] = "azure_he_IL"
+        m2 = ClipMetadata.model_validate_json(json.dumps(legacy_json))
+        assert m2.clip_id == m.clip_id
+        with pytest.raises(AttributeError):
+            _ = m2.tts_engine  # type: ignore[attr-defined]
+
 
 # ---------------------------------------------------------------------------
 # LabelGenerator tests

diff --git a/tests/unit/test_manifest.py b/tests/unit/test_manifest.py
@@ -58,7 +58,6 @@ def _write_valid_clip(
         "generation_date": datetime.date.today().isoformat(),
         "generator_version": "0.1.0",
         "is_synthetic": True,
-        "tts_engine": "azure_he_IL",
         "acoustic_scene": {},
         "speakers": [
             {
@@ -360,7 +359,6 @@ def test_json_without_sibling_wav_is_skipped(self, tmp_path):
                     "generation_date": datetime.date.today().isoformat(),
                     "generator_version": "0.1.0",
                     "is_synthetic": True,
-                    "tts_engine": "azure_he_IL",
                     "acoustic_scene": {},
                     "speakers": [
                         {