From bee7e149a0281a4a26462f07e4acb1679b58689b Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 09:29:18 +0300 Subject: [PATCH 1/3] =?UTF-8?q?fix(cli,manifest):=20#108=20=E2=80=94=20wri?= =?UTF-8?q?te=20repo-relative=20paths=20in=20clip=20JSON=20and=20manifest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ClipMetadata.dirty_file_path / transcript_path and ManifestRow.wav_path / strong_labels_path used to be written as absolute, machine-specific paths (e.g. /Users//clones/avdp-synth-corpus/...). delivery-002 committed relative paths, and delivery-003 (avdp-synth-corpus#4) had to post-process them on the way in. spec.md §5.1 already specifies repo-relative form (updated in #103); this brings the generator in line. Changes: - synthbanshee/cli.py: new _relative_to_data_root + _infer_data_root helpers; new --data-root option on `generate` and `generate-batch` (envvar SYNTHBANSHEE_DATA_ROOT, default = two parents above output-dir, matching the corpus convention /data/he/). Threaded through _run_generate_pipeline and _render_one. dirty_file_path / transcript_path go through the helper before metadata is built. - synthbanshee/package/manifest.py: new relative_to kwarg on generate_manifest; applied via _maybe_relative to wav_path and strong_labels_path. CLI passes the inferred data root. - synthbanshee/labels/schema.py: attribute docstrings on dirty_file_path / transcript_path documenting the relative-to-data-root contract. - tests/unit/test_cli.py: two new tests under TestRunGeneratePipeline — metadata paths are repo-relative under the corpus layout, and fall back to absolute when paths land outside the configured root. - tests/unit/test_manifest.py: two new tests on generate_manifest — relative_to rewrites wav_path / strong_labels_path columns; relative_to=None preserves the legacy absolute form (back-compat). Fixes #108. Co-Authored-By: Claude Opus 4.7 --- synthbanshee/cli.py | 85 +++++++++++++++++++++++++++-- synthbanshee/labels/schema.py | 12 +++++ synthbanshee/package/manifest.py | 27 +++++++++- tests/unit/test_cli.py | 91 ++++++++++++++++++++++++++++++++ tests/unit/test_manifest.py | 31 +++++++++++ 5 files changed, 241 insertions(+), 5 deletions(-) diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 0af074f..f353ebd 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -212,6 +212,40 @@ def _build_preprocessing_metadata(result: PreprocessingResult) -> PreprocessingA ) +def _relative_to_data_root(path: Path, data_root: Path | None) -> str: + """Render *path* as a string relative to *data_root* when possible. + + #108 — clip JSON / manifest CSV must record repo-relative paths so the + corpus is portable. Returns ``str(path.resolve().relative_to(data_root))`` + when *path* is genuinely under *data_root* (after resolving symlinks on + both sides), falling back to ``str(path)`` otherwise. *data_root* of + ``None`` short-circuits to the legacy absolute form. + """ + if data_root is None: + return str(path) + try: + return str(Path(path).resolve().relative_to(Path(data_root).resolve())) + except ValueError: + return str(path) + + +def _infer_data_root(output_dir: Path, override: Path | None) -> Path | None: + """Pick a data root for path-rewriting in clip metadata. + + Returns *override* when supplied. Otherwise infers from *output_dir*: + the corpus convention is ``/data/he/``, so two parents up is + the right anchor. Returns ``None`` only when no sensible inference is + possible (e.g. *output_dir* has fewer than two parents), in which case + callers leave paths absolute. + """ + if override is not None: + return Path(override).resolve() + resolved = Path(output_dir).resolve() + if len(resolved.parents) < 2: + return None + return resolved.parent.parent + + def _run_generate_pipeline( config: Path, output_dir: Path, @@ -222,6 +256,7 @@ def _run_generate_pipeline( speaker_overrides: dict[str, str] | None = None, project_profile: ProjectProfile | None = None, enable_breathiness: bool = False, + data_root: Path | None = None, ) -> tuple[Path | None, list[str]]: """Run the full single-clip generate pipeline. @@ -698,6 +733,10 @@ def vlog(msg: str) -> None: effective_prosody_caps=_cap_events, ) + # #108: clip metadata records repo-relative paths so the corpus is + # portable. Anchor at *data_root*; fall back to the legacy absolute form + # if a path is genuinely outside the root. + _resolved_data_root = _infer_data_root(output_dir, data_root) metadata = label_gen.generate_clip_metadata( clip_id=f"{clip_id}_00", project=scene.project, @@ -709,8 +748,12 @@ def vlog(msg: str) -> None: scene_config_path=str(config), random_seed=scene.random_seed, preprocessing=preprocessing_meta, - dirty_file_path=str(result.dirty_path) if result.dirty_path else None, - transcript_path=str(clip_txt), + dirty_file_path=( + _relative_to_data_root(result.dirty_path, _resolved_data_root) + if result.dirty_path + else None + ), + transcript_path=_relative_to_data_root(clip_txt, _resolved_data_root), acoustic_scene=acoustic_scene_meta, quality_flags=quality_flags, generation_metadata=gen_meta, @@ -788,6 +831,17 @@ def cli() -> None: envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR", help="LLM script generation cache directory.", ) +@click.option( + "--data-root", + type=click.Path(path_type=Path), + default=None, + envvar="SYNTHBANSHEE_DATA_ROOT", + help=( + "Root directory that paths in clip JSON metadata are written " + "relative to (corpus repo root by convention). Defaults to " + "two parents above --output-dir." + ), +) @click.option( "--dry-run", is_flag=True, @@ -817,6 +871,7 @@ def generate( cache_dir: Path, dirty_dir: Path, script_cache_dir: Path, + data_root: Path | None, dry_run: bool, project_profile: str | None, verbose: bool, @@ -858,6 +913,7 @@ def generate( script_cache_dir, verbose=verbose, project_profile=profile, + data_root=data_root, ) if wav_path is None: @@ -1040,6 +1096,7 @@ def _render_one( speaker_overrides: dict[str, str] | None = None, project_profile: ProjectProfile | None = None, enable_breathiness: bool = False, + data_root: Path | None = None, ) -> tuple[Path | None, list[str]]: """Render a single clip with retries. @@ -1067,6 +1124,7 @@ def _render_one( speaker_overrides=speaker_overrides, project_profile=project_profile, enable_breathiness=enable_breathiness, + data_root=data_root, ) if wav_path is not None: return wav_path, messages @@ -1112,6 +1170,16 @@ def _render_one( envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR", help="LLM script generation cache directory.", ) +@click.option( + "--data-root", + type=click.Path(path_type=Path), + default=None, + envvar="SYNTHBANSHEE_DATA_ROOT", + help=( + "Root directory that paths in clip JSON metadata and manifest CSV " + "are written relative to. Defaults to two parents above --output-dir." + ), +) @click.option( "--manifest-out", "-m", @@ -1159,6 +1227,7 @@ def generate_batch( cache_dir: Path, dirty_dir: Path, script_cache_dir: Path, + data_root: Path | None, manifest_out: Path | None, dry_run: bool, workers: int, @@ -1283,6 +1352,7 @@ def generate_batch( speaker_overrides=speaker_override_map.get(scene_yaml), project_profile=profile, enable_breathiness=run_cfg.enable_breathiness, + data_root=data_root, ) progress.advance(task_id) if wav_path is None: @@ -1317,6 +1387,7 @@ def generate_batch( speaker_override_map.get(scene_yaml), profile, run_cfg.enable_breathiness, + data_root, ): scene_yaml for scene_yaml in selected_paths } @@ -1366,7 +1437,15 @@ def generate_batch( ) # --- Manifest --- - rows = generate_manifest(out_dir, manifest_path, splits=splits, clip_ids=set(splits.keys())) + # #108: anchor manifest paths at the same data root used for clip JSON. + _manifest_data_root = _infer_data_root(out_dir, data_root) + rows = generate_manifest( + out_dir, + manifest_path, + splits=splits, + clip_ids=set(splits.keys()), + relative_to=_manifest_data_root, + ) console.print(f"[bold green]Manifest written:[/bold green] {manifest_path} ({len(rows)} rows)") # --- Summary --- diff --git a/synthbanshee/labels/schema.py b/synthbanshee/labels/schema.py index cd0aca4..934b7ea 100644 --- a/synthbanshee/labels/schema.py +++ b/synthbanshee/labels/schema.py @@ -216,7 +216,19 @@ class ClipMetadata(BaseModel): weak_label: WeakLabel preprocessing_applied: PreprocessingApplied = Field(default_factory=PreprocessingApplied) dirty_file_path: str | None = None + """Path to the retained pre-preprocessing WAV (#108). + + Written relative to the data root passed to ``synthbanshee generate`` + via ``--data-root`` / ``SYNTHBANSHEE_DATA_ROOT`` (default: two parents + above ``--output-dir``). Older corpus snapshots may still carry + absolute paths; consumers should treat both forms as accepted but + only emit relative paths going forward.""" + transcript_path: str | None = None + """Path to the ``.txt`` transcript file (#108). + + Written relative to the data root — see ``dirty_file_path`` for the + anchor convention.""" quality_flags: list[str] = Field(default_factory=list) annotator_confidence: float = Field(ge=0.0, le=1.0, default=1.0) iaa_reviewed: bool = False diff --git a/synthbanshee/package/manifest.py b/synthbanshee/package/manifest.py index 9cf37ac..c59d68d 100644 --- a/synthbanshee/package/manifest.py +++ b/synthbanshee/package/manifest.py @@ -51,12 +51,29 @@ class ManifestRow: strong_labels_path: str # path to .jsonl, empty string if absent +def _maybe_relative(path: Path, root: Path | None) -> str: + """Render *path* as repo-relative when *root* is provided and contains it. + + Mirrors ``synthbanshee.cli._relative_to_data_root`` so the manifest CSV + and per-clip JSON agree on path shape. Falls back to the absolute form + when *path* is genuinely outside *root* (after resolving symlinks on + both sides) or when *root* is ``None``. + """ + if root is None: + return str(path) + try: + return str(path.resolve().relative_to(root.resolve())) + except ValueError: + return str(path) + + def generate_manifest( data_dir: Path, output_path: Path, *, splits: dict[str, str] | None = None, clip_ids: set[str] | None = None, + relative_to: Path | None = None, ) -> list[ManifestRow]: """Scan data_dir recursively for clip JSON files and write a manifest CSV. @@ -73,6 +90,10 @@ def generate_manifest( whose clip_id is in this set are included in the manifest. Use this to restrict the manifest to a specific generation run when ``data_dir`` may contain clips from previous runs. + relative_to: Optional data root for path columns (#108). When + provided, ``wav_path`` and ``strong_labels_path`` are written + relative to this directory; paths outside the root fall back to + absolute form. Returns: List of ManifestRow objects that were written to output_path. @@ -112,8 +133,10 @@ def generate_manifest( max_intensity=metadata.weak_label.max_intensity, quality_flags=",".join(metadata.quality_flags), split=(splits or {}).get(metadata.clip_id, ""), - wav_path=str(json_path.with_suffix(".wav")), - strong_labels_path=str(jsonl_path) if jsonl_path.exists() else "", + wav_path=_maybe_relative(json_path.with_suffix(".wav"), relative_to), + strong_labels_path=( + _maybe_relative(jsonl_path, relative_to) if jsonl_path.exists() else "" + ), ) ) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 9af7acd..ba68cc3 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -838,6 +838,96 @@ def test_completely_unknown_emotional_state_fails_cleanly(self, tmp_path): "clip_json should not exist on pre-validation failure" ) + def test_metadata_paths_are_repo_relative(self, tmp_path): + """`dirty_file_path` and `transcript_path` in clip JSON are repo-relative (#108). + + Models the corpus layout: ``/data/he//`` + and ``/assets/speech/dirty/_dirty.wav``. With + ``data_root`` resolved to ````, both metadata path + fields must come out relative — not absolute, machine-specific + paths as in pre-#108 corpus snapshots. + """ + data_root = tmp_path / "corpus" + out_dir = data_root / "data" / "he" + cache_dir = data_root / "assets" / "speech" + dirty_dir = cache_dir / "dirty" + script_cache_dir = data_root / "assets" / "scripts" + for d in (out_dir, cache_dir, dirty_dir, script_cache_dir): + d.mkdir(parents=True, exist_ok=True) + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + wav, _messages = _run_generate_pipeline( + SCENES_DIR / "test_scene_001.yaml", + out_dir, + cache_dir, + dirty_dir, + script_cache_dir, + data_root=data_root, + ) + + assert wav is not None + meta_json = json.loads(wav.with_suffix(".json").read_text(encoding="utf-8")) + assert meta_json["transcript_path"].startswith("data/he/"), ( + f"transcript_path should be repo-relative, got: {meta_json['transcript_path']}" + ) + assert not Path(meta_json["transcript_path"]).is_absolute() + if meta_json.get("dirty_file_path"): + assert meta_json["dirty_file_path"].startswith("assets/speech/dirty/"), ( + f"dirty_file_path should be repo-relative, got: {meta_json['dirty_file_path']}" + ) + assert not Path(meta_json["dirty_file_path"]).is_absolute() + + def test_metadata_paths_fall_back_to_absolute_when_outside_root(self, tmp_path): + """Paths outside *data_root* keep their absolute form (graceful fallback). + + Documents the contract: ``_relative_to_data_root`` does not raise + when a path is outside the root — it returns ``str(path)`` so the + clip is still written successfully. Future runs with a correct + ``data_root`` produce relative paths; legacy data is not retro- + actively rewritten. + """ + out_dir = tmp_path / "out" + cache_dir = tmp_path / "cache" + dirty_dir = tmp_path / "dirty" + script_cache_dir = tmp_path / "scripts" + for d in (out_dir, cache_dir, dirty_dir, script_cache_dir): + d.mkdir(parents=True, exist_ok=True) + # data_root deliberately points elsewhere — neither out_dir nor + # dirty_dir are under it. + bogus_root = tmp_path / "elsewhere" + bogus_root.mkdir() + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + wav, _ = _run_generate_pipeline( + SCENES_DIR / "test_scene_001.yaml", + out_dir, + cache_dir, + dirty_dir, + script_cache_dir, + data_root=bogus_root, + ) + + assert wav is not None + meta_json = json.loads(wav.with_suffix(".json").read_text(encoding="utf-8")) + # Out-of-root → absolute fallback. Not great, but not a crash. + assert Path(meta_json["transcript_path"]).is_absolute() + # --------------------------------------------------------------------------- # generate command — additional failure and warning branches @@ -1185,6 +1275,7 @@ def _fail_then_succeed( speaker_overrides=None, project_profile=None, enable_breathiness=False, + data_root=None, ): call_count[0] += 1 if call_count[0] == 1: diff --git a/tests/unit/test_manifest.py b/tests/unit/test_manifest.py index 6f306d4..20ecf00 100644 --- a/tests/unit/test_manifest.py +++ b/tests/unit/test_manifest.py @@ -175,6 +175,37 @@ def test_csv_has_all_expected_columns(self, tmp_path): reader = csv.DictReader(fh) assert set(reader.fieldnames or []) == set(_MANIFEST_COLUMNS) + def test_relative_to_rewrites_path_columns(self, tmp_path): + """When ``relative_to`` is set, ``wav_path`` / ``strong_labels_path`` + are written relative to that root (#108).""" + speaker_dir = tmp_path / "data" / "he" / "agg_m_30-45_001" + wav_path = _write_valid_clip(speaker_dir, "clip_rel_00") + # Also drop a .jsonl so the strong_labels_path column is populated. + wav_path.with_suffix(".jsonl").write_text("{}\n", encoding="utf-8") + out = tmp_path / "manifest.csv" + + rows = generate_manifest(tmp_path, out, relative_to=tmp_path) + + assert len(rows) == 1 + assert rows[0].wav_path == "data/he/agg_m_30-45_001/clip_rel_00.wav" + assert rows[0].strong_labels_path == "data/he/agg_m_30-45_001/clip_rel_00.jsonl" + assert not Path(rows[0].wav_path).is_absolute() + + def test_relative_to_none_keeps_absolute_paths(self, tmp_path): + """Default ``relative_to=None`` preserves pre-#108 absolute-path behaviour. + + Documents the back-compat contract — callers that don't opt into + the new ``relative_to`` argument see no behaviour change. + """ + speaker_dir = tmp_path / "spk_abs" + _write_valid_clip(speaker_dir, "clip_abs_00") + out = tmp_path / "manifest.csv" + + rows = generate_manifest(tmp_path, out) + + assert len(rows) == 1 + assert Path(rows[0].wav_path).is_absolute() + def test_dirty_files_excluded(self, tmp_path): """JSON files whose stems contain '_dirty' must not appear in the manifest.""" clip_dir = tmp_path / "spk_000" From 94ca9a146841446209244e9a02364f26b28910a4 Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 15:00:37 +0300 Subject: [PATCH 2/3] =?UTF-8?q?fix(cli,manifest):=20#108=20=E2=80=94=20lou?= =?UTF-8?q?d-warn=20on=20absolute=20fallback=20+=20dedupe=20helper=20+=20a?= =?UTF-8?q?dd=20real=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-review caught five gaps in the first cut. This commit addresses them: 1. Silent fallback to absolute path now logs a WARNING. The original code returned ``str(path)`` whenever a path landed outside data_root — i.e. the same bug #108 was supposed to fix, silently. A misconfigured ``--data-root`` is now visible in CLI output via ``logger.warning``, making the failure mode loud. 2. Deduped helper. ``_relative_to_data_root`` in cli.py and ``_maybe_relative`` in manifest.py were near-identical duplicates. Extracted to ``synthbanshee/package/_paths.relative_to_data_root``; both call sites delegate to the single helper. 3. Direct tests for ``_infer_data_root`` and ``relative_to_data_root``. Previously the helpers were only tested through their call sites (which exercise the success path); the new ``TestInferDataRoot`` and ``TestRelativeToDataRoot`` classes cover every branch including the loud-warn fallback. ``test_path_outside_root_falls_back_to_absolute_with_warning`` uses ``caplog`` to verify the warning fires. 4. End-to-end batch CLI test. The new ``test_full_batch_writes_relative_paths_to_manifest_and_clip_json`` invokes ``generate-batch`` with ``--data-root`` set and asserts the resulting ``manifest.csv`` ``wav_path`` column AND the per-clip JSON ``transcript_path`` are both repo-relative. Catches the regression where ``data_root`` is threaded through one path but not the other — both must be relative-anchored, and the previous test suite only exercised them in isolation. 5. Parallel ``_render_one`` call now uses keyword args. The ``ThreadPoolExecutor.submit`` call mixed positional and keyword arguments (kwargs in the sequential path, positional in the parallel path); a single signature drift would silently transpose ``data_root`` and another arg. Both call sites now use the same kwarg shape. Full unit suite: 1708/1708 passing (up from 1700 — eight new tests). ruff + mypy clean. Co-Authored-By: Claude Opus 4.7 --- synthbanshee/cli.py | 39 ++----- synthbanshee/package/_paths.py | 35 ++++++ synthbanshee/package/manifest.py | 17 +-- tests/unit/test_cli.py | 177 +++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 44 deletions(-) create mode 100644 synthbanshee/package/_paths.py diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index f353ebd..74f45cb 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -32,6 +32,7 @@ from synthbanshee.config.scene_config import SceneConfig console = Console() +logger = logging.getLogger(__name__) class DiscoveredScene(NamedTuple): @@ -212,23 +213,6 @@ def _build_preprocessing_metadata(result: PreprocessingResult) -> PreprocessingA ) -def _relative_to_data_root(path: Path, data_root: Path | None) -> str: - """Render *path* as a string relative to *data_root* when possible. - - #108 — clip JSON / manifest CSV must record repo-relative paths so the - corpus is portable. Returns ``str(path.resolve().relative_to(data_root))`` - when *path* is genuinely under *data_root* (after resolving symlinks on - both sides), falling back to ``str(path)`` otherwise. *data_root* of - ``None`` short-circuits to the legacy absolute form. - """ - if data_root is None: - return str(path) - try: - return str(Path(path).resolve().relative_to(Path(data_root).resolve())) - except ValueError: - return str(path) - - def _infer_data_root(output_dir: Path, override: Path | None) -> Path | None: """Pick a data root for path-rewriting in clip metadata. @@ -735,7 +719,9 @@ def vlog(msg: str) -> None: # #108: clip metadata records repo-relative paths so the corpus is # portable. Anchor at *data_root*; fall back to the legacy absolute form - # if a path is genuinely outside the root. + # if a path is genuinely outside the root (with a warning logged). + from synthbanshee.package._paths import relative_to_data_root + _resolved_data_root = _infer_data_root(output_dir, data_root) metadata = label_gen.generate_clip_metadata( clip_id=f"{clip_id}_00", @@ -749,11 +735,11 @@ def vlog(msg: str) -> None: random_seed=scene.random_seed, preprocessing=preprocessing_meta, dirty_file_path=( - _relative_to_data_root(result.dirty_path, _resolved_data_root) + relative_to_data_root(result.dirty_path, _resolved_data_root) if result.dirty_path else None ), - transcript_path=_relative_to_data_root(clip_txt, _resolved_data_root), + transcript_path=relative_to_data_root(clip_txt, _resolved_data_root), acoustic_scene=acoustic_scene_meta, quality_flags=quality_flags, generation_metadata=gen_meta, @@ -1004,13 +990,10 @@ def _distribute_speakers( Scenes whose speakers already match the assigned variant get an empty override dict (no-op). """ - import logging import random from synthbanshee.config.speaker_config import SpeakerConfig - logger = logging.getLogger(__name__) - # 1. Discover all available speakers. all_speakers: dict[str, SpeakerConfig] = {} for search_dir in [Path("configs/speakers"), Path("configs/examples")]: @@ -1383,11 +1366,11 @@ def generate_batch( script_cache_dir, run_cfg.max_retries, stop_event, - verbose, - speaker_override_map.get(scene_yaml), - profile, - run_cfg.enable_breathiness, - data_root, + verbose=verbose, + speaker_overrides=speaker_override_map.get(scene_yaml), + project_profile=profile, + enable_breathiness=run_cfg.enable_breathiness, + data_root=data_root, ): scene_yaml for scene_yaml in selected_paths } diff --git a/synthbanshee/package/_paths.py b/synthbanshee/package/_paths.py new file mode 100644 index 0000000..0f4a87c --- /dev/null +++ b/synthbanshee/package/_paths.py @@ -0,0 +1,35 @@ +"""Shared helper for repo-relative path rendering in clip metadata +and manifest CSV. #108 — keep cli.py and manifest.py in sync. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def relative_to_data_root(path: Path, data_root: Path | None) -> str: + """Render *path* as a string relative to *data_root* when possible. + + Returns ``str(path.resolve().relative_to(data_root.resolve()))`` when + *path* is genuinely under *data_root* (symlinks resolved on both + sides), falling back to ``str(path)`` otherwise. *data_root* of + ``None`` short-circuits to the absolute form. Emits a + ``logger.warning`` on the out-of-root fallback so a misconfigured + ``--data-root`` does not silently revert to the pre-#108 absolute- + path behaviour. + """ + if data_root is None: + return str(path) + try: + return str(Path(path).resolve().relative_to(Path(data_root).resolve())) + except ValueError: + logger.warning( + "Path %s is outside data_root %s; recording absolute path. " + "Configure --data-root / SYNTHBANSHEE_DATA_ROOT to fix.", + path, + data_root, + ) + return str(path) diff --git a/synthbanshee/package/manifest.py b/synthbanshee/package/manifest.py index c59d68d..96bf1cf 100644 --- a/synthbanshee/package/manifest.py +++ b/synthbanshee/package/manifest.py @@ -14,6 +14,7 @@ import pydantic from synthbanshee.labels.schema import ClipMetadata +from synthbanshee.package._paths import relative_to_data_root as _maybe_relative _MANIFEST_COLUMNS = [ "clip_id", @@ -51,22 +52,6 @@ class ManifestRow: strong_labels_path: str # path to .jsonl, empty string if absent -def _maybe_relative(path: Path, root: Path | None) -> str: - """Render *path* as repo-relative when *root* is provided and contains it. - - Mirrors ``synthbanshee.cli._relative_to_data_root`` so the manifest CSV - and per-clip JSON agree on path shape. Falls back to the absolute form - when *path* is genuinely outside *root* (after resolving symlinks on - both sides) or when *root* is ``None``. - """ - if root is None: - return str(path) - try: - return str(path.resolve().relative_to(root.resolve())) - except ValueError: - return str(path) - - def generate_manifest( data_dir: Path, output_path: Path, diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index ba68cc3..0de5281 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -21,12 +21,14 @@ _derive_event_type, _discover_scene_configs, _distribute_speakers, + _infer_data_root, _print_batch_summary, _print_selection_summary, _run_generate_pipeline, _select_configs_by_typology, cli, ) +from synthbanshee.package._paths import relative_to_data_root from synthbanshee.package.validator import ValidationResult SCENES_DIR = Path(__file__).parent.parent.parent / "configs" / "scenes" @@ -405,6 +407,113 @@ def test_full_batch_with_mocked_tts(self, tmp_path): # Manifest CSV must exist with at least a header row assert manifest_path.exists() + def test_full_batch_writes_relative_paths_to_manifest_and_clip_json( + self, tmp_path, monkeypatch + ): + """End-to-end #108 contract: with --data-root explicit, the resulting + manifest CSV's ``wav_path`` column and the per-clip JSON's + ``transcript_path`` are repo-relative — not absolute paths. + + Catches the failure mode where the data_root threading was correct + in `_run_generate_pipeline` but forgotten in the `generate_manifest` + call (or vice versa). Both must be relative-anchored. + """ + import csv as _csv + + import yaml as _yaml + + # #107: ensure none of the dir env vars leak into this CliRunner + # invocation. The strip fixture for that lives on the #107 PR + # branch (#110); this test keeps the protection local until that + # PR merges so the two fixes can land independently. + for _env in ( + "SYNTHBANSHEE_DATA_DIR", + "SYNTHBANSHEE_CACHE_DIR", + "SYNTHBANSHEE_SCRIPT_CACHE_DIR", + ): + monkeypatch.delenv(_env, raising=False) + + # Lay out the corpus layout `/data/he/`. + data_root = tmp_path / "corpus" + out_dir = data_root / "data" / "he" + out_dir.mkdir(parents=True) + + scenes_dir = tmp_path / "scenes" + scenes_dir.mkdir() + scene = { + "scene_id": "SP_NEU_A_0000", + "project": "she_proves", + "language": "he", + "violence_typology": "NEU", + "tier": "A", + "random_seed": 0, + "speakers": [{"speaker_id": "AGG_M_30-45_001", "role": "AGG"}], + "script_template": "synthbanshee/script/templates/she_proves/neutral_domestic_routine.j2", + "script_slots": {}, + "intensity_arc": [1, 1, 1], + "target_duration_minutes": 3.0, + "output_dir": str(out_dir), + } + (scenes_dir / "scene_000.yaml").write_text(_yaml.dump(scene), encoding="utf-8") + + run_cfg_path = tmp_path / "run.yaml" + run_cfg_path.write_text( + _BATCH_RUN_CONFIG_TEMPLATE.format( + output_dir=str(out_dir), + scene_configs_dir=str(scenes_dir), + ), + encoding="utf-8", + ) + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + manifest_path = data_root / "manifest.csv" + runner = CliRunner() + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + result = runner.invoke( + cli, + [ + "generate-batch", + "--run-config", + str(run_cfg_path), + "--manifest-out", + str(manifest_path), + "--cache-dir", + str(data_root / "assets" / "speech"), + "--dirty-dir", + str(data_root / "assets" / "speech" / "dirty"), + "--script-cache-dir", + str(data_root / "assets" / "scripts"), + "--data-root", + str(data_root), + ], + ) + + assert result.exit_code == 0, result.output + + # Manifest CSV: wav_path column must be repo-relative. + with manifest_path.open(encoding="utf-8") as fh: + rows = list(_csv.DictReader(fh)) + assert len(rows) == 1, f"expected one row, got: {rows}" + wav_col = rows[0]["wav_path"] + assert not Path(wav_col).is_absolute(), ( + f"manifest wav_path should be relative, got: {wav_col!r}" + ) + assert wav_col.startswith("data/he/"), wav_col + + # Per-clip JSON: transcript_path must also be repo-relative. + clip_jsons = list(out_dir.rglob("*.json")) + assert clip_jsons, "no clip JSON written" + meta = json.loads(clip_jsons[0].read_text(encoding="utf-8")) + assert not Path(meta["transcript_path"]).is_absolute() + assert meta["transcript_path"].startswith("data/he/"), meta["transcript_path"] + def test_no_distribute_speakers_skips_distribution(self, tmp_path): """--no-distribute-speakers produces empty override maps for every scene.""" import yaml as _yaml @@ -1821,6 +1930,74 @@ def _fake_preprocessing_result(peak_dbfs: float): ) +class TestInferDataRoot: + """Direct unit tests for `_infer_data_root` — the helper that owns the + "two parents above output_dir" magic used by #108. Covers every branch + of the function so a future refactor can't silently regress the + inference contract. + """ + + def test_explicit_override_returned_resolved(self, tmp_path): + """An explicit override is returned (resolved, unchanged otherwise).""" + override = tmp_path / "explicit_root" + override.mkdir() + # output_dir is irrelevant when override is set + result = _infer_data_root(Path("/whatever/data/he"), override) + assert result == override.resolve() + + def test_corpus_shape_inferred_to_two_parents_up(self, tmp_path): + """`/data/he/` → infers `` (the corpus convention).""" + data_he = tmp_path / "data" / "he" + data_he.mkdir(parents=True) + result = _infer_data_root(data_he, override=None) + assert result == tmp_path.resolve() + + def test_shallow_root_returns_none(self): + """A path with fewer than two parents has no sensible data root.""" + # "/" has zero parents; "/foo" has one parent ("/"). Both bail. + assert _infer_data_root(Path("/"), override=None) is None + + def test_relative_path_resolves_against_cwd(self, tmp_path, monkeypatch): + """A relative output_dir resolves against cwd before inference.""" + nested = tmp_path / "corpus" / "data" / "he" + nested.mkdir(parents=True) + monkeypatch.chdir(tmp_path / "corpus") + result = _infer_data_root(Path("data/he"), override=None) + # Resolved path: /corpus/data/he → parent.parent → /corpus + assert result == (tmp_path / "corpus").resolve() + + +class TestRelativeToDataRoot: + """Direct unit tests for the shared helper that owns #108's + repo-relative path rendering — exercised by both cli.py + (clip JSON) and manifest.py (manifest CSV).""" + + def test_returns_relative_when_under_root(self, tmp_path): + root = tmp_path / "corpus" + path = root / "data" / "he" / "clip.wav" + path.parent.mkdir(parents=True) + path.touch() + assert relative_to_data_root(path, root) == "data/he/clip.wav" + + def test_data_root_none_returns_absolute_unchanged(self): + p = Path("/Users/whatever/corpus/data/he/clip.wav") + assert relative_to_data_root(p, None) == str(p) + + def test_path_outside_root_falls_back_to_absolute_with_warning(self, tmp_path, caplog): + root = tmp_path / "corpus" + root.mkdir() + outside = tmp_path / "elsewhere" / "clip.wav" + outside.parent.mkdir(parents=True) + outside.touch() + with caplog.at_level("WARNING", logger="synthbanshee.package._paths"): + result = relative_to_data_root(outside, root) + assert Path(result).is_absolute() + # The point of the warning: misconfigured data_root must not be silent. + assert any("is outside data_root" in rec.message for rec in caplog.records), ( + f"No fallback warning emitted. caplog records: {caplog.records}" + ) + + class TestBuildPreprocessingMetadata: """`_build_preprocessing_metadata` must pass the *measured* peak from `PreprocessingResult.peak_dbfs` into `PreprocessingApplied.normalized_dbfs` From d78f1fdeb7072ebfb42351b8c11ebee630799a0e Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Tue, 12 May 2026 16:28:05 +0300 Subject: [PATCH 3/3] =?UTF-8?q?fix(cli,manifest):=20#108=20=E2=80=94=20add?= =?UTF-8?q?ress=20Copilot=20review=20on=20path-shape=20stability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copilot caught two real gaps in the fallback path: 1. **Relative input + None data_root → relative output.** The original ``str(path)`` fallback claimed "absolute form" in the docstring but returned whatever shape the caller passed. A relative ``Path("data/he/clip.wav")`` produced a relative string in the JSON, making metadata depend on the process's working directory at write time. Fixed by always calling ``Path.resolve()`` before stringifying — every code path now produces a stable absolute string when no relative form is achievable. 2. **Windows path separators in JSON.** ``str(Path)`` on Windows emits backslashes; the same corpus loaded on Linux then breaks. Fixed by using ``Path.as_posix()`` on every path the helper returns, both for the relative happy path and the absolute fallback. The corpus is now portable across host OSes by contract, not by accident. New regression tests cover the combined edge cases: - relative input + data_root=None → absolute output - relative input + path outside root → absolute output (not "../../foo") - output strings carry no backslashes regardless of host OS Full unit suite: 1710/1710 passing (up from 1708 — two new edge-case tests in TestRelativeToDataRoot). Resolves Copilot review threads on _relative_to_data_root and _maybe_relative. Co-Authored-By: Claude Opus 4.7 --- synthbanshee/package/_paths.py | 37 ++++++++++++++++-------- tests/unit/test_cli.py | 52 ++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/synthbanshee/package/_paths.py b/synthbanshee/package/_paths.py index 0f4a87c..3ed80b3 100644 --- a/synthbanshee/package/_paths.py +++ b/synthbanshee/package/_paths.py @@ -11,20 +11,33 @@ def relative_to_data_root(path: Path, data_root: Path | None) -> str: - """Render *path* as a string relative to *data_root* when possible. - - Returns ``str(path.resolve().relative_to(data_root.resolve()))`` when - *path* is genuinely under *data_root* (symlinks resolved on both - sides), falling back to ``str(path)`` otherwise. *data_root* of - ``None`` short-circuits to the absolute form. Emits a - ``logger.warning`` on the out-of-root fallback so a misconfigured - ``--data-root`` does not silently revert to the pre-#108 absolute- - path behaviour. + """Render *path* as a string anchored at *data_root* when possible. + + Returns a POSIX-style relative string (``"data/he/clip.wav"``) when + *path* resolves to a location under *data_root* (symlinks resolved + on both sides). Falls back to a POSIX-style **absolute** string in + every other case: + + - *data_root* is ``None``: nothing to anchor against. + - *path* resolves outside *data_root*: also emits a + ``logger.warning`` so a misconfigured ``--data-root`` is loud + rather than silent. + + Both branches always call ``Path.resolve()`` and ``Path.as_posix()`` + so the JSON / CSV path shape is **stable** regardless of: + + - whether the caller passed a relative or absolute ``path`` + (#108 review: a relative input used to produce a relative-string + fallback, contradicting the docstring and making metadata depend + on the working directory), + - the host OS (#108 review: Windows would otherwise emit + backslashes; POSIX separators keep the corpus portable). """ + resolved = Path(path).resolve() if data_root is None: - return str(path) + return resolved.as_posix() try: - return str(Path(path).resolve().relative_to(Path(data_root).resolve())) + return resolved.relative_to(Path(data_root).resolve()).as_posix() except ValueError: logger.warning( "Path %s is outside data_root %s; recording absolute path. " @@ -32,4 +45,4 @@ def relative_to_data_root(path: Path, data_root: Path | None) -> str: path, data_root, ) - return str(path) + return resolved.as_posix() diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 0de5281..d268afb 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -1972,18 +1972,42 @@ class TestRelativeToDataRoot: repo-relative path rendering — exercised by both cli.py (clip JSON) and manifest.py (manifest CSV).""" - def test_returns_relative_when_under_root(self, tmp_path): + def test_returns_posix_relative_when_under_root(self, tmp_path): root = tmp_path / "corpus" path = root / "data" / "he" / "clip.wav" path.parent.mkdir(parents=True) path.touch() + # POSIX separators, regardless of host OS (#108 Copilot review). assert relative_to_data_root(path, root) == "data/he/clip.wav" - def test_data_root_none_returns_absolute_unchanged(self): - p = Path("/Users/whatever/corpus/data/he/clip.wav") - assert relative_to_data_root(p, None) == str(p) + def test_data_root_none_returns_resolved_absolute_posix(self, tmp_path): + """`data_root=None` returns a *resolved* absolute POSIX string.""" + target = tmp_path / "corpus" / "data" / "he" / "clip.wav" + target.parent.mkdir(parents=True) + target.touch() + result = relative_to_data_root(target, None) + assert Path(result).is_absolute() + # No backslashes — POSIX everywhere. + assert "\\" not in result + + def test_relative_input_data_root_none_still_returns_absolute(self, tmp_path, monkeypatch): + """A relative input + ``data_root=None`` must resolve to absolute. + + Copilot's #108 review caught the original bug: the function + claimed "absolute fallback" but returned ``str(path)`` verbatim, + which is relative if the input was relative. The fix calls + ``Path.resolve()`` on every code path. + """ + nested = tmp_path / "corpus" / "data" / "he" + nested.mkdir(parents=True) + (nested / "clip.wav").touch() + monkeypatch.chdir(tmp_path / "corpus") + result = relative_to_data_root(Path("data/he/clip.wav"), None) + assert Path(result).is_absolute(), ( + f"relative input with data_root=None must resolve to absolute, got: {result!r}" + ) - def test_path_outside_root_falls_back_to_absolute_with_warning(self, tmp_path, caplog): + def test_path_outside_root_falls_back_to_resolved_absolute_with_warning(self, tmp_path, caplog): root = tmp_path / "corpus" root.mkdir() outside = tmp_path / "elsewhere" / "clip.wav" @@ -1992,11 +2016,29 @@ def test_path_outside_root_falls_back_to_absolute_with_warning(self, tmp_path, c with caplog.at_level("WARNING", logger="synthbanshee.package._paths"): result = relative_to_data_root(outside, root) assert Path(result).is_absolute() + # Must be resolved (no backslashes, no '..'). + assert "\\" not in result and ".." not in result # The point of the warning: misconfigured data_root must not be silent. assert any("is outside data_root" in rec.message for rec in caplog.records), ( f"No fallback warning emitted. caplog records: {caplog.records}" ) + def test_relative_input_outside_root_also_resolves_to_absolute( + self, tmp_path, monkeypatch, caplog + ): + """Combined edge case from Copilot's #108 review: relative input + + path outside the configured data_root must still produce an + absolute string (not a misleading relative one).""" + root = tmp_path / "corpus" + root.mkdir() + elsewhere = tmp_path / "elsewhere" + elsewhere.mkdir() + (elsewhere / "clip.wav").touch() + monkeypatch.chdir(tmp_path) + with caplog.at_level("WARNING", logger="synthbanshee.package._paths"): + result = relative_to_data_root(Path("elsewhere/clip.wav"), root) + assert Path(result).is_absolute(), f"expected absolute fallback, got: {result!r}" + class TestBuildPreprocessingMetadata: """`_build_preprocessing_metadata` must pass the *measured* peak from