Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 72 additions & 10 deletions synthbanshee/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from synthbanshee.config.scene_config import SceneConfig

console = Console()
logger = logging.getLogger(__name__)


class DiscoveredScene(NamedTuple):
Expand Down Expand Up @@ -212,6 +213,23 @@ def _build_preprocessing_metadata(result: PreprocessingResult) -> PreprocessingA
)


def _infer_data_root(output_dir: Path, override: Path | None) -> Path | None:
"""Pick a data root for path-rewriting in clip metadata.

Returns *override* when supplied. Otherwise infers from *output_dir*:
the corpus convention is ``<data_root>/data/he/``, so two parents up is
the right anchor. Returns ``None`` only when no sensible inference is
possible (e.g. *output_dir* has fewer than two parents), in which case
callers leave paths absolute.
"""
if override is not None:
return Path(override).resolve()
resolved = Path(output_dir).resolve()
if len(resolved.parents) < 2:
return None
return resolved.parent.parent


def _run_generate_pipeline(
config: Path,
output_dir: Path,
Expand All @@ -222,6 +240,7 @@ def _run_generate_pipeline(
speaker_overrides: dict[str, str] | None = None,
project_profile: ProjectProfile | None = None,
enable_breathiness: bool = False,
data_root: Path | None = None,
) -> tuple[Path | None, list[str]]:
"""Run the full single-clip generate pipeline.

Expand Down Expand Up @@ -698,6 +717,12 @@ def vlog(msg: str) -> None:
effective_prosody_caps=_cap_events,
)

# #108: clip metadata records repo-relative paths so the corpus is
# portable. Anchor at *data_root*; fall back to the legacy absolute form
# if a path is genuinely outside the root (with a warning logged).
from synthbanshee.package._paths import relative_to_data_root

_resolved_data_root = _infer_data_root(output_dir, data_root)
metadata = label_gen.generate_clip_metadata(
clip_id=f"{clip_id}_00",
project=scene.project,
Expand All @@ -709,8 +734,12 @@ def vlog(msg: str) -> None:
scene_config_path=str(config),
random_seed=scene.random_seed,
preprocessing=preprocessing_meta,
dirty_file_path=str(result.dirty_path) if result.dirty_path else None,
transcript_path=str(clip_txt),
dirty_file_path=(
relative_to_data_root(result.dirty_path, _resolved_data_root)
if result.dirty_path
else None
),
transcript_path=relative_to_data_root(clip_txt, _resolved_data_root),
acoustic_scene=acoustic_scene_meta,
quality_flags=quality_flags,
generation_metadata=gen_meta,
Expand Down Expand Up @@ -788,6 +817,17 @@ def cli() -> None:
envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR",
help="LLM script generation cache directory.",
)
@click.option(
"--data-root",
type=click.Path(path_type=Path),
default=None,
envvar="SYNTHBANSHEE_DATA_ROOT",
help=(
"Root directory that paths in clip JSON metadata are written "
"relative to (corpus repo root by convention). Defaults to "
"two parents above --output-dir."
),
)
@click.option(
"--dry-run",
is_flag=True,
Expand Down Expand Up @@ -817,6 +857,7 @@ def generate(
cache_dir: Path,
dirty_dir: Path,
script_cache_dir: Path,
data_root: Path | None,
dry_run: bool,
project_profile: str | None,
verbose: bool,
Expand Down Expand Up @@ -858,6 +899,7 @@ def generate(
script_cache_dir,
verbose=verbose,
project_profile=profile,
data_root=data_root,
)

if wav_path is None:
Expand Down Expand Up @@ -948,13 +990,10 @@ def _distribute_speakers(
Scenes whose speakers already match the assigned variant get an empty
override dict (no-op).
"""
import logging
import random

from synthbanshee.config.speaker_config import SpeakerConfig

logger = logging.getLogger(__name__)

# 1. Discover all available speakers.
all_speakers: dict[str, SpeakerConfig] = {}
for search_dir in [Path("configs/speakers"), Path("configs/examples")]:
Expand Down Expand Up @@ -1040,6 +1079,7 @@ def _render_one(
speaker_overrides: dict[str, str] | None = None,
project_profile: ProjectProfile | None = None,
enable_breathiness: bool = False,
data_root: Path | None = None,
) -> tuple[Path | None, list[str]]:
"""Render a single clip with retries.

Expand Down Expand Up @@ -1067,6 +1107,7 @@ def _render_one(
speaker_overrides=speaker_overrides,
project_profile=project_profile,
enable_breathiness=enable_breathiness,
data_root=data_root,
)
if wav_path is not None:
return wav_path, messages
Expand Down Expand Up @@ -1112,6 +1153,16 @@ def _render_one(
envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR",
help="LLM script generation cache directory.",
)
@click.option(
"--data-root",
type=click.Path(path_type=Path),
default=None,
envvar="SYNTHBANSHEE_DATA_ROOT",
help=(
"Root directory that paths in clip JSON metadata and manifest CSV "
"are written relative to. Defaults to two parents above --output-dir."
),
)
@click.option(
"--manifest-out",
"-m",
Expand Down Expand Up @@ -1159,6 +1210,7 @@ def generate_batch(
cache_dir: Path,
dirty_dir: Path,
script_cache_dir: Path,
data_root: Path | None,
manifest_out: Path | None,
dry_run: bool,
workers: int,
Expand Down Expand Up @@ -1283,6 +1335,7 @@ def generate_batch(
speaker_overrides=speaker_override_map.get(scene_yaml),
project_profile=profile,
enable_breathiness=run_cfg.enable_breathiness,
data_root=data_root,
)
progress.advance(task_id)
if wav_path is None:
Expand Down Expand Up @@ -1313,10 +1366,11 @@ def generate_batch(
script_cache_dir,
run_cfg.max_retries,
stop_event,
verbose,
speaker_override_map.get(scene_yaml),
profile,
run_cfg.enable_breathiness,
verbose=verbose,
speaker_overrides=speaker_override_map.get(scene_yaml),
project_profile=profile,
enable_breathiness=run_cfg.enable_breathiness,
data_root=data_root,
): scene_yaml
for scene_yaml in selected_paths
}
Expand Down Expand Up @@ -1366,7 +1420,15 @@ def generate_batch(
)

# --- Manifest ---
rows = generate_manifest(out_dir, manifest_path, splits=splits, clip_ids=set(splits.keys()))
# #108: anchor manifest paths at the same data root used for clip JSON.
_manifest_data_root = _infer_data_root(out_dir, data_root)
rows = generate_manifest(
out_dir,
manifest_path,
splits=splits,
clip_ids=set(splits.keys()),
relative_to=_manifest_data_root,
)
console.print(f"[bold green]Manifest written:[/bold green] {manifest_path} ({len(rows)} rows)")

# --- Summary ---
Expand Down
12 changes: 12 additions & 0 deletions synthbanshee/labels/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,19 @@ class ClipMetadata(BaseModel):
weak_label: WeakLabel
preprocessing_applied: PreprocessingApplied = Field(default_factory=PreprocessingApplied)
dirty_file_path: str | None = None
"""Path to the retained pre-preprocessing WAV (#108).

Written relative to the data root passed to ``synthbanshee generate``
via ``--data-root`` / ``SYNTHBANSHEE_DATA_ROOT`` (default: two parents
above ``--output-dir``). Older corpus snapshots may still carry
absolute paths; consumers should treat both forms as accepted but
only emit relative paths going forward."""

transcript_path: str | None = None
"""Path to the ``.txt`` transcript file (#108).

Written relative to the data root — see ``dirty_file_path`` for the
anchor convention."""
Comment on lines 218 to +231
quality_flags: list[str] = Field(default_factory=list)
annotator_confidence: float = Field(ge=0.0, le=1.0, default=1.0)
iaa_reviewed: bool = False
Expand Down
48 changes: 48 additions & 0 deletions synthbanshee/package/_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Shared helper for repo-relative path rendering in clip metadata
and manifest CSV. #108 — keep cli.py and manifest.py in sync.
"""

from __future__ import annotations

import logging
from pathlib import Path

logger = logging.getLogger(__name__)


def relative_to_data_root(path: Path, data_root: Path | None) -> str:
"""Render *path* as a string anchored at *data_root* when possible.

Returns a POSIX-style relative string (``"data/he/clip.wav"``) when
*path* resolves to a location under *data_root* (symlinks resolved
on both sides). Falls back to a POSIX-style **absolute** string in
every other case:

- *data_root* is ``None``: nothing to anchor against.
- *path* resolves outside *data_root*: also emits a
``logger.warning`` so a misconfigured ``--data-root`` is loud
rather than silent.

Both branches always call ``Path.resolve()`` and ``Path.as_posix()``
so the JSON / CSV path shape is **stable** regardless of:

- whether the caller passed a relative or absolute ``path``
(#108 review: a relative input used to produce a relative-string
fallback, contradicting the docstring and making metadata depend
on the working directory),
- the host OS (#108 review: Windows would otherwise emit
backslashes; POSIX separators keep the corpus portable).
"""
resolved = Path(path).resolve()
if data_root is None:
return resolved.as_posix()
try:
return resolved.relative_to(Path(data_root).resolve()).as_posix()
except ValueError:
logger.warning(
"Path %s is outside data_root %s; recording absolute path. "
"Configure --data-root / SYNTHBANSHEE_DATA_ROOT to fix.",
path,
data_root,
)
return resolved.as_posix()
12 changes: 10 additions & 2 deletions synthbanshee/package/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pydantic

from synthbanshee.labels.schema import ClipMetadata
from synthbanshee.package._paths import relative_to_data_root as _maybe_relative

_MANIFEST_COLUMNS = [
"clip_id",
Expand Down Expand Up @@ -57,6 +58,7 @@ def generate_manifest(
*,
splits: dict[str, str] | None = None,
clip_ids: set[str] | None = None,
relative_to: Path | None = None,
) -> list[ManifestRow]:
"""Scan data_dir recursively for clip JSON files and write a manifest CSV.

Expand All @@ -73,6 +75,10 @@ def generate_manifest(
whose clip_id is in this set are included in the manifest. Use this
to restrict the manifest to a specific generation run when
``data_dir`` may contain clips from previous runs.
relative_to: Optional data root for path columns (#108). When
provided, ``wav_path`` and ``strong_labels_path`` are written
relative to this directory; paths outside the root fall back to
absolute form.

Returns:
List of ManifestRow objects that were written to output_path.
Expand Down Expand Up @@ -112,8 +118,10 @@ def generate_manifest(
max_intensity=metadata.weak_label.max_intensity,
quality_flags=",".join(metadata.quality_flags),
split=(splits or {}).get(metadata.clip_id, ""),
wav_path=str(json_path.with_suffix(".wav")),
strong_labels_path=str(jsonl_path) if jsonl_path.exists() else "",
wav_path=_maybe_relative(json_path.with_suffix(".wav"), relative_to),
strong_labels_path=(
_maybe_relative(jsonl_path, relative_to) if jsonl_path.exists() else ""
),
)
)

Expand Down
Loading
Loading