diff --git a/.claude/AMX_GOTCHAS.md b/.claude/AMX_GOTCHAS.md index 3e8dfe48..22330c91 100644 --- a/.claude/AMX_GOTCHAS.md +++ b/.claude/AMX_GOTCHAS.md @@ -66,18 +66,41 @@ For CPUID leaf 7 (AMX detection): use `__cpuid_count()`, not inline asm. --- -## Gotcha 4: OS must enable AMX via XSETBV +## Gotcha 4: OS must enable AMX via XSETBV + process must request permission -AMX tiles are large (8 KB of state). The OS must opt in via XCR0 bits 17+18. -Linux 5.19+ enables AMX by default. Older kernels: SIGILL on tile instructions. +AMX tiles are large (8 KB of state). Two levels of OS enablement required: + +1. **Kernel enables tile state in XCR0** (bits 17+18). Linux 5.19+ does this. +2. **Process requests XCOMP_PERM** via `prctl(ARCH_REQ_XCOMP_PERM, 18)`. + Without this, LDTILECFG will SIGILL even if XCR0 bits are set. **Detection (stable)**: ```rust -let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0); -let tilecfg = (xcr0.eax >> 17) & 1; // bit 17 = XTILECFG -let tiledata = (xcr0.eax >> 18) & 1; // bit 18 = XTILEDATA -// Both must be 1 -``` +// Step 1: CPUID — does CPU support AMX? +let cpuid = core::arch::x86_64::__cpuid_count(7, 0); +let amx_tile = (cpuid.edx >> 24) & 1; +let amx_int8 = (cpuid.edx >> 25) & 1; + +// Step 2: OSXSAVE — does OS support XSAVE? +let cpuid_01 = core::arch::x86_64::__cpuid(1); +let osxsave = (cpuid_01.ecx >> 27) & 1; + +// Step 3: _xgetbv(0) — did OS ACTUALLY enable tile state? +// ⚠ Do NOT use __cpuid_count(0xD, 0) — that reports what CPU SUPPORTS, +// not what the OS ENABLED. _xgetbv(0) reads the actual XCR0 register. +let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) }; +let tilecfg = (xcr0 >> 17) & 1; // bit 17 = XTILECFG +let tiledata = (xcr0 >> 18) & 1; // bit 18 = XTILEDATA + +// Step 4: prctl — request tile permission for this process +// SYS_prctl = 157, ARCH_REQ_XCOMP_PERM = 0x1023, XFEATURE_XTILEDATA = 18 +// Returns 0 on success, -errno on failure. Idempotent. +``` + +**Previous bug**: `__cpuid_count(0xD, 0)` reports XSAVE state component bitmap +(what the CPU *supports*), NOT the actual XCR0 value (what the OS *enabled*). +On hypervisors that advertise AMX in CPUID but don't enable tile state, +the old check returned `true` → SIGILL on LDTILECFG. --- diff --git a/src/hpc/audio/codec_map.rs b/src/hpc/audio/codec_map.rs new file mode 100644 index 00000000..24e2935a --- /dev/null +++ b/src/hpc/audio/codec_map.rs @@ -0,0 +1,297 @@ +//! Codec provenance map: which real codec each primitive comes from. +//! +//! Every primitive in this audio stack was stolen from a production codec. +//! Nothing invented — only transcoded and compressed to fit the HHTL cascade. +//! +//! ```text +//! ┌─────────────┬──────────┬─────────┬────────┬─────────┬──────┬───────────┐ +//! │ Our type │ Opus │ Whisper │ MP3 │ Vorbis │ Bark │ ElevenLabs│ +//! ├─────────────┼──────────┼─────────┼────────┼─────────┼──────┼───────────┤ +//! │ MDCT │ CELT │ │ hybrid │ ✓ │ │ │ +//! │ 21 bands │ eBands48 │ │ 32 sub │ ✓ │ │ │ +//! │ PVQ shape │ CELT PVQ │ │ │ residue │ │ │ +//! │ Mel 80ch │ │ frontend│ │ │ │ │ +//! │ Phase 4B │ │ STFT ∠ │ │ │ │ │ +//! │ VoiceArch │ │ │ │ │ spk │ embedding │ +//! │ RvqFrame │ │ │ │ │ 3stg │ │ +//! │ OctaveBand │ │ │ ✓ │ floor │ │ │ +//! │ Mode │ │ │ │ │ │ emotion │ +//! │ HHTL skip │ │ │ mask │ floor │ │ │ +//! │ CompLinear │ │ │ │ VQ cb │ RVQ │ │ +//! │ Qualia17D │ (QPL) │ │ │ │ sem │ emotion │ +//! └─────────────┴──────────┴─────────┴────────┴─────────┴──────┴───────────┘ +//! ``` +//! +//! The architecture replaces neural inference with graph search at every stage: +//! MP3's psychoacoustic model → HHTL cascade (RouteAction::Skip) +//! Whisper's transformer → phoneme graph shortest path +//! Bark's 3 GPT-2 stages → 3 HHTL levels (HEEL/HIP/TWIG) +//! Vorbis's codebook VQ → CompiledLinear VNNI palette lookup +//! ElevenLabs' voice cloning → VoiceArchetype 16-byte embedding + +/// Codec provenance for each audio primitive. +/// +/// Documents which production codec each type was transcoded from, +/// what aspect of that codec it captures, and what it replaces. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CodecSource { + Opus, + Whisper, + Mp3, + OggVorbis, + Bark, + ElevenLabs, +} + +/// What aspect of audio each primitive captures. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum AudioAspect { + /// Spectral energy distribution (WHAT frequencies) + SpectralEnvelope, + /// Fine spectral shape within bands (HOW the energy is distributed) + SpectralShape, + /// Perceptual frequency mapping (WHERE in human hearing) + PerceptualMapping, + /// Temporal phase relationships (WHEN harmonics align) + PhaseRelationship, + /// Speaker identity (WHO is speaking) + SpeakerIdentity, + /// Semantic/emotional content (WHY it sounds that way) + SemanticContent, + /// Psychoacoustic masking (WHAT to skip) + MaskingDecision, + /// Codebook lookup (HOW to decompress) + CodebookLookup, +} + +/// Complete provenance record for one primitive. +pub struct Provenance { + pub our_type: &'static str, + pub byte_size: usize, + pub source: CodecSource, + pub aspect: AudioAspect, + pub source_concept: &'static str, + pub what_it_replaces: &'static str, +} + +/// Full provenance table for every audio primitive. +/// +/// This IS the design document. If a new primitive doesn't appear here, +/// it wasn't stolen from a real codec and shouldn't exist. +pub const PROVENANCE: &[Provenance] = &[ + // ═══ From Opus CELT ═══ + Provenance { + our_type: "AudioFrame.band_energies", + byte_size: 42, + source: CodecSource::Opus, + aspect: AudioAspect::SpectralEnvelope, + source_concept: "eBands48 critical bands, gain in gain-shape split", + what_it_replaces: "Per-coefficient quantization (MP3/Vorbis)", + }, + Provenance { + our_type: "AudioFrame.pvq_summary", + byte_size: 6, + source: CodecSource::Opus, + aspect: AudioAspect::SpectralShape, + source_concept: "PVQ (Pyramid Vector Quantization) pulse allocation", + what_it_replaces: "Huffman-coded residuals (MP3) / VQ codebook (Vorbis)", + }, + Provenance { + our_type: "mdct_forward / mdct_backward", + byte_size: 0, // transform, not stored + source: CodecSource::Opus, + aspect: AudioAspect::SpectralEnvelope, + source_concept: "CELT MDCT: 960-sample window → 480 frequency bins", + what_it_replaces: "FFT+windowing (all codecs use some form)", + }, + + // ═══ From Whisper ═══ + Provenance { + our_type: "mel::log_mel_spectrogram", + byte_size: 160, // 80 × BF16 per frame + source: CodecSource::Whisper, + aspect: AudioAspect::PerceptualMapping, + source_concept: "80-channel mel filterbank at 16kHz, Hann STFT", + what_it_replaces: "Transformer encoder (150M params → 80 f32 per frame)", + }, + + // ═══ From MP3 ═══ + Provenance { + our_type: "HhtlCache::route() → Skip", + byte_size: 0, // decision, not stored + source: CodecSource::Mp3, + aspect: AudioAspect::MaskingDecision, + source_concept: "Psychoacoustic masking model (simultaneous + temporal)", + what_it_replaces: "ISO 11172-3 psychoacoustic model 1/2 (iterative bit allocation)", + }, + Provenance { + our_type: "OctaveBand", + byte_size: 13, // 3×f32 + u8 + source: CodecSource::Mp3, + aspect: AudioAspect::SpectralEnvelope, + source_concept: "32-subband polyphase filterbank (octave-spaced)", + what_it_replaces: "Per-subband quantization + Huffman (MP3 granules)", + }, + + // ═══ From Ogg Vorbis ═══ + Provenance { + our_type: "CompiledLinear (ndarray burn)", + byte_size: 65536, // 256 centroids × 256 dim + source: CodecSource::OggVorbis, + aspect: AudioAspect::CodebookLookup, + source_concept: "VQ codebook: precomputed centroids, lookup-based decode", + what_it_replaces: "Huffman trees (MP3) / arithmetic coding (Opus range coder)", + }, + + // ═══ From Bark (Suno) ═══ + Provenance { + our_type: "RvqFrame.archetype (HEEL)", + byte_size: 1, + source: CodecSource::Bark, + aspect: AudioAspect::SemanticContent, + source_concept: "Stage 1: GPT-2 semantic tokens (coarse meaning)", + what_it_replaces: "350M-param GPT-2 autoregressive generation", + }, + Provenance { + our_type: "RvqFrame.coarse (HIP)", + byte_size: 8, + source: CodecSource::Bark, + aspect: AudioAspect::SpectralEnvelope, + source_concept: "Stage 2: GPT-2 coarse acoustic tokens (spectral envelope)", + what_it_replaces: "350M-param GPT-2 conditioned on semantic tokens", + }, + Provenance { + our_type: "RvqFrame.fine (TWIG)", + byte_size: 8, + source: CodecSource::Bark, + aspect: AudioAspect::SpectralShape, + source_concept: "Stage 3: non-autoregressive fine acoustic tokens", + what_it_replaces: "Fine model (smaller network, fills spectral detail)", + }, + + // ═══ From ElevenLabs ═══ + Provenance { + our_type: "VoiceArchetype", + byte_size: 16, + source: CodecSource::ElevenLabs, + aspect: AudioAspect::SpeakerIdentity, + source_concept: "Speaker embedding (voice cloning conditioning vector)", + what_it_replaces: "512-dim speaker embedding (2KB → 16 bytes)", + }, + + // ═══ Phase (novel — no codec stores this) ═══ + Provenance { + our_type: "PhaseDescriptor", + byte_size: 4, + source: CodecSource::Whisper, // closest: Whisper STFT preserves phase internally + aspect: AudioAspect::PhaseRelationship, + source_concept: "STFT phase (discarded by all codecs except Griffin-Lim)", + what_it_replaces: "Nothing — all codecs discard phase. We keep it as relative pressure.", + }, + + // ═══ Qualia (novel — derived from QPL musical calibration) ═══ + Provenance { + our_type: "Qualia17D", + byte_size: 68, + source: CodecSource::Bark, // closest: Bark semantic tokens carry meaning + aspect: AudioAspect::SemanticContent, + source_concept: "QPL: Octave→arousal, Fifth→valence, Third→warmth, Tritone→tension", + what_it_replaces: "No codec captures nonverbal meaning explicitly. This is the grid.", + }, +]; + +/// Total bytes for one complete frame (all primitives combined). +/// +/// AudioFrame (48) + PhaseDescriptor (4) + VoiceArchetype (16, amortized) +/// = 52 bytes per frame for complete nonverbal characterization. +/// + RvqFrame (17) for HHTL-compressed TTS output = 69 bytes. +/// +/// Compare: +/// MP3 128kbps: ~417 bytes per 26ms frame +/// Opus 64kbps: ~166 bytes per 20ms frame +/// Bark tokens: ~128 bytes per frame +/// Ours: 52-69 bytes per frame (complete, including phase + identity) +pub const FRAME_BUDGET: usize = 52; +pub const FRAME_BUDGET_WITH_TTS: usize = 69; + +/// Codec comparison: bits per second at comparable quality. +/// +/// These are approximate — our codec is lossy in a fundamentally +/// different way (palette quantization, not psychoacoustic masking). +pub const BITRATE_COMPARISON: &[(&str, u32, &str)] = &[ + ("MP3 128k", 128_000, "psychoacoustic masking, Huffman"), + ("Opus 64k", 64_000, "CELT+SILK hybrid, range coder"), + ("Vorbis 128k", 128_000, "MDCT, floor+residue, VQ codebook"), + ("Bark tokens", 25_600, "3-stage RVQ, ~100 tokens/sec × 256 bits"), + ("Ours (48kHz)", 20_800, "52 bytes × 50 fps × 8 bits = 20.8 kbps"), + ("Ours (24kHz)", 10_400, "52 bytes × 25 fps × 8 bits = 10.4 kbps"), +]; + +/// Verify every AudioAspect is covered by at least one primitive. +/// If an aspect is missing, we have a hole in our codec design. +pub fn verify_aspect_coverage() -> Vec { + use AudioAspect::*; + let all = [SpectralEnvelope, SpectralShape, PerceptualMapping, + PhaseRelationship, SpeakerIdentity, SemanticContent, + MaskingDecision, CodebookLookup]; + + all.iter() + .filter(|&&aspect| !PROVENANCE.iter().any(|p| p.aspect == aspect)) + .copied() + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn all_aspects_covered() { + let missing = verify_aspect_coverage(); + assert!(missing.is_empty(), "Missing audio aspects: {:?}", missing); + } + + #[test] + fn frame_budget_correct() { + // AudioFrame (48) + PhaseDescriptor (4) = 52 + assert_eq!(FRAME_BUDGET, 48 + 4); + // + RvqFrame (17) = 69 + assert_eq!(FRAME_BUDGET_WITH_TTS, 48 + 4 + 17); + } + + #[test] + fn provenance_byte_sizes_consistent() { + // AudioFrame = 42 (energies) + 6 (pvq) = 48 + let af_energies = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.band_energies").unwrap(); + let af_pvq = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.pvq_summary").unwrap(); + assert_eq!(af_energies.byte_size + af_pvq.byte_size, 48); + + // RvqFrame = 1 (HEEL) + 8 (HIP) + 8 (TWIG) = 17 + let rvq_heel = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.archetype (HEEL)").unwrap(); + let rvq_hip = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.coarse (HIP)").unwrap(); + let rvq_twig = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.fine (TWIG)").unwrap(); + assert_eq!(rvq_heel.byte_size + rvq_hip.byte_size + rvq_twig.byte_size, 17); + } + + #[test] + fn every_source_codec_represented() { + // All 6 source codecs should appear at least once + for source in [CodecSource::Opus, CodecSource::Whisper, CodecSource::Mp3, + CodecSource::OggVorbis, CodecSource::Bark, CodecSource::ElevenLabs] { + assert!(PROVENANCE.iter().any(|p| p.source == source), + "Codec {:?} not represented in provenance table", source); + } + } + + #[test] + fn our_bitrate_competitive() { + // Our codec should be lower bitrate than all traditional codecs + let ours_24k = BITRATE_COMPARISON.iter() + .find(|&&(name, _, _)| name == "Ours (24kHz)") + .unwrap().1; + let mp3 = BITRATE_COMPARISON.iter() + .find(|&&(name, _, _)| name == "MP3 128k") + .unwrap().1; + assert!(ours_24k < mp3, "Our codec should be lower bitrate than MP3"); + } +} diff --git a/src/hpc/audio/mel.rs b/src/hpc/audio/mel.rs new file mode 100644 index 00000000..d45c3e4f --- /dev/null +++ b/src/hpc/audio/mel.rs @@ -0,0 +1,271 @@ +//! Mel filterbank — transcoded from Whisper's audio preprocessing. +//! +//! 80-channel mel spectrogram at 16kHz, matching Whisper's frontend: +//! PCM 16kHz → STFT (400-sample window, 160-sample hop) → mel filterbank → log scale +//! +//! The mel scale maps linear frequencies to perceptual pitch: +//! mel(f) = 2595 × log₁₀(1 + f/700) +//! +//! Key insight stolen from Whisper: the mel spectrogram IS the phoneme +//! fingerprint space. Each 80-dim mel frame can be compressed to a +//! 6-byte CAM fingerprint for HHTL cascade search. +//! +//! Zero external dependencies — uses `hpc::fft` internally. + +use crate::hpc::fft; +use core::f32::consts::PI; + +/// Number of mel channels (Whisper default). +pub const N_MELS: usize = 80; +/// STFT window size (400 samples = 25ms at 16kHz). +pub const STFT_WINDOW: usize = 400; +/// STFT hop size (160 samples = 10ms at 16kHz). +pub const STFT_HOP: usize = 160; +/// Sample rate for mel computation (Whisper operates at 16kHz). +pub const MEL_SAMPLE_RATE: usize = 16000; +/// FFT size (next power of 2 from STFT_WINDOW). +pub const FFT_SIZE: usize = 512; +/// Number of FFT bins used: FFT_SIZE/2 + 1. +pub const N_FFT_BINS: usize = FFT_SIZE / 2 + 1; + +/// Convert frequency in Hz to mel scale. +/// Whisper uses the Slaney formula: mel = 2595 × log₁₀(1 + f/700) +#[inline] +pub fn hz_to_mel(hz: f32) -> f32 { + 2595.0 * (1.0 + hz / 700.0).log10() +} + +/// Convert mel scale to frequency in Hz. +#[inline] +pub fn mel_to_hz(mel: f32) -> f32 { + 700.0 * (10.0f32.powf(mel / 2595.0) - 1.0) +} + +/// Precomputed mel filterbank matrix: [N_MELS × N_FFT_BINS]. +/// +/// Row-major: `filters[mel * N_FFT_BINS + bin]` = weight for mel channel `mel` +/// at FFT bin `bin`. Each row is a triangular filter centered at the mel-spaced +/// frequency. +/// +/// Build once, reuse for every frame. 80 × 257 × 4 bytes = ~82 KB. +pub fn build_mel_filters(sample_rate: usize, n_fft: usize, n_mels: usize) -> Vec { + let n_bins = n_fft / 2 + 1; + let mut filters = vec![0.0f32; n_mels * n_bins]; + + let f_min = 0.0f32; + let f_max = sample_rate as f32 / 2.0; + let mel_min = hz_to_mel(f_min); + let mel_max = hz_to_mel(f_max); + + // n_mels + 2 points evenly spaced in mel domain + let n_points = n_mels + 2; + let mel_points: Vec = (0..n_points) + .map(|i| mel_min + (mel_max - mel_min) * i as f32 / (n_points - 1) as f32) + .collect(); + + // Convert mel points back to Hz, then to FFT bin indices + let hz_points: Vec = mel_points.iter().map(|&m| mel_to_hz(m)).collect(); + let bin_points: Vec = hz_points.iter() + .map(|&h| h * n_fft as f32 / sample_rate as f32) + .collect(); + + // Build triangular filters + for m in 0..n_mels { + let left = bin_points[m]; + let center = bin_points[m + 1]; + let right = bin_points[m + 2]; + + for bin in 0..n_bins { + let b = bin as f32; + let weight = if b >= left && b < center { + // Rising slope + (b - left) / (center - left).max(1e-10) + } else if b >= center && b <= right { + // Falling slope + (right - b) / (right - center).max(1e-10) + } else { + 0.0 + }; + filters[m * n_bins + bin] = weight; + } + } + + filters +} + +/// Hann window for STFT. +pub fn hann_window(n: usize) -> Vec { + (0..n).map(|i| 0.5 * (1.0 - (2.0 * PI * i as f32 / n as f32).cos())).collect() +} + +/// Compute magnitude spectrogram via STFT. +/// +/// Input: mono f32 PCM at 16kHz. +/// Output: `[n_frames × n_bins]` magnitude values (row-major). +/// +/// Uses `hpc::fft` internally. Window = Hann, hop = 160 samples. +pub fn stft_magnitude(pcm: &[f32], window_size: usize, hop_size: usize) -> Vec { + let n_fft = window_size.next_power_of_two(); + let n_bins = n_fft / 2 + 1; + let window = hann_window(window_size); + + let n_frames = if pcm.len() >= window_size { + (pcm.len() - window_size) / hop_size + 1 + } else { + 0 + }; + + let mut magnitudes = Vec::with_capacity(n_frames * n_bins); + + for frame_idx in 0..n_frames { + let start = frame_idx * hop_size; + + // Apply window, then pack as interleaved [re, im, re, im, ...] + let mut data = vec![0.0f32; 2 * n_fft]; + for i in 0..window_size.min(pcm.len() - start) { + data[2 * i] = pcm[start + i] * window[i]; // real + // imaginary stays 0 + } + + // FFT (interleaved complex: data[2*k] = re, data[2*k+1] = im) + fft::fft_f32(&mut data, n_fft); + + // Magnitude: |X[k]| = sqrt(re² + im²) + for bin in 0..n_bins { + let re = data[2 * bin]; + let im = data[2 * bin + 1]; + let mag = (re * re + im * im).sqrt(); + magnitudes.push(mag); + } + } + + magnitudes +} + +/// Compute 80-channel log mel spectrogram (Whisper frontend). +/// +/// Input: mono f32 PCM at 16kHz. +/// Output: `[n_frames × N_MELS]` log-mel values (row-major). +/// +/// Pipeline: PCM → STFT magnitude → mel filterbank → log scale. +pub fn log_mel_spectrogram(pcm: &[f32]) -> Vec { + let n_bins = FFT_SIZE / 2 + 1; + + // Build mel filters (could be cached, but 82KB is cheap) + let filters = build_mel_filters(MEL_SAMPLE_RATE, FFT_SIZE, N_MELS); + + // STFT magnitude + let mag = stft_magnitude(pcm, STFT_WINDOW, STFT_HOP); + let n_frames = mag.len() / n_bins; + + // Apply mel filterbank + log scale + let mut log_mel = Vec::with_capacity(n_frames * N_MELS); + + for frame in 0..n_frames { + for mel in 0..N_MELS { + let mut energy = 0.0f32; + for bin in 0..n_bins { + energy += filters[mel * n_bins + bin] * mag[frame * n_bins + bin]; + } + // Log scale with floor (Whisper uses max(energy, 1e-10)) + let log_e = energy.max(1e-10).ln(); + log_mel.push(log_e); + } + } + + log_mel +} + +/// Compress an 80-dim mel frame to BF16 (160 bytes → useful for distance). +pub fn mel_frame_to_bf16(frame: &[f32]) -> [u16; N_MELS] { + let mut bf16 = [0u16; N_MELS]; + for i in 0..N_MELS.min(frame.len()) { + let bits = frame[i].to_bits(); + let lsb = (bits >> 16) & 1; + let biased = bits.wrapping_add(0x7FFF).wrapping_add(lsb); + bf16[i] = (biased >> 16) as u16; + } + bf16 +} + +/// L1 distance between two BF16 mel frames (for HHTL cascade). +pub fn mel_l1_bf16(a: &[u16; N_MELS], b: &[u16; N_MELS]) -> f32 { + let mut d = 0.0f32; + for i in 0..N_MELS { + let va = f32::from_bits((a[i] as u32) << 16); + let vb = f32::from_bits((b[i] as u32) << 16); + d += (va - vb).abs(); + } + d +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn mel_hz_roundtrip() { + for &f in &[440.0, 1000.0, 4000.0, 8000.0] { + let mel = hz_to_mel(f); + let back = mel_to_hz(mel); + assert!((f - back).abs() < 0.01, "Roundtrip failed: {} → {} → {}", f, mel, back); + } + } + + #[test] + fn mel_scale_monotonic() { + let m1 = hz_to_mel(100.0); + let m2 = hz_to_mel(1000.0); + let m3 = hz_to_mel(8000.0); + assert!(m1 < m2 && m2 < m3); + // Higher frequencies are compressed in mel scale + assert!((m2 - m1) > (m3 - m2) * 0.3); + } + + #[test] + fn build_filters_shape() { + let filters = build_mel_filters(MEL_SAMPLE_RATE, FFT_SIZE, N_MELS); + assert_eq!(filters.len(), N_MELS * N_FFT_BINS); + // Each mel channel should have some nonzero weights + for mel in 0..N_MELS { + let row_sum: f32 = (0..N_FFT_BINS) + .map(|bin| filters[mel * N_FFT_BINS + bin]) + .sum(); + assert!(row_sum > 0.0, "Mel channel {} has no energy", mel); + } + } + + #[test] + fn log_mel_440hz_sine() { + // 440Hz sine at 16kHz, 1 second + let n_samples = MEL_SAMPLE_RATE; + let pcm: Vec = (0..n_samples) + .map(|i| (2.0 * PI * 440.0 * i as f32 / MEL_SAMPLE_RATE as f32).sin()) + .collect(); + + let log_mel = log_mel_spectrogram(&pcm); + let n_frames = log_mel.len() / N_MELS; + assert!(n_frames > 0, "Should produce at least one frame"); + + // The mel channel containing 440Hz should have high energy + // 440Hz ≈ mel channel ~14 (depends on exact mel spacing) + let frame0 = &log_mel[0..N_MELS]; + let max_mel = frame0.iter() + .enumerate() + .max_by(|a, b| a.1.partial_cmp(b.1).unwrap()) + .unwrap(); + // Peak should be in low-to-mid range (440Hz is low) + assert!(max_mel.0 < 30, "440Hz peak at mel {}, expected < 30", max_mel.0); + } + + #[test] + fn mel_bf16_roundtrip() { + let frame: Vec = (0..N_MELS).map(|i| (i as f32 * 0.1) - 4.0).collect(); + let bf16 = mel_frame_to_bf16(&frame); + for i in 0..N_MELS { + let recovered = f32::from_bits((bf16[i] as u32) << 16); + let err = (frame[i] - recovered).abs(); + assert!(err < 0.1, "BF16 error at mel {}: {:.4} vs {:.4}", i, frame[i], recovered); + } + } +} diff --git a/src/hpc/audio/mod.rs b/src/hpc/audio/mod.rs index d156944b..455be306 100644 --- a/src/hpc/audio/mod.rs +++ b/src/hpc/audio/mod.rs @@ -1,7 +1,15 @@ -//! Audio primitives transcoded from Opus CELT. +//! Audio primitives transcoded from Opus CELT, Whisper, and Bark. //! -//! MDCT, band energy extraction, PVQ, and AudioFrame for the -//! HHTL cascade → waveform synthesis pipeline. +//! Steals the best ideas from each: +//! Opus — MDCT + PVQ gain-shape split + CELT critical bands +//! Whisper — 80-channel mel filterbank (perceptual frequency mapping) +//! Bark — 3-stage RVQ hierarchy (semantic→coarse→fine → HHTL levels) +//! ElevenLabs — voice cloning as archetype embedding (16 i8 channels) +//! +//! AudioFrame (48 bytes) from Opus is the storage format. +//! Mel spectrogram from Whisper is the recognition format. +//! VoiceArchetype (16 bytes) from Bark/ElevenLabs is the speaker identity. +//! RvqFrame (17 bytes) is the compressed TTS output. //! //! Zero external dependencies — uses `hpc::fft` internally. @@ -9,3 +17,9 @@ pub mod mdct; pub mod bands; pub mod pvq; pub mod codec; +pub mod mel; +pub mod voice; +pub mod modes; +pub mod phase; +pub mod codec_map; +pub mod synth; diff --git a/src/hpc/audio/modes.rs b/src/hpc/audio/modes.rs new file mode 100644 index 00000000..9f042ca1 --- /dev/null +++ b/src/hpc/audio/modes.rs @@ -0,0 +1,475 @@ +//! Musical mode progressions via Base17 Quintenzirkel. +//! +//! The 17-dimension golden spiral maps to musical modes via octave stacking: +//! - 17-EDO (17 equal divisions of the octave) approximates both +//! perfect fifths and major thirds better than 12-EDO +//! - Base17 dim rotation = mode rotation (Dorian↔Lydian = offset change) +//! - Golden step (11/17) visits all 17 dims without repetition, +//! like the circle of fifths visits all 12 chromatic notes +//! +//! Mode-to-qualia mapping for TTS: +//! Ionian (I): bright, confident → gate stride 8 (broad routing) +//! Dorian (ii): warm, reflective → V stride 5 (content retrieval) +//! Phrygian (iii): dark, exotic → QK stride 3 (tight attention) +//! Lydian (IV): dreamy, floating → Up stride 2 (fine expansion) +//! Mixolydian (V): driving, bluesy → Down stride 4 (compression) +//! Aeolian (vi): sad, minor → QK stride 3 (shifted start) +//! Locrian (vii°): unstable, tense → Gate stride 8 (shifted start) +//! +//! The stride IS the mode. The start offset IS the key. +//! No lookup table needed — the address geometry encodes the qualia. + +use super::bands; + +/// Musical modes as qualia progressions. +/// +/// Each mode is defined by its interval pattern (in 17-EDO steps) +/// and maps to a Base17 stride for spectral character. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub enum Mode { + Ionian, // Major: W-W-H-W-W-W-H → bright, resolved + Dorian, // Minor with ♮6: warm, jazz + Phrygian, // Minor with ♭2: dark, flamenco + Lydian, // Major with ♯4: dreamy, floating + Mixolydian, // Major with ♭7: dominant, bluesy + Aeolian, // Natural minor: sad, reflective + Locrian, // Diminished: unstable, tense +} + +impl Mode { + /// Map mode to highheelbgz stride (voice qualia selector). + /// + /// The stride determines how the spectral envelope is sampled: + /// larger stride = coarser sampling = broader routing + /// smaller stride = finer sampling = tighter detail + pub fn stride(&self) -> u32 { + match self { + Mode::Ionian => 8, // Gate: broad, confident + Mode::Dorian => 5, // V: warm content + Mode::Phrygian => 3, // QK: tight, exotic + Mode::Lydian => 2, // Up: fine, dreamy + Mode::Mixolydian => 4, // Down: driving compression + Mode::Aeolian => 3, // QK: minor, offset start + Mode::Locrian => 8, // Gate: unstable, offset start + } + } + + /// Start offset in Base17 space (key signature). + /// + /// The offset rotates the golden spiral walk, changing which + /// spectral dimensions are sampled first — equivalent to + /// transposing the key. + pub fn start_offset(&self) -> u32 { + match self { + Mode::Ionian => 0, + Mode::Dorian => 2, + Mode::Phrygian => 4, + Mode::Lydian => 5, + Mode::Mixolydian => 7, + Mode::Aeolian => 9, + Mode::Locrian => 11, + } + } + + /// 17-EDO interval pattern (steps in 17-EDO). + /// + /// 17-EDO: W=3 steps, H=2 steps, total=17 steps per octave. + /// This is more accurate than 12-EDO for both fifths and thirds. + pub fn intervals_17edo(&self) -> [u8; 7] { + match self { + Mode::Ionian => [3, 3, 2, 3, 3, 3, 0], // W W H W W W (last H implicit) + Mode::Dorian => [3, 2, 3, 3, 3, 2, 1], // W H W W W H W-1 + Mode::Phrygian => [2, 3, 3, 3, 2, 3, 1], // H W W W H W W-1 + Mode::Lydian => [3, 3, 3, 2, 3, 3, 0], // W W W H W W (last H implicit) + Mode::Mixolydian => [3, 3, 2, 3, 3, 2, 1], // W W H W W H W-1 + Mode::Aeolian => [3, 2, 3, 3, 2, 3, 1], // W H W W H W W-1 + Mode::Locrian => [2, 3, 3, 2, 3, 3, 1], // H W W H W W W-1 + } + } + + /// Tension level (0.0 = resolved, 1.0 = maximally tense). + /// + /// Derived from the tritone content and leading tone quality. + /// Maps to HHTL skip threshold: low tension → aggressive skipping, + /// high tension → less skipping (preserve detail). + pub fn tension(&self) -> f32 { + match self { + Mode::Ionian => 0.1, // most resolved + Mode::Lydian => 0.2, // floating but stable + Mode::Mixolydian => 0.3, // dominant tension + Mode::Dorian => 0.4, // warm but minor + Mode::Aeolian => 0.6, // sad minor + Mode::Phrygian => 0.8, // dark, exotic + Mode::Locrian => 1.0, // maximum instability + } + } +} + +/// Band energy modulation by mode. +/// +/// Each mode emphasizes different frequency regions, creating the +/// characteristic "color" of the mode. Applied as a multiplier +/// on the 21 Opus CELT band energies. +/// +/// Ionian boosts presence (2-4 kHz) for brightness. +/// Phrygian boosts sub-bass and cuts presence for darkness. +/// Lydian boosts harmonics (4-8 kHz) for shimmer. +pub fn mode_band_weights(mode: Mode) -> [f32; bands::N_BANDS] { + let mut weights = [1.0f32; bands::N_BANDS]; + + match mode { + Mode::Ionian => { + // Bright: boost presence (bands 10-14, ~2-5 kHz) + for i in 10..=14 { weights[i] = 1.3; } + } + Mode::Dorian => { + // Warm: boost low-mid (bands 4-8, ~800-1800 Hz) + for i in 4..=8 { weights[i] = 1.2; } + } + Mode::Phrygian => { + // Dark: boost sub-bass (bands 0-3), cut presence + for i in 0..=3 { weights[i] = 1.4; } + for i in 10..=14 { weights[i] = 0.7; } + } + Mode::Lydian => { + // Shimmering: boost harmonics (bands 14-18, ~5-13 kHz) + for i in 14..=18 { weights[i] = 1.3; } + } + Mode::Mixolydian => { + // Driving: boost fundamental + mid (bands 2-6, ~400-1400 Hz) + for i in 2..=6 { weights[i] = 1.25; } + } + Mode::Aeolian => { + // Sad: slight low emphasis, gentle roll-off + for i in 0..=5 { weights[i] = 1.15; } + for i in 16..=20 { weights[i] = 0.85; } + } + Mode::Locrian => { + // Unstable: emphasize dissonant regions + weights[6] = 1.4; // ~1400 Hz tritone region + weights[13] = 1.3; // ~3400 Hz + for i in 0..=2 { weights[i] = 0.8; } // weaken root + } + } + + weights +} + +/// Apply mode coloring to band energies. +/// +/// Modulates band energies by the mode's characteristic weights. +/// Used in the TTS pipeline: archetype → band energies → mode color → synthesis. +pub fn apply_mode(energies: &mut [f32; bands::N_BANDS], mode: Mode) { + let weights = mode_band_weights(mode); + for i in 0..bands::N_BANDS { + energies[i] *= weights[i]; + } +} + +/// Circle of fifths progression as mode sequence. +/// +/// Returns the classic I → IV → V → I progression in mode space. +/// Each step has a mode and a root offset in 17-EDO steps. +/// +/// For TTS: modulate voice character through a progression to +/// create natural-sounding prosody contours. +pub fn circle_of_fifths_progression() -> Vec<(Mode, u32)> { + vec![ + (Mode::Ionian, 0), // I (tonic, resolved) + (Mode::Lydian, 5), // IV (subdominant, floating) + (Mode::Mixolydian, 7), // V (dominant, driving) + (Mode::Ionian, 0), // I (return to tonic) + ] +} + +/// Minor progression: i → iv → VI → V → i +pub fn minor_progression() -> Vec<(Mode, u32)> { + vec![ + (Mode::Aeolian, 0), // i (tonic minor) + (Mode::Dorian, 5), // iv (subdominant, warm) + (Mode::Ionian, 8), // VI (relative major, bright) + (Mode::Mixolydian, 7), // V (dominant, driving) + (Mode::Aeolian, 0), // i (return) + ] +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Octave compression: same tone across octaves → one transposed band +// ═══════════════════════════════════════════════════════════════════════════ + +/// Octave-compressed band modulation. +/// +/// Key insight: harmonics of the same pitch class have identical spectral +/// SHAPE, just shifted in frequency by powers of 2. A C2 (65 Hz) and C4 +/// (262 Hz) produce the same overtone ratios — only the fundamental moves. +/// +/// So instead of storing band energies for every octave separately, store +/// ONE canonical modulation pattern and an octave offset. The pattern is +/// applied at `band_offset + octave * bands_per_octave`. +/// +/// Compression ratio: 8 octaves × 21 bands → 1 pattern + 3-bit offset = 90% +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct OctaveBand { + /// Canonical band modulation pattern (one octave's worth). + /// Normalized: sum = 1.0. Applied as weights to the 3 bands + /// spanning one octave at the given offset. + pub pattern: [f32; 3], + /// Octave offset (0 = lowest, 7 = highest). + /// Selects which 3-band group in the 21-band Opus layout to modulate. + pub octave: u8, +} + +impl OctaveBand { + /// Number of Opus bands per octave (approximately 3 in the log-spaced layout). + pub const BANDS_PER_OCTAVE: usize = 3; + + /// Map an octave offset to the starting Opus band index. + /// + /// Opus CELT bands are quasi-logarithmic, so each ~3 bands ≈ 1 octave: + /// octave 0: bands 0-2 (~0-600 Hz, sub-bass to bass) + /// octave 1: bands 3-5 (~600-1200 Hz, low-mid) + /// octave 2: bands 6-8 (~1200-1800 Hz, mid) + /// octave 3: bands 9-11 (~1800-3000 Hz, presence) + /// octave 4: bands 12-14 (~3000-4800 Hz, brilliance) + /// octave 5: bands 15-17 (~4800-8000 Hz, air) + /// octave 6: bands 18-20 (~8000-24000 Hz, ultra) + pub fn start_band(&self) -> usize { + (self.octave as usize * Self::BANDS_PER_OCTAVE).min(bands::N_BANDS - Self::BANDS_PER_OCTAVE) + } + + /// Apply this octave-compressed modulation to 21-band energies. + /// + /// Only modifies the 3 bands at `start_band()..start_band()+3`. + /// All other bands are untouched. + pub fn apply(&self, energies: &mut [f32; bands::N_BANDS]) { + let start = self.start_band(); + for i in 0..Self::BANDS_PER_OCTAVE { + if start + i < bands::N_BANDS { + energies[start + i] *= self.pattern[i]; + } + } + } + + /// Transpose: shift this pattern up or down by N octaves. + /// + /// Same pitch class, different register. The pattern is unchanged, + /// only the octave offset moves. This IS the compression: all octaves + /// of a note share the same pattern. + pub fn transpose(&self, delta: i8) -> Self { + OctaveBand { + pattern: self.pattern, + octave: (self.octave as i8 + delta).clamp(0, 6) as u8, + } + } + + /// Build from a fundamental frequency. + /// + /// The pattern captures the harmonic envelope at that frequency: + /// pattern[0] = fundamental energy weight + /// pattern[1] = 2nd harmonic weight + /// pattern[2] = 3rd harmonic weight + /// + /// The harmonic decay rate determines voice character: + /// steep decay → flute/sine (pure tone) + /// gradual decay → strings/voice (rich harmonics) + /// flat → noise/percussion + pub fn from_fundamental(freq_hz: f32, harmonic_decay: f32) -> Self { + // Determine octave from frequency (A0 = 27.5 Hz reference) + let octave = ((freq_hz / 27.5).max(1.0).log2()).floor() as u8; + + // Build harmonic pattern with given decay rate + let pattern = [ + 1.0, // fundamental (always 1.0) + harmonic_decay, // 2nd harmonic + harmonic_decay * harmonic_decay, // 3rd harmonic + ]; + + // Normalize so sum = 1.0 + some headroom + let sum: f32 = pattern.iter().sum(); + let norm = [pattern[0] / sum * 3.0, pattern[1] / sum * 3.0, pattern[2] / sum * 3.0]; + + OctaveBand { pattern: norm, octave: octave.min(6) } + } + + /// Compress a full 21-band energy vector to octave bands. + /// + /// Groups bands into 7 octave triplets, keeping only the + /// normalized pattern within each. Returns 7 OctaveBands. + /// + /// Original: 21 × f32 = 84 bytes + /// Compressed: 7 × (3 × f32 + u8) = 91 bytes (no savings for one frame) + /// BUT: if many frames share the same pattern (same pitch class), + /// store pattern ONCE + per-frame octave offset = massive savings. + pub fn compress_to_octaves(energies: &[f32; bands::N_BANDS]) -> [OctaveBand; 7] { + let mut result = [OctaveBand { pattern: [1.0; 3], octave: 0 }; 7]; + for oct in 0..7 { + let start = oct * Self::BANDS_PER_OCTAVE; + let mut pattern = [0.0f32; 3]; + let mut sum = 0.0f32; + for i in 0..Self::BANDS_PER_OCTAVE { + if start + i < bands::N_BANDS { + pattern[i] = energies[start + i]; + sum += pattern[i]; + } + } + // Normalize + if sum > 1e-10 { + for p in &mut pattern { *p /= sum; *p *= 3.0; } + } + result[oct] = OctaveBand { pattern, octave: oct as u8 }; + } + result + } +} + +/// Pitch class: one of 17 pitch classes in 17-EDO. +/// +/// In 17-EDO, each pitch class maps to a Base17 dimension: +/// dim 0 = "C", dim 1 = "C♯↓", dim 2 = "D♭", ... +/// The golden step (11/17) walks all 17 in the same order +/// that the circle of fifths walks 12 in 12-EDO. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct PitchClass17(pub u8); + +impl PitchClass17 { + /// The golden step interval (11 steps in 17-EDO ≈ perfect fifth). + /// gcd(11, 17) = 1, so iterating generates all 17 classes. + pub const GOLDEN_STEP: u8 = 11; + + /// Circle of fifths in 17-EDO: iterates through all 17 pitch classes. + pub fn circle_of_fifths() -> Vec { + let mut result = Vec::with_capacity(17); + let mut current = 0u8; + for _ in 0..17 { + result.push(PitchClass17(current)); + current = (current + Self::GOLDEN_STEP) % 17; + } + result + } + + /// Interval between two pitch classes (in 17-EDO steps). + pub fn interval(&self, other: &PitchClass17) -> u8 { + ((other.0 as i8 - self.0 as i8).rem_euclid(17)) as u8 + } + + /// Map pitch class to Base17 dimension index. + /// Identity mapping: pitch class N = dimension N. + pub fn base17_dim(&self) -> usize { + self.0 as usize + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn mode_stride_matches_highheelbgz() { + // Verify stride→role mapping is consistent with highheelbgz::TensorRole + assert_eq!(Mode::Ionian.stride(), 8); // Gate + assert_eq!(Mode::Dorian.stride(), 5); // V + assert_eq!(Mode::Phrygian.stride(), 3); // QK + assert_eq!(Mode::Lydian.stride(), 2); // Up + assert_eq!(Mode::Mixolydian.stride(), 4); // Down + } + + #[test] + fn mode_tension_ordered() { + // Ionian is least tense, Locrian is most + assert!(Mode::Ionian.tension() < Mode::Aeolian.tension()); + assert!(Mode::Aeolian.tension() < Mode::Locrian.tension()); + } + + #[test] + fn band_weights_centered() { + // All mode weights should average close to 1.0 + for mode in [Mode::Ionian, Mode::Dorian, Mode::Phrygian, + Mode::Lydian, Mode::Mixolydian, Mode::Aeolian, Mode::Locrian] { + let weights = mode_band_weights(mode); + let avg: f32 = weights.iter().sum::() / bands::N_BANDS as f32; + assert!(avg > 0.8 && avg < 1.3, + "Mode {:?} weights avg {:.2} — should be ~1.0", mode, avg); + } + } + + #[test] + fn circle_of_fifths_starts_and_ends_tonic() { + let prog = circle_of_fifths_progression(); + assert_eq!(prog.first().unwrap().0, Mode::Ionian); + assert_eq!(prog.last().unwrap().0, Mode::Ionian); + assert_eq!(prog.first().unwrap().1, prog.last().unwrap().1); + } + + #[test] + fn intervals_sum_to_17() { + // Each mode's intervals should sum close to 17 (one octave in 17-EDO) + for mode in [Mode::Ionian, Mode::Dorian, Mode::Phrygian, + Mode::Lydian, Mode::Mixolydian, Mode::Aeolian, Mode::Locrian] { + let intervals = mode.intervals_17edo(); + let sum: u8 = intervals.iter().sum(); + // 7 intervals sum to 17 (W=3, H=2): 5W+2H = 5×3+2×2 = 19? + // Actually in 17-EDO: 5×3+2×2 = 19, but we use 7 scale degrees + // The sum should be ≤ 17 (the remaining step completes the octave) + assert!(sum <= 17, "Mode {:?} intervals sum to {} > 17", mode, sum); + } + } + + #[test] + fn apply_mode_preserves_nonzero() { + let mut energies = [1.0f32; bands::N_BANDS]; + apply_mode(&mut energies, Mode::Phrygian); + // All energies should still be positive + for (i, &e) in energies.iter().enumerate() { + assert!(e > 0.0, "Band {} energy went to zero after Phrygian mode", i); + } + } + + #[test] + fn octave_transpose_preserves_pattern() { + let ob = OctaveBand::from_fundamental(440.0, 0.5); + let up = ob.transpose(2); + let down = ob.transpose(-1); + // Pattern should be identical, only octave changes + assert_eq!(ob.pattern, up.pattern); + assert_eq!(ob.pattern, down.pattern); + assert_ne!(ob.octave, up.octave); + } + + #[test] + fn octave_compress_roundtrip() { + let mut energies = [0.0f32; bands::N_BANDS]; + // Put energy at 440Hz band region (approximately band 9-11) + energies[9] = 1.0; + energies[10] = 0.5; + energies[11] = 0.25; + let octaves = OctaveBand::compress_to_octaves(&energies); + // Octave 3 (bands 9-11) should have the most energy in pattern[0] + assert!(octaves[3].pattern[0] > octaves[3].pattern[2], + "Octave 3 pattern should peak at fundamental: {:?}", octaves[3].pattern); + // The fundamental (1.0) should have ~57% of the energy (1.0 / 1.75 × 3) + assert!(octaves[3].pattern[0] > 1.5, "Fundamental weight should be > 1.5: {}", octaves[3].pattern[0]); + } + + #[test] + fn circle_of_fifths_17_visits_all() { + let cof = PitchClass17::circle_of_fifths(); + assert_eq!(cof.len(), 17); + // All 17 pitch classes should appear exactly once + let mut seen = [false; 17]; + for pc in &cof { + assert!(!seen[pc.0 as usize], "Pitch class {} visited twice", pc.0); + seen[pc.0 as usize] = true; + } + assert!(seen.iter().all(|&s| s), "Not all pitch classes visited"); + } + + #[test] + fn pitch_class_interval() { + let c = PitchClass17(0); + let g = PitchClass17(10); // 10/17 ≈ perfect fifth in 17-EDO + assert_eq!(c.interval(&g), 10); + // Golden step = 11 ≈ also a fifth (the just one) + let g_just = PitchClass17(11); + assert_eq!(c.interval(&g_just), PitchClass17::GOLDEN_STEP); + } +} diff --git a/src/hpc/audio/phase.rs b/src/hpc/audio/phase.rs new file mode 100644 index 00000000..18dd6684 --- /dev/null +++ b/src/hpc/audio/phase.rs @@ -0,0 +1,330 @@ +//! Phase shift dynamics — measuring what amplitude alone misses. +//! +//! Amplitude tells you WHAT frequencies are present. +//! Phase tells you HOW they relate to each other in time. +//! +//! Phase coherence between harmonics: +//! High coherence → voiced sound (vowels, singing, resonance) +//! Low coherence → noise (consonants, breath, static) +//! Phase locked → natural voice +//! Phase random → synthetic/robotic +//! +//! Phase gradient across frames: +//! Steady phase → sustained note (singing, humming) +//! Rotating phase → vibrato, tremolo +//! Phase discontinuity → attack, plosive, glottal stop +//! +//! Maps to QPL dims: +//! Phase coherence → coherence (dim 9) + clarity (dim 4) +//! Phase gradient → velocity (dim 7) + integration (dim 16) +//! Phase stability → groundedness (dim 14) +//! Phase entropy → entropy (dim 8) +//! +//! Uses the same STFT from mel.rs but keeps phase info instead of +//! discarding it (which is what magnitude spectrograms do). + +use crate::hpc::fft; +use core::f32::consts::PI; +use super::bands; + +/// Phase coherence between adjacent harmonics within one frame. +/// +/// Measures how "locked" the harmonics are to each other. +/// Natural voice: harmonics are phase-locked (coherence ≈ 1.0). +/// Noise: random phase relationships (coherence ≈ 0.0). +/// +/// Returns per-band coherence values [0.0, 1.0]. +pub fn band_phase_coherence( + real: &[f32], + imag: &[f32], +) -> [f32; bands::N_BANDS] { + let mut coherence = [0.0f32; bands::N_BANDS]; + + for band in 0..bands::N_BANDS { + let lo = bands::CELT_BANDS_48K[band]; + let hi = bands::CELT_BANDS_48K[band + 1].min(real.len().min(imag.len())); + if hi <= lo + 1 { continue; } + + // Phase differences between adjacent bins within this band + let mut cos_sum = 0.0f64; + let mut sin_sum = 0.0f64; + let mut count = 0u32; + + for i in lo..(hi - 1) { + if i >= real.len() || i + 1 >= real.len() { break; } + let phase_i = imag[i].atan2(real[i]); + let phase_next = imag[i + 1].atan2(real[i + 1]); + let diff = phase_next - phase_i; + cos_sum += diff.cos() as f64; + sin_sum += diff.sin() as f64; + count += 1; + } + + if count > 0 { + // Resultant length of unit vectors (circular mean) + let r = ((cos_sum * cos_sum + sin_sum * sin_sum).sqrt()) / count as f64; + coherence[band] = r.min(1.0) as f32; + } + } + + coherence +} + +/// Phase gradient between two consecutive frames. +/// +/// Measures how much phase rotates between frames at each band. +/// Steady gradient → sustained pitch (the gradient IS the frequency). +/// Changing gradient → pitch modulation (vibrato, portamento). +/// Zero gradient → DC or silence. +/// +/// Returns per-band gradient in radians/frame. +pub fn phase_gradient( + prev_real: &[f32], prev_imag: &[f32], + curr_real: &[f32], curr_imag: &[f32], +) -> [f32; bands::N_BANDS] { + let mut gradient = [0.0f32; bands::N_BANDS]; + + for band in 0..bands::N_BANDS { + let lo = bands::CELT_BANDS_48K[band]; + let hi = bands::CELT_BANDS_48K[band + 1] + .min(prev_real.len()) + .min(curr_real.len()); + if hi <= lo { continue; } + + let mut total_diff = 0.0f64; + let mut count = 0u32; + + for i in lo..hi { + if i >= prev_real.len() || i >= curr_real.len() { break; } + let prev_phase = prev_imag[i].atan2(prev_real[i]); + let curr_phase = curr_imag[i].atan2(curr_real[i]); + // Unwrap phase difference to [-π, π] + let mut diff = curr_phase - prev_phase; + while diff > PI { diff -= 2.0 * PI; } + while diff < -PI { diff += 2.0 * PI; } + total_diff += diff.abs() as f64; + count += 1; + } + + if count > 0 { + gradient[band] = (total_diff / count as f64) as f32; + } + } + + gradient +} + +/// Compact phase descriptor: 4 bytes capturing the essential phase dynamics. +/// +/// byte 0: overall coherence (0=noise, 255=perfectly locked harmonics) +/// byte 1: gradient magnitude (0=static, 255=rapid phase rotation) +/// byte 2: coherence entropy (0=uniform coherence, 255=mixed voiced/unvoiced) +/// byte 3: gradient stability (0=steady pitch, 255=rapidly changing pitch) +/// +/// These 4 bytes complement AudioFrame's PVQ summary: +/// PVQ summary = amplitude shape (WHAT) +/// Phase descriptor = temporal relationship (HOW) +/// +/// Together: complete nonverbal vocal characterization in 52 bytes. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct PhaseDescriptor { + pub bytes: [u8; 4], +} + +impl PhaseDescriptor { + /// Build from band coherence and gradient. + pub fn from_bands(coherence: &[f32; bands::N_BANDS], gradient: &[f32; bands::N_BANDS]) -> Self { + // Overall coherence: weighted mean (weight mid-bands more — voice formants) + let mut coh_sum = 0.0f32; + let mut weight_sum = 0.0f32; + for i in 0..bands::N_BANDS { + let w = if (4..=14).contains(&i) { 2.0 } else { 1.0 }; // voice range weight + coh_sum += coherence[i] * w; + weight_sum += w; + } + let mean_coherence = coh_sum / weight_sum.max(1.0); + + // Gradient magnitude: RMS of per-band gradients + let grad_rms = (gradient.iter().map(|g| g * g).sum::() / bands::N_BANDS as f32).sqrt(); + + // Coherence entropy: are some bands voiced and others not? + let mut coh_entropy = 0.0f32; + let coh_total: f32 = coherence.iter().sum::().max(1e-10); + for &c in coherence { + if c > 1e-10 { + let p = c / coh_total; + coh_entropy -= p * p.ln(); + } + } + let max_entropy = (bands::N_BANDS as f32).ln(); + let norm_coh_entropy = coh_entropy / max_entropy; + + // Gradient stability: std dev of gradients (high = changing pitch) + let grad_mean = gradient.iter().sum::() / bands::N_BANDS as f32; + let grad_var = gradient.iter() + .map(|g| (g - grad_mean) * (g - grad_mean)) + .sum::() / bands::N_BANDS as f32; + let grad_std = grad_var.sqrt(); + + PhaseDescriptor { + bytes: [ + (mean_coherence * 255.0).clamp(0.0, 255.0) as u8, + (grad_rms * 255.0 / PI).clamp(0.0, 255.0) as u8, + (norm_coh_entropy * 255.0).clamp(0.0, 255.0) as u8, + (grad_std * 255.0 / PI).clamp(0.0, 255.0) as u8, + ], + } + } + + /// Map phase descriptor to QPL dims it informs. + /// + /// Returns (coherence→dim9, clarity→dim4, velocity→dim7, + /// entropy→dim8, groundedness→dim14). + pub fn to_qualia_dims(&self) -> [(usize, f32); 5] { + let coherence = self.bytes[0] as f32 / 255.0; + let gradient = self.bytes[1] as f32 / 255.0; + let coh_entropy = self.bytes[2] as f32 / 255.0; + let stability = 1.0 - self.bytes[3] as f32 / 255.0; + + [ + (9, coherence), // coherence: phase-locked = unified + (4, coherence), // clarity: locked harmonics = clear + (7, gradient), // velocity: phase rotation = movement + (8, coh_entropy), // entropy: mixed voiced/unvoiced + (14, stability), // groundedness: steady pitch = rooted + ] + } + + /// Is this a voiced frame? (coherence > threshold) + pub fn is_voiced(&self) -> bool { + self.bytes[0] > 128 // > 50% coherence + } + + /// Is this an attack/plosive? (low coherence + high gradient) + pub fn is_attack(&self) -> bool { + self.bytes[0] < 64 && self.bytes[1] > 128 + } +} + +/// STFT with phase preservation. +/// +/// Returns (magnitude_per_frame, real_per_frame, imag_per_frame). +/// Each frame has n_fft/2+1 bins. +pub fn stft_with_phase( + pcm: &[f32], + window_size: usize, + hop_size: usize, +) -> (Vec>, Vec>, Vec>) { + let n_fft = window_size.next_power_of_two(); + let n_bins = n_fft / 2 + 1; + let window: Vec = (0..window_size) + .map(|i| 0.5 * (1.0 - (2.0 * PI * i as f32 / window_size as f32).cos())) + .collect(); + + let n_frames = if pcm.len() >= window_size { + (pcm.len() - window_size) / hop_size + 1 + } else { + 0 + }; + + let mut mags = Vec::with_capacity(n_frames); + let mut reals = Vec::with_capacity(n_frames); + let mut imags = Vec::with_capacity(n_frames); + + for frame_idx in 0..n_frames { + let start = frame_idx * hop_size; + let mut data = vec![0.0f32; 2 * n_fft]; + for i in 0..window_size.min(pcm.len() - start) { + data[2 * i] = pcm[start + i] * window[i]; + } + + fft::fft_f32(&mut data, n_fft); + + let mut mag = Vec::with_capacity(n_bins); + let mut real = Vec::with_capacity(n_bins); + let mut imag = Vec::with_capacity(n_bins); + + for bin in 0..n_bins { + let re = data[2 * bin]; + let im = data[2 * bin + 1]; + mag.push((re * re + im * im).sqrt()); + real.push(re); + imag.push(im); + } + + mags.push(mag); + reals.push(real); + imags.push(imag); + } + + (mags, reals, imags) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sine_has_high_coherence() { + // Pure 440Hz sine → all energy in one bin → high coherence + let n = 1024; + let pcm: Vec = (0..n) + .map(|i| (2.0 * PI * 440.0 * i as f32 / 48000.0).sin()) + .collect(); + + let (_mags, reals, imags) = stft_with_phase(&pcm, 512, 256); + if reals.is_empty() { return; } + + let coh = band_phase_coherence(&reals[0], &imags[0]); + // At least one band should have high coherence (the one with 440Hz) + let max_coh = coh.iter().cloned().fold(0.0f32, f32::max); + assert!(max_coh > 0.3, "Pure sine should have coherent band: max={}", max_coh); + } + + #[test] + fn noise_has_low_coherence() { + // White noise → random phases → low coherence + let n = 1024; + let mut rng = 0x12345678u64; + let pcm: Vec = (0..n).map(|_| { + rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + ((rng >> 33) as f32 / (1u64 << 31) as f32) * 2.0 - 1.0 + }).collect(); + + let (_mags, reals, imags) = stft_with_phase(&pcm, 512, 256); + if reals.is_empty() { return; } + + let coh = band_phase_coherence(&reals[0], &imags[0]); + let mean_coh: f32 = coh.iter().sum::() / bands::N_BANDS as f32; + // Noise should have lower mean coherence than pure tone + assert!(mean_coh < 0.8, "Noise should have moderate-low coherence: mean={}", mean_coh); + } + + #[test] + fn phase_descriptor_voiced_detection() { + let voiced_coh = [0.9f32; bands::N_BANDS]; + let steady_grad = [0.1f32; bands::N_BANDS]; + let desc = PhaseDescriptor::from_bands(&voiced_coh, &steady_grad); + assert!(desc.is_voiced(), "High coherence should be voiced"); + assert!(!desc.is_attack(), "Steady should not be attack"); + } + + #[test] + fn phase_descriptor_attack_detection() { + let noise_coh = [0.1f32; bands::N_BANDS]; + let high_grad = [2.0f32; bands::N_BANDS]; + let desc = PhaseDescriptor::from_bands(&noise_coh, &high_grad); + assert!(!desc.is_voiced(), "Low coherence should not be voiced"); + assert!(desc.is_attack(), "Low coherence + high gradient = attack"); + } + + #[test] + fn phase_to_qualia_dims_valid() { + let desc = PhaseDescriptor { bytes: [200, 50, 100, 30] }; + let dims = desc.to_qualia_dims(); + for (dim_idx, value) in dims { + assert!(dim_idx < 17, "Invalid dim index: {}", dim_idx); + assert!(value >= 0.0 && value <= 1.0, "Dim {} value out of range: {}", dim_idx, value); + } + } +} diff --git a/src/hpc/audio/synth.rs b/src/hpc/audio/synth.rs new file mode 100644 index 00000000..c72c406e --- /dev/null +++ b/src/hpc/audio/synth.rs @@ -0,0 +1,369 @@ +//! Synthesize pipeline: VoiceFrame → AudioFrame → iMDCT → PCM → WAV. +//! +//! This is the missing piece identified in lance-graph PR #168: +//! "AudioFrame not connected to HHTL cascade levels" +//! "WAV synthesis was bits-as-vectors — needs audio primitives" +//! +//! The pipeline: +//! 1. VoiceFrame (21B) → decompose into RvqFrame + PhaseDescriptor +//! 2. RvqFrame.archetype → VoiceCodebook lookup → VoiceArchetype (16B) +//! 3. RvqFrame.coarse → band energy prediction (8 codes → 21 BF16 bands) +//! 4. RvqFrame.fine → PVQ shape refinement (8 codes → 6B summary) +//! 5. PhaseDescriptor → phase-modulate the reconstructed bands +//! 6. AudioFrame.decode_coarse() → iMDCT → PCM +//! 7. Overlap-add consecutive frames → continuous PCM stream +//! 8. Write WAV header + PCM → .wav file +//! +//! The mode coloring (from Qualia17D → Mode → family_band_weights) is +//! applied at step 3: band energies are scaled by the QPL family's +//! spectral EQ before synthesis. + +use super::codec::AudioFrame; +use super::bands; +use super::voice::{VoiceArchetype, VoiceCodebook, VoiceFrame, RvqFrame}; +use super::phase::PhaseDescriptor; +use super::modes; + +/// Decode a sequence of VoiceFrames into PCM audio. +/// +/// This is the complete synthesis pipeline: +/// VoiceFrame → AudioFrame → iMDCT → overlap-add → PCM +/// +/// `codebook`: the voice codebook (256 archetypes) for speaker lookup. +/// `coarse_centroids`: 256 × 21 BF16 band energy centroids (from HHTL HIP level). +/// `sample_rate`: output sample rate (48000 for Opus compatibility). +/// +/// Returns mono f32 PCM samples. +pub fn synthesize( + frames: &[VoiceFrame], + codebook: &VoiceCodebook, + coarse_centroids: &[[u16; bands::N_BANDS]; 256], + sample_rate: u32, +) -> Vec { + if frames.is_empty() { return vec![]; } + + // Frame parameters (Opus CELT compatible) + let frame_samples = 960; // 20ms at 48kHz + let hop_size = frame_samples / 2; // 50% overlap + let total_samples = hop_size * (frames.len() + 1); + let mut output = vec![0.0f32; total_samples]; + + for (idx, vf) in frames.iter().enumerate() { + // Step 1: Decompose VoiceFrame + let rvq = &vf.rvq; + let phase = &vf.phase; + + // Step 2: Look up voice archetype + let archetype_idx = rvq.archetype as usize; + let _archetype = if archetype_idx < codebook.entries.len() { + codebook.entries[archetype_idx] + } else { + VoiceArchetype::zero() + }; + + // Step 3: Reconstruct band energies from coarse codes + // Each coarse code indexes into the centroid table + let band_energies = reconstruct_band_energies(rvq, coarse_centroids); + + // Step 4: Build AudioFrame from predicted energies + PVQ summary from fine codes + let pvq_summary = fine_to_pvq_summary(&rvq.fine); + let audio_frame = AudioFrame { + band_energies, + pvq_summary, + }; + + // Step 5: Phase modulation — adjust band energies based on phase coherence + // Voiced frames get boosted mid-bands, attacks get transient emphasis + let modulated = phase_modulate_frame(&audio_frame, phase); + + // Step 6: Decode to PCM via iMDCT + let pcm = modulated.decode_coarse(); + + // Step 7: Overlap-add into output buffer + let start = idx * hop_size; + let overlap_len = pcm.len().min(total_samples - start); + for i in 0..overlap_len { + // Hann window for smooth overlap-add + let t = i as f32 / pcm.len() as f32; + let window = 0.5 * (1.0 - (2.0 * core::f32::consts::PI * t).cos()); + output[start + i] += pcm[i] * window; + } + } + + // Resample if needed (our MDCT produces at 48kHz, caller may want 24kHz) + if sample_rate == 24000 { + // Simple 2:1 decimation with averaging + output = output.chunks(2) + .map(|c| if c.len() == 2 { (c[0] + c[1]) * 0.5 } else { c[0] }) + .collect(); + } + + output +} + +/// Reconstruct 21 BF16 band energies from RvqFrame coarse codes. +/// +/// Each coarse code (0-255) indexes the HHTL HIP-level centroid table. +/// The 8 coarse codes cover overlapping band groups: +/// code[0]: bands 0-2 (sub-bass + bass) +/// code[1]: bands 3-5 (low-mid) +/// code[2]: bands 6-8 (mid) +/// code[3]: bands 9-11 (upper-mid) +/// code[4]: bands 12-14 (presence) +/// code[5]: bands 15-17 (brilliance) +/// code[6]: bands 18-20 (air) +/// code[7]: global gain (scales all bands) +fn reconstruct_band_energies( + rvq: &RvqFrame, + centroids: &[[u16; bands::N_BANDS]; 256], +) -> [u16; bands::N_BANDS] { + // Start with the centroid pointed to by code[0] (base spectral shape) + let base = centroids[rvq.coarse[0] as usize]; + let mut energies = base; + + // Blend in contributions from other coarse codes per band group + let band_groups: [(usize, usize); 7] = [ + (0, 3), (3, 6), (6, 9), (9, 12), (12, 15), (15, 18), (18, 21), + ]; + + for (group_idx, &(lo, hi)) in band_groups.iter().enumerate() { + let code_idx = group_idx + 1; + if code_idx >= 8 { break; } + let centroid = ¢roids[rvq.coarse[code_idx] as usize]; + for band in lo..hi.min(bands::N_BANDS) { + // Weighted blend: 60% base + 40% group-specific centroid + let base_f = f32::from_bits((energies[band] as u32) << 16); + let group_f = f32::from_bits((centroid[band] as u32) << 16); + let blended = base_f * 0.6 + group_f * 0.4; + energies[band] = (blended.to_bits() >> 16) as u16; + } + } + + // Global gain from code[7] + let gain = (rvq.coarse[7] as f32) / 128.0; // 0.0 to ~2.0 + for band in 0..bands::N_BANDS { + let e = f32::from_bits((energies[band] as u32) << 16); + let scaled = e * gain; + energies[band] = (scaled.to_bits() >> 16) as u16; + } + + energies +} + +/// Convert 8 fine RVQ codes to a 6-byte PVQ summary. +/// +/// The fine codes carry spectral detail within each band group. +/// We compress them to the AudioFrame's 6-byte PVQ summary format: +/// bytes 0-1: sign pattern (from fine[0..2]) +/// bytes 2-3: temporal gradient (from fine[2..5]) +/// bytes 4-5: harmonic detail (from fine[5..8]) +fn fine_to_pvq_summary(fine: &[u8; 8]) -> [u8; 6] { + [ + fine[0] ^ fine[1], // sign pattern XOR + fine[1] ^ fine[2], // sign pattern continuation + fine[2], // temporal gradient + fine[3] ^ fine[4], // temporal modulation + fine[5], // harmonic detail + fine[6] ^ fine[7], // harmonic modulation + ] +} + +/// Apply phase modulation to an AudioFrame. +/// +/// Voiced frames (high coherence): boost mid-band energy (formants). +/// Attacks (low coherence + high gradient): sharpen transient. +/// Noise (low coherence + low gradient): spread energy more evenly. +fn phase_modulate_frame(frame: &AudioFrame, phase: &PhaseDescriptor) -> AudioFrame { + let mut out = *frame; + let coherence = phase.bytes[0] as f32 / 255.0; + let gradient = phase.bytes[1] as f32 / 255.0; + + for band in 0..bands::N_BANDS { + let e = f32::from_bits((out.band_energies[band] as u32) << 16); + let modulated = if phase.is_voiced() { + // Voiced: boost formant region (bands 4-14), suppress extremes + if (4..=14).contains(&band) { + e * (1.0 + coherence * 0.3) + } else { + e * (1.0 - coherence * 0.1) + } + } else if phase.is_attack() { + // Attack: boost all bands briefly (transient energy) + e * (1.0 + gradient * 0.5) + } else { + // Noise: flatten spectrum slightly + e * (1.0 + (0.5 - coherence) * 0.2) + }; + out.band_energies[band] = (modulated.to_bits() >> 16) as u16; + } + + out +} + +/// Write PCM samples as a 16-bit WAV file. +/// +/// Mono, little-endian, standard PCM format. +/// The WAV file is complete and playable by any audio software. +pub fn write_wav(pcm: &[f32], sample_rate: u32) -> Vec { + let n_samples = pcm.len(); + let bits_per_sample: u16 = 16; + let n_channels: u16 = 1; + let byte_rate = sample_rate * (bits_per_sample as u32 / 8) * n_channels as u32; + let block_align = n_channels * (bits_per_sample / 8); + let data_size = (n_samples * 2) as u32; + let file_size = 36 + data_size; + + let mut wav = Vec::with_capacity(44 + n_samples * 2); + + // RIFF header + wav.extend_from_slice(b"RIFF"); + wav.extend_from_slice(&file_size.to_le_bytes()); + wav.extend_from_slice(b"WAVE"); + + // fmt sub-chunk + wav.extend_from_slice(b"fmt "); + wav.extend_from_slice(&16u32.to_le_bytes()); // sub-chunk size + wav.extend_from_slice(&1u16.to_le_bytes()); // PCM format + wav.extend_from_slice(&n_channels.to_le_bytes()); + wav.extend_from_slice(&sample_rate.to_le_bytes()); + wav.extend_from_slice(&byte_rate.to_le_bytes()); + wav.extend_from_slice(&block_align.to_le_bytes()); + wav.extend_from_slice(&bits_per_sample.to_le_bytes()); + + // data sub-chunk + wav.extend_from_slice(b"data"); + wav.extend_from_slice(&data_size.to_le_bytes()); + + // Normalize and convert to i16 + let max_abs = pcm.iter().map(|s| s.abs()).fold(0.0f32, f32::max).max(1e-10); + let scale = 32767.0 / max_abs; + + for &sample in pcm { + let s = (sample * scale).clamp(-32768.0, 32767.0) as i16; + wav.extend_from_slice(&s.to_le_bytes()); + } + + wav +} + +/// Validate a WAV byte buffer (basic sanity check). +pub fn validate_wav(wav: &[u8]) -> Result<(u32, usize), &'static str> { + if wav.len() < 44 { return Err("WAV too short"); } + if &wav[0..4] != b"RIFF" { return Err("Missing RIFF header"); } + if &wav[8..12] != b"WAVE" { return Err("Missing WAVE format"); } + if &wav[12..16] != b"fmt " { return Err("Missing fmt chunk"); } + + let sample_rate = u32::from_le_bytes([wav[24], wav[25], wav[26], wav[27]]); + let data_start = 44; // standard PCM WAV + let data_size = wav.len() - data_start; + let n_samples = data_size / 2; // 16-bit samples + + Ok((sample_rate, n_samples)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn write_wav_valid_header() { + let pcm = vec![0.5f32; 4800]; // 100ms at 48kHz + let wav = write_wav(&pcm, 48000); + let (sr, n) = validate_wav(&wav).unwrap(); + assert_eq!(sr, 48000); + assert_eq!(n, 4800); + } + + #[test] + fn write_wav_nonzero_samples() { + let pcm: Vec = (0..960) + .map(|i| (2.0 * core::f32::consts::PI * 440.0 * i as f32 / 48000.0).sin()) + .collect(); + let wav = write_wav(&pcm, 48000); + // Check data section has nonzero content + let data = &wav[44..]; + let nonzero = data.iter().filter(|&&b| b != 0).count(); + assert!(nonzero > data.len() / 4, "WAV data should be mostly nonzero"); + } + + #[test] + fn synthesize_empty_returns_empty() { + let codebook = VoiceCodebook { entries: vec![VoiceArchetype::zero()] }; + let centroids = [[0u16; bands::N_BANDS]; 256]; + let pcm = synthesize(&[], &codebook, ¢roids, 48000); + assert!(pcm.is_empty()); + } + + #[test] + fn synthesize_single_frame() { + let codebook = VoiceCodebook { entries: vec![VoiceArchetype::zero(); 256] }; + // Create centroids with some energy in mid-bands + let mut centroids = [[0u16; bands::N_BANDS]; 256]; + for c in centroids.iter_mut() { + for band in 4..14 { + // Set BF16 value for 0.1 (reasonable band energy) + c[band] = (0.1f32.to_bits() >> 16) as u16; + } + } + + let frame = VoiceFrame { + rvq: RvqFrame { archetype: 0, coarse: [0, 0, 0, 0, 0, 0, 0, 128], fine: [128; 8] }, + phase: PhaseDescriptor { bytes: [200, 30, 128, 50] }, // voiced, steady + }; + + let pcm = synthesize(&[frame], &codebook, ¢roids, 48000); + assert!(!pcm.is_empty(), "Should produce samples"); + let energy: f32 = pcm.iter().map(|s| s * s).sum(); + assert!(energy > 0.0, "Should have nonzero energy"); + } + + #[test] + fn fine_to_pvq_deterministic() { + let fine = [1u8, 2, 3, 4, 5, 6, 7, 8]; + let a = fine_to_pvq_summary(&fine); + let b = fine_to_pvq_summary(&fine); + assert_eq!(a, b); + } + + #[test] + fn phase_modulate_voiced_boosts_mid() { + let mut energies = [0u16; bands::N_BANDS]; + for band in 0..bands::N_BANDS { + energies[band] = (0.5f32.to_bits() >> 16) as u16; + } + let frame = AudioFrame { band_energies: energies, pvq_summary: [0; 6] }; + let voiced = PhaseDescriptor { bytes: [255, 30, 128, 50] }; // high coherence + + let modulated = phase_modulate_frame(&frame, &voiced); + + // Mid-bands (4-14) should be boosted + let mid_orig: f32 = (4..=14).map(|b| f32::from_bits((frame.band_energies[b] as u32) << 16)).sum(); + let mid_mod: f32 = (4..=14).map(|b| f32::from_bits((modulated.band_energies[b] as u32) << 16)).sum(); + assert!(mid_mod > mid_orig, "Voiced phase should boost mid-bands: {} vs {}", mid_mod, mid_orig); + } + + #[test] + fn roundtrip_encode_synthesize() { + // Encode a 440Hz sine, then synthesize back + let pcm: Vec = (0..1024) + .map(|i| (2.0 * core::f32::consts::PI * 440.0 * i as f32 / 48000.0).sin()) + .collect(); + + let audio_frame = AudioFrame::encode(&pcm, 8); + + // Build a codebook with this frame's energies as the only centroid + let codebook = VoiceCodebook { entries: vec![VoiceArchetype::zero(); 256] }; + let mut centroids = [[0u16; bands::N_BANDS]; 256]; + centroids[0] = audio_frame.band_energies; + + let voice_frame = VoiceFrame { + rvq: RvqFrame { archetype: 0, coarse: [0, 0, 0, 0, 0, 0, 0, 128], fine: [0; 8] }, + phase: PhaseDescriptor { bytes: [200, 30, 128, 50] }, + }; + + let decoded = synthesize(&[voice_frame], &codebook, ¢roids, 48000); + assert!(!decoded.is_empty()); + let energy: f32 = decoded.iter().map(|s| s * s).sum(); + assert!(energy > 0.0, "Roundtrip should preserve energy"); + } +} diff --git a/src/hpc/audio/voice.rs b/src/hpc/audio/voice.rs new file mode 100644 index 00000000..ff051c4c --- /dev/null +++ b/src/hpc/audio/voice.rs @@ -0,0 +1,442 @@ +//! VoiceArchetype — transcoded from Bark's 3-stage RVQ hierarchy. +//! +//! Bark's 3-stage pipeline (semantic GPT-2 → coarse GPT-2 → fine model) +//! maps directly to HHTL cascade levels: +//! +//! HEEL: VoiceArchetype (16 i8 channels — voice identity qualia) +//! HIP: spectral envelope (21 BF16 band energies from Opus bands) +//! TWIG: PVQ fine detail (6-byte harmonic signature) +//! LEAF: full iMDCT → PCM waveform +//! +//! ElevenLabs insight: voice cloning = archetype embedding. +//! A 16-channel i8 vector captures speaker identity: +//! channels 0-3: pitch register (bass/tenor/alto/soprano) +//! channels 4-7: resonance (chest/head/nasal/breathy) +//! channels 8-11: articulation (crisp/smooth/rough/whisper) +//! channels 12-15: prosody (flat/dynamic/staccato/legato) +//! +//! Total: 16 bytes per voice identity. Fits in one SIMD lane. + +/// Number of voice archetype channels. +pub const N_VOICE_CHANNELS: usize = 16; + +/// VoiceArchetype: 16 i8 channels capturing voice identity. +/// +/// Maps to Bark's semantic tokens (Stage 1): the coarse "what kind of voice" +/// decision, before any spectral detail. L1 distance between archetypes +/// predicts voice similarity. +/// +/// The 16 channels correspond to perceptual voice qualia: +/// Pitch register, resonance, articulation, prosody. +/// +/// Compression: 16 bytes (vs Bark's 1024-dim semantic token embedding). +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +pub struct VoiceArchetype { + pub channels: [i8; N_VOICE_CHANNELS], +} + +impl VoiceArchetype { + pub const BYTE_SIZE: usize = N_VOICE_CHANNELS; + + /// Zero archetype (neutral voice). + pub fn zero() -> Self { + VoiceArchetype { channels: [0i8; N_VOICE_CHANNELS] } + } + + /// L1 distance between two archetypes. + #[inline] + pub fn l1(&self, other: &VoiceArchetype) -> u32 { + let mut d = 0u32; + for i in 0..N_VOICE_CHANNELS { + d += (self.channels[i] as i32 - other.channels[i] as i32).unsigned_abs(); + } + d + } + + /// Cosine similarity (for voice matching). + pub fn cosine(&self, other: &VoiceArchetype) -> f64 { + let mut dot = 0i64; + let mut na = 0i64; + let mut nb = 0i64; + for i in 0..N_VOICE_CHANNELS { + let a = self.channels[i] as i64; + let b = other.channels[i] as i64; + dot += a * b; + na += a * a; + nb += b * b; + } + let denom = ((na as f64) * (nb as f64)).sqrt(); + if denom < 1e-12 { 0.0 } else { dot as f64 / denom } + } + + /// Extract archetype from raw embedding by quantizing to 16 channels. + /// + /// Takes a high-dimensional embedding (e.g., Bark's 1024-dim semantic token + /// or ElevenLabs' speaker embedding) and compresses to 16 i8 channels + /// via strided sampling + quantization. + /// + /// The stride determines which embedding dimensions map to which channels: + /// dim[0], dim[stride], dim[2*stride], ... → channels 0..15 + pub fn from_embedding(embedding: &[f32], stride: usize) -> Self { + let mut channels = [0i8; N_VOICE_CHANNELS]; + + // Find scale factor for quantization to i8 range + let max_abs = embedding.iter() + .map(|v| v.abs()) + .fold(0.0f32, f32::max) + .max(1e-10); + let scale = 127.0 / max_abs; + + for ch in 0..N_VOICE_CHANNELS { + let dim = ch * stride.max(1); + if dim < embedding.len() { + channels[ch] = (embedding[dim] * scale).clamp(-128.0, 127.0) as i8; + } + } + + VoiceArchetype { channels } + } + + /// Serialize to bytes. + pub fn to_bytes(&self) -> [u8; N_VOICE_CHANNELS] { + let mut bytes = [0u8; N_VOICE_CHANNELS]; + for i in 0..N_VOICE_CHANNELS { + bytes[i] = self.channels[i] as u8; + } + bytes + } + + /// Deserialize from bytes. + pub fn from_bytes(bytes: &[u8; N_VOICE_CHANNELS]) -> Self { + let mut channels = [0i8; N_VOICE_CHANNELS]; + for i in 0..N_VOICE_CHANNELS { + channels[i] = bytes[i] as i8; + } + VoiceArchetype { channels } + } + + /// Pitch register (channels 0-3 magnitude). + pub fn pitch_energy(&self) -> u32 { + (0..4).map(|i| self.channels[i].unsigned_abs() as u32).sum() + } + + /// Resonance quality (channels 4-7 magnitude). + pub fn resonance_energy(&self) -> u32 { + (4..8).map(|i| self.channels[i].unsigned_abs() as u32).sum() + } + + /// Articulation quality (channels 8-11 magnitude). + pub fn articulation_energy(&self) -> u32 { + (8..12).map(|i| self.channels[i].unsigned_abs() as u32).sum() + } + + /// Prosody quality (channels 12-15 magnitude). + pub fn prosody_energy(&self) -> u32 { + (12..16).map(|i| self.channels[i].unsigned_abs() as u32).sum() + } + + /// Modulate archetype with phase dynamics. + /// + /// Phase coherence sharpens articulation channels (8-11). + /// Phase gradient boosts prosody channels (12-15). + /// This is the bridge: amplitude identity (archetype) + temporal + /// dynamics (phase) = complete voice characterization. + /// + /// The phase descriptor IS relative pressure within — it modulates + /// the archetype's channels proportionally, not by overwriting. + pub fn modulate_with_phase(&self, phase: &super::phase::PhaseDescriptor) -> Self { + let mut out = *self; + + // Phase coherence → sharpen articulation (high coherence = crisp) + let coherence = phase.bytes[0] as i16; // 0-255 + for i in 8..12 { + // Scale articulation channels toward their sign direction + let sign = if out.channels[i] >= 0 { 1i16 } else { -1 }; + let boost = sign * (coherence - 128) / 8; // ±16 max + out.channels[i] = (out.channels[i] as i16 + boost).clamp(-127, 127) as i8; + } + + // Phase gradient → boost prosody dynamics (high gradient = dynamic) + let gradient = phase.bytes[1] as i16; + for i in 12..16 { + let sign = if out.channels[i] >= 0 { 1i16 } else { -1 }; + let boost = sign * (gradient - 128) / 8; + out.channels[i] = (out.channels[i] as i16 + boost).clamp(-127, 127) as i8; + } + + out + } +} + +/// VoiceCodebook: collection of voice archetypes for HHTL routing. +/// +/// Maps to Bark Stage 1: the set of "voice types" the system knows about. +/// Each voice in the codebook is a prototype speaker pattern. +/// New speakers are matched to nearest archetype via L1 distance. +/// +/// For a 256-entry codebook: 256 × 16 bytes = 4 KB. +#[derive(Clone, Debug)] +pub struct VoiceCodebook { + pub entries: Vec, +} + +impl VoiceCodebook { + /// Build from raw embeddings (e.g., from Bark speaker prompts). + pub fn build(embeddings: &[Vec], stride: usize) -> Self { + let entries: Vec = embeddings.iter() + .map(|e| VoiceArchetype::from_embedding(e, stride)) + .collect(); + VoiceCodebook { entries } + } + + /// Find nearest archetype. + pub fn nearest(&self, query: &VoiceArchetype) -> (u8, u32) { + let mut best_idx = 0u8; + let mut best_dist = u32::MAX; + for (i, entry) in self.entries.iter().enumerate() { + let d = query.l1(entry); + if d < best_dist { + best_dist = d; + best_idx = i as u8; + } + } + (best_idx, best_dist) + } + + /// Build 256 × 256 distance table for HHTL cascade. + /// + /// Returns a flat `[k × k]` u16 table (same format as AttentionTable). + pub fn build_distance_table(&self) -> Vec { + let k = self.entries.len(); + let mut table = vec![0u16; k * k]; + for i in 0..k { + for j in (i + 1)..k { + let d = self.entries[i].l1(&self.entries[j]); + // Scale to u16: max L1 for 16 i8 channels = 16 × 255 = 4080 + let scaled = ((d as u32 * 65535) / 4080).min(65535) as u16; + table[i * k + j] = scaled; + table[j * k + i] = scaled; + } + } + table + } + + /// Byte size. + pub fn byte_size(&self) -> usize { + self.entries.len() * VoiceArchetype::BYTE_SIZE + } +} + +/// RVQ code frame: Bark's 3-stage output compressed to HHTL levels. +/// +/// Stage 1 (semantic) → HEEL: voice archetype index (1 byte) +/// Stage 2 (coarse) → HIP: 8 coarse spectral codes (8 bytes) +/// Stage 3 (fine) → TWIG: 8 fine detail codes (8 bytes) +/// +/// Total: 17 bytes per frame (vs Bark's ~128 bytes per frame). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct RvqFrame { + /// HEEL: voice archetype index (0-255). + pub archetype: u8, + /// HIP: coarse spectral codes (8 codebook indices). + pub coarse: [u8; 8], + /// TWIG: fine detail codes (8 codebook indices). + pub fine: [u8; 8], +} + +impl RvqFrame { + pub const BYTE_SIZE: usize = 17; + + /// Serialize to 17 bytes. + pub fn to_bytes(&self) -> [u8; Self::BYTE_SIZE] { + let mut bytes = [0u8; Self::BYTE_SIZE]; + bytes[0] = self.archetype; + bytes[1..9].copy_from_slice(&self.coarse); + bytes[9..17].copy_from_slice(&self.fine); + bytes + } + + /// Deserialize from 17 bytes. + pub fn from_bytes(bytes: &[u8; Self::BYTE_SIZE]) -> Self { + let mut coarse = [0u8; 8]; + let mut fine = [0u8; 8]; + coarse.copy_from_slice(&bytes[1..9]); + fine.copy_from_slice(&bytes[9..17]); + RvqFrame { archetype: bytes[0], coarse, fine } + } + + /// HEEL check: same voice archetype? + #[inline] + pub fn same_voice(&self, other: &RvqFrame) -> bool { + self.archetype == other.archetype + } + + /// HIP distance: L1 over coarse codes. + pub fn coarse_l1(&self, other: &RvqFrame) -> u32 { + let mut d = 0u32; + for i in 0..8 { + d += (self.coarse[i] as i32 - other.coarse[i] as i32).unsigned_abs(); + } + d + } +} + +/// Complete voice frame: RVQ codes + phase dynamics. +/// +/// The full 21-byte nonverbal unit: +/// RvqFrame (17B): WHAT the voice is doing (identity + spectral + detail) +/// PhaseDescriptor (4B): HOW the harmonics relate in time +/// +/// This is the minimum viable unit for lossless nonverbal transmission. +/// AudioFrame (48B) + PhaseDescriptor (4B) = 52B is the analysis frame. +/// VoiceFrame (21B) is the compressed synthesis frame. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct VoiceFrame { + pub rvq: RvqFrame, + pub phase: super::phase::PhaseDescriptor, +} + +impl VoiceFrame { + pub const BYTE_SIZE: usize = RvqFrame::BYTE_SIZE + 4; // 21 bytes + + pub fn to_bytes(&self) -> [u8; Self::BYTE_SIZE] { + let mut bytes = [0u8; Self::BYTE_SIZE]; + bytes[..17].copy_from_slice(&self.rvq.to_bytes()); + bytes[17..21].copy_from_slice(&self.phase.bytes); + bytes + } + + pub fn from_bytes(bytes: &[u8; Self::BYTE_SIZE]) -> Self { + let mut rvq_bytes = [0u8; 17]; + rvq_bytes.copy_from_slice(&bytes[..17]); + let mut phase_bytes = [0u8; 4]; + phase_bytes.copy_from_slice(&bytes[17..21]); + VoiceFrame { + rvq: RvqFrame::from_bytes(&rvq_bytes), + phase: super::phase::PhaseDescriptor { bytes: phase_bytes }, + } + } + + /// Is this a voiced frame? (delegates to phase) + pub fn is_voiced(&self) -> bool { + self.phase.is_voiced() + } + + /// Is this an attack/plosive? (delegates to phase) + pub fn is_attack(&self) -> bool { + self.phase.is_attack() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn archetype_self_distance_zero() { + let a = VoiceArchetype { channels: [10, -20, 30, -40, 50, -60, 70, -80, + 90, -100, 110, -120, 5, -15, 25, -35] }; + assert_eq!(a.l1(&a), 0); + } + + #[test] + fn archetype_self_cosine_one() { + let a = VoiceArchetype { channels: [10, -20, 30, -40, 50, -60, 70, -80, + 1, 2, 3, 4, 5, 6, 7, 8] }; + let c = a.cosine(&a); + assert!((c - 1.0).abs() < 1e-10, "Self cosine should be 1.0: {}", c); + } + + #[test] + fn archetype_from_embedding() { + let emb: Vec = (0..1024).map(|i| (i as f32 * 0.1) - 51.2).collect(); + let arch = VoiceArchetype::from_embedding(&emb, 64); + // Should be nonzero + let mag: u32 = arch.channels.iter().map(|&c| c.unsigned_abs() as u32).sum(); + assert!(mag > 0, "Archetype should be nonzero"); + } + + #[test] + fn archetype_serialize_roundtrip() { + let a = VoiceArchetype { channels: [1, -2, 3, -4, 5, -6, 7, -8, + 9, -10, 11, -12, 13, -14, 15, -16] }; + let bytes = a.to_bytes(); + let recovered = VoiceArchetype::from_bytes(&bytes); + assert_eq!(a, recovered); + } + + #[test] + fn codebook_nearest() { + let entries = vec![ + VoiceArchetype { channels: [100; 16] }, + VoiceArchetype { channels: [-100; 16] }, + VoiceArchetype { channels: [0; 16] }, + ]; + let cb = VoiceCodebook { entries }; + let query = VoiceArchetype { channels: [90; 16] }; + let (idx, dist) = cb.nearest(&query); + assert_eq!(idx, 0, "Should match first entry"); + assert!(dist < 200, "Should be close: {}", dist); + } + + #[test] + fn rvq_frame_roundtrip() { + let frame = RvqFrame { + archetype: 42, + coarse: [1, 2, 3, 4, 5, 6, 7, 8], + fine: [10, 20, 30, 40, 50, 60, 70, 80], + }; + let bytes = frame.to_bytes(); + let recovered = RvqFrame::from_bytes(&bytes); + assert_eq!(frame, recovered); + } + + #[test] + fn phase_modulation_changes_articulation() { + let base = VoiceArchetype { channels: [0, 0, 0, 0, 0, 0, 0, 0, + 50, 50, 50, 50, 0, 0, 0, 0] }; + // High coherence → should boost articulation channels + let high_coh = super::super::phase::PhaseDescriptor { bytes: [255, 128, 128, 128] }; + let modulated = base.modulate_with_phase(&high_coh); + + // Articulation channels (8-11) should be boosted + let base_art: i32 = (8..12).map(|i| base.channels[i].unsigned_abs() as i32).sum(); + let mod_art: i32 = (8..12).map(|i| modulated.channels[i].unsigned_abs() as i32).sum(); + assert!(mod_art >= base_art, "High coherence should boost articulation: {} vs {}", mod_art, base_art); + } + + #[test] + fn voice_frame_roundtrip() { + let frame = VoiceFrame { + rvq: RvqFrame { archetype: 7, coarse: [1; 8], fine: [2; 8] }, + phase: super::super::phase::PhaseDescriptor { bytes: [200, 50, 100, 30] }, + }; + let bytes = frame.to_bytes(); + assert_eq!(bytes.len(), VoiceFrame::BYTE_SIZE); + let recovered = VoiceFrame::from_bytes(&bytes); + assert_eq!(frame, recovered); + } + + #[test] + fn voice_frame_size() { + assert_eq!(VoiceFrame::BYTE_SIZE, 21, "VoiceFrame should be 21 bytes (17 RVQ + 4 phase)"); + } + + #[test] + fn distance_table_symmetric() { + let entries = vec![ + VoiceArchetype { channels: [10; 16] }, + VoiceArchetype { channels: [-10; 16] }, + VoiceArchetype { channels: [50; 16] }, + ]; + let cb = VoiceCodebook { entries }; + let table = cb.build_distance_table(); + let k = 3; + for i in 0..k { + for j in 0..k { + assert_eq!(table[i * k + j], table[j * k + i], + "Distance table not symmetric at ({}, {})", i, j); + } + } + } +} diff --git a/src/simd_amx.rs b/src/simd_amx.rs index a092f2f1..9bc688af 100644 --- a/src/simd_amx.rs +++ b/src/simd_amx.rs @@ -31,17 +31,74 @@ // ═══════════════════════════════════════════════════════════════════════════ /// Check if AMX hardware is present AND OS-enabled. +/// +/// Two checks required: +/// 1. CPUID.07H.0H:EDX bits 24 (AMX-TILE) + 25 (AMX-INT8) = CPU supports it +/// 2. XCR0 bits 17 (TILECFG) + 18 (TILEDATA) = OS has enabled tile state +/// +/// The XCR0 check is critical: even if CPUID reports AMX, the hypervisor +/// may not have enabled the XSTATE for tiles. Without OS enablement, +/// LDTILECFG will SIGILL. +/// +/// Previous bug: used CPUID leaf 0xD (reports what CPU supports for XSAVE) +/// instead of _xgetbv(0) (reports what OS actually enabled). The old check +/// could return true on a hypervisor that advertises AMX in CPUID but +/// hasn't set XCR0 bits 17+18. #[cfg(target_arch = "x86_64")] pub fn amx_available() -> bool { + // Step 1: CPU supports AMX-TILE + AMX-INT8? let cpuid = core::arch::x86_64::__cpuid_count(7, 0); let amx_tile = (cpuid.edx >> 24) & 1; let amx_int8 = (cpuid.edx >> 25) & 1; if amx_tile == 0 || amx_int8 == 0 { return false; } - // Check OS enabled via XCR0 bits 17+18 - let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0); - let tilecfg = (xcr0.eax >> 17) & 1; - let tiledata = (xcr0.eax >> 18) & 1; - tilecfg == 1 && tiledata == 1 + + // Step 2: OS enabled XSAVE? (CPUID.01H:ECX bit 27 = OSXSAVE) + let cpuid_01 = core::arch::x86_64::__cpuid(1); + let osxsave = (cpuid_01.ecx >> 27) & 1; + if osxsave == 0 { return false; } + + // Step 3: OS actually enabled tile state in XCR0? + // _xgetbv(0) reads the ACTUAL XCR0 register (what the OS set), + // not the CPUID-reported capability. + // Bit 17 = TILECFG, Bit 18 = TILEDATA. Both must be set. + let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) }; + let tilecfg = (xcr0 >> 17) & 1; + let tiledata = (xcr0 >> 18) & 1; + if tilecfg == 0 || tiledata == 0 { return false; } + + // Step 4: Request XCOMP_PERM for TILEDATA. + // Linux kernel 5.19+: processes must call prctl(ARCH_REQ_XCOMP_PERM, 18) + // to request permission for TILEDATA (XFEATURE 18) before using AMX. + // Without this, LDTILECFG will SIGILL even if XCR0 bits are set. + // The prctl either succeeds (0) or fails (-1) — idempotent, safe to call + // multiple times. + #[cfg(target_os = "linux")] + { + const SYS_PRCTL: i64 = 157; // x86_64 syscall number for prctl + const ARCH_REQ_XCOMP_PERM: i64 = 0x1023; + const XFEATURE_XTILEDATA: i64 = 18; + // SAFETY: syscall(prctl, ARCH_REQ_XCOMP_PERM, 18) is a simple permission + // request. It either grants tile permission (returns 0) or fails (returns + // -errno). No side effects on failure. Idempotent. + let ret: i64; + unsafe { + core::arch::asm!( + "syscall", + inlateout("rax") SYS_PRCTL => ret, + in("rdi") ARCH_REQ_XCOMP_PERM, + in("rsi") XFEATURE_XTILEDATA, + in("rdx") 0i64, + in("r10") 0i64, + in("r8") 0i64, + lateout("rcx") _, + lateout("r11") _, + options(nostack), + ); + } + if ret != 0 { return false; } + } + + true } #[cfg(not(target_arch = "x86_64"))] @@ -203,17 +260,25 @@ pub fn vnni_matvec_scalar( /// Runtime-dispatched VNNI MatVec: avx512vnni → avxvnniint8 → scalar i32. /// -/// Three tiers, mutually exclusive by hardware generation: +/// Three tiers, checked in order (first match wins): /// avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+) /// avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H) -/// scalar i32 — only for non-x86 or testing (caller should prefer F32x16 FMA) +/// scalar i32 — only for non-x86 or testing +/// +/// IMPORTANT: avxvnniint8 (VNNI2, 256-bit) is NEVER reached when +/// avx512vnni (VNNI512) is present. This is correct: +/// - CPUs with avx512vnni always have 512-bit VPDPBUSD (faster) +/// - avxvnniint8 exists ONLY for CPUs that dropped AVX-512 +/// but added 256-bit VNNI (Arrow Lake, Meteor Lake U-series) +/// - The two instructions have DIFFERENT encodings: +/// avx512vnni: EVEX-encoded VPDPBUSD zmm (512-bit) +/// avxvnniint8: VEX-encoded VPDPBUSD ymm (256-bit) +/// - Running EVEX VPDPBUSD on a VEX-only CPU = SIGILL +/// - Running VEX VPDPBUSD on an EVEX CPU = works but wastes half the width /// -/// NOTE: The scalar path here does i32 multiply-accumulate, NOT f32. -/// For the thinking engine, F32x16 FMA (16 MACs/instr) is the true floor. -/// This scalar path exists only for correctness on non-x86 targets. /// The thinking engine's cycle_auto() dispatches: /// VNNI detected → cycle_vnni() → this function -/// No VNNI → cycle() → F32x16 (never reaches here) +/// No VNNI → cycle() → F32x16 FMA (never reaches here) pub fn matvec_dispatch( table: &[u8], energy_i8: &[i8],