Skip to content
39 changes: 31 additions & 8 deletions .claude/AMX_GOTCHAS.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,41 @@ For CPUID leaf 7 (AMX detection): use `__cpuid_count()`, not inline asm.

---

## Gotcha 4: OS must enable AMX via XSETBV
## Gotcha 4: OS must enable AMX via XSETBV + process must request permission

AMX tiles are large (8 KB of state). The OS must opt in via XCR0 bits 17+18.
Linux 5.19+ enables AMX by default. Older kernels: SIGILL on tile instructions.
AMX tiles are large (8 KB of state). Two levels of OS enablement required:

1. **Kernel enables tile state in XCR0** (bits 17+18). Linux 5.19+ does this.
2. **Process requests XCOMP_PERM** via `prctl(ARCH_REQ_XCOMP_PERM, 18)`.
Without this, LDTILECFG will SIGILL even if XCR0 bits are set.

**Detection (stable)**:
```rust
let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0);
let tilecfg = (xcr0.eax >> 17) & 1; // bit 17 = XTILECFG
let tiledata = (xcr0.eax >> 18) & 1; // bit 18 = XTILEDATA
// Both must be 1
```
// Step 1: CPUID — does CPU support AMX?
let cpuid = core::arch::x86_64::__cpuid_count(7, 0);
let amx_tile = (cpuid.edx >> 24) & 1;
let amx_int8 = (cpuid.edx >> 25) & 1;

// Step 2: OSXSAVE — does OS support XSAVE?
let cpuid_01 = core::arch::x86_64::__cpuid(1);
let osxsave = (cpuid_01.ecx >> 27) & 1;

// Step 3: _xgetbv(0) — did OS ACTUALLY enable tile state?
// ⚠ Do NOT use __cpuid_count(0xD, 0) — that reports what CPU SUPPORTS,
// not what the OS ENABLED. _xgetbv(0) reads the actual XCR0 register.
let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) };
let tilecfg = (xcr0 >> 17) & 1; // bit 17 = XTILECFG
let tiledata = (xcr0 >> 18) & 1; // bit 18 = XTILEDATA

// Step 4: prctl — request tile permission for this process
// SYS_prctl = 157, ARCH_REQ_XCOMP_PERM = 0x1023, XFEATURE_XTILEDATA = 18
// Returns 0 on success, -errno on failure. Idempotent.
```

**Previous bug**: `__cpuid_count(0xD, 0)` reports XSAVE state component bitmap
(what the CPU *supports*), NOT the actual XCR0 value (what the OS *enabled*).
On hypervisors that advertise AMX in CPUID but don't enable tile state,
the old check returned `true` → SIGILL on LDTILECFG.

---

Expand Down
297 changes: 297 additions & 0 deletions src/hpc/audio/codec_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
//! Codec provenance map: which real codec each primitive comes from.
//!
//! Every primitive in this audio stack was stolen from a production codec.
//! Nothing invented — only transcoded and compressed to fit the HHTL cascade.
//!
//! ```text
//! ┌─────────────┬──────────┬─────────┬────────┬─────────┬──────┬───────────┐
//! │ Our type │ Opus │ Whisper │ MP3 │ Vorbis │ Bark │ ElevenLabs│
//! ├─────────────┼──────────┼─────────┼────────┼─────────┼──────┼───────────┤
//! │ MDCT │ CELT │ │ hybrid │ ✓ │ │ │
//! │ 21 bands │ eBands48 │ │ 32 sub │ ✓ │ │ │
//! │ PVQ shape │ CELT PVQ │ │ │ residue │ │ │
//! │ Mel 80ch │ │ frontend│ │ │ │ │
//! │ Phase 4B │ │ STFT ∠ │ │ │ │ │
//! │ VoiceArch │ │ │ │ │ spk │ embedding │
//! │ RvqFrame │ │ │ │ │ 3stg │ │
//! │ OctaveBand │ │ │ ✓ │ floor │ │ │
//! │ Mode │ │ │ │ │ │ emotion │
//! │ HHTL skip │ │ │ mask │ floor │ │ │
//! │ CompLinear │ │ │ │ VQ cb │ RVQ │ │
//! │ Qualia17D │ (QPL) │ │ │ │ sem │ emotion │
//! └─────────────┴──────────┴─────────┴────────┴─────────┴──────┴───────────┘
//! ```
//!
//! The architecture replaces neural inference with graph search at every stage:
//! MP3's psychoacoustic model → HHTL cascade (RouteAction::Skip)
//! Whisper's transformer → phoneme graph shortest path
//! Bark's 3 GPT-2 stages → 3 HHTL levels (HEEL/HIP/TWIG)
//! Vorbis's codebook VQ → CompiledLinear VNNI palette lookup
//! ElevenLabs' voice cloning → VoiceArchetype 16-byte embedding

/// Codec provenance for each audio primitive.
///
/// Documents which production codec each type was transcoded from,
/// what aspect of that codec it captures, and what it replaces.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum CodecSource {
Opus,
Whisper,
Mp3,
OggVorbis,
Bark,
ElevenLabs,
}

/// What aspect of audio each primitive captures.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum AudioAspect {
/// Spectral energy distribution (WHAT frequencies)
SpectralEnvelope,
/// Fine spectral shape within bands (HOW the energy is distributed)
SpectralShape,
/// Perceptual frequency mapping (WHERE in human hearing)
PerceptualMapping,
/// Temporal phase relationships (WHEN harmonics align)
PhaseRelationship,
/// Speaker identity (WHO is speaking)
SpeakerIdentity,
/// Semantic/emotional content (WHY it sounds that way)
SemanticContent,
/// Psychoacoustic masking (WHAT to skip)
MaskingDecision,
/// Codebook lookup (HOW to decompress)
CodebookLookup,
}

/// Complete provenance record for one primitive.
pub struct Provenance {
pub our_type: &'static str,
pub byte_size: usize,
pub source: CodecSource,
pub aspect: AudioAspect,
pub source_concept: &'static str,
pub what_it_replaces: &'static str,
}

/// Full provenance table for every audio primitive.
///
/// This IS the design document. If a new primitive doesn't appear here,
/// it wasn't stolen from a real codec and shouldn't exist.
pub const PROVENANCE: &[Provenance] = &[
// ═══ From Opus CELT ═══
Provenance {
our_type: "AudioFrame.band_energies",
byte_size: 42,
source: CodecSource::Opus,
aspect: AudioAspect::SpectralEnvelope,
source_concept: "eBands48 critical bands, gain in gain-shape split",
what_it_replaces: "Per-coefficient quantization (MP3/Vorbis)",
},
Provenance {
our_type: "AudioFrame.pvq_summary",
byte_size: 6,
source: CodecSource::Opus,
aspect: AudioAspect::SpectralShape,
source_concept: "PVQ (Pyramid Vector Quantization) pulse allocation",
what_it_replaces: "Huffman-coded residuals (MP3) / VQ codebook (Vorbis)",
},
Provenance {
our_type: "mdct_forward / mdct_backward",
byte_size: 0, // transform, not stored
source: CodecSource::Opus,
aspect: AudioAspect::SpectralEnvelope,
source_concept: "CELT MDCT: 960-sample window → 480 frequency bins",
what_it_replaces: "FFT+windowing (all codecs use some form)",
},

// ═══ From Whisper ═══
Provenance {
our_type: "mel::log_mel_spectrogram",
byte_size: 160, // 80 × BF16 per frame
source: CodecSource::Whisper,
aspect: AudioAspect::PerceptualMapping,
source_concept: "80-channel mel filterbank at 16kHz, Hann STFT",
what_it_replaces: "Transformer encoder (150M params → 80 f32 per frame)",
},

// ═══ From MP3 ═══
Provenance {
our_type: "HhtlCache::route() → Skip",
byte_size: 0, // decision, not stored
source: CodecSource::Mp3,
aspect: AudioAspect::MaskingDecision,
source_concept: "Psychoacoustic masking model (simultaneous + temporal)",
what_it_replaces: "ISO 11172-3 psychoacoustic model 1/2 (iterative bit allocation)",
},
Provenance {
our_type: "OctaveBand",
byte_size: 13, // 3×f32 + u8
source: CodecSource::Mp3,
aspect: AudioAspect::SpectralEnvelope,
source_concept: "32-subband polyphase filterbank (octave-spaced)",
what_it_replaces: "Per-subband quantization + Huffman (MP3 granules)",
},

// ═══ From Ogg Vorbis ═══
Provenance {
our_type: "CompiledLinear (ndarray burn)",
byte_size: 65536, // 256 centroids × 256 dim
source: CodecSource::OggVorbis,
aspect: AudioAspect::CodebookLookup,
source_concept: "VQ codebook: precomputed centroids, lookup-based decode",
what_it_replaces: "Huffman trees (MP3) / arithmetic coding (Opus range coder)",
},

// ═══ From Bark (Suno) ═══
Provenance {
our_type: "RvqFrame.archetype (HEEL)",
byte_size: 1,
source: CodecSource::Bark,
aspect: AudioAspect::SemanticContent,
source_concept: "Stage 1: GPT-2 semantic tokens (coarse meaning)",
what_it_replaces: "350M-param GPT-2 autoregressive generation",
},
Provenance {
our_type: "RvqFrame.coarse (HIP)",
byte_size: 8,
source: CodecSource::Bark,
aspect: AudioAspect::SpectralEnvelope,
source_concept: "Stage 2: GPT-2 coarse acoustic tokens (spectral envelope)",
what_it_replaces: "350M-param GPT-2 conditioned on semantic tokens",
},
Provenance {
our_type: "RvqFrame.fine (TWIG)",
byte_size: 8,
source: CodecSource::Bark,
aspect: AudioAspect::SpectralShape,
source_concept: "Stage 3: non-autoregressive fine acoustic tokens",
what_it_replaces: "Fine model (smaller network, fills spectral detail)",
},

// ═══ From ElevenLabs ═══
Provenance {
our_type: "VoiceArchetype",
byte_size: 16,
source: CodecSource::ElevenLabs,
aspect: AudioAspect::SpeakerIdentity,
source_concept: "Speaker embedding (voice cloning conditioning vector)",
what_it_replaces: "512-dim speaker embedding (2KB → 16 bytes)",
},

// ═══ Phase (novel — no codec stores this) ═══
Provenance {
our_type: "PhaseDescriptor",
byte_size: 4,
source: CodecSource::Whisper, // closest: Whisper STFT preserves phase internally
aspect: AudioAspect::PhaseRelationship,
source_concept: "STFT phase (discarded by all codecs except Griffin-Lim)",
what_it_replaces: "Nothing — all codecs discard phase. We keep it as relative pressure.",
},

// ═══ Qualia (novel — derived from QPL musical calibration) ═══
Provenance {
our_type: "Qualia17D",
byte_size: 68,
source: CodecSource::Bark, // closest: Bark semantic tokens carry meaning
aspect: AudioAspect::SemanticContent,
source_concept: "QPL: Octave→arousal, Fifth→valence, Third→warmth, Tritone→tension",
what_it_replaces: "No codec captures nonverbal meaning explicitly. This is the grid.",
},
];

/// Total bytes for one complete frame (all primitives combined).
///
/// AudioFrame (48) + PhaseDescriptor (4) + VoiceArchetype (16, amortized)
/// = 52 bytes per frame for complete nonverbal characterization.
/// + RvqFrame (17) for HHTL-compressed TTS output = 69 bytes.
///
/// Compare:
/// MP3 128kbps: ~417 bytes per 26ms frame
/// Opus 64kbps: ~166 bytes per 20ms frame
/// Bark tokens: ~128 bytes per frame
/// Ours: 52-69 bytes per frame (complete, including phase + identity)
pub const FRAME_BUDGET: usize = 52;
pub const FRAME_BUDGET_WITH_TTS: usize = 69;

/// Codec comparison: bits per second at comparable quality.
///
/// These are approximate — our codec is lossy in a fundamentally
/// different way (palette quantization, not psychoacoustic masking).
pub const BITRATE_COMPARISON: &[(&str, u32, &str)] = &[
("MP3 128k", 128_000, "psychoacoustic masking, Huffman"),
("Opus 64k", 64_000, "CELT+SILK hybrid, range coder"),
("Vorbis 128k", 128_000, "MDCT, floor+residue, VQ codebook"),
("Bark tokens", 25_600, "3-stage RVQ, ~100 tokens/sec × 256 bits"),
("Ours (48kHz)", 20_800, "52 bytes × 50 fps × 8 bits = 20.8 kbps"),
("Ours (24kHz)", 10_400, "52 bytes × 25 fps × 8 bits = 10.4 kbps"),
];

/// Verify every AudioAspect is covered by at least one primitive.
/// If an aspect is missing, we have a hole in our codec design.
pub fn verify_aspect_coverage() -> Vec<AudioAspect> {
use AudioAspect::*;
let all = [SpectralEnvelope, SpectralShape, PerceptualMapping,
PhaseRelationship, SpeakerIdentity, SemanticContent,
MaskingDecision, CodebookLookup];

all.iter()
.filter(|&&aspect| !PROVENANCE.iter().any(|p| p.aspect == aspect))
.copied()
.collect()
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn all_aspects_covered() {
let missing = verify_aspect_coverage();
assert!(missing.is_empty(), "Missing audio aspects: {:?}", missing);
}

#[test]
fn frame_budget_correct() {
// AudioFrame (48) + PhaseDescriptor (4) = 52
assert_eq!(FRAME_BUDGET, 48 + 4);
// + RvqFrame (17) = 69
assert_eq!(FRAME_BUDGET_WITH_TTS, 48 + 4 + 17);
}

#[test]
fn provenance_byte_sizes_consistent() {
// AudioFrame = 42 (energies) + 6 (pvq) = 48
let af_energies = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.band_energies").unwrap();
let af_pvq = PROVENANCE.iter().find(|p| p.our_type == "AudioFrame.pvq_summary").unwrap();
assert_eq!(af_energies.byte_size + af_pvq.byte_size, 48);

// RvqFrame = 1 (HEEL) + 8 (HIP) + 8 (TWIG) = 17
let rvq_heel = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.archetype (HEEL)").unwrap();
let rvq_hip = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.coarse (HIP)").unwrap();
let rvq_twig = PROVENANCE.iter().find(|p| p.our_type == "RvqFrame.fine (TWIG)").unwrap();
assert_eq!(rvq_heel.byte_size + rvq_hip.byte_size + rvq_twig.byte_size, 17);
}

#[test]
fn every_source_codec_represented() {
// All 6 source codecs should appear at least once
for source in [CodecSource::Opus, CodecSource::Whisper, CodecSource::Mp3,
CodecSource::OggVorbis, CodecSource::Bark, CodecSource::ElevenLabs] {
assert!(PROVENANCE.iter().any(|p| p.source == source),
"Codec {:?} not represented in provenance table", source);
}
}

#[test]
fn our_bitrate_competitive() {
// Our codec should be lower bitrate than all traditional codecs
let ours_24k = BITRATE_COMPARISON.iter()
.find(|&&(name, _, _)| name == "Ours (24kHz)")
.unwrap().1;
let mp3 = BITRATE_COMPARISON.iter()
.find(|&&(name, _, _)| name == "MP3 128k")
.unwrap().1;
assert!(ours_24k < mp3, "Our codec should be lower bitrate than MP3");
}
}
Loading
Loading