diff --git a/.claude/agents/l3-strategist.md b/.claude/agents/l3-strategist.md index 6d5b1740..9ff81560 100644 --- a/.claude/agents/l3-strategist.md +++ b/.claude/agents/l3-strategist.md @@ -6,7 +6,7 @@ description: > mapping rustynum capabilities to ndarray's trait system, identifying architectural gaps, or planning multi-phase implementation roadmaps. tools: Read, Glob, Grep, Bash -model: sonnet +model: opus --- You are the L3_STRATEGIST for Project NDARRAY Expansion. diff --git a/.claude/agents/migration-tracker.md b/.claude/agents/migration-tracker.md index 1bc89009..7cffda00 100644 --- a/.claude/agents/migration-tracker.md +++ b/.claude/agents/migration-tracker.md @@ -5,7 +5,7 @@ description: > Updates blackboard. Identifies gaps. Prevents duplication. READ ONLY — never writes code. tools: Read, Glob, Grep, Bash -model: sonnet +model: opus --- # Migration Tracker diff --git a/.claude/agents/product-engineer.md b/.claude/agents/product-engineer.md index 59215b6d..d6b8d7e8 100644 --- a/.claude/agents/product-engineer.md +++ b/.claude/agents/product-engineer.md @@ -6,7 +6,7 @@ description: > finalizing API surface, writing doc comments, designing error types, managing feature flags, or ensuring the crate is publishable. tools: Read, Glob, Grep, Bash, Edit, Write -model: sonnet +model: opus --- You are the PRODUCT_ENGINEER for Project NDARRAY Expansion. diff --git a/.claude/agents/vector-synthesis.md b/.claude/agents/vector-synthesis.md index 9c9babdc..a508d38e 100644 --- a/.claude/agents/vector-synthesis.md +++ b/.claude/agents/vector-synthesis.md @@ -6,7 +6,7 @@ description: > similarity search kernels, ndarray↔vector store bridges, distance metrics (cosine, L2, dot product), or batch vector operations. tools: Read, Glob, Grep, Bash, Edit -model: sonnet +model: opus --- You are the VECTOR_SYNTHESIS_EXPERT for Project NDARRAY Expansion. diff --git a/src/hpc/cam_pq.rs b/src/hpc/cam_pq.rs index 0c103192..e9c01252 100644 --- a/src/hpc/cam_pq.rs +++ b/src/hpc/cam_pq.rs @@ -460,7 +460,7 @@ pub fn train_hybrid( /// For 16D subvectors (CAM-PQ subspace dimension), this is one F32x16 /// load-subtract-multiply-reduce. Consumer never sees hardware details. #[inline(always)] -fn squared_l2(a: &[f32], b: &[f32]) -> f32 { +pub fn squared_l2(a: &[f32], b: &[f32]) -> f32 { debug_assert_eq!(a.len(), b.len()); let n = a.len(); @@ -518,7 +518,7 @@ fn jaccard_similarity(a: &[String], b: &[String]) -> f32 { /// Simple k-means clustering. /// /// Returns `k` centroid vectors of length `dim`. -fn kmeans(data: &[Vec], k: usize, dim: usize, iterations: usize) -> Vec> { +pub fn kmeans(data: &[Vec], k: usize, dim: usize, iterations: usize) -> Vec> { let n = data.len(); if n == 0 || k == 0 { return vec![vec![0.0; dim]; k]; diff --git a/src/hpc/fft.rs b/src/hpc/fft.rs index d3ed8063..dcc80293 100644 --- a/src/hpc/fft.rs +++ b/src/hpc/fft.rs @@ -136,6 +136,103 @@ pub fn rfft_f32(input: &[f32]) -> Vec { complex[..2 * out_len].to_vec() } +// ── Walsh-Hadamard Transform ────────────────────────────────────── +// +// The WHT is to quantization codecs what FFT is to signal processing: +// an O(n log n) orthogonal rotation that spreads energy uniformly +// across all coefficients. Unlike SVD (data-adaptive, O(n²k) training), +// the Hadamard rotation is deterministic, free, and self-inverse. +// +// Used by the HadCascade codec (bgz-tensor) for residual rotation +// before i4/i2 quantization. ICC 1.0000 on real model weights. + +/// In-place Walsh-Hadamard Transform (normalized, self-inverse). +/// +/// `data` length must be a power of 2. After transform, `||WHT(x)|| = ||x||` +/// (energy-preserving). Applying WHT twice returns the original vector. +/// +/// SIMD: uses F32x16 butterfly for blocks ≥ 16 elements. +/// +/// # Example +/// +/// ``` +/// use ndarray::hpc::fft::wht_f32; +/// +/// let mut x = vec![1.0f32, 0.0, 0.0, 0.0]; +/// wht_f32(&mut x); +/// assert!((x[0] - 0.5).abs() < 1e-6); // 1/sqrt(4) * 1 = 0.5 +/// +/// // Self-inverse: WHT(WHT(x)) = x +/// wht_f32(&mut x); +/// assert!((x[0] - 1.0).abs() < 1e-5); +/// ``` +pub fn wht_f32(data: &mut [f32]) { + let n = data.len(); + assert!(n.is_power_of_two(), "WHT length must be a power of 2"); + + let mut h = 1; + while h < n { + if h >= 16 { + wht_butterfly_simd(data, n, h); + } else { + for i in (0..n).step_by(h * 2) { + for j in i..i + h { + let x = data[j]; + let y = data[j + h]; + data[j] = x + y; + data[j + h] = x - y; + } + } + } + h *= 2; + } + + let norm = 1.0 / (n as f32).sqrt(); + let mut i = 0; + while i + 16 <= n { + use crate::simd::F32x16; + let v = F32x16::from_slice(&data[i..]); + let scaled = v * F32x16::splat(norm); + scaled.copy_to_slice(&mut data[i..i + 16]); + i += 16; + } + while i < n { + data[i] *= norm; + i += 1; + } +} + +/// WHT butterfly for one level, SIMD-accelerated for h ≥ 16. +fn wht_butterfly_simd(data: &mut [f32], n: usize, h: usize) { + use crate::simd::F32x16; + for i in (0..n).step_by(h * 2) { + let mut j = 0; + while j + 16 <= h { + let a = F32x16::from_slice(&data[i + j..]); + let b = F32x16::from_slice(&data[i + j + h..]); + let sum = a + b; + let diff = a - b; + sum.copy_to_slice(&mut data[i + j..i + j + 16]); + diff.copy_to_slice(&mut data[i + j + h..i + j + h + 16]); + j += 16; + } + while j < h { + let x = data[i + j]; + let y = data[i + j + h]; + data[i + j] = x + y; + data[i + j + h] = x - y; + j += 1; + } + } +} + +/// Convenience: WHT on a new vector (non-mutating). +pub fn wht_f32_new(input: &[f32]) -> Vec { + let mut out = input.to_vec(); + wht_f32(&mut out); + out +} + // ── Helpers ──────────────────────────────────────────────────────── fn bit_reverse_f32(data: &mut [f32], n: usize) { @@ -197,6 +294,44 @@ mod tests { } } + #[test] + fn test_wht_self_inverse() { + let original = vec![1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let mut data = original.clone(); + wht_f32(&mut data); + wht_f32(&mut data); + for (a, b) in original.iter().zip(data.iter()) { + assert!((a - b).abs() < 1e-5, "self-inverse: {} vs {}", a, b); + } + } + + #[test] + fn test_wht_energy_preservation() { + let mut data = vec![1.0f32, -2.0, 3.0, -4.0]; + let norm_before: f32 = data.iter().map(|x| x * x).sum::().sqrt(); + wht_f32(&mut data); + let norm_after: f32 = data.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm_before - norm_after).abs() < 1e-4, + "energy: {} vs {}", norm_before, norm_after); + } + + #[test] + fn test_wht_large_simd() { + let mut data: Vec = (0..1024).map(|i| (i as f32 * 0.618).sin()).collect(); + let original = data.clone(); + wht_f32(&mut data); + // Norm preservation at 1024-d (hits SIMD path) + let n_orig: f32 = original.iter().map(|x| x * x).sum::().sqrt(); + let n_wht: f32 = data.iter().map(|x| x * x).sum::().sqrt(); + assert!((n_orig - n_wht).abs() / n_orig < 1e-4, + "SIMD WHT norm: {} vs {}", n_orig, n_wht); + // Self-inverse + wht_f32(&mut data); + let max_err = original.iter().zip(data.iter()) + .map(|(a, b)| (a - b).abs()).fold(0.0f32, f32::max); + assert!(max_err < 1e-3, "SIMD self-inverse max_err: {}", max_err); + } + #[test] fn test_rfft() { let input = vec![1.0f32, 2.0, 3.0, 4.0]; diff --git a/src/hpc/quantized.rs b/src/hpc/quantized.rs index 5202d8e3..c150c349 100644 --- a/src/hpc/quantized.rs +++ b/src/hpc/quantized.rs @@ -374,6 +374,49 @@ pub fn dequantize_i4_to_f32(packed: &[u8], params: &QuantParams, len: usize) -> result } +/// Quantize f32 to i2 (packed: four i2 values per byte, signed ±1). +/// +/// Each value is clamped to {-1, 0, +1} after scaling by abs_max. +/// Packing: 4 crumbs per byte, low bits first. +/// Symmetric quantization with zero_point = 0. +pub fn quantize_f32_to_i2(data: &[f32]) -> (Vec, QuantParams) { + let abs_max = data.iter().fold(0.0f32, |a, &b| a.max(b.abs())); + let scale = if abs_max > 0.0 { abs_max } else { 1.0 }; + + let packed_len = (data.len() + 3) / 4; + let mut packed = vec![0u8; packed_len]; + + for (i, &v) in data.iter().enumerate() { + let q = (v / scale).round().clamp(-1.0, 1.0) as i8; + let u = (q + 1) as u8; // map {-1,0,1} to {0,1,2} + let shift = (i % 4) * 2; + packed[i / 4] |= (u & 0x03) << shift; + } + + ( + packed, + QuantParams { + scale, + zero_point: 0, + min_val: -abs_max, + max_val: abs_max, + }, + ) +} + +/// Dequantize i2 (packed) to f32. +pub fn dequantize_i2_to_f32(packed: &[u8], params: &QuantParams, len: usize) -> Vec { + let mut result = Vec::with_capacity(len); + for i in 0..len { + let byte = packed[i / 4]; + let shift = (i % 4) * 2; + let u = (byte >> shift) & 0x03; + let q = u as i8 - 1; // map {0,1,2} back to {-1,0,1} + result.push(q as f32 * params.scale); + } + result +} + #[cfg(test)] mod tests { use super::*;