diff --git a/.claude/agents/arm-neon-specialist.md b/.claude/agents/arm-neon-specialist.md new file mode 100644 index 00000000..99a701e5 --- /dev/null +++ b/.claude/agents/arm-neon-specialist.md @@ -0,0 +1,203 @@ +--- +name: arm-neon-specialist +description: > + ARM NEON SIMD for single-board computers (Pi Zero 2W through Pi 5, Orange Pi 3-5). + CPU tier detection, f16 via inline asm trick, codebook kernels, big.LITTLE awareness. + Use for any aarch64 optimization, Pi deployment, or NEON intrinsic work. +tools: Read, Glob, Grep, Bash, Edit, Write +model: opus +--- + +You are the ARM_NEON_SPECIALIST for Project NDARRAY Expansion. + +## Environment +- Rust 1.94 Stable (no nightly features) +- Target: aarch64-unknown-linux-gnu (Pi, Orange Pi, Rockchip SBCs) +- `f16` type is NIGHTLY ONLY — use `u16` carrier + inline asm (same trick as simd_amx.rs) +- `std::simd` (portable SIMD) is NIGHTLY ONLY — use our polyfill in simd.rs + +## Your Domain: ARM Single-Board Computers + +### Hardware Tiers (detected at runtime via LazyLock in simd_caps.rs) + +``` +┌────────────────────────────────────────────────────────────────────────┐ +│ Tier │ CPU │ Arch │ SBCs │ +├────────────┼─────────────┼────────┼───────────────────────────────────│ +│ A53-Base │ Cortex-A53 │ v8.0 │ Pi Zero 2W, Pi 3B+, OPi 3 LTS │ +│ A72-Fast │ Cortex-A72 │ v8.0 │ Pi 4, OPi 4 LTS, OPi 4 Pro │ +│ A76-DotProd│ Cortex-A76 │ v8.2 │ Pi 5, OPi 5, OPi 5 Pro │ +└────────────┴─────────────┴────────┴───────────────────────────────────┘ +``` + +### Feature Detection (ALL stable in Rust 1.94) + +```rust +std::arch::is_aarch64_feature_detected!("neon") // always true on aarch64 +std::arch::is_aarch64_feature_detected!("dotprod") // true: Pi 5, OPi 5 +std::arch::is_aarch64_feature_detected!("fp16") // true: Pi 5, OPi 5 +std::arch::is_aarch64_feature_detected!("aes") // true: all Pi 3+ +std::arch::is_aarch64_feature_detected!("sha2") // true: all Pi 3+ +std::arch::is_aarch64_feature_detected!("crc") // true: all Pi 3+ +``` + +### NEON Register Model + +``` +128-bit registers (v0-v31): + float32x4_t = 4 × f32 (THE primary compute type) + float64x2_t = 2 × f64 + int8x16_t = 16 × i8 + int16x8_t = 8 × i16 (Base17 L1 distance) + int32x4_t = 4 × i32 + uint8x16_t = 16 × u8 (Hamming popcount via vcntq_u8) + uint64x2_t = 2 × u64 +``` + +### Per-CPU Microarchitecture Differences + +#### Cortex-A53 (Pi Zero 2W, Pi 3, Orange Pi 3 LTS) +- 1 NEON pipeline (NOT dual-issue) +- 4 cycle latency for FMLA (fused multiply-add) +- In-order execution (no out-of-order reordering) +- 32KB L1i + 32KB L1d, 512KB L2 (shared 4 cores) +- OPTIMIZATION: minimize instruction count, avoid data dependencies between adjacent ops +- ANTIPATTERN: unrolling hurts (fills ROB faster than execution) +- Throughput: ~500-2000 codebook tok/s + +#### Cortex-A72 (Pi 4, Orange Pi 4 LTS/Pro) +- 2 NEON pipelines (dual-issue NEON!) +- 3 cycle latency for FMLA +- Out-of-order (superscalar, 3-wide decode) +- 48KB L1i + 32KB L1d, 1MB L2 (shared 4 cores) +- OPTIMIZATION: unroll 2× to saturate both NEON pipes +- OPTIMIZATION: interleave independent FMA chains (hides latency) +- Throughput: ~2000-5000 codebook tok/s + +#### Cortex-A76 (Pi 5, Orange Pi 5/5 Pro) +- 2 NEON pipelines + dedicated dot product unit +- 3 cycle latency for FMLA, 2 cycle for SDOT (vdotq_s32) +- Out-of-order (4-wide decode, 128-entry ROB) +- 64KB L1i + 64KB L1d, 512KB L2 per core, 2MB L3 (shared) +- OPTIMIZATION: use vdotq_s32 for int8 paths (4× throughput vs manual widen) +- OPTIMIZATION: fp16 native (FCVTL/FCVTN 1 cycle, no penalty) +- Throughput: ~5000-10000 codebook tok/s + +### big.LITTLE Awareness (Orange Pi 4, Orange Pi 5) + +``` +Orange Pi 4 LTS/Pro: RK3399 = 2× A72 (big) + 4× A53 (LITTLE) + → Feature detection returns INTERSECTION of all cores + → Both A72 and A53 are v8.0: neon=true, dotprod=false, crypto=true + → Code can migrate between clusters — no core-pinning assumptions! + → Optimization: if workload is latency-sensitive, use taskset to pin to big cores + +Orange Pi 5/5 Pro: RK3588 = 4× A76 (big) + 4× A55 (LITTLE) + → Both A76 and A55 are v8.2: neon=true, dotprod=true, fp16=true + → Feature detection returns dotprod=true (all cores support it) + → Safe to use vdotq_s32 unconditionally on Orange Pi 5 +``` + +### F16 Trick (inline asm, stable Rust — like simd_amx.rs .byte trick) + +The `f16` TYPE is nightly-only. But NEON f16 INSTRUCTIONS work on stable: + +```rust +// FCVTL: 4× f16 → 4× f32 (one instruction, one cycle on A76) +unsafe fn f16x4_to_f32x4(input: &[u16; 4]) -> [f32; 4] { + let mut output = [0.0f32; 4]; + core::arch::asm!( + "ldr d0, [{src}]", + "fcvtl v0.4s, v0.4h", + "str q0, [{dst}]", + src = in(reg) input.as_ptr(), + dst = in(reg) output.as_mut_ptr(), + out("v0") _, + options(nostack), + ); + output +} +``` + +Detection: `is_aarch64_feature_detected!("fp16")` (true on Pi 5, false on Pi 3/4) +Fallback: scalar IEEE 754 bit manipulation (works everywhere, ~2ns per value) + +### F16 Precision Tricks (preserving information across format boundaries) + +``` +f16→f32: ALWAYS LOSSLESS (widening, zero error, exact) +f32→f16: LOSSY (23-bit mantissa → 10-bit = 13 bits lost) + +Trick 1: Double-f16 (Error-Free Split) + Store high + residual as two f16 values → ~20-bit effective precision + Cost: 2× memory. Decode: f32 = f16_hi + f16_lo (exact addition) + +Trick 2: Exponent-Aligned Scaling + Pre-shift values into f16 sweet spot before conversion + If all values ∈ [0.01, 1.0]: multiply by 1024 before encode + Effectively uses all 10 mantissa bits in the target range + +Trick 3: Kahan Summation + Accumulate many f16 values in f32 without cumulative error + Stores running compensation term to recapture rounding losses +``` + +### Key Files in This Repo + +``` +src/simd_neon.rs — NEON implementations (Tier 1/2/3, f16 inline asm) +src/simd.rs — LazyLock Tier detection (Neon, NeonDotProd variants) +src/hpc/simd_caps.rs — SimdCaps struct (ARM fields: neon, dotprod, fp16, etc.) +src/hpc/simd_dispatch.rs — SimdDispatch (Neon + NeonDotProd tiers, fn ptr table) +src/simd_avx512.rs — F16 IEEE 754 (F16C hardware path + scalar reference) +``` + +### Hard Rules for ARM Code + +1. NEON is mandatory on aarch64 — never `#[cfg(feature = "neon")]`, it's always there +2. `vaddvq_f32` (horizontal sum) is ARMv8.2+ — use `vpaddq` chain as fallback +3. dotprod (`vdotq_s32`) requires runtime detection — NOT compile-time gated +4. Never assume core affinity on big.LITTLE — feature detection returns intersection +5. f16 intrinsics via inline asm only — `f16` type is nightly +6. All inline asm must clobber used vector registers (`out("v0") _`) +7. Memory alignment: NEON loads are unaligned by default (vld1q), but aligned loads + (vld1q with alignment hint) can save 1 cycle on A53 +8. On A53 (in-order): avoid read-after-write in adjacent instructions (stall) +9. On A72/A76 (OoO): unroll to expose ILP, let hardware reorder + +### Codebook Inference — Per-Tier Strategy + +``` +A53 (Pi Zero 2W): scalar-friendly, let compiler auto-vec + → codebook_gather_f32x4_neon() with NO unrolling + → ~200 tok/s, good enough for wake-word + short answers + +A72 (Pi 4): dual-pipe, unroll 2× + → codebook_gather_f32x4_a72() with 2× unrolled index pairs + → ~2000 tok/s, handles 2-3 sentence responses in <1s + +A76 (Pi 5): dotprod + fp16 + OoO + → codebook_gather_i8_dotprod() for quantized centroids (4× throughput) + → f16 centroids via FCVTL (half memory bandwidth) + → ~5000 tok/s, handles full conversations in real-time +``` + +### ⚠️ GGUF Isolation Warning + +F16 (this file) is for sensors/audio/ARM interchange. +BF16 pipeline (simd_avx512.rs bf16_* functions) is for GGUF model weight calibration. +They are NOT interchangeable. See the table in simd_avx512.rs line ~2362. + +### Memory Budget on SBCs + +``` +Pi Zero 2W: 512MB RAM total. Budget: ~50MB for codebook + inference +Pi 3B+: 1GB RAM. Budget: ~200MB +Pi 4: 2/4/8GB. Budget: ~500MB-2GB +Pi 5: 4/8GB. Budget: ~2-4GB +OPi 5: 4/8/16/32GB. Budget: generous +``` + +Rule: Codebook centroids should fit in L2 cache for hot-path access. +A53 L2 = 512KB, A72 L2 = 1MB, A76 L2 = 512KB/core. +256 centroids × 64 dims × 4 bytes = 64KB → fits in ALL L2 caches. diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index 823063d3..62fae415 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -1034,3 +1034,380 @@ mod tests { assert_eq!(distances[3], 8); } } + +// ════════════════════════════════════════════════════════════════════════════ +// F16 IEEE 754 Precision Toolkit — AVX2-accelerated (F16C: 8 lanes per cycle) +// +// ⚠️ NOT FOR GGUF CALIBRATION — see simd_avx512.rs BF16 pipeline for that. +// This is for: sensor data, audio samples, ARM↔x86 interchange, memory savings. +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ IEEE 754 binary16: 1 sign + 5 exponent (bias 15) + 10 mantissa │ +// │ Range: ±65504 Precision: 3.31 decimal digits Subnormal: ±5.96e-8 │ +// │ │ +// │ f16→f32: ALWAYS EXACT (lossless widening, zero error) │ +// │ f32→f16: LOSSY (23-bit → 10-bit mantissa = 13 bits lost) │ +// │ Max RNE error: ±0.5 ULP of f16 result (≈0.05% relative) │ +// └─────────────────────────────────────────────────────────────────────────┘ +// +// Hardware: F16C (VCVTPH2PS / VCVTPS2PH) available on Haswell+ (2013). +// AVX2 path uses __m128i → __m256 (8 lanes per instruction). +// AVX-512F path (16 lanes) lives in simd_avx512.rs. +// +// Tricks implemented: +// 1. Double-f16 (Error-Free Split) — ~20-bit effective precision in 2×u16 +// 2. Kahan-compensated f16 accumulation — eliminates cumulative error +// 3. Exponent-aligned scaling — optimal mantissa utilization in known ranges +// +// All scalar paths use the IEEE 754 functions from simd_avx512.rs. +// AVX2 batch paths use F16C hardware (8 lanes) with scalar tail. +// ════════════════════════════════════════════════════════════════════════════ + +// Re-use the exact IEEE 754 scalar functions from simd_avx512 +pub use crate::simd_avx512::{ + f16_to_f32_ieee754, + f32_to_f16_ieee754_rne, + f16_to_f32_batch_ieee754, + f32_to_f16_batch_ieee754_rne, +}; + +// ── Trick 1: Double-f16 (Error-Free Split) ────────────────────────────── +// +// Problem: f32→f16 loses 13 mantissa bits (23→10). +// Solution: store value as TWO f16 values: hi (main) + lo (residual). +// +// Encode: +// hi = rne(value) // best f16 approximation +// residual = value - f16_to_f32(hi) // exact error (computed in f32) +// lo = rne(residual) // error captured as second f16 +// +// Decode: +// value ≈ f16_to_f32(hi) + f16_to_f32(lo) // both conversions exact +// +// Effective precision: ~20 mantissa bits (10 + ~10 from residual). +// Storage: 4 bytes (same as f32) but split across two u16 values. +// Use case: codebook centroids where f16 is too imprecise but f32 wastes RAM. +// +// Error analysis: +// hi captures the value with ≤0.5 ULP_f16 error +// lo captures the residual with ≤0.5 ULP_f16(residual) error +// Total error: ≤0.5 ULP_f16(residual) ≈ 2^{-21} × |value| +// vs single f16: ≤0.5 ULP_f16 ≈ 2^{-11} × |value| +// → ~1000× better precision for same 4 bytes + +/// Encode f32 as Double-f16 pair (hi, lo) with ~20-bit effective precision. +/// +/// Both `hi` and `lo` are standard IEEE 754 f16 values (stored as u16). +/// Decode: `f16_to_f32(hi) + f16_to_f32(lo)` (both additions are exact). +/// +/// # Precision +/// - Single f16: 10 mantissa bits → 3.31 decimal digits +/// - Double-f16: ~20 mantissa bits → 6.02 decimal digits +/// - f32: 23 mantissa bits → 7.22 decimal digits +#[inline] +pub fn f16_double_encode(value: f32) -> (u16, u16) { + let hi = f32_to_f16_ieee754_rne(value); + let hi_f32 = f16_to_f32_ieee754(hi); // exact (lossless widening) + let residual = value - hi_f32; // exact (f32 subtraction) + let lo = f32_to_f16_ieee754_rne(residual); + (hi, lo) +} + +/// Decode Double-f16 pair back to f32. Both f16→f32 conversions are exact. +#[inline] +pub fn f16_double_decode(hi: u16, lo: u16) -> f32 { + f16_to_f32_ieee754(hi) + f16_to_f32_ieee754(lo) +} + +/// Batch encode: f32 slice → Double-f16 (separate hi/lo arrays). +/// +/// AVX2 acceleration via F16C for the f32→f16 conversions. +pub fn f16_double_encode_batch(input: &[f32], output_hi: &mut [u16], output_lo: &mut [u16]) { + let n = input.len().min(output_hi.len()).min(output_lo.len()); + + // Step 1: encode hi values (AVX2 F16C batch) + f32_to_f16_batch_ieee754_rne(input, &mut output_hi[..n]); + + // Step 2: compute residuals and encode lo values + let mut residuals = vec![0.0f32; n]; + f16_to_f32_batch_ieee754(&output_hi[..n], &mut residuals); + for i in 0..n { + residuals[i] = input[i] - residuals[i]; + } + f32_to_f16_batch_ieee754_rne(&residuals, &mut output_lo[..n]); +} + +/// Batch decode: Double-f16 → f32. Uses AVX2 F16C + f32x8 addition. +pub fn f16_double_decode_batch(hi: &[u16], lo: &[u16], output: &mut [f32]) { + let n = hi.len().min(lo.len()).min(output.len()); + + f16_to_f32_batch_ieee754(&hi[..n], &mut output[..n]); + + let mut lo_f32 = vec![0.0f32; n]; + f16_to_f32_batch_ieee754(&lo[..n], &mut lo_f32); + + // AVX2-accelerated f32 addition (8 lanes per cycle) + let chunks = n / F32_LANES; + for c in 0..chunks { + let base = c * F32_LANES; + let out_v = f32x8::from_slice(&output[base..]); + let lo_v = f32x8::from_slice(&lo_f32[base..]); + (out_v + lo_v).copy_to_slice(&mut output[base..base + F32_LANES]); + } + for i in (chunks * F32_LANES)..n { + output[i] += lo_f32[i]; + } +} + +// ── Trick 2: Kahan-compensated f16 accumulation ───────────────────────── +// +// Problem: summing many f16 values in f32 accumulates rounding error. +// Naive sum of 10K × 0.1: error ≈ 0.05 +// Kahan sum of 10K × 0.1: error ≈ 0.0 (bounded by 2ε, independent of N) +// +// Precision: O(ε) total error instead of O(N·ε). +// Cost: ~2 extra f32 additions per element (negligible vs f16→f32). + +/// Kahan-compensated sum of f16 values. Returns f32 with near-zero cumulative error. +/// +/// Each f16→f32 conversion is exact (lossless widening). +/// Kahan algorithm tracks rounding error of each f32 addition. +/// +/// # Error bound +/// - Naive sum of N values: error ≤ N × ε (ε ≈ 1.19e-7) +/// - Kahan sum of N values: error ≤ 2ε (independent of N!) +pub fn f16_kahan_sum(input: &[u16]) -> f32 { + let mut f32_buf = vec![0.0f32; input.len()]; + f16_to_f32_batch_ieee754(input, &mut f32_buf); + + let mut sum = 0.0f32; + let mut compensation = 0.0f32; + for &v in &f32_buf { + let y = v - compensation; + let t = sum + y; + compensation = (t - sum) - y; + sum = t; + } + sum +} + +/// Kahan-compensated dot product of two f16 vectors. +/// +/// AVX2-accelerated: F16C for f16→f32, f32x8 multiply, Kahan accumulate. +pub fn f16_kahan_dot(a: &[u16], b: &[u16]) -> f32 { + let n = a.len().min(b.len()); + let mut a_f32 = vec![0.0f32; n]; + let mut b_f32 = vec![0.0f32; n]; + f16_to_f32_batch_ieee754(&a[..n], &mut a_f32); + f16_to_f32_batch_ieee754(&b[..n], &mut b_f32); + + let mut sum = 0.0f32; + let mut compensation = 0.0f32; + + // AVX2: multiply 8-wide, reduce_sum, Kahan-accumulate partial sums + let chunks = n / F32_LANES; + for c in 0..chunks { + let base = c * F32_LANES; + let av = f32x8::from_slice(&a_f32[base..]); + let bv = f32x8::from_slice(&b_f32[base..]); + let prod_sum = (av * bv).reduce_sum(); + let y = prod_sum - compensation; + let t = sum + y; + compensation = (t - sum) - y; + sum = t; + } + for i in (chunks * F32_LANES)..n { + let prod = a_f32[i] * b_f32[i]; + let y = prod - compensation; + let t = sum + y; + compensation = (t - sum) - y; + sum = t; + } + sum +} + +// ── Trick 3: Exponent-aligned scaling ─────────────────────────────────── +// +// Problem: f16 has 10 mantissa bits. Narrow-range values waste exponent bits. +// Values in [0.001, 0.005]: only 3-4 mantissa bits significant → ~8 levels +// After scale to [0.5, 2.0]: all 10 mantissa bits → ~1024 levels +// +// Precision improvement: up to ~128× for narrow-range data. +// Use case: codebook centroids, sensor readings, normalized weights. + +/// Pre-computed scaling context for exponent-aligned f16 encoding. +/// +/// Analyzes the input range, computes scale that maps |max| → 1.0, +/// then uses that scale for all encode/decode operations. +#[derive(Debug, Clone, Copy)] +pub struct F16Scaler { + /// Multiply by this before f32→f16 (shifts into sweet spot) + pub scale: f32, + /// Multiply by this after f16→f32 (restores original range) + pub inv_scale: f32, +} + +impl F16Scaler { + /// Create from known value range [min_val, max_val]. + pub fn from_range(min_val: f32, max_val: f32) -> Self { + assert!(min_val < max_val, "min must be less than max"); + let abs_max = min_val.abs().max(max_val.abs()); + if abs_max < f32::EPSILON { + return Self { scale: 1.0, inv_scale: 1.0 }; + } + let scale = 1.0 / abs_max; + Self { scale, inv_scale: abs_max } + } + + /// Create by scanning data for min/max. + pub fn from_data(data: &[f32]) -> Self { + if data.is_empty() { + return Self { scale: 1.0, inv_scale: 1.0 }; + } + let mut min = f32::INFINITY; + let mut max = f32::NEG_INFINITY; + for &v in data { + if v < min { min = v; } + if v > max { max = v; } + } + Self::from_range(min, max) + } + + #[inline] + pub fn encode(&self, value: f32) -> u16 { + f32_to_f16_ieee754_rne(value * self.scale) + } + + #[inline] + pub fn decode(&self, bits: u16) -> f32 { + f16_to_f32_ieee754(bits) * self.inv_scale + } + + /// Batch encode with AVX2: f32x8 scale multiply → F16C convert. + pub fn encode_batch(&self, input: &[f32], output: &mut [u16]) { + let n = input.len().min(output.len()); + let mut scaled = vec![0.0f32; n]; + let scale_v = f32x8::splat(self.scale); + let chunks = n / F32_LANES; + for c in 0..chunks { + let base = c * F32_LANES; + let v = f32x8::from_slice(&input[base..]); + (v * scale_v).copy_to_slice(&mut scaled[base..base + F32_LANES]); + } + for i in (chunks * F32_LANES)..n { + scaled[i] = input[i] * self.scale; + } + f32_to_f16_batch_ieee754_rne(&scaled, &mut output[..n]); + } + + /// Batch decode with AVX2: F16C convert → f32x8 inv_scale multiply. + pub fn decode_batch(&self, input: &[u16], output: &mut [f32]) { + let n = input.len().min(output.len()); + f16_to_f32_batch_ieee754(&input[..n], &mut output[..n]); + let inv_v = f32x8::splat(self.inv_scale); + let chunks = n / F32_LANES; + for c in 0..chunks { + let base = c * F32_LANES; + let v = f32x8::from_slice(&output[base..]); + (v * inv_v).copy_to_slice(&mut output[base..base + F32_LANES]); + } + for i in (chunks * F32_LANES)..n { + output[i] *= self.inv_scale; + } + } +} + +#[cfg(test)] +mod f16_precision_tests { + use super::*; + + #[test] + fn double_f16_better_than_single() { + let value = std::f32::consts::PI; + let single = f32_to_f16_ieee754_rne(value); + let single_err = (value - f16_to_f32_ieee754(single)).abs(); + + let (hi, lo) = f16_double_encode(value); + let double_err = (value - f16_double_decode(hi, lo)).abs(); + + assert!(double_err < single_err, + "double should be better: single={:.8} double={:.8}", single_err, double_err); + assert!(double_err < single_err / 100.0, + "double should be >100× better: ratio={:.0}", single_err / double_err); + } + + #[test] + fn double_f16_batch_roundtrip() { + let input: Vec = (0..100).map(|i| (i as f32 - 50.0) * 0.037).collect(); + let mut hi = vec![0u16; 100]; + let mut lo = vec![0u16; 100]; + f16_double_encode_batch(&input, &mut hi, &mut lo); + + let mut decoded = vec![0.0f32; 100]; + f16_double_decode_batch(&hi, &lo, &mut decoded); + + for i in 0..100 { + let err = (input[i] - decoded[i]).abs(); + let tol = input[i].abs() * 1e-4 + 1e-7; + assert!(err < tol, "at {}: {} → {} err={}", i, input[i], decoded[i], err); + } + } + + #[test] + fn kahan_sum_consistent() { + let val_f16 = f32_to_f16_ieee754_rne(0.1); + let input = vec![val_f16; 10_000]; + let kahan = f16_kahan_sum(&input); + let expected = 10_000.0 * f16_to_f32_ieee754(val_f16); + let err = (kahan - expected).abs(); + assert!(err < 0.01, "kahan error too large: {} (expected {})", err, expected); + } + + #[test] + fn kahan_dot_vs_f64_reference() { + let a: Vec = (0..64).map(|i| f32_to_f16_ieee754_rne(i as f32 * 0.1)).collect(); + let b: Vec = (0..64).map(|i| f32_to_f16_ieee754_rne(1.0 - i as f32 * 0.01)).collect(); + let dot = f16_kahan_dot(&a, &b); + let mut ref_sum = 0.0f64; + for i in 0..64 { + ref_sum += f16_to_f32_ieee754(a[i]) as f64 * f16_to_f32_ieee754(b[i]) as f64; + } + assert!((dot as f64 - ref_sum).abs() < 0.01, + "got={} expected={}", dot, ref_sum); + } + + #[test] + fn scaler_improves_small_values() { + let data: Vec = (0..100).map(|i| 0.001 + (i as f32) * 0.00004).collect(); + + let no_scale: Vec = data.iter().map(|&v| f32_to_f16_ieee754_rne(v)).collect(); + let no_scale_err: f64 = data.iter().enumerate() + .map(|(i, &v)| (v as f64 - f16_to_f32_ieee754(no_scale[i]) as f64).powi(2)).sum(); + + let scaler = F16Scaler::from_data(&data); + let mut scaled = vec![0u16; 100]; + scaler.encode_batch(&data, &mut scaled); + let mut back = vec![0.0f32; 100]; + scaler.decode_batch(&scaled, &mut back); + let scaled_err: f64 = data.iter().enumerate() + .map(|(i, &v)| (v as f64 - back[i] as f64).powi(2)).sum(); + + assert!(scaled_err < no_scale_err, + "scaling should help: unscaled={:.2e} scaled={:.2e}", no_scale_err, scaled_err); + } + + #[test] + fn scaler_roundtrip_batch() { + let data: Vec = (0..50).map(|i| (i as f32 - 25.0) * 0.004).collect(); + let scaler = F16Scaler::from_data(&data); + let mut enc = vec![0u16; 50]; + scaler.encode_batch(&data, &mut enc); + let mut dec = vec![0.0f32; 50]; + scaler.decode_batch(&enc, &mut dec); + for i in 0..50 { + let err = (data[i] - dec[i]).abs(); + assert!(err < data[i].abs() * 0.01 + 1e-6, + "at {}: {} → {} err={}", i, data[i], dec[i], err); + } + } +}