From 226d7ce43d4b3243b5f123d45ed20a189553477d Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 3 Apr 2026 13:16:23 +0000 Subject: [PATCH 1/3] feat: add pub accessors mu(), sigma(), observations() to Cascade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumers (bgz-tensor) need read access to Welford σ tracking state for quarter-sigma band computation. Fields remain private; accessors provide read-only access. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp --- src/hpc/cascade.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/hpc/cascade.rs b/src/hpc/cascade.rs index 20645ef9..cc5f6535 100644 --- a/src/hpc/cascade.rs +++ b/src/hpc/cascade.rs @@ -97,6 +97,15 @@ pub struct Cascade { } impl Cascade { + /// Current distribution mean (Welford online estimate). + pub fn mu(&self) -> f64 { self.mu } + + /// Current distribution standard deviation (Welford online estimate). + pub fn sigma(&self) -> f64 { self.sigma } + + /// Number of observations processed. + pub fn observations(&self) -> usize { self.observations } + pub fn from_threshold(threshold: u64, vec_bytes: usize) -> Self { Self { threshold, vec_bytes, mu: 0.0, sigma: 0.0, observations: 0 } } From 5277b2949cf44ae732ff5a9645ce9f980f67bdbc Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 3 Apr 2026 13:26:07 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20heel=5Ff64x8=20=E2=80=94=20F64x8=20?= =?UTF-8?q?SIMD=20kernel=20for=208=20HEEL=20plane=20weighted=20distance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 8 HEEL planes × 1 distance each = 8 f64 = one F64x8 register. LazyLock dispatch: AVX-512 (vmulpd + vreducepd) → AVX2 (2×vmulpd) → scalar. Functions: heel_weighted_distance(&[f64;8], &[f64;8]) → f64 (weighted dot) heel_plane_distances(&[u64;8], &[u64;8]) → [f64;8] (Hamming per plane) heel_weighted_hamming(a, b, weights) → f64 (full pipeline) Predefined weights: UNIFORM_WEIGHTS = [1.0; 8] HEEL_7PLUS1_WEIGHTS = [1,1,1,1,1,1,1, 0.5] (contradiction at half) 6 tests passing. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp --- src/hpc/heel_f64x8.rs | 178 ++++++++++++++++++++++++++++++++++++++++++ src/hpc/mod.rs | 2 + 2 files changed, 180 insertions(+) create mode 100644 src/hpc/heel_f64x8.rs diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs new file mode 100644 index 00000000..a9940e8d --- /dev/null +++ b/src/hpc/heel_f64x8.rs @@ -0,0 +1,178 @@ +//! F64x8 HEEL distance: 8 f64 distances across 8 HEEL planes in one SIMD pass. +//! +//! p64 has 8 HEEL planes (u64 each). For weighted f64 distance computation, +//! each plane produces one f64 distance value → 8 values = one F64x8 register. +//! +//! This module provides the SIMD kernel; p64-bridge calls it. +//! ndarray = hardware acceleration, consumers use the kernel. +//! +//! Dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar. +//! LazyLock selects at startup. + +use std::sync::LazyLock; + +/// Kernel signature: 8 distances in, weighted sum out. +/// `distances`: 8 f64 values (one per HEEL plane). +/// `weights`: 8 f64 weights (per-expert importance). +/// Returns: weighted sum = Σ(distance[i] × weight[i]). +type HeelF64x8DotFn = unsafe fn(&[f64; 8], &[f64; 8]) -> f64; + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +unsafe fn heel_dot_avx512(a: &[f64; 8], b: &[f64; 8]) -> f64 { + use std::arch::x86_64::*; + let va = _mm512_loadu_pd(a.as_ptr()); + let vb = _mm512_loadu_pd(b.as_ptr()); + let prod = _mm512_mul_pd(va, vb); + _mm512_reduce_add_pd(prod) +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +unsafe fn heel_dot_avx2(a: &[f64; 8], b: &[f64; 8]) -> f64 { + use std::arch::x86_64::*; + // 2 passes of 4 lanes + let va0 = _mm256_loadu_pd(a.as_ptr()); + let vb0 = _mm256_loadu_pd(b.as_ptr()); + let p0 = _mm256_mul_pd(va0, vb0); + + let va1 = _mm256_loadu_pd(a[4..].as_ptr()); + let vb1 = _mm256_loadu_pd(b[4..].as_ptr()); + let p1 = _mm256_mul_pd(va1, vb1); + + let sum = _mm256_add_pd(p0, p1); + // Horizontal sum of 4 f64 + let hi = _mm256_extractf128_pd(sum, 1); + let lo = _mm256_castpd256_pd128(sum); + let pair = _mm_add_pd(lo, hi); + let hi64 = _mm_unpackhi_pd(pair, pair); + let result = _mm_add_sd(pair, hi64); + _mm_cvtsd_f64(result) +} + +fn heel_dot_scalar(a: &[f64; 8], b: &[f64; 8]) -> f64 { + let mut sum = 0.0f64; + for i in 0..8 { + sum += a[i] * b[i]; + } + sum +} + +static HEEL_DOT_KERNEL: LazyLock = LazyLock::new(|| { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx512f") { + return heel_dot_avx512 as HeelF64x8DotFn; + } + if is_x86_feature_detected!("avx2") { + return heel_dot_avx2 as HeelF64x8DotFn; + } + } + heel_dot_scalar as HeelF64x8DotFn +}); + +/// Compute weighted dot product of 8 HEEL plane distances. +/// +/// `distances[i]` = distance for HEEL plane i. +/// `weights[i]` = importance weight for plane i. +/// Returns: Σ(distances[i] × weights[i]). +/// +/// One SIMD pass on AVX-512 (single `vmulpd` + `vreducepd`). +/// Two passes on AVX2. Scalar fallback for non-x86. +#[inline] +pub fn heel_weighted_distance(distances: &[f64; 8], weights: &[f64; 8]) -> f64 { + unsafe { HEEL_DOT_KERNEL(distances, weights) } +} + +/// Compute L1-like distance across 8 HEEL planes. +/// +/// For each plane i: distance[i] = popcount(a[i] XOR b[i]) as f64. +/// Then weighted sum via F64x8 dot product. +/// +/// This converts binary Hamming distances to f64 for weighted combination, +/// where each plane's contribution is scaled by expert importance. +pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] { + let mut dists = [0.0f64; 8]; + for i in 0..8 { + dists[i] = (a[i] ^ b[i]).count_ones() as f64; + } + dists +} + +/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar distance. +#[inline] +pub fn heel_weighted_hamming( + a_planes: &[u64; 8], + b_planes: &[u64; 8], + weights: &[f64; 8], +) -> f64 { + let dists = heel_plane_distances(a_planes, b_planes); + heel_weighted_distance(&dists, weights) +} + +/// Uniform weights (all planes equal). +pub const UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8]; + +/// HEEL-weighted (7 constructive + 1 contradiction at reduced weight). +/// Contradiction plane (index 7) gets 0.5× weight. +pub const HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn dot_product_basic() { + let a = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; + let b = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; + let result = heel_weighted_distance(&a, &b); + assert!((result - 36.0).abs() < 1e-10, "1+2+3+4+5+6+7+8 = 36, got {}", result); + } + + #[test] + fn dot_product_weighted() { + let distances = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0]; + let weights = [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5]; + let result = heel_weighted_distance(&distances, &weights); + assert!((result - 60.0).abs() < 1e-10, "10×2 + 80×0.5 = 60, got {}", result); + } + + #[test] + fn plane_distances_self_zero() { + let planes = [0x1234u64; 8]; + let dists = heel_plane_distances(&planes, &planes); + for d in &dists { + assert_eq!(*d, 0.0); + } + } + + #[test] + fn plane_distances_opposite() { + let a = [0u64; 8]; + let b = [u64::MAX; 8]; + let dists = heel_plane_distances(&a, &b); + for d in &dists { + assert_eq!(*d, 64.0); + } + } + + #[test] + fn full_pipeline_uniform() { + let a = [0xFFFF_0000_FFFF_0000u64; 8]; + let b = [0x0000_FFFF_0000_FFFFu64; 8]; + let d = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS); + // Each plane: all bits differ = 64 + assert!((d - 64.0 * 8.0).abs() < 1e-10, "8 planes × 64 bits = 512, got {}", d); + } + + #[test] + fn seven_plus_one_weights() { + let a = [0u64; 8]; + let b = [u64::MAX; 8]; + let d_uniform = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS); + let d_7plus1 = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS); + // 7+1: plane 7 at 0.5× = 7×64 + 0.5×64 = 480 vs 512 + assert!((d_uniform - 512.0).abs() < 1e-10); + assert!((d_7plus1 - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d_7plus1); + } +} diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 3533baae..0796e56a 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -52,6 +52,8 @@ pub mod node; #[allow(missing_docs)] pub mod cascade; #[allow(missing_docs)] +pub mod heel_f64x8; +#[allow(missing_docs)] pub mod bf16_truth; #[allow(missing_docs)] pub mod causality; From 256b23d589798d76e0705dfd6e65c3ca448f50ac Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 3 Apr 2026 13:44:06 +0000 Subject: [PATCH 3/3] refactor: heel_f64x8 uses crate::simd::F64x8 polyfill, add SIMD cosine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace raw std::arch intrinsics with crate::simd::F64x8 polyfill. Automatic dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar. Consumer writes crate::simd::F64x8 — polyfill handles tier selection. Added SIMD cosine kernels using F64x8 FMA: cosine_f64_simd() — single-pass dot + norm_a + norm_b via F64x8 cosine_f32_to_f64_simd() — f32 input, f64 precision cosine dot_f64_simd() — F64x8 FMA dot product on f64 slices sum_sq_f64_simd() — F64x8 sum of squares 12 tests passing (6 HEEL + 6 cosine). https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp --- src/hpc/heel_f64x8.rs | 312 ++++++++++++++++++++++++++++++------------ 1 file changed, 221 insertions(+), 91 deletions(-) diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs index a9940e8d..2cb57cad 100644 --- a/src/hpc/heel_f64x8.rs +++ b/src/hpc/heel_f64x8.rs @@ -3,73 +3,13 @@ //! p64 has 8 HEEL planes (u64 each). For weighted f64 distance computation, //! each plane produces one f64 distance value → 8 values = one F64x8 register. //! -//! This module provides the SIMD kernel; p64-bridge calls it. -//! ndarray = hardware acceleration, consumers use the kernel. -//! -//! Dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar. -//! LazyLock selects at startup. - -use std::sync::LazyLock; - -/// Kernel signature: 8 distances in, weighted sum out. -/// `distances`: 8 f64 values (one per HEEL plane). -/// `weights`: 8 f64 weights (per-expert importance). -/// Returns: weighted sum = Σ(distance[i] × weight[i]). -type HeelF64x8DotFn = unsafe fn(&[f64; 8], &[f64; 8]) -> f64; - -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "avx512f")] -unsafe fn heel_dot_avx512(a: &[f64; 8], b: &[f64; 8]) -> f64 { - use std::arch::x86_64::*; - let va = _mm512_loadu_pd(a.as_ptr()); - let vb = _mm512_loadu_pd(b.as_ptr()); - let prod = _mm512_mul_pd(va, vb); - _mm512_reduce_add_pd(prod) -} - -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "avx2")] -unsafe fn heel_dot_avx2(a: &[f64; 8], b: &[f64; 8]) -> f64 { - use std::arch::x86_64::*; - // 2 passes of 4 lanes - let va0 = _mm256_loadu_pd(a.as_ptr()); - let vb0 = _mm256_loadu_pd(b.as_ptr()); - let p0 = _mm256_mul_pd(va0, vb0); - - let va1 = _mm256_loadu_pd(a[4..].as_ptr()); - let vb1 = _mm256_loadu_pd(b[4..].as_ptr()); - let p1 = _mm256_mul_pd(va1, vb1); - - let sum = _mm256_add_pd(p0, p1); - // Horizontal sum of 4 f64 - let hi = _mm256_extractf128_pd(sum, 1); - let lo = _mm256_castpd256_pd128(sum); - let pair = _mm_add_pd(lo, hi); - let hi64 = _mm_unpackhi_pd(pair, pair); - let result = _mm_add_sd(pair, hi64); - _mm_cvtsd_f64(result) -} - -fn heel_dot_scalar(a: &[f64; 8], b: &[f64; 8]) -> f64 { - let mut sum = 0.0f64; - for i in 0..8 { - sum += a[i] * b[i]; - } - sum -} +//! Uses `crate::simd::F64x8` polyfill — automatic dispatch: +//! AVX-512: native __m512d (one register) +//! AVX2: 2× __m256d (two registers, same API) +//! Scalar: [f64; 8] fallback +//! Consumer writes `crate::simd::F64x8`. The polyfill handles the rest. -static HEEL_DOT_KERNEL: LazyLock = LazyLock::new(|| { - #[cfg(target_arch = "x86_64")] - { - if is_x86_feature_detected!("avx512f") { - return heel_dot_avx512 as HeelF64x8DotFn; - } - if is_x86_feature_detected!("avx2") { - return heel_dot_avx2 as HeelF64x8DotFn; - } - } - heel_dot_scalar as HeelF64x8DotFn -}); +use crate::simd::F64x8; /// Compute weighted dot product of 8 HEEL plane distances. /// @@ -77,20 +17,20 @@ static HEEL_DOT_KERNEL: LazyLock = LazyLock::new(|| { /// `weights[i]` = importance weight for plane i. /// Returns: Σ(distances[i] × weights[i]). /// -/// One SIMD pass on AVX-512 (single `vmulpd` + `vreducepd`). -/// Two passes on AVX2. Scalar fallback for non-x86. +/// One F64x8 multiply + reduce_sum. On AVX-512: single vmulpd + vreducepd. +/// On AVX2: 2× vmulpd + 2× haddpd. Scalar: 8 multiplies + sum. #[inline] pub fn heel_weighted_distance(distances: &[f64; 8], weights: &[f64; 8]) -> f64 { - unsafe { HEEL_DOT_KERNEL(distances, weights) } + let vd = F64x8::from_slice(distances); + let vw = F64x8::from_slice(weights); + (vd * vw).reduce_sum() } /// Compute L1-like distance across 8 HEEL planes. /// /// For each plane i: distance[i] = popcount(a[i] XOR b[i]) as f64. -/// Then weighted sum via F64x8 dot product. -/// -/// This converts binary Hamming distances to f64 for weighted combination, -/// where each plane's contribution is scaled by expert importance. +/// This is Hamming on binary HEEL planes — valid because HEEL planes +/// ARE uniform binary data (unlike bgz17 i16 which must use L1). pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] { let mut dists = [0.0f64; 8]; for i in 0..8 { @@ -99,7 +39,7 @@ pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] { dists } -/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar distance. +/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar. #[inline] pub fn heel_weighted_hamming( a_planes: &[u64; 8], @@ -117,20 +57,151 @@ pub const UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8]; /// Contradiction plane (index 7) gets 0.5× weight. pub const HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5]; +// ═══════════════════════════════════════════════════════════════════════════ +// SIMD cosine similarity via F64x8 — for CLAM cosine clustering +// ═══════════════════════════════════════════════════════════════════════════ + +/// SIMD dot product on f64 slices via F64x8. +/// +/// Processes 8 elements per iteration. Remainder handled scalar. +/// Used by cosine_simd as the inner kernel. +pub fn dot_f64_simd(a: &[f64], b: &[f64]) -> f64 { + let n = a.len().min(b.len()); + let chunks = n / 8; + let remainder = n % 8; + + let mut acc = F64x8::splat(0.0); + for i in 0..chunks { + let va = F64x8::from_slice(&a[i * 8..]); + let vb = F64x8::from_slice(&b[i * 8..]); + acc = va.mul_add(vb, acc); // acc = va * vb + acc (FMA) + } + let mut sum = acc.reduce_sum(); + + // Scalar remainder + let offset = chunks * 8; + for i in 0..remainder { + sum += a[offset + i] * b[offset + i]; + } + sum +} + +/// SIMD sum of squares via F64x8. +pub fn sum_sq_f64_simd(a: &[f64]) -> f64 { + let n = a.len(); + let chunks = n / 8; + let remainder = n % 8; + + let mut acc = F64x8::splat(0.0); + for i in 0..chunks { + let va = F64x8::from_slice(&a[i * 8..]); + acc = va.mul_add(va, acc); // acc = va * va + acc + } + let mut sum = acc.reduce_sum(); + + let offset = chunks * 8; + for i in 0..remainder { + sum += a[offset + i] * a[offset + i]; + } + sum +} + +/// SIMD cosine similarity on f64 slices. +/// +/// Computes dot(a,b) / (||a|| × ||b||) using F64x8 FMA. +/// Single pass: accumulates dot, norm_a, norm_b simultaneously. +pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 { + let n = a.len().min(b.len()); + let chunks = n / 8; + let remainder = n % 8; + + let mut dot_acc = F64x8::splat(0.0); + let mut na_acc = F64x8::splat(0.0); + let mut nb_acc = F64x8::splat(0.0); + + for i in 0..chunks { + let va = F64x8::from_slice(&a[i * 8..]); + let vb = F64x8::from_slice(&b[i * 8..]); + dot_acc = va.mul_add(vb, dot_acc); // dot += a*b + na_acc = va.mul_add(va, na_acc); // na += a*a + nb_acc = vb.mul_add(vb, nb_acc); // nb += b*b + } + + let mut dot = dot_acc.reduce_sum(); + let mut na = na_acc.reduce_sum(); + let mut nb = nb_acc.reduce_sum(); + + let offset = chunks * 8; + for i in 0..remainder { + dot += a[offset + i] * b[offset + i]; + na += a[offset + i] * a[offset + i]; + nb += b[offset + i] * b[offset + i]; + } + + let denom = (na * nb).sqrt(); + if denom < 1e-12 { 0.0 } else { dot / denom } +} + +/// SIMD cosine similarity on f32 slices (converts to f64 internally for precision). +/// +/// For hot paths where input is f32 but you need f64 precision cosine. +/// Converts 8 f32 → 8 f64 per chunk via scalar widening, then F64x8 FMA. +pub fn cosine_f32_to_f64_simd(a: &[f32], b: &[f32]) -> f64 { + let n = a.len().min(b.len()); + let chunks = n / 8; + let remainder = n % 8; + + let mut dot_acc = F64x8::splat(0.0); + let mut na_acc = F64x8::splat(0.0); + let mut nb_acc = F64x8::splat(0.0); + + let mut buf_a = [0.0f64; 8]; + let mut buf_b = [0.0f64; 8]; + + for i in 0..chunks { + let off = i * 8; + for j in 0..8 { + buf_a[j] = a[off + j] as f64; + buf_b[j] = b[off + j] as f64; + } + let va = F64x8::from_slice(&buf_a); + let vb = F64x8::from_slice(&buf_b); + dot_acc = va.mul_add(vb, dot_acc); + na_acc = va.mul_add(va, na_acc); + nb_acc = vb.mul_add(vb, nb_acc); + } + + let mut dot = dot_acc.reduce_sum(); + let mut na = na_acc.reduce_sum(); + let mut nb = nb_acc.reduce_sum(); + + let offset = chunks * 8; + for i in 0..remainder { + let ai = a[offset + i] as f64; + let bi = b[offset + i] as f64; + dot += ai * bi; + na += ai * ai; + nb += bi * bi; + } + + let denom = (na * nb).sqrt(); + if denom < 1e-12 { 0.0 } else { dot / denom } +} + #[cfg(test)] mod tests { use super::*; #[test] - fn dot_product_basic() { + fn heel_dot_basic() { let a = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; - let b = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; + let b = [1.0; 8]; let result = heel_weighted_distance(&a, &b); - assert!((result - 36.0).abs() < 1e-10, "1+2+3+4+5+6+7+8 = 36, got {}", result); + assert!((result - 36.0).abs() < 1e-10, "1+2+...+8 = 36, got {}", result); } #[test] - fn dot_product_weighted() { + fn heel_dot_weighted() { let distances = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0]; let weights = [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5]; let result = heel_weighted_distance(&distances, &weights); @@ -141,9 +212,7 @@ mod tests { fn plane_distances_self_zero() { let planes = [0x1234u64; 8]; let dists = heel_plane_distances(&planes, &planes); - for d in &dists { - assert_eq!(*d, 0.0); - } + for d in &dists { assert_eq!(*d, 0.0); } } #[test] @@ -151,9 +220,7 @@ mod tests { let a = [0u64; 8]; let b = [u64::MAX; 8]; let dists = heel_plane_distances(&a, &b); - for d in &dists { - assert_eq!(*d, 64.0); - } + for d in &dists { assert_eq!(*d, 64.0); } } #[test] @@ -161,18 +228,81 @@ mod tests { let a = [0xFFFF_0000_FFFF_0000u64; 8]; let b = [0x0000_FFFF_0000_FFFFu64; 8]; let d = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS); - // Each plane: all bits differ = 64 - assert!((d - 64.0 * 8.0).abs() < 1e-10, "8 planes × 64 bits = 512, got {}", d); + assert!((d - 512.0).abs() < 1e-10, "8×64 = 512, got {}", d); } #[test] fn seven_plus_one_weights() { let a = [0u64; 8]; let b = [u64::MAX; 8]; - let d_uniform = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS); - let d_7plus1 = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS); - // 7+1: plane 7 at 0.5× = 7×64 + 0.5×64 = 480 vs 512 - assert!((d_uniform - 512.0).abs() < 1e-10); - assert!((d_7plus1 - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d_7plus1); + let d = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS); + assert!((d - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d); + } + + // ── SIMD cosine tests ─────────────────────────────────────────── + + #[test] + fn cosine_identical() { + let a: Vec = (0..1024).map(|i| (i as f64 * 0.01).sin()).collect(); + let c = cosine_f64_simd(&a, &a); + assert!((c - 1.0).abs() < 1e-10, "self-cosine should be 1.0: {}", c); + } + + #[test] + fn cosine_opposite() { + let a: Vec = (0..256).map(|i| i as f64 * 0.1).collect(); + let b: Vec = a.iter().map(|v| -v).collect(); + let c = cosine_f64_simd(&a, &b); + assert!((c - (-1.0)).abs() < 1e-10, "opposite should be -1.0: {}", c); + } + + #[test] + fn cosine_orthogonal() { + let mut a = vec![0.0f64; 256]; + let mut b = vec![0.0f64; 256]; + a[0] = 1.0; + b[1] = 1.0; + let c = cosine_f64_simd(&a, &b); + assert!(c.abs() < 1e-10, "orthogonal should be 0.0: {}", c); + } + + #[test] + fn cosine_matches_scalar() { + let a: Vec = (0..333).map(|i| (i as f64 * 0.037).sin()).collect(); + let b: Vec = (0..333).map(|i| (i as f64 * 0.023).cos()).collect(); + + let simd_cos = cosine_f64_simd(&a, &b); + + // Scalar reference + let dot: f64 = a.iter().zip(&b).map(|(x, y)| x * y).sum(); + let na: f64 = a.iter().map(|x| x * x).sum(); + let nb: f64 = b.iter().map(|x| x * x).sum(); + let scalar_cos = dot / (na * nb).sqrt(); + + assert!((simd_cos - scalar_cos).abs() < 1e-10, + "SIMD {:.12} vs scalar {:.12}", simd_cos, scalar_cos); + } + + #[test] + fn cosine_f32_matches_f64() { + let a_f32: Vec = (0..500).map(|i| (i as f32 * 0.01).sin()).collect(); + let b_f32: Vec = (0..500).map(|i| (i as f32 * 0.02).cos()).collect(); + + let a_f64: Vec = a_f32.iter().map(|&v| v as f64).collect(); + let b_f64: Vec = b_f32.iter().map(|&v| v as f64).collect(); + + let cos_f64 = cosine_f64_simd(&a_f64, &b_f64); + let cos_f32 = cosine_f32_to_f64_simd(&a_f32, &b_f32); + + assert!((cos_f64 - cos_f32).abs() < 1e-6, + "f32 {:.10} vs f64 {:.10}", cos_f32, cos_f64); + } + + #[test] + fn dot_f64_simd_basic() { + let a = [1.0f64; 24]; + let b = [2.0f64; 24]; + let d = dot_f64_simd(&a, &b); + assert!((d - 48.0).abs() < 1e-10, "24×2 = 48, got {}", d); } }