From 226d7ce43d4b3243b5f123d45ed20a189553477d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 3 Apr 2026 13:16:23 +0000
Subject: [PATCH 1/3] feat: add pub accessors mu(), sigma(), observations() to
 Cascade
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consumers (bgz-tensor) need read access to Welford σ tracking state
for quarter-sigma band computation. Fields remain private; accessors
provide read-only access.

https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
---
 src/hpc/cascade.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/hpc/cascade.rs b/src/hpc/cascade.rs
index 20645ef9..cc5f6535 100644
--- a/src/hpc/cascade.rs
+++ b/src/hpc/cascade.rs
@@ -97,6 +97,15 @@ pub struct Cascade {
 }
 
 impl Cascade {
+    /// Current distribution mean (Welford online estimate).
+    pub fn mu(&self) -> f64 { self.mu }
+
+    /// Current distribution standard deviation (Welford online estimate).
+    pub fn sigma(&self) -> f64 { self.sigma }
+
+    /// Number of observations processed.
+    pub fn observations(&self) -> usize { self.observations }
+
     pub fn from_threshold(threshold: u64, vec_bytes: usize) -> Self {
         Self { threshold, vec_bytes, mu: 0.0, sigma: 0.0, observations: 0 }
     }

From 5277b2949cf44ae732ff5a9645ce9f980f67bdbc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 3 Apr 2026 13:26:07 +0000
Subject: [PATCH 2/3] =?UTF-8?q?feat:=20heel=5Ff64x8=20=E2=80=94=20F64x8=20?=
 =?UTF-8?q?SIMD=20kernel=20for=208=20HEEL=20plane=20weighted=20distance?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

8 HEEL planes × 1 distance each = 8 f64 = one F64x8 register.
LazyLock dispatch: AVX-512 (vmulpd + vreducepd) → AVX2 (2×vmulpd) → scalar.

Functions:
  heel_weighted_distance(&[f64;8], &[f64;8]) → f64  (weighted dot)
  heel_plane_distances(&[u64;8], &[u64;8]) → [f64;8] (Hamming per plane)
  heel_weighted_hamming(a, b, weights) → f64  (full pipeline)

Predefined weights:
  UNIFORM_WEIGHTS = [1.0; 8]
  HEEL_7PLUS1_WEIGHTS = [1,1,1,1,1,1,1, 0.5] (contradiction at half)

6 tests passing.

https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
---
 src/hpc/heel_f64x8.rs | 178 ++++++++++++++++++++++++++++++++++++++++++
 src/hpc/mod.rs        |   2 +
 2 files changed, 180 insertions(+)
 create mode 100644 src/hpc/heel_f64x8.rs

diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs
new file mode 100644
index 00000000..a9940e8d
--- /dev/null
+++ b/src/hpc/heel_f64x8.rs
@@ -0,0 +1,178 @@
+//! F64x8 HEEL distance: 8 f64 distances across 8 HEEL planes in one SIMD pass.
+//!
+//! p64 has 8 HEEL planes (u64 each). For weighted f64 distance computation,
+//! each plane produces one f64 distance value → 8 values = one F64x8 register.
+//!
+//! This module provides the SIMD kernel; p64-bridge calls it.
+//! ndarray = hardware acceleration, consumers use the kernel.
+//!
+//! Dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar.
+//! LazyLock selects at startup.
+
+use std::sync::LazyLock;
+
+/// Kernel signature: 8 distances in, weighted sum out.
+/// `distances`: 8 f64 values (one per HEEL plane).
+/// `weights`: 8 f64 weights (per-expert importance).
+/// Returns: weighted sum = Σ(distance[i] × weight[i]).
+type HeelF64x8DotFn = unsafe fn(&[f64; 8], &[f64; 8]) -> f64;
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+unsafe fn heel_dot_avx512(a: &[f64; 8], b: &[f64; 8]) -> f64 {
+    use std::arch::x86_64::*;
+    let va = _mm512_loadu_pd(a.as_ptr());
+    let vb = _mm512_loadu_pd(b.as_ptr());
+    let prod = _mm512_mul_pd(va, vb);
+    _mm512_reduce_add_pd(prod)
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+unsafe fn heel_dot_avx2(a: &[f64; 8], b: &[f64; 8]) -> f64 {
+    use std::arch::x86_64::*;
+    // 2 passes of 4 lanes
+    let va0 = _mm256_loadu_pd(a.as_ptr());
+    let vb0 = _mm256_loadu_pd(b.as_ptr());
+    let p0 = _mm256_mul_pd(va0, vb0);
+
+    let va1 = _mm256_loadu_pd(a[4..].as_ptr());
+    let vb1 = _mm256_loadu_pd(b[4..].as_ptr());
+    let p1 = _mm256_mul_pd(va1, vb1);
+
+    let sum = _mm256_add_pd(p0, p1);
+    // Horizontal sum of 4 f64
+    let hi = _mm256_extractf128_pd(sum, 1);
+    let lo = _mm256_castpd256_pd128(sum);
+    let pair = _mm_add_pd(lo, hi);
+    let hi64 = _mm_unpackhi_pd(pair, pair);
+    let result = _mm_add_sd(pair, hi64);
+    _mm_cvtsd_f64(result)
+}
+
+fn heel_dot_scalar(a: &[f64; 8], b: &[f64; 8]) -> f64 {
+    let mut sum = 0.0f64;
+    for i in 0..8 {
+        sum += a[i] * b[i];
+    }
+    sum
+}
+
+static HEEL_DOT_KERNEL: LazyLock<HeelF64x8DotFn> = LazyLock::new(|| {
+    #[cfg(target_arch = "x86_64")]
+    {
+        if is_x86_feature_detected!("avx512f") {
+            return heel_dot_avx512 as HeelF64x8DotFn;
+        }
+        if is_x86_feature_detected!("avx2") {
+            return heel_dot_avx2 as HeelF64x8DotFn;
+        }
+    }
+    heel_dot_scalar as HeelF64x8DotFn
+});
+
+/// Compute weighted dot product of 8 HEEL plane distances.
+///
+/// `distances[i]` = distance for HEEL plane i.
+/// `weights[i]` = importance weight for plane i.
+/// Returns: Σ(distances[i] × weights[i]).
+///
+/// One SIMD pass on AVX-512 (single `vmulpd` + `vreducepd`).
+/// Two passes on AVX2. Scalar fallback for non-x86.
+#[inline]
+pub fn heel_weighted_distance(distances: &[f64; 8], weights: &[f64; 8]) -> f64 {
+    unsafe { HEEL_DOT_KERNEL(distances, weights) }
+}
+
+/// Compute L1-like distance across 8 HEEL planes.
+///
+/// For each plane i: distance[i] = popcount(a[i] XOR b[i]) as f64.
+/// Then weighted sum via F64x8 dot product.
+///
+/// This converts binary Hamming distances to f64 for weighted combination,
+/// where each plane's contribution is scaled by expert importance.
+pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] {
+    let mut dists = [0.0f64; 8];
+    for i in 0..8 {
+        dists[i] = (a[i] ^ b[i]).count_ones() as f64;
+    }
+    dists
+}
+
+/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar distance.
+#[inline]
+pub fn heel_weighted_hamming(
+    a_planes: &[u64; 8],
+    b_planes: &[u64; 8],
+    weights: &[f64; 8],
+) -> f64 {
+    let dists = heel_plane_distances(a_planes, b_planes);
+    heel_weighted_distance(&dists, weights)
+}
+
+/// Uniform weights (all planes equal).
+pub const UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8];
+
+/// HEEL-weighted (7 constructive + 1 contradiction at reduced weight).
+/// Contradiction plane (index 7) gets 0.5× weight.
+pub const HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn dot_product_basic() {
+        let a = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let b = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
+        let result = heel_weighted_distance(&a, &b);
+        assert!((result - 36.0).abs() < 1e-10, "1+2+3+4+5+6+7+8 = 36, got {}", result);
+    }
+
+    #[test]
+    fn dot_product_weighted() {
+        let distances = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0];
+        let weights = [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5];
+        let result = heel_weighted_distance(&distances, &weights);
+        assert!((result - 60.0).abs() < 1e-10, "10×2 + 80×0.5 = 60, got {}", result);
+    }
+
+    #[test]
+    fn plane_distances_self_zero() {
+        let planes = [0x1234u64; 8];
+        let dists = heel_plane_distances(&planes, &planes);
+        for d in &dists {
+            assert_eq!(*d, 0.0);
+        }
+    }
+
+    #[test]
+    fn plane_distances_opposite() {
+        let a = [0u64; 8];
+        let b = [u64::MAX; 8];
+        let dists = heel_plane_distances(&a, &b);
+        for d in &dists {
+            assert_eq!(*d, 64.0);
+        }
+    }
+
+    #[test]
+    fn full_pipeline_uniform() {
+        let a = [0xFFFF_0000_FFFF_0000u64; 8];
+        let b = [0x0000_FFFF_0000_FFFFu64; 8];
+        let d = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS);
+        // Each plane: all bits differ = 64
+        assert!((d - 64.0 * 8.0).abs() < 1e-10, "8 planes × 64 bits = 512, got {}", d);
+    }
+
+    #[test]
+    fn seven_plus_one_weights() {
+        let a = [0u64; 8];
+        let b = [u64::MAX; 8];
+        let d_uniform = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS);
+        let d_7plus1 = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS);
+        // 7+1: plane 7 at 0.5× = 7×64 + 0.5×64 = 480 vs 512
+        assert!((d_uniform - 512.0).abs() < 1e-10);
+        assert!((d_7plus1 - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d_7plus1);
+    }
+}
diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs
index 3533baae..0796e56a 100644
--- a/src/hpc/mod.rs
+++ b/src/hpc/mod.rs
@@ -52,6 +52,8 @@ pub mod node;
 #[allow(missing_docs)]
 pub mod cascade;
 #[allow(missing_docs)]
+pub mod heel_f64x8;
+#[allow(missing_docs)]
 pub mod bf16_truth;
 #[allow(missing_docs)]
 pub mod causality;

From 256b23d589798d76e0705dfd6e65c3ca448f50ac Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 3 Apr 2026 13:44:06 +0000
Subject: [PATCH 3/3] refactor: heel_f64x8 uses crate::simd::F64x8 polyfill,
 add SIMD cosine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace raw std::arch intrinsics with crate::simd::F64x8 polyfill.
Automatic dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar.
Consumer writes crate::simd::F64x8 — polyfill handles tier selection.

Added SIMD cosine kernels using F64x8 FMA:
  cosine_f64_simd() — single-pass dot + norm_a + norm_b via F64x8
  cosine_f32_to_f64_simd() — f32 input, f64 precision cosine
  dot_f64_simd() — F64x8 FMA dot product on f64 slices
  sum_sq_f64_simd() — F64x8 sum of squares

12 tests passing (6 HEEL + 6 cosine).

https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
---
 src/hpc/heel_f64x8.rs | 312 ++++++++++++++++++++++++++++++------------
 1 file changed, 221 insertions(+), 91 deletions(-)

diff --git a/src/hpc/heel_f64x8.rs b/src/hpc/heel_f64x8.rs
index a9940e8d..2cb57cad 100644
--- a/src/hpc/heel_f64x8.rs
+++ b/src/hpc/heel_f64x8.rs
@@ -3,73 +3,13 @@
 //! p64 has 8 HEEL planes (u64 each). For weighted f64 distance computation,
 //! each plane produces one f64 distance value → 8 values = one F64x8 register.
 //!
-//! This module provides the SIMD kernel; p64-bridge calls it.
-//! ndarray = hardware acceleration, consumers use the kernel.
-//!
-//! Dispatch: AVX-512 (native __m512d) → AVX2 (2×__m256d) → scalar.
-//! LazyLock selects at startup.
-
-use std::sync::LazyLock;
-
-/// Kernel signature: 8 distances in, weighted sum out.
-/// `distances`: 8 f64 values (one per HEEL plane).
-/// `weights`: 8 f64 weights (per-expert importance).
-/// Returns: weighted sum = Σ(distance[i] × weight[i]).
-type HeelF64x8DotFn = unsafe fn(&[f64; 8], &[f64; 8]) -> f64;
-
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx512f")]
-unsafe fn heel_dot_avx512(a: &[f64; 8], b: &[f64; 8]) -> f64 {
-    use std::arch::x86_64::*;
-    let va = _mm512_loadu_pd(a.as_ptr());
-    let vb = _mm512_loadu_pd(b.as_ptr());
-    let prod = _mm512_mul_pd(va, vb);
-    _mm512_reduce_add_pd(prod)
-}
-
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx2")]
-unsafe fn heel_dot_avx2(a: &[f64; 8], b: &[f64; 8]) -> f64 {
-    use std::arch::x86_64::*;
-    // 2 passes of 4 lanes
-    let va0 = _mm256_loadu_pd(a.as_ptr());
-    let vb0 = _mm256_loadu_pd(b.as_ptr());
-    let p0 = _mm256_mul_pd(va0, vb0);
-
-    let va1 = _mm256_loadu_pd(a[4..].as_ptr());
-    let vb1 = _mm256_loadu_pd(b[4..].as_ptr());
-    let p1 = _mm256_mul_pd(va1, vb1);
-
-    let sum = _mm256_add_pd(p0, p1);
-    // Horizontal sum of 4 f64
-    let hi = _mm256_extractf128_pd(sum, 1);
-    let lo = _mm256_castpd256_pd128(sum);
-    let pair = _mm_add_pd(lo, hi);
-    let hi64 = _mm_unpackhi_pd(pair, pair);
-    let result = _mm_add_sd(pair, hi64);
-    _mm_cvtsd_f64(result)
-}
-
-fn heel_dot_scalar(a: &[f64; 8], b: &[f64; 8]) -> f64 {
-    let mut sum = 0.0f64;
-    for i in 0..8 {
-        sum += a[i] * b[i];
-    }
-    sum
-}
+//! Uses `crate::simd::F64x8` polyfill — automatic dispatch:
+//!   AVX-512: native __m512d (one register)
+//!   AVX2:    2× __m256d (two registers, same API)
+//!   Scalar:  [f64; 8] fallback
+//! Consumer writes `crate::simd::F64x8`. The polyfill handles the rest.
 
-static HEEL_DOT_KERNEL: LazyLock<HeelF64x8DotFn> = LazyLock::new(|| {
-    #[cfg(target_arch = "x86_64")]
-    {
-        if is_x86_feature_detected!("avx512f") {
-            return heel_dot_avx512 as HeelF64x8DotFn;
-        }
-        if is_x86_feature_detected!("avx2") {
-            return heel_dot_avx2 as HeelF64x8DotFn;
-        }
-    }
-    heel_dot_scalar as HeelF64x8DotFn
-});
+use crate::simd::F64x8;
 
 /// Compute weighted dot product of 8 HEEL plane distances.
 ///
@@ -77,20 +17,20 @@ static HEEL_DOT_KERNEL: LazyLock<HeelF64x8DotFn> = LazyLock::new(|| {
 /// `weights[i]` = importance weight for plane i.
 /// Returns: Σ(distances[i] × weights[i]).
 ///
-/// One SIMD pass on AVX-512 (single `vmulpd` + `vreducepd`).
-/// Two passes on AVX2. Scalar fallback for non-x86.
+/// One F64x8 multiply + reduce_sum. On AVX-512: single vmulpd + vreducepd.
+/// On AVX2: 2× vmulpd + 2× haddpd. Scalar: 8 multiplies + sum.
 #[inline]
 pub fn heel_weighted_distance(distances: &[f64; 8], weights: &[f64; 8]) -> f64 {
-    unsafe { HEEL_DOT_KERNEL(distances, weights) }
+    let vd = F64x8::from_slice(distances);
+    let vw = F64x8::from_slice(weights);
+    (vd * vw).reduce_sum()
 }
 
 /// Compute L1-like distance across 8 HEEL planes.
 ///
 /// For each plane i: distance[i] = popcount(a[i] XOR b[i]) as f64.
-/// Then weighted sum via F64x8 dot product.
-///
-/// This converts binary Hamming distances to f64 for weighted combination,
-/// where each plane's contribution is scaled by expert importance.
+/// This is Hamming on binary HEEL planes — valid because HEEL planes
+/// ARE uniform binary data (unlike bgz17 i16 which must use L1).
 pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] {
     let mut dists = [0.0f64; 8];
     for i in 0..8 {
@@ -99,7 +39,7 @@ pub fn heel_plane_distances(a: &[u64; 8], b: &[u64; 8]) -> [f64; 8] {
     dists
 }
 
-/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar distance.
+/// Full pipeline: 8 HEEL planes → Hamming per plane → weighted F64x8 dot → scalar.
 #[inline]
 pub fn heel_weighted_hamming(
     a_planes: &[u64; 8],
@@ -117,20 +57,151 @@ pub const UNIFORM_WEIGHTS: [f64; 8] = [1.0; 8];
 /// Contradiction plane (index 7) gets 0.5× weight.
 pub const HEEL_7PLUS1_WEIGHTS: [f64; 8] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5];
 
+// ═══════════════════════════════════════════════════════════════════════════
+// SIMD cosine similarity via F64x8 — for CLAM cosine clustering
+// ═══════════════════════════════════════════════════════════════════════════
+
+/// SIMD dot product on f64 slices via F64x8.
+///
+/// Processes 8 elements per iteration. Remainder handled scalar.
+/// Used by cosine_simd as the inner kernel.
+pub fn dot_f64_simd(a: &[f64], b: &[f64]) -> f64 {
+    let n = a.len().min(b.len());
+    let chunks = n / 8;
+    let remainder = n % 8;
+
+    let mut acc = F64x8::splat(0.0);
+    for i in 0..chunks {
+        let va = F64x8::from_slice(&a[i * 8..]);
+        let vb = F64x8::from_slice(&b[i * 8..]);
+        acc = va.mul_add(vb, acc); // acc = va * vb + acc (FMA)
+    }
+    let mut sum = acc.reduce_sum();
+
+    // Scalar remainder
+    let offset = chunks * 8;
+    for i in 0..remainder {
+        sum += a[offset + i] * b[offset + i];
+    }
+    sum
+}
+
+/// SIMD sum of squares via F64x8.
+pub fn sum_sq_f64_simd(a: &[f64]) -> f64 {
+    let n = a.len();
+    let chunks = n / 8;
+    let remainder = n % 8;
+
+    let mut acc = F64x8::splat(0.0);
+    for i in 0..chunks {
+        let va = F64x8::from_slice(&a[i * 8..]);
+        acc = va.mul_add(va, acc); // acc = va * va + acc
+    }
+    let mut sum = acc.reduce_sum();
+
+    let offset = chunks * 8;
+    for i in 0..remainder {
+        sum += a[offset + i] * a[offset + i];
+    }
+    sum
+}
+
+/// SIMD cosine similarity on f64 slices.
+///
+/// Computes dot(a,b) / (||a|| × ||b||) using F64x8 FMA.
+/// Single pass: accumulates dot, norm_a, norm_b simultaneously.
+pub fn cosine_f64_simd(a: &[f64], b: &[f64]) -> f64 {
+    let n = a.len().min(b.len());
+    let chunks = n / 8;
+    let remainder = n % 8;
+
+    let mut dot_acc = F64x8::splat(0.0);
+    let mut na_acc = F64x8::splat(0.0);
+    let mut nb_acc = F64x8::splat(0.0);
+
+    for i in 0..chunks {
+        let va = F64x8::from_slice(&a[i * 8..]);
+        let vb = F64x8::from_slice(&b[i * 8..]);
+        dot_acc = va.mul_add(vb, dot_acc);  // dot += a*b
+        na_acc = va.mul_add(va, na_acc);    // na += a*a
+        nb_acc = vb.mul_add(vb, nb_acc);    // nb += b*b
+    }
+
+    let mut dot = dot_acc.reduce_sum();
+    let mut na = na_acc.reduce_sum();
+    let mut nb = nb_acc.reduce_sum();
+
+    let offset = chunks * 8;
+    for i in 0..remainder {
+        dot += a[offset + i] * b[offset + i];
+        na += a[offset + i] * a[offset + i];
+        nb += b[offset + i] * b[offset + i];
+    }
+
+    let denom = (na * nb).sqrt();
+    if denom < 1e-12 { 0.0 } else { dot / denom }
+}
+
+/// SIMD cosine similarity on f32 slices (converts to f64 internally for precision).
+///
+/// For hot paths where input is f32 but you need f64 precision cosine.
+/// Converts 8 f32 → 8 f64 per chunk via scalar widening, then F64x8 FMA.
+pub fn cosine_f32_to_f64_simd(a: &[f32], b: &[f32]) -> f64 {
+    let n = a.len().min(b.len());
+    let chunks = n / 8;
+    let remainder = n % 8;
+
+    let mut dot_acc = F64x8::splat(0.0);
+    let mut na_acc = F64x8::splat(0.0);
+    let mut nb_acc = F64x8::splat(0.0);
+
+    let mut buf_a = [0.0f64; 8];
+    let mut buf_b = [0.0f64; 8];
+
+    for i in 0..chunks {
+        let off = i * 8;
+        for j in 0..8 {
+            buf_a[j] = a[off + j] as f64;
+            buf_b[j] = b[off + j] as f64;
+        }
+        let va = F64x8::from_slice(&buf_a);
+        let vb = F64x8::from_slice(&buf_b);
+        dot_acc = va.mul_add(vb, dot_acc);
+        na_acc = va.mul_add(va, na_acc);
+        nb_acc = vb.mul_add(vb, nb_acc);
+    }
+
+    let mut dot = dot_acc.reduce_sum();
+    let mut na = na_acc.reduce_sum();
+    let mut nb = nb_acc.reduce_sum();
+
+    let offset = chunks * 8;
+    for i in 0..remainder {
+        let ai = a[offset + i] as f64;
+        let bi = b[offset + i] as f64;
+        dot += ai * bi;
+        na += ai * ai;
+        nb += bi * bi;
+    }
+
+    let denom = (na * nb).sqrt();
+    if denom < 1e-12 { 0.0 } else { dot / denom }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn dot_product_basic() {
+    fn heel_dot_basic() {
         let a = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
-        let b = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
+        let b = [1.0; 8];
         let result = heel_weighted_distance(&a, &b);
-        assert!((result - 36.0).abs() < 1e-10, "1+2+3+4+5+6+7+8 = 36, got {}", result);
+        assert!((result - 36.0).abs() < 1e-10, "1+2+...+8 = 36, got {}", result);
     }
 
     #[test]
-    fn dot_product_weighted() {
+    fn heel_dot_weighted() {
         let distances = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0];
         let weights = [2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5];
         let result = heel_weighted_distance(&distances, &weights);
@@ -141,9 +212,7 @@ mod tests {
     fn plane_distances_self_zero() {
         let planes = [0x1234u64; 8];
         let dists = heel_plane_distances(&planes, &planes);
-        for d in &dists {
-            assert_eq!(*d, 0.0);
-        }
+        for d in &dists { assert_eq!(*d, 0.0); }
     }
 
     #[test]
@@ -151,9 +220,7 @@ mod tests {
         let a = [0u64; 8];
         let b = [u64::MAX; 8];
         let dists = heel_plane_distances(&a, &b);
-        for d in &dists {
-            assert_eq!(*d, 64.0);
-        }
+        for d in &dists { assert_eq!(*d, 64.0); }
     }
 
     #[test]
@@ -161,18 +228,81 @@ mod tests {
         let a = [0xFFFF_0000_FFFF_0000u64; 8];
         let b = [0x0000_FFFF_0000_FFFFu64; 8];
         let d = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS);
-        // Each plane: all bits differ = 64
-        assert!((d - 64.0 * 8.0).abs() < 1e-10, "8 planes × 64 bits = 512, got {}", d);
+        assert!((d - 512.0).abs() < 1e-10, "8×64 = 512, got {}", d);
     }
 
     #[test]
     fn seven_plus_one_weights() {
         let a = [0u64; 8];
         let b = [u64::MAX; 8];
-        let d_uniform = heel_weighted_hamming(&a, &b, &UNIFORM_WEIGHTS);
-        let d_7plus1 = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS);
-        // 7+1: plane 7 at 0.5× = 7×64 + 0.5×64 = 480 vs 512
-        assert!((d_uniform - 512.0).abs() < 1e-10);
-        assert!((d_7plus1 - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d_7plus1);
+        let d = heel_weighted_hamming(&a, &b, &HEEL_7PLUS1_WEIGHTS);
+        assert!((d - 480.0).abs() < 1e-10, "7×64 + 0.5×64 = 480, got {}", d);
+    }
+
+    // ── SIMD cosine tests ───────────────────────────────────────────
+
+    #[test]
+    fn cosine_identical() {
+        let a: Vec<f64> = (0..1024).map(|i| (i as f64 * 0.01).sin()).collect();
+        let c = cosine_f64_simd(&a, &a);
+        assert!((c - 1.0).abs() < 1e-10, "self-cosine should be 1.0: {}", c);
+    }
+
+    #[test]
+    fn cosine_opposite() {
+        let a: Vec<f64> = (0..256).map(|i| i as f64 * 0.1).collect();
+        let b: Vec<f64> = a.iter().map(|v| -v).collect();
+        let c = cosine_f64_simd(&a, &b);
+        assert!((c - (-1.0)).abs() < 1e-10, "opposite should be -1.0: {}", c);
+    }
+
+    #[test]
+    fn cosine_orthogonal() {
+        let mut a = vec![0.0f64; 256];
+        let mut b = vec![0.0f64; 256];
+        a[0] = 1.0;
+        b[1] = 1.0;
+        let c = cosine_f64_simd(&a, &b);
+        assert!(c.abs() < 1e-10, "orthogonal should be 0.0: {}", c);
+    }
+
+    #[test]
+    fn cosine_matches_scalar() {
+        let a: Vec<f64> = (0..333).map(|i| (i as f64 * 0.037).sin()).collect();
+        let b: Vec<f64> = (0..333).map(|i| (i as f64 * 0.023).cos()).collect();
+
+        let simd_cos = cosine_f64_simd(&a, &b);
+
+        // Scalar reference
+        let dot: f64 = a.iter().zip(&b).map(|(x, y)| x * y).sum();
+        let na: f64 = a.iter().map(|x| x * x).sum();
+        let nb: f64 = b.iter().map(|x| x * x).sum();
+        let scalar_cos = dot / (na * nb).sqrt();
+
+        assert!((simd_cos - scalar_cos).abs() < 1e-10,
+            "SIMD {:.12} vs scalar {:.12}", simd_cos, scalar_cos);
+    }
+
+    #[test]
+    fn cosine_f32_matches_f64() {
+        let a_f32: Vec<f32> = (0..500).map(|i| (i as f32 * 0.01).sin()).collect();
+        let b_f32: Vec<f32> = (0..500).map(|i| (i as f32 * 0.02).cos()).collect();
+
+        let a_f64: Vec<f64> = a_f32.iter().map(|&v| v as f64).collect();
+        let b_f64: Vec<f64> = b_f32.iter().map(|&v| v as f64).collect();
+
+        let cos_f64 = cosine_f64_simd(&a_f64, &b_f64);
+        let cos_f32 = cosine_f32_to_f64_simd(&a_f32, &b_f32);
+
+        assert!((cos_f64 - cos_f32).abs() < 1e-6,
+            "f32 {:.10} vs f64 {:.10}", cos_f32, cos_f64);
+    }
+
+    #[test]
+    fn dot_f64_simd_basic() {
+        let a = [1.0f64; 24];
+        let b = [2.0f64; 24];
+        let d = dot_f64_simd(&a, &b);
+        assert!((d - 48.0).abs() < 1e-10, "24×2 = 48, got {}", d);
     }
 }