diff --git a/crates/p64/src/lib.rs b/crates/p64/src/lib.rs
index 270e4db7..b1308df2 100644
--- a/crates/p64/src/lib.rs
+++ b/crates/p64/src/lib.rs
@@ -178,21 +178,19 @@ type AttendFn = unsafe fn(&[u64; 64], u64, u8) -> (u8, u8, [u8; 64], u64);
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn attend_avx512(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8; 64], u64) {
-    use std::arch::x86_64::*;
     let mut best_idx = 0u8;
     let mut best_score = 0u8;
     let mut scores = [0u8; 64];
     let mut fires = 0u64;
 
-    let q = _mm512_set1_epi64(query as i64);
     // Process 8 rows per chunk, 8 chunks = 64 rows
+    // (scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4)
     for chunk in 0..8 {
         let base = chunk * 8;
-        // SAFETY: rows is [u64; 64], base..base+8 is in bounds, Palette64 is 64-byte aligned.
-        let r = _mm512_loadu_si512(rows[base..].as_ptr() as *const __m512i);
-        let anded = _mm512_and_si512(r, q);
-        // Extract 8 u64s and scalar popcount (no VPOPCNTDQ dependency)
-        let vals: [u64; 8] = std::mem::transmute(anded);
+        let mut vals = [0u64; 8];
+        for j in 0..8 {
+            vals[j] = rows[base + j] & query;
+        }
         for j in 0..8 {
             let score = vals[j].count_ones() as u8;
             let idx = base + j;
@@ -212,20 +210,19 @@ unsafe fn attend_avx512(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn attend_avx2(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8; 64], u64) {
-    use std::arch::x86_64::*;
     let mut best_idx = 0u8;
     let mut best_score = 0u8;
     let mut scores = [0u8; 64];
     let mut fires = 0u64;
 
-    let q = _mm256_set1_epi64x(query as i64);
     // Process 4 rows per chunk, 16 chunks = 64 rows
+    // (scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4)
     for chunk in 0..16 {
         let base = chunk * 4;
-        // SAFETY: rows is [u64; 64], base..base+4 is in bounds.
-        let r = _mm256_loadu_si256(rows[base..].as_ptr() as *const __m256i);
-        let anded = _mm256_and_si256(r, q);
-        let vals: [u64; 4] = std::mem::transmute(anded);
+        let mut vals = [0u64; 4];
+        for j in 0..4 {
+            vals[j] = rows[base + j] & query;
+        }
         for j in 0..4 {
             let score = vals[j].count_ones() as u8;
             let idx = base + j;
@@ -284,15 +281,14 @@ type NearestKFn = unsafe fn(&[u64; 64], u64) -> [u8; 64];
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn nearest_k_avx512(rows: &[u64; 64], query: u64) -> [u8; 64] {
-    use std::arch::x86_64::*;
     let mut dists = [0u8; 64];
-    let q = _mm512_set1_epi64(query as i64);
+    // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4
     for chunk in 0..8 {
         let base = chunk * 8;
-        // SAFETY: rows is [u64; 64], base..base+8 is in bounds.
-        let r = _mm512_loadu_si512(rows[base..].as_ptr() as *const __m512i);
-        let xored = _mm512_xor_si512(r, q);
-        let vals: [u64; 8] = std::mem::transmute(xored);
+        let mut vals = [0u64; 8];
+        for j in 0..8 {
+            vals[j] = rows[base + j] ^ query;
+        }
         for j in 0..8 {
             dists[base + j] = vals[j].count_ones() as u8;
         }
@@ -303,15 +299,14 @@ unsafe fn nearest_k_avx512(rows: &[u64; 64], query: u64) -> [u8; 64] {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn nearest_k_avx2(rows: &[u64; 64], query: u64) -> [u8; 64] {
-    use std::arch::x86_64::*;
     let mut dists = [0u8; 64];
-    let q = _mm256_set1_epi64x(query as i64);
+    // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4
     for chunk in 0..16 {
         let base = chunk * 4;
-        // SAFETY: rows is [u64; 64], base..base+4 is in bounds.
-        let r = _mm256_loadu_si256(rows[base..].as_ptr() as *const __m256i);
-        let xored = _mm256_xor_si256(r, q);
-        let vals: [u64; 4] = std::mem::transmute(xored);
+        let mut vals = [0u64; 4];
+        for j in 0..4 {
+            vals[j] = rows[base + j] ^ query;
+        }
         for j in 0..4 {
             dists[base + j] = vals[j].count_ones() as u8;
         }
@@ -350,13 +345,11 @@ type MoeGateFn = unsafe fn(&[u64; 8], u64, u8) -> (u8, [u8; 8], u64);
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn moe_gate_avx512(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u8; 8], u64) {
-    use std::arch::x86_64::*;
-    // Load all 8 planes into one zmm register, AND with broadcast query
-    // SAFETY: planes is [u64; 8] = 64 bytes, fits in one zmm.
-    let p = _mm512_loadu_si512(planes.as_ptr() as *const __m512i);
-    let q = _mm512_set1_epi64(query as i64);
-    let anded = _mm512_and_si512(p, q);
-    let vals: [u64; 8] = std::mem::transmute(anded);
+    // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4
+    let mut vals = [0u64; 8];
+    for i in 0..8 {
+        vals[i] = planes[i] & query;
+    }
 
     let mut active = 0u8;
     let mut strength = [0u8; 8];
@@ -375,8 +368,7 @@ unsafe fn moe_gate_avx512(planes: &[u64; 8], query: u64, threshold: u8) -> (u8,
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn moe_gate_avx2(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u8; 8], u64) {
-    use std::arch::x86_64::*;
-    let q = _mm256_set1_epi64x(query as i64);
+    // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4
     let mut active = 0u8;
     let mut strength = [0u8; 8];
     let mut combined = 0u64;
@@ -384,10 +376,10 @@ unsafe fn moe_gate_avx2(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u
     // Process 4 planes at a time, 2 chunks = 8 planes
     for chunk in 0..2 {
         let base = chunk * 4;
-        // SAFETY: planes is [u64; 8], base..base+4 is in bounds.
-        let p = _mm256_loadu_si256(planes[base..].as_ptr() as *const __m256i);
-        let anded = _mm256_and_si256(p, q);
-        let vals: [u64; 4] = std::mem::transmute(anded);
+        let mut vals = [0u64; 4];
+        for j in 0..4 {
+            vals[j] = planes[base + j] & query;
+        }
         for j in 0..4 {
             let score = vals[j].count_ones() as u8;
             let idx = base + j;
diff --git a/src/hpc/aabb.rs b/src/hpc/aabb.rs
index cf93da09..3f25fc81 100644
--- a/src/hpc/aabb.rs
+++ b/src/hpc/aabb.rs
@@ -168,8 +168,8 @@ fn aabb_intersect_batch_scalar(query: &Aabb, candidates: &[Aabb]) -> Vec<bool> {
 
 /// AVX-512 batch AABB intersection: tests 16 candidates per axis comparison.
 ///
-/// Broadcasts query min/max per axis, gathers candidate coords into __m512,
-/// compares all 16 at once using `_mm512_cmp_ps_mask`, ANDs the 6 comparison
+/// Broadcasts query min/max per axis, gathers candidate coords into F32x16,
+/// compares all 16 at once using `simd_le` / `simd_ge`, ANDs the 6 comparison
 /// masks.
 ///
 /// # Safety
@@ -177,7 +177,7 @@ fn aabb_intersect_batch_scalar(query: &Aabb, candidates: &[Aabb]) -> Vec<bool> {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec<bool> {
-    use core::arch::x86_64::*;
+    use crate::simd::{F32x16, F32Mask16};
 
     let mut result = Vec::with_capacity(candidates.len());
 
@@ -203,32 +203,30 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec<
             c_max_z[i] = cand.max[2];
         }
 
-        // SAFETY: arrays are 16-element, avx512f checked by caller.
-        let v_c_min_x = _mm512_loadu_ps(c_min_x.as_ptr());
-        let v_c_max_x = _mm512_loadu_ps(c_max_x.as_ptr());
-        let v_c_min_y = _mm512_loadu_ps(c_min_y.as_ptr());
-        let v_c_max_y = _mm512_loadu_ps(c_max_y.as_ptr());
-        let v_c_min_z = _mm512_loadu_ps(c_min_z.as_ptr());
-        let v_c_max_z = _mm512_loadu_ps(c_max_z.as_ptr());
+        let v_c_min_x = F32x16::from_array(c_min_x);
+        let v_c_max_x = F32x16::from_array(c_max_x);
+        let v_c_min_y = F32x16::from_array(c_min_y);
+        let v_c_max_y = F32x16::from_array(c_max_y);
+        let v_c_min_z = F32x16::from_array(c_min_z);
+        let v_c_max_z = F32x16::from_array(c_max_z);
 
         // Broadcast query bounds
-        let q_min_x = _mm512_set1_ps(query.min[0]);
-        let q_max_x = _mm512_set1_ps(query.max[0]);
-        let q_min_y = _mm512_set1_ps(query.min[1]);
-        let q_max_y = _mm512_set1_ps(query.max[1]);
-        let q_min_z = _mm512_set1_ps(query.min[2]);
-        let q_max_z = _mm512_set1_ps(query.max[2]);
+        let q_min_x = F32x16::splat(query.min[0]);
+        let q_max_x = F32x16::splat(query.max[0]);
+        let q_min_y = F32x16::splat(query.min[1]);
+        let q_max_y = F32x16::splat(query.max[1]);
+        let q_min_z = F32x16::splat(query.min[2]);
+        let q_max_z = F32x16::splat(query.max[2]);
 
         // 6 intersection conditions: q.min[i] <= c.max[i] && q.max[i] >= c.min[i]
-        // _CMP_LE_OQ = 18, _CMP_GE_OQ = 29 (ordered, quiet)
-        let m1 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_x, v_c_max_x);
-        let m2 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_x, v_c_min_x);
-        let m3 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_y, v_c_max_y);
-        let m4 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_y, v_c_min_y);
-        let m5 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_z, v_c_max_z);
-        let m6 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_z, v_c_min_z);
+        let m1 = q_min_x.simd_le(v_c_max_x);
+        let m2 = q_max_x.simd_ge(v_c_min_x);
+        let m3 = q_min_y.simd_le(v_c_max_y);
+        let m4 = q_max_y.simd_ge(v_c_min_y);
+        let m5 = q_min_z.simd_le(v_c_max_z);
+        let m6 = q_max_z.simd_ge(v_c_min_z);
 
-        let all = m1 & m2 & m3 & m4 & m5 & m6;
+        let all = m1.0 & m2.0 & m3.0 & m4.0 & m5.0 & m6.0;
 
         for i in 0..16 {
             result.push((all >> i) & 1 != 0);
@@ -246,24 +244,16 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec<
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "sse4.1")]
 unsafe fn aabb_intersect_batch_sse41(query: &Aabb, candidates: &[Aabb]) -> Vec<bool> {
-    use core::arch::x86_64::*;
-
-    // Load query min/max into SSE registers (only need xyz, ignore w).
-    let q_min = _mm_set_ps(0.0, query.min[2], query.min[1], query.min[0]);
-    let q_max = _mm_set_ps(f32::MAX, query.max[2], query.max[1], query.max[0]);
-
+    // Scalar per-candidate test — LLVM auto-vectorizes with target-cpu=x86-64-v4
     let mut result = Vec::with_capacity(candidates.len());
     for c in candidates {
-        let c_min = _mm_set_ps(0.0, c.min[2], c.min[1], c.min[0]);
-        let c_max = _mm_set_ps(f32::MAX, c.max[2], c.max[1], c.max[0]);
-
-        // q.min <= c.max  AND  q.max >= c.min  (per component)
-        let le = _mm_cmple_ps(q_min, c_max);    // q_min[i] <= c_max[i]
-        let ge = _mm_cmpge_ps(q_max, c_min);    // q_max[i] >= c_min[i]
-        let both = _mm_and_ps(le, ge);
-        // All 4 lanes must be true (lane 3 is always true due to sentinel values).
-        let mask = _mm_movemask_ps(both);
-        result.push(mask == 0xF);
+        let hit = query.min[0] <= c.max[0]
+            && query.max[0] >= c.min[0]
+            && query.min[1] <= c.max[1]
+            && query.max[1] >= c.min[1]
+            && query.min[2] <= c.max[2]
+            && query.max[2] >= c.min[2];
+        result.push(hit);
     }
     result
 }
@@ -333,27 +323,27 @@ fn ray_aabb_slab_test_scalar(ray: &Ray, aabbs: &[Aabb]) -> (Vec<bool>, Vec<f32>)
 /// AVX-512 batch ray-AABB slab test: processes 16 AABBs per iteration.
 ///
 /// Broadcasts ray origin and inv_dir per axis, gathers candidate min/max
-/// coords into SoA arrays, computes slab intervals with `_mm512_min_ps` /
-/// `_mm512_max_ps`, and combines masks with `_mm512_cmp_ps_mask`.
+/// coords into SoA arrays, computes slab intervals with `simd_min` /
+/// `simd_max`, and combines masks with `simd_le` / `simd_ge`.
 ///
 /// # Safety
 /// Caller must ensure AVX-512F is available.
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec<bool>, Vec<f32>) {
-    use core::arch::x86_64::*;
+    use crate::simd::F32x16;
 
     let mut hits = Vec::with_capacity(aabbs.len());
     let mut t_values = Vec::with_capacity(aabbs.len());
 
     // Broadcast ray origin and inv_dir per axis
-    let orig_x = _mm512_set1_ps(ray.origin[0]);
-    let orig_y = _mm512_set1_ps(ray.origin[1]);
-    let orig_z = _mm512_set1_ps(ray.origin[2]);
-    let inv_x = _mm512_set1_ps(ray.inv_dir[0]);
-    let inv_y = _mm512_set1_ps(ray.inv_dir[1]);
-    let inv_z = _mm512_set1_ps(ray.inv_dir[2]);
-    let zero = _mm512_set1_ps(0.0);
+    let orig_x = F32x16::splat(ray.origin[0]);
+    let orig_y = F32x16::splat(ray.origin[1]);
+    let orig_z = F32x16::splat(ray.origin[2]);
+    let inv_x = F32x16::splat(ray.inv_dir[0]);
+    let inv_y = F32x16::splat(ray.inv_dir[1]);
+    let inv_z = F32x16::splat(ray.inv_dir[2]);
+    let zero = F32x16::splat(0.0);
 
     // Process 16 AABBs at a time
     let chunks = aabbs.len() / 16;
@@ -378,49 +368,44 @@ unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec<bool>, Ve
             a_max_z[i] = aabb.max[2];
         }
 
-        // SAFETY: arrays are 16-element, avx512f checked by caller.
-        let v_min_x = _mm512_loadu_ps(a_min_x.as_ptr());
-        let v_max_x = _mm512_loadu_ps(a_max_x.as_ptr());
-        let v_min_y = _mm512_loadu_ps(a_min_y.as_ptr());
-        let v_max_y = _mm512_loadu_ps(a_max_y.as_ptr());
-        let v_min_z = _mm512_loadu_ps(a_min_z.as_ptr());
-        let v_max_z = _mm512_loadu_ps(a_max_z.as_ptr());
+        let v_min_x = F32x16::from_array(a_min_x);
+        let v_max_x = F32x16::from_array(a_max_x);
+        let v_min_y = F32x16::from_array(a_min_y);
+        let v_max_y = F32x16::from_array(a_max_y);
+        let v_min_z = F32x16::from_array(a_min_z);
+        let v_max_z = F32x16::from_array(a_max_z);
 
         // X axis: t1 = (min - origin) * inv_dir, t2 = (max - origin) * inv_dir
-        let t1_x = _mm512_mul_ps(_mm512_sub_ps(v_min_x, orig_x), inv_x);
-        let t2_x = _mm512_mul_ps(_mm512_sub_ps(v_max_x, orig_x), inv_x);
-        let t_near_x = _mm512_min_ps(t1_x, t2_x);
-        let t_far_x = _mm512_max_ps(t1_x, t2_x);
+        let t1_x = (v_min_x - orig_x) * inv_x;
+        let t2_x = (v_max_x - orig_x) * inv_x;
+        let t_near_x = t1_x.simd_min(t2_x);
+        let t_far_x = t1_x.simd_max(t2_x);
 
         // Y axis
-        let t1_y = _mm512_mul_ps(_mm512_sub_ps(v_min_y, orig_y), inv_y);
-        let t2_y = _mm512_mul_ps(_mm512_sub_ps(v_max_y, orig_y), inv_y);
-        let t_near_y = _mm512_min_ps(t1_y, t2_y);
-        let t_far_y = _mm512_max_ps(t1_y, t2_y);
+        let t1_y = (v_min_y - orig_y) * inv_y;
+        let t2_y = (v_max_y - orig_y) * inv_y;
+        let t_near_y = t1_y.simd_min(t2_y);
+        let t_far_y = t1_y.simd_max(t2_y);
 
         // Z axis
-        let t1_z = _mm512_mul_ps(_mm512_sub_ps(v_min_z, orig_z), inv_z);
-        let t2_z = _mm512_mul_ps(_mm512_sub_ps(v_max_z, orig_z), inv_z);
-        let t_near_z = _mm512_min_ps(t1_z, t2_z);
-        let t_far_z = _mm512_max_ps(t1_z, t2_z);
+        let t1_z = (v_min_z - orig_z) * inv_z;
+        let t2_z = (v_max_z - orig_z) * inv_z;
+        let t_near_z = t1_z.simd_min(t2_z);
+        let t_far_z = t1_z.simd_max(t2_z);
 
         // t_enter = max(t_near_x, t_near_y, t_near_z)
-        let t_enter = _mm512_max_ps(_mm512_max_ps(t_near_x, t_near_y), t_near_z);
+        let t_enter = t_near_x.simd_max(t_near_y).simd_max(t_near_z);
         // t_exit = min(t_far_x, t_far_y, t_far_z)
-        let t_exit = _mm512_min_ps(_mm512_min_ps(t_far_x, t_far_y), t_far_z);
+        let t_exit = t_far_x.simd_min(t_far_y).simd_min(t_far_z);
 
         // hit = t_enter <= t_exit AND t_exit >= 0
-        // _CMP_LE_OQ = 18, _CMP_GE_OQ = 29 (ordered, quiet)
-        let m_le = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(t_enter, t_exit);
-        let m_ge = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(t_exit, zero);
-        let hit_mask = m_le & m_ge;
+        let m_le = t_enter.simd_le(t_exit);
+        let m_ge = t_exit.simd_ge(zero);
+        let hit_mask = m_le.0 & m_ge.0;
 
         // Clamp t_enter to 0 for origins inside box
-        let t_enter_clamped = _mm512_max_ps(t_enter, zero);
-
-        // SAFETY: 16-element array matches __m512 lane count.
-        let mut t_arr = [0.0f32; 16];
-        _mm512_storeu_ps(t_arr.as_mut_ptr(), t_enter_clamped);
+        let t_enter_clamped = t_enter.simd_max(zero);
+        let t_arr = t_enter_clamped.to_array();
 
         for i in 0..16 {
             let hit = (hit_mask >> i) & 1 != 0;
@@ -482,27 +467,14 @@ fn aabb_expand_batch_scalar(aabbs: &mut [Aabb], dx: f32, dy: f32, dz: f32) {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "sse2")]
 unsafe fn aabb_expand_batch_sse2(aabbs: &mut [Aabb], dx: f32, dy: f32, dz: f32) {
-    use core::arch::x86_64::*;
-
-    let delta_min = _mm_set_ps(0.0, dz, dy, dx);
-    let delta_max = _mm_set_ps(0.0, dz, dy, dx);
-
+    // Scalar per-AABB expand — LLVM auto-vectorizes with target-cpu=x86-64-v4
     for a in aabbs.iter_mut() {
-        let min_v = _mm_set_ps(0.0, a.min[2], a.min[1], a.min[0]);
-        let max_v = _mm_set_ps(0.0, a.max[2], a.max[1], a.max[0]);
-
-        let new_min = _mm_sub_ps(min_v, delta_min);
-        let new_max = _mm_add_ps(max_v, delta_max);
-
-        // Store back. We cannot use _mm_storeu_ps directly into [f32;3],
-        // so extract components.
-        let mut min_arr = [0.0f32; 4];
-        let mut max_arr = [0.0f32; 4];
-        _mm_storeu_ps(min_arr.as_mut_ptr(), new_min);
-        _mm_storeu_ps(max_arr.as_mut_ptr(), new_max);
-
-        a.min = [min_arr[0], min_arr[1], min_arr[2]];
-        a.max = [max_arr[0], max_arr[1], max_arr[2]];
+        a.min[0] -= dx;
+        a.min[1] -= dy;
+        a.min[2] -= dz;
+        a.max[0] += dx;
+        a.max[1] += dy;
+        a.max[2] += dz;
     }
 }
 
diff --git a/src/hpc/bitwise.rs b/src/hpc/bitwise.rs
index 1e338e00..35396574 100644
--- a/src/hpc/bitwise.rs
+++ b/src/hpc/bitwise.rs
@@ -60,49 +60,20 @@ fn hamming_scalar(a: &[u8], b: &[u8]) -> u64 {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn hamming_avx2(a: &[u8], b: &[u8]) -> u64 {
-    use core::arch::x86_64::*;
+    // No U8x32 polyfill available — use u64 XOR + count_ones (hardware POPCNT).
     let n = a.len().min(b.len());
     let mut total = 0u64;
-
-    // Lookup table for popcount nibbles
-    let lookup = _mm256_setr_epi8(
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-    );
-    let low_mask = _mm256_set1_epi8(0x0f);
-    let mut acc = _mm256_setzero_si256();
     let mut i = 0;
-    let mut inner_count = 0u32;
 
-    while i + 32 <= n {
-        let va = _mm256_loadu_si256(a.as_ptr().add(i) as *const __m256i);
-        let vb = _mm256_loadu_si256(b.as_ptr().add(i) as *const __m256i);
-        let xor = _mm256_xor_si256(va, vb);
-        let lo = _mm256_and_si256(xor, low_mask);
-        let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask);
-        let popcnt_lo = _mm256_shuffle_epi8(lookup, lo);
-        let popcnt_hi = _mm256_shuffle_epi8(lookup, hi);
-        acc = _mm256_add_epi8(acc, _mm256_add_epi8(popcnt_lo, popcnt_hi));
-        i += 32;
-        inner_count += 1;
-        // Prevent overflow of u8 accumulators (max 255/8 ≈ 31 iterations)
-        if inner_count >= 30 {
-            let sad = _mm256_sad_epu8(acc, _mm256_setzero_si256());
-            let arr: [u64; 4] = core::mem::transmute(sad);
-            total += arr[0] + arr[1] + arr[2] + arr[3];
-            acc = _mm256_setzero_si256();
-            inner_count = 0;
-        }
+    // Process 8 bytes (one u64) per iteration
+    while i + 8 <= n {
+        let wa = u64::from_ne_bytes(a[i..i + 8].try_into().unwrap());
+        let wb = u64::from_ne_bytes(b[i..i + 8].try_into().unwrap());
+        total += (wa ^ wb).count_ones() as u64;
+        i += 8;
     }
 
-    // Flush accumulator
-    if inner_count > 0 {
-        let sad = _mm256_sad_epu8(acc, _mm256_setzero_si256());
-        let arr: [u64; 4] = core::mem::transmute(sad);
-        total += arr[0] + arr[1] + arr[2] + arr[3];
-    }
-
-    // Handle remainder
+    // Scalar remainder
     while i < n {
         total += (a[i] ^ b[i]).count_ones() as u64;
         i += 1;
@@ -115,45 +86,41 @@ unsafe fn hamming_avx2(a: &[u8], b: &[u8]) -> u64 {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512bw")]
 unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 {
-    use core::arch::x86_64::*;
+    use crate::simd::U8x64;
+
     let n = a.len().min(b.len());
     let mut total = 0u64;
 
     // vpshufb LUT: popcount of each nibble (replicated across 64B)
-    let lookup = _mm512_set4_epi32(
-        0x04030302_i32, 0x03020201_i32, 0x03020201_i32, 0x02010100_i32,
-    );
-    let low_mask = _mm512_set1_epi8(0x0f);
-    let mut acc = _mm512_setzero_si512();
+    let lookup = U8x64::nibble_popcount_lut();
+    let low_mask = U8x64::splat(0x0f);
+    let mut acc = U8x64::splat(0);
     let mut i = 0;
     let mut inner_count = 0u32;
 
     while i + 64 <= n {
-        let va = _mm512_loadu_si512(a.as_ptr().add(i) as *const _);
-        let vb = _mm512_loadu_si512(b.as_ptr().add(i) as *const _);
-        let xor = _mm512_xor_si512(va, vb);
+        let va = U8x64::from_slice(&a[i..]);
+        let vb = U8x64::from_slice(&b[i..]);
+        let xor = va ^ vb;
 
-        let lo = _mm512_and_si512(xor, low_mask);
-        let hi = _mm512_and_si512(_mm512_srli_epi16(xor, 4), low_mask);
-        let popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
-        let popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
-        acc = _mm512_add_epi8(acc, _mm512_add_epi8(popcnt_lo, popcnt_hi));
+        let lo = xor & low_mask;
+        let hi = xor.shr_epi16(4) & low_mask;
+        let popcnt_lo = lookup.shuffle_bytes(lo);
+        let popcnt_hi = lookup.shuffle_bytes(hi);
+        acc = acc + (popcnt_lo + popcnt_hi);
 
         i += 64;
         inner_count += 1;
         // Flush u8 accumulators before overflow (max 255/8 ≈ 31 iterations)
         if inner_count >= 30 {
-            // sad_epu8 sums groups of 8 bytes into u64 lanes
-            let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512());
-            total += _mm512_reduce_add_epi64(sad) as u64;
-            acc = _mm512_setzero_si512();
+            total += acc.sum_bytes_u64();
+            acc = U8x64::splat(0);
             inner_count = 0;
         }
     }
 
     if inner_count > 0 {
-        let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512());
-        total += _mm512_reduce_add_epi64(sad) as u64;
+        total += acc.sum_bytes_u64();
     }
 
     // Remainder
@@ -168,39 +135,36 @@ unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512bw")]
 unsafe fn popcount_avx512bw(a: &[u8]) -> u64 {
-    use core::arch::x86_64::*;
+    use crate::simd::U8x64;
+
     let n = a.len();
     let mut total = 0u64;
 
-    let lookup = _mm512_set4_epi32(
-        0x04030302_i32, 0x03020201_i32, 0x03020201_i32, 0x02010100_i32,
-    );
-    let low_mask = _mm512_set1_epi8(0x0f);
-    let mut acc = _mm512_setzero_si512();
+    let lookup = U8x64::nibble_popcount_lut();
+    let low_mask = U8x64::splat(0x0f);
+    let mut acc = U8x64::splat(0);
     let mut i = 0;
     let mut inner_count = 0u32;
 
     while i + 64 <= n {
-        let va = _mm512_loadu_si512(a.as_ptr().add(i) as *const _);
-        let lo = _mm512_and_si512(va, low_mask);
-        let hi = _mm512_and_si512(_mm512_srli_epi16(va, 4), low_mask);
-        let popcnt_lo = _mm512_shuffle_epi8(lookup, lo);
-        let popcnt_hi = _mm512_shuffle_epi8(lookup, hi);
-        acc = _mm512_add_epi8(acc, _mm512_add_epi8(popcnt_lo, popcnt_hi));
+        let va = U8x64::from_slice(&a[i..]);
+        let lo = va & low_mask;
+        let hi = va.shr_epi16(4) & low_mask;
+        let popcnt_lo = lookup.shuffle_bytes(lo);
+        let popcnt_hi = lookup.shuffle_bytes(hi);
+        acc = acc + (popcnt_lo + popcnt_hi);
 
         i += 64;
         inner_count += 1;
         if inner_count >= 30 {
-            let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512());
-            total += _mm512_reduce_add_epi64(sad) as u64;
-            acc = _mm512_setzero_si512();
+            total += acc.sum_bytes_u64();
+            acc = U8x64::splat(0);
             inner_count = 0;
         }
     }
 
     if inner_count > 0 {
-        let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512());
-        total += _mm512_reduce_add_epi64(sad) as u64;
+        total += acc.sum_bytes_u64();
     }
 
     while i < n {
diff --git a/src/hpc/byte_scan.rs b/src/hpc/byte_scan.rs
index 0840804f..4876a2cb 100644
--- a/src/hpc/byte_scan.rs
+++ b/src/hpc/byte_scan.rs
@@ -10,34 +10,33 @@
 
 #[cfg(target_arch = "x86_64")]
 pub(crate) mod simd_impl {
-    use core::arch::x86_64::*;
+    use crate::simd::U8x64;
 
-    /// Find all positions of `needle` in `haystack` using AVX2 (32 bytes/iter).
+    /// Find all positions of `needle` in `haystack` using scalar 32-byte chunks.
+    ///
+    /// No U8x32 polyfill exists, so we use scalar array comparison in 32-byte
+    /// blocks (same throughput pattern as the former AVX2 intrinsics path).
     ///
     /// # Safety
-    /// Caller must ensure AVX2 is available.
+    /// Caller must ensure AVX2 is available (kept for dispatch compatibility).
     #[target_feature(enable = "avx2")]
     pub(crate) unsafe fn byte_find_all_avx2(haystack: &[u8], needle: u8) -> Vec<usize> {
         let mut result = Vec::new();
         let n = haystack.len();
-        let ptr = haystack.as_ptr();
-        let needle_v = _mm256_set1_epi8(needle as i8);
 
         let mut i = 0usize;
         while i + 32 <= n {
-            let data = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
-            let cmp = _mm256_cmpeq_epi8(data, needle_v);
-            let mut mask = _mm256_movemask_epi8(cmp) as u32;
-            while mask != 0 {
-                let bit = mask.trailing_zeros() as usize;
-                result.push(i + bit);
-                mask &= mask - 1; // clear lowest set bit
+            // Scalar array comparison over 32 bytes
+            for j in 0..32 {
+                if haystack[i + j] == needle {
+                    result.push(i + j);
+                }
             }
             i += 32;
         }
         // Scalar tail
         for j in i..n {
-            if *ptr.add(j) == needle {
+            if haystack[j] == needle {
                 result.push(j);
             }
         }
@@ -46,8 +45,7 @@ pub(crate) mod simd_impl {
 
     /// Find all positions of `needle` in `haystack` using AVX-512 BW (64 bytes/iter).
     ///
-    /// Uses `_mm512_cmpeq_epi8_mask` which returns a `u64` kmask directly,
-    /// avoiding the movemask step needed in AVX2.
+    /// Uses `U8x64::cmpeq_mask()` which returns a `u64` bitmask directly.
     ///
     /// # Safety
     /// Caller must ensure AVX-512 BW is available.
@@ -55,14 +53,13 @@ pub(crate) mod simd_impl {
     pub(crate) unsafe fn byte_find_all_avx512(haystack: &[u8], needle: u8) -> Vec<usize> {
         let mut result = Vec::new();
         let n = haystack.len();
-        let ptr = haystack.as_ptr();
-        let needle_v = _mm512_set1_epi8(needle as i8);
+        let needle_v = U8x64::splat(needle);
 
         let mut i = 0usize;
         while i + 64 <= n {
-            // SAFETY: ptr.add(i) is within bounds, avx512bw checked by caller.
-            let data = _mm512_loadu_si512(ptr.add(i) as *const __m512i);
-            let mut mask = _mm512_cmpeq_epi8_mask(data, needle_v);
+            // SAFETY: i + 64 <= n, so slice is valid; avx512bw checked by caller.
+            let data = U8x64::from_slice(&haystack[i..]);
+            let mut mask = data.cmpeq_mask(needle_v);
             while mask != 0 {
                 let bit = mask.trailing_zeros() as usize;
                 result.push(i + bit);
@@ -72,34 +69,37 @@ pub(crate) mod simd_impl {
         }
         // Scalar tail
         for j in i..n {
-            if *ptr.add(j) == needle {
+            if haystack[j] == needle {
                 result.push(j);
             }
         }
         result
     }
 
-    /// Count occurrences of `needle` using AVX2.
+    /// Count occurrences of `needle` using scalar 32-byte chunks.
+    ///
+    /// No U8x32 polyfill exists, so we use scalar array comparison in 32-byte
+    /// blocks (same throughput pattern as the former AVX2 intrinsics path).
     ///
     /// # Safety
-    /// Caller must ensure AVX2 is available.
+    /// Caller must ensure AVX2 is available (kept for dispatch compatibility).
     #[target_feature(enable = "avx2")]
     pub(crate) unsafe fn byte_count_avx2(haystack: &[u8], needle: u8) -> usize {
         let n = haystack.len();
-        let ptr = haystack.as_ptr();
-        let needle_v = _mm256_set1_epi8(needle as i8);
         let mut total = 0usize;
 
         let mut i = 0usize;
         while i + 32 <= n {
-            let data = _mm256_loadu_si256(ptr.add(i) as *const __m256i);
-            let cmp = _mm256_cmpeq_epi8(data, needle_v);
-            let mask = _mm256_movemask_epi8(cmp) as u32;
-            total += mask.count_ones() as usize;
+            // Scalar array comparison over 32 bytes
+            for j in 0..32 {
+                if haystack[i + j] == needle {
+                    total += 1;
+                }
+            }
             i += 32;
         }
         for j in i..n {
-            if *ptr.add(j) == needle {
+            if haystack[j] == needle {
                 total += 1;
             }
         }
@@ -108,25 +108,26 @@ pub(crate) mod simd_impl {
 
     /// Count occurrences of `needle` using AVX-512 BW (64 bytes/iter).
     ///
+    /// Uses `U8x64::cmpeq_mask()` which returns a `u64` bitmask directly.
+    ///
     /// # Safety
     /// Caller must ensure AVX-512 BW is available.
     #[target_feature(enable = "avx512bw")]
     pub(crate) unsafe fn byte_count_avx512(haystack: &[u8], needle: u8) -> usize {
         let n = haystack.len();
-        let ptr = haystack.as_ptr();
-        let needle_v = _mm512_set1_epi8(needle as i8);
+        let needle_v = U8x64::splat(needle);
         let mut total = 0usize;
 
         let mut i = 0usize;
         while i + 64 <= n {
-            // SAFETY: ptr.add(i) is within bounds, avx512bw checked by caller.
-            let data = _mm512_loadu_si512(ptr.add(i) as *const __m512i);
-            let mask = _mm512_cmpeq_epi8_mask(data, needle_v);
+            // SAFETY: i + 64 <= n, so slice is valid; avx512bw checked by caller.
+            let data = U8x64::from_slice(&haystack[i..]);
+            let mask = data.cmpeq_mask(needle_v);
             total += mask.count_ones() as usize;
             i += 64;
         }
         for j in i..n {
-            if *ptr.add(j) == needle {
+            if haystack[j] == needle {
                 total += 1;
             }
         }
diff --git a/src/hpc/cam_pq.rs b/src/hpc/cam_pq.rs
index 87536ea2..0c103192 100644
--- a/src/hpc/cam_pq.rs
+++ b/src/hpc/cam_pq.rs
@@ -214,7 +214,7 @@ impl DistanceTables {
     #[cfg(target_arch = "x86_64")]
     #[target_feature(enable = "avx512f")]
     unsafe fn distance_batch_avx512(&self, cams: &[CamFingerprint]) -> Vec<f32> {
-        use core::arch::x86_64::*;
+        use crate::simd::{F32x16, I32x16};
 
         let n = cams.len();
         let mut result = vec![0.0f32; n];
@@ -222,31 +222,30 @@ impl DistanceTables {
 
         for chunk in 0..chunks {
             let base = chunk * 16;
-            let mut acc = _mm512_setzero_ps();
+            let mut acc = F32x16::splat(0.0);
 
             for s in 0..NUM_SUBSPACES {
                 // Gather 16 centroid indices for subspace s
-                let indices = _mm512_set_epi32(
-                    cams[base + 15][s] as i32, cams[base + 14][s] as i32,
-                    cams[base + 13][s] as i32, cams[base + 12][s] as i32,
-                    cams[base + 11][s] as i32, cams[base + 10][s] as i32,
-                    cams[base + 9][s] as i32,  cams[base + 8][s] as i32,
-                    cams[base + 7][s] as i32,  cams[base + 6][s] as i32,
-                    cams[base + 5][s] as i32,  cams[base + 4][s] as i32,
-                    cams[base + 3][s] as i32,  cams[base + 2][s] as i32,
-                    cams[base + 1][s] as i32,  cams[base][s] as i32,
-                );
+                let idx_arr = [
+                    cams[base][s] as i32,      cams[base + 1][s] as i32,
+                    cams[base + 2][s] as i32,  cams[base + 3][s] as i32,
+                    cams[base + 4][s] as i32,  cams[base + 5][s] as i32,
+                    cams[base + 6][s] as i32,  cams[base + 7][s] as i32,
+                    cams[base + 8][s] as i32,  cams[base + 9][s] as i32,
+                    cams[base + 10][s] as i32, cams[base + 11][s] as i32,
+                    cams[base + 12][s] as i32, cams[base + 13][s] as i32,
+                    cams[base + 14][s] as i32, cams[base + 15][s] as i32,
+                ];
+                let indices = I32x16::from_array(idx_arr);
 
                 // Gather distances from precomputed table
-                // _mm512_i32gather_ps gathers f32 values at byte offsets = index * scale
-                // Scale=4 means each index is multiplied by 4 (sizeof f32)
                 let base_ptr = self.tables[s].as_ptr();
-                let distances = _mm512_i32gather_ps::<4>(indices, base_ptr);
+                let distances = F32x16::gather(indices, base_ptr);
 
-                acc = _mm512_add_ps(acc, distances);
+                acc = acc + distances;
             }
 
-            _mm512_storeu_ps(result[base..].as_mut_ptr(), acc);
+            acc.copy_to_slice(&mut result[base..base + 16]);
         }
 
         // Scalar tail
diff --git a/src/hpc/distance.rs b/src/hpc/distance.rs
index 4b37b9a2..545a6f22 100644
--- a/src/hpc/distance.rs
+++ b/src/hpc/distance.rs
@@ -30,10 +30,9 @@ fn sq_dist_f64(a: [f64; 3], b: [f64; 3]) -> f64 {
 
 #[cfg(target_arch = "x86_64")]
 pub(crate) mod simd_impl {
-    #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64::*;
+    use crate::simd::F32x8;
 
-    /// Compute squared distances for 8 points at a time using AVX2.
+    /// Compute squared distances for 8 points at a time using F32x8 polyfill.
     /// `query` components are broadcast; `points` is read in SOA-style chunks.
     ///
     /// # Safety
@@ -48,11 +47,10 @@ pub(crate) mod simd_impl {
         out.clear();
         out.reserve(n);
 
-        let qx = _mm256_set1_ps(query[0]);
-        let qy = _mm256_set1_ps(query[1]);
-        let qz = _mm256_set1_ps(query[2]);
+        let qx = F32x8::splat(query[0]);
+        let qy = F32x8::splat(query[1]);
+        let qz = F32x8::splat(query[2]);
 
-        let ptr = points.as_ptr() as *const f32;
         // Each point is 3 floats => stride 3
         let mut i = 0usize;
         // Process 8 points at a time
@@ -63,28 +61,23 @@ pub(crate) mod simd_impl {
             let mut ys = [0f32; 8];
             let mut zs = [0f32; 8];
             for j in 0..8 {
-                let base = (i + j) * 3;
-                xs[j] = *ptr.add(base);
-                ys[j] = *ptr.add(base + 1);
-                zs[j] = *ptr.add(base + 2);
+                xs[j] = points[i + j][0];
+                ys[j] = points[i + j][1];
+                zs[j] = points[i + j][2];
             }
 
-            let vx = _mm256_loadu_ps(xs.as_ptr());
-            let vy = _mm256_loadu_ps(ys.as_ptr());
-            let vz = _mm256_loadu_ps(zs.as_ptr());
+            let vx = F32x8::from_array(xs);
+            let vy = F32x8::from_array(ys);
+            let vz = F32x8::from_array(zs);
 
-            let dx = _mm256_sub_ps(qx, vx);
-            let dy = _mm256_sub_ps(qy, vy);
-            let dz = _mm256_sub_ps(qz, vz);
+            let dx = qx - vx;
+            let dy = qy - vy;
+            let dz = qz - vz;
 
-            // dx*dx + dy*dy + dz*dz  (FMA where available)
-            let mut acc = _mm256_mul_ps(dx, dx);
-            acc = _mm256_add_ps(acc, _mm256_mul_ps(dy, dy));
-            acc = _mm256_add_ps(acc, _mm256_mul_ps(dz, dz));
+            // dx*dx + dy*dy + dz*dz
+            let acc = dx * dx + dy * dy + dz * dz;
 
-            let mut tmp = [0f32; 8];
-            _mm256_storeu_ps(tmp.as_mut_ptr(), acc);
-            out.extend_from_slice(&tmp);
+            out.extend_from_slice(&acc.to_array());
 
             i += 8;
         }
diff --git a/src/hpc/nibble.rs b/src/hpc/nibble.rs
index a740f198..4eceb246 100644
--- a/src/hpc/nibble.rs
+++ b/src/hpc/nibble.rs
@@ -50,34 +50,35 @@ pub(crate) fn nibble_unpack_scalar(packed: &[u8], count: usize, out: &mut Vec<u8
 
 /// AVX2 nibble unpack: processes 16 packed bytes → 32 nibbles per iteration.
 ///
+/// Uses scalar array operations on 16-byte chunks for portability.
+///
 /// # Safety
 /// Caller must ensure AVX2 is available and `count >= 32`.
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nibble_unpack_avx2(packed: &[u8], count: usize, out: &mut Vec<u8>) {
-    use core::arch::x86_64::*;
-
-    let low_mask = _mm_set1_epi8(0x0F);
     let mut i = 0usize; // byte index into packed
     let mut emitted = 0usize;
 
     // Each iteration: load 16 packed bytes → 32 nibbles
     while emitted + 32 <= count && i + 16 <= packed.len() {
-        // SAFETY: i + 16 <= packed.len(), avx2 checked by caller.
-        let data = _mm_loadu_si128(packed.as_ptr().add(i) as *const __m128i);
-
-        // Low nibbles (even indices)
-        let lo = _mm_and_si128(data, low_mask);
-        // High nibbles (odd indices)
-        let hi = _mm_and_si128(_mm_srli_epi16(data, 4), low_mask);
+        let mut data = [0u8; 16];
+        data.copy_from_slice(&packed[i..i + 16]);
+
+        // Extract low and high nibbles
+        let mut lo = [0u8; 16];
+        let mut hi = [0u8; 16];
+        for j in 0..16 {
+            lo[j] = data[j] & 0x0F;
+            hi[j] = (data[j] >> 4) & 0x0F;
+        }
 
         // Interleave: lo[0],hi[0], lo[1],hi[1], ...
-        let interleaved_lo = _mm_unpacklo_epi8(lo, hi); // bytes 0-7 → 16 nibbles
-        let interleaved_hi = _mm_unpackhi_epi8(lo, hi); // bytes 8-15 → 16 nibbles
-
         let mut buf = [0u8; 32];
-        _mm_storeu_si128(buf.as_mut_ptr() as *mut __m128i, interleaved_lo);
-        _mm_storeu_si128(buf.as_mut_ptr().add(16) as *mut __m128i, interleaved_hi);
+        for j in 0..16 {
+            buf[j * 2] = lo[j];
+            buf[j * 2 + 1] = hi[j];
+        }
 
         out.extend_from_slice(&buf);
         i += 16;
@@ -167,34 +168,20 @@ fn nibble_sub_clamp_scalar(packed: &mut [u8], delta: u8) {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) {
-    use core::arch::x86_64::*;
-
-    let mask_lo = _mm256_set1_epi8(0x0F);
-    let mask_hi = _mm256_set1_epi8(0xF0u8 as i8);
-    let delta_v = _mm256_set1_epi8(delta as i8);
-    // delta shifted into high nibble position for direct subtraction
-    let delta_hi = _mm256_set1_epi8((delta << 4) as i8);
     let chunks = packed.len() / 32;
 
     for c in 0..chunks {
-        let ptr = packed.as_mut_ptr().add(c * 32);
-        let data = _mm256_loadu_si256(ptr as *const __m256i);
-
-        // Extract low nibbles, subtract with saturation
-        let lo = _mm256_and_si256(data, mask_lo);
-        let lo_sub = _mm256_subs_epu8(lo, delta_v);
-
-        // Extract high nibbles (keep in high position), subtract with saturation
-        let hi = _mm256_and_si256(data, mask_hi);
-        let hi_sub = _mm256_subs_epu8(hi, delta_hi);
-
-        // Combine: low nibbles are already clean (0-15), high nibbles already in position
-        let result = _mm256_or_si256(
-            _mm256_and_si256(lo_sub, mask_lo),
-            _mm256_and_si256(hi_sub, mask_hi),
-        );
+        let offset = c * 32;
+        let mut data = [0u8; 32];
+        data.copy_from_slice(&packed[offset..offset + 32]);
+
+        for j in 0..32 {
+            let lo = (data[j] & 0x0F).saturating_sub(delta);
+            let hi = ((data[j] >> 4) & 0x0F).saturating_sub(delta);
+            data[j] = lo | (hi << 4);
+        }
 
-        _mm256_storeu_si256(ptr as *mut __m256i, result);
+        packed[offset..offset + 32].copy_from_slice(&data);
     }
 
     // Scalar tail
@@ -208,31 +195,28 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) {
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512bw")]
 unsafe fn nibble_sub_clamp_avx512(packed: &mut [u8], delta: u8) {
-    use core::arch::x86_64::*;
+    use crate::simd::U8x64;
 
-    let mask_lo = _mm512_set1_epi8(0x0F);
-    let mask_hi = _mm512_set1_epi8(0xF0u8 as i8);
-    let delta_v = _mm512_set1_epi8(delta as i8);
-    let delta_hi = _mm512_set1_epi8((delta << 4) as i8);
+    let mask_lo = U8x64::splat(0x0F);
+    let mask_hi = U8x64::splat(0xF0);
+    let delta_v = U8x64::splat(delta);
+    let delta_hi = U8x64::splat(delta << 4);
     let chunks = packed.len() / 64;
 
     for c in 0..chunks {
-        let ptr = packed.as_mut_ptr().add(c * 64);
-        // SAFETY: ptr is within bounds (c * 64 + 64 <= packed.len()), avx512bw checked.
-        let data = _mm512_loadu_si512(ptr as *const __m512i);
+        let offset = c * 64;
+        // SAFETY: offset + 64 <= packed.len(), avx512bw checked.
+        let data = U8x64::from_slice(&packed[offset..]);
 
-        let lo = _mm512_and_si512(data, mask_lo);
-        let lo_sub = _mm512_subs_epu8(lo, delta_v);
+        let lo = data & mask_lo;
+        let lo_sub = lo.saturating_sub(delta_v);
 
-        let hi = _mm512_and_si512(data, mask_hi);
-        let hi_sub = _mm512_subs_epu8(hi, delta_hi);
+        let hi = data & mask_hi;
+        let hi_sub = hi.saturating_sub(delta_hi);
 
-        let result = _mm512_or_si512(
-            _mm512_and_si512(lo_sub, mask_lo),
-            _mm512_and_si512(hi_sub, mask_hi),
-        );
+        let result = (lo_sub & mask_lo) | (hi_sub & mask_hi);
 
-        _mm512_storeu_si512(ptr as *mut __m512i, result);
+        result.copy_to_slice(&mut packed[offset..offset + 64]);
     }
 
     // Scalar tail
@@ -265,55 +249,29 @@ pub(crate) fn nibble_above_threshold_scalar(packed: &[u8], threshold: u8) -> Vec
 
 /// AVX2 nibble threshold scan: processes 32 packed bytes (64 nibbles) per iteration.
 ///
-/// Splits each byte into lo/hi nibbles, compares against threshold using
-/// signed comparison (with bias trick), and extracts matching indices from bitmask.
+/// Uses scalar array operations on 32-byte chunks for portability.
 ///
 /// # Safety
 /// Caller must ensure AVX2 is available and `packed.len() >= 16`.
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) -> Vec<usize> {
-    use core::arch::x86_64::*;
-
     let mut result = Vec::new();
-    let low_mask = _mm256_set1_epi8(0x0F);
-    // For unsigned comparison via signed: bias both operands by -128
-    let bias = _mm256_set1_epi8(-128i8);
-    // We want > threshold, which is: (val - 128) > (threshold - 128) via signed cmpgt
-    let thresh_lo = _mm256_set1_epi8((threshold as i8).wrapping_add(-128));
 
     let chunks = packed.len() / 32;
     for c in 0..chunks {
         let base_byte = c * 32;
-        // SAFETY: base_byte + 32 <= packed.len(), avx2 checked.
-        let data = _mm256_loadu_si256(packed.as_ptr().add(base_byte) as *const __m256i);
-
-        // Extract low nibbles
-        let lo = _mm256_and_si256(data, low_mask);
-        // Extract high nibbles
-        let hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), low_mask);
-
-        // Bias for unsigned compare: add -128 then use signed cmpgt
-        let lo_biased = _mm256_add_epi8(lo, bias);
-        let hi_biased = _mm256_add_epi8(hi, bias);
-
-        let lo_gt = _mm256_cmpgt_epi8(lo_biased, thresh_lo);
-        let hi_gt = _mm256_cmpgt_epi8(hi_biased, thresh_lo);
+        let chunk = &packed[base_byte..base_byte + 32];
 
-        let mut lo_mask = _mm256_movemask_epi8(lo_gt) as u32;
-        let mut hi_mask = _mm256_movemask_epi8(hi_gt) as u32;
-
-        // Low nibbles are at even indices: byte_index * 2
-        while lo_mask != 0 {
-            let bit = lo_mask.trailing_zeros() as usize;
-            result.push((base_byte + bit) * 2);
-            lo_mask &= lo_mask - 1;
-        }
-        // High nibbles are at odd indices: byte_index * 2 + 1
-        while hi_mask != 0 {
-            let bit = hi_mask.trailing_zeros() as usize;
-            result.push((base_byte + bit) * 2 + 1);
-            hi_mask &= hi_mask - 1;
+        for j in 0..32 {
+            let lo = chunk[j] & 0x0F;
+            let hi = (chunk[j] >> 4) & 0x0F;
+            if lo > threshold {
+                result.push((base_byte + j) * 2);
+            }
+            if hi > threshold {
+                result.push((base_byte + j) * 2 + 1);
+            }
         }
     }
 
diff --git a/src/hpc/packed.rs b/src/hpc/packed.rs
index 339dd9ba..6cd4b887 100644
--- a/src/hpc/packed.rs
+++ b/src/hpc/packed.rs
@@ -21,7 +21,7 @@ use super::bitwise;
 
 /// Software prefetch: bring a cache line into L1 for the given byte slice.
 ///
-/// No-op on non-x86 targets. On x86_64, uses `_mm_prefetch(_MM_HINT_T0)`.
+/// No-op on non-x86 targets. On x86_64, emits `prefetcht0`.
 /// The prefetch distance (how many candidates ahead) should be tuned per
 /// cache hierarchy — 4 candidates × 128B = 512B ≈ 8 cache lines is a
 /// reasonable default for Stroke 1 sequential scan.
@@ -29,16 +29,11 @@ use super::bitwise;
 #[allow(unused_variables)]
 fn prefetch_t0(ptr: *const u8) {
     #[cfg(target_arch = "x86_64")]
-    // SAFETY: `_mm_prefetch` is a CPU hint that cannot cause UB for any pointer
+    // SAFETY: prefetcht0 is a CPU hint that cannot cause UB for any pointer
     // value — the CPU silently ignores invalid or unmapped addresses. The `ptr`
     // comes from a bounds-checked slice index in the caller.
     unsafe {
-        #[cfg(target_feature = "sse")]
-        {
-            core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T0 }>(
-                ptr as *const i8,
-            );
-        }
+        core::arch::asm!("prefetcht0 [{}]", in(reg) ptr, options(nostack, preserves_flags));
     }
 }
 
@@ -47,16 +42,11 @@ fn prefetch_t0(ptr: *const u8) {
 #[allow(unused_variables)]
 fn prefetch_t1(ptr: *const u8) {
     #[cfg(target_arch = "x86_64")]
-    // SAFETY: `_mm_prefetch` is a CPU hint that cannot cause UB for any pointer
+    // SAFETY: prefetcht1 is a CPU hint that cannot cause UB for any pointer
     // value — the CPU silently ignores invalid or unmapped addresses. The `ptr`
     // comes from a bounds-checked slice index in the caller.
     unsafe {
-        #[cfg(target_feature = "sse")]
-        {
-            core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T1 }>(
-                ptr as *const i8,
-            );
-        }
+        core::arch::asm!("prefetcht1 [{}]", in(reg) ptr, options(nostack, preserves_flags));
     }
 }
 
diff --git a/src/hpc/palette_codec.rs b/src/hpc/palette_codec.rs
index f1715e41..2189c435 100644
--- a/src/hpc/palette_codec.rs
+++ b/src/hpc/palette_codec.rs
@@ -352,30 +352,36 @@ unsafe fn pack_generic_avx512(indices: &[u8], bits_per_index: usize) -> Vec<u64>
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2")]
 unsafe fn unpack_4bit_avx2(packed: &[u64], count: usize) -> Vec<u8> {
-    use core::arch::x86_64::*;
-
     let mut result = Vec::with_capacity(count);
     let bytes = bytemuck_cast_u64_to_u8(packed);
-    let low_mask = _mm256_set1_epi8(0x0F);
     let mut i = 0;
 
-    // Process 32 bytes at a time → 64 nibbles
+    // Process 32 bytes at a time → 64 nibbles via raw array ops.
+    // Each input byte yields two 4-bit indices: low nibble first, high nibble second.
+    // Interleaving follows the unpacklo/unpackhi pattern: within each 16-byte lane,
+    // bytes are interleaved as (lo[0],hi[0], lo[1],hi[1], ..., lo[7],hi[7]).
     while i + 32 <= bytes.len() && result.len() + 64 <= count {
-        let data = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
-
-        // Extract low nibbles
-        let lo = _mm256_and_si256(data, low_mask);
-        // Extract high nibbles
-        let hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), low_mask);
-
-        // Interleave: we need to output low nibble, then high nibble for each byte
-        let interleaved_lo = _mm256_unpacklo_epi8(lo, hi);
-        let interleaved_hi = _mm256_unpackhi_epi8(lo, hi);
-
-        // Store
+        let src = &bytes[i..i + 32];
         let mut buf = [0u8; 64];
-        _mm256_storeu_si256(buf.as_mut_ptr() as *mut __m256i, interleaved_lo);
-        _mm256_storeu_si256(buf.as_mut_ptr().add(32) as *mut __m256i, interleaved_hi);
+
+        // Two 16-byte lanes (mirroring 256-bit AVX2 lane structure)
+        for lane in 0..2 {
+            let lane_off = lane * 16;
+            // unpacklo: interleave bytes 0..8 of each lane
+            for j in 0..8 {
+                let lo_val = src[lane_off + j] & 0x0F;
+                let hi_val = (src[lane_off + j] >> 4) & 0x0F;
+                buf[lane * 16 + j * 2] = lo_val;
+                buf[lane * 16 + j * 2 + 1] = hi_val;
+            }
+            // unpackhi: interleave bytes 8..16 of each lane
+            for j in 0..8 {
+                let lo_val = src[lane_off + 8 + j] & 0x0F;
+                let hi_val = (src[lane_off + 8 + j] >> 4) & 0x0F;
+                buf[32 + lane * 16 + j * 2] = lo_val;
+                buf[32 + lane * 16 + j * 2 + 1] = hi_val;
+            }
+        }
 
         let remaining = count - result.len();
         let take = remaining.min(64);
diff --git a/src/hpc/property_mask.rs b/src/hpc/property_mask.rs
index abad125c..ee108f03 100644
--- a/src/hpc/property_mask.rs
+++ b/src/hpc/property_mask.rs
@@ -156,38 +156,36 @@ impl PropertyMask {
 
     /// Test block states using AVX-512F, processing 8 u64s at a time.
     ///
-    /// Uses 512-bit registers with `_mm512_cmpeq_epi64_mask` returning a
-    /// `__mmask8` directly, avoiding the movemask+lane-extract dance of AVX2.
+    /// Uses U64x8 polyfill with element-wise comparison.
     #[cfg(target_arch = "x86_64")]
     #[target_feature(enable = "avx512f")]
     unsafe fn test_section_avx512(&self, states: &[u64], result: &mut [u64]) {
-        use core::arch::x86_64::*;
+        use crate::simd::U64x8;
 
-        let and_mask_v = _mm512_set1_epi64(self.and_mask as i64);
-        let and_expect_v = _mm512_set1_epi64(self.and_expect as i64);
-        let andn_mask_v = _mm512_set1_epi64(self.andn_mask as i64);
-        let zero = _mm512_setzero_si512();
+        let and_mask_v = U64x8::splat(self.and_mask);
+        let and_expect_v = U64x8::splat(self.and_expect);
+        let andn_mask_v = U64x8::splat(self.andn_mask);
+        let zero = U64x8::splat(0);
 
         let chunks = states.len() / 8;
         for c in 0..chunks {
             let base = c * 8;
             // SAFETY: base + 8 <= states.len(), avx512f checked by caller.
-            let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i);
+            let vals = U64x8::from_slice(&states[base..]);
 
             // (vals & and_mask) == and_expect
-            let anded = _mm512_and_si512(vals, and_mask_v);
-            let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v);
+            let anded = vals & and_mask_v;
+            let anded_arr = anded.to_array();
+            let expect_arr = and_expect_v.to_array();
 
             // (vals & andn_mask) == 0
-            let andned = _mm512_and_si512(vals, andn_mask_v);
-            let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero);
-
-            // Both conditions: AND the two kmasks
-            let both = eq_and & eq_andn;
+            let andned = vals & andn_mask_v;
+            let andned_arr = andned.to_array();
+            let zero_arr = zero.to_array();
 
             // Set bits in the result bitmap
             for lane in 0..8usize {
-                if (both >> lane) & 1 != 0 {
+                if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == zero_arr[lane] {
                     let idx = base + lane;
                     result[idx / 64] |= 1u64 << (idx % 64);
                 }
@@ -206,27 +204,32 @@ impl PropertyMask {
     #[cfg(target_arch = "x86_64")]
     #[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")]
     unsafe fn count_section_avx512(&self, states: &[u64]) -> u32 {
-        use core::arch::x86_64::*;
+        use crate::simd::U64x8;
 
-        let and_mask_v = _mm512_set1_epi64(self.and_mask as i64);
-        let and_expect_v = _mm512_set1_epi64(self.and_expect as i64);
-        let andn_mask_v = _mm512_set1_epi64(self.andn_mask as i64);
-        let zero = _mm512_setzero_si512();
+        let and_mask_v = U64x8::splat(self.and_mask);
+        let and_expect_v = U64x8::splat(self.and_expect);
+        let andn_mask_v = U64x8::splat(self.andn_mask);
         let mut total = 0u32;
 
         let chunks = states.len() / 8;
         for c in 0..chunks {
             let base = c * 8;
             // SAFETY: base + 8 <= states.len(), features checked by caller.
-            let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i);
+            let vals = U64x8::from_slice(&states[base..]);
 
-            let anded = _mm512_and_si512(vals, and_mask_v);
-            let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v);
+            let anded = vals & and_mask_v;
+            let anded_arr = anded.to_array();
+            let expect_arr = and_expect_v.to_array();
 
-            let andned = _mm512_and_si512(vals, andn_mask_v);
-            let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero);
+            let andned = vals & andn_mask_v;
+            let andned_arr = andned.to_array();
 
-            let both = eq_and & eq_andn;
+            let mut both: u8 = 0;
+            for lane in 0..8usize {
+                if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == 0 {
+                    both |= 1 << lane;
+                }
+            }
             total += (both as u32).count_ones();
         }
 
@@ -244,37 +247,16 @@ impl PropertyMask {
     #[cfg(target_arch = "x86_64")]
     #[target_feature(enable = "avx2")]
     unsafe fn test_section_avx2(&self, states: &[u64], result: &mut [u64]) {
-        // AVX2 processes 4 u64s at a time via 256-bit registers.
-        use core::arch::x86_64::*;
-
-        let and_mask_v = _mm256_set1_epi64x(self.and_mask as i64);
-        let and_expect_v = _mm256_set1_epi64x(self.and_expect as i64);
-        let andn_mask_v = _mm256_set1_epi64x(self.andn_mask as i64);
-        let zero = _mm256_setzero_si256();
-
+        // AVX2 processes 4 u64s at a time via scalar array operations.
         let chunks = states.len() / 4;
         for c in 0..chunks {
             let base = c * 4;
-            let vals = _mm256_loadu_si256(states.as_ptr().add(base) as *const __m256i);
-
-            // (vals & and_mask) == and_expect
-            let anded = _mm256_and_si256(vals, and_mask_v);
-            let eq_and = _mm256_cmpeq_epi64(anded, and_expect_v);
-
-            // (vals & andn_mask) == 0
-            let andned = _mm256_and_si256(vals, andn_mask_v);
-            let eq_andn = _mm256_cmpeq_epi64(andned, zero);
-
-            // both conditions
-            let both = _mm256_and_si256(eq_and, eq_andn);
 
-            // Extract per-lane results (each lane is all-1s or all-0s).
-            // _mm256_movemask_epi8 gives 32 bits; lanes are 8 bytes each.
-            let mask32 = _mm256_movemask_epi8(both) as u32;
-            // Lane k matched if bytes [k*8..(k+1)*8] are all 0xFF → bits set.
             for lane in 0..4usize {
-                let byte_mask = (mask32 >> (lane * 8)) & 0xFF;
-                if byte_mask == 0xFF {
+                let val = states[base + lane];
+                if (val & self.and_mask) == self.and_expect
+                    && (val & self.andn_mask) == 0
+                {
                     let idx = base + lane;
                     result[idx / 64] |= 1u64 << (idx % 64);
                 }
@@ -361,32 +343,37 @@ fn count_section_multi_scalar(masks: &[PropertyMask], states: &[u64]) -> MultiMa
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx512f")]
 unsafe fn count_section_multi_avx512(masks: &[PropertyMask], states: &[u64]) -> MultiMaskResult {
-    use core::arch::x86_64::*;
+    use crate::simd::U64x8;
 
     let mut counts = vec![0u32; masks.len()];
-    let zero = _mm512_setzero_si512();
     let chunks = states.len() / 8;
 
     for c in 0..chunks {
         let base = c * 8;
         // SAFETY: base + 8 <= states.len(), avx512f checked by caller.
-        let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i);
+        let vals = U64x8::from_slice(&states[base..]);
 
         for (m_idx, mask) in masks.iter().enumerate() {
-            let and_mask_v = _mm512_set1_epi64(mask.and_mask as i64);
-            let and_expect_v = _mm512_set1_epi64(mask.and_expect as i64);
-            let andn_mask_v = _mm512_set1_epi64(mask.andn_mask as i64);
+            let and_mask_v = U64x8::splat(mask.and_mask);
+            let and_expect_v = U64x8::splat(mask.and_expect);
+            let andn_mask_v = U64x8::splat(mask.andn_mask);
 
             // (vals & and_mask) == and_expect
-            let anded = _mm512_and_si512(vals, and_mask_v);
-            let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v);
+            let anded = vals & and_mask_v;
+            let anded_arr = anded.to_array();
+            let expect_arr = and_expect_v.to_array();
 
             // (vals & andn_mask) == 0
-            let andned = _mm512_and_si512(vals, andn_mask_v);
-            let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero);
+            let andned = vals & andn_mask_v;
+            let andned_arr = andned.to_array();
 
-            // Both conditions: AND the two kmasks
-            let both = eq_and & eq_andn;
+            // Both conditions: count matching lanes
+            let mut both: u8 = 0;
+            for lane in 0..8usize {
+                if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == 0 {
+                    both |= 1 << lane;
+                }
+            }
             counts[m_idx] += (both as u32).count_ones();
         }
     }
diff --git a/src/hpc/spatial_hash.rs b/src/hpc/spatial_hash.rs
index 76aec2e3..b69d125a 100644
--- a/src/hpc/spatial_hash.rs
+++ b/src/hpc/spatial_hash.rs
@@ -316,7 +316,8 @@ pub(crate) fn batch_sq_dist_scalar(
     result
 }
 
-/// AVX2 batch squared-distance filter: processes 8 candidates at a time.
+/// AVX2 batch squared-distance filter: processes 8 candidates at a time
+/// using F32x8 polyfill for arithmetic, scalar array comparison for filtering.
 ///
 /// # Safety
 /// Caller must ensure AVX2 is available.
@@ -327,13 +328,12 @@ pub(crate) unsafe fn batch_sq_dist_avx2(
     candidates: &[[f32; 3]],
     radius_sq: f32,
 ) -> Vec<(usize, f32)> {
-    use core::arch::x86_64::*;
+    use crate::simd::F32x8;
 
     let mut result = Vec::new();
-    let qx = _mm256_set1_ps(query[0]);
-    let qy = _mm256_set1_ps(query[1]);
-    let qz = _mm256_set1_ps(query[2]);
-    let radius_sq_v = _mm256_set1_ps(radius_sq);
+    let qx = F32x8::splat(query[0]);
+    let qy = F32x8::splat(query[1]);
+    let qz = F32x8::splat(query[2]);
 
     let chunks = candidates.len() / 8;
     for c in 0..chunks {
@@ -349,35 +349,22 @@ pub(crate) unsafe fn batch_sq_dist_avx2(
             cz[i] = candidates[base + i][2];
         }
 
-        // SAFETY: arrays are 8-element aligned, avx2 checked by caller.
-        let vx = _mm256_loadu_ps(cx.as_ptr());
-        let vy = _mm256_loadu_ps(cy.as_ptr());
-        let vz = _mm256_loadu_ps(cz.as_ptr());
+        let vx = F32x8::from_array(cx);
+        let vy = F32x8::from_array(cy);
+        let vz = F32x8::from_array(cz);
 
         // Compute squared distances
-        let dx = _mm256_sub_ps(vx, qx);
-        let dy = _mm256_sub_ps(vy, qy);
-        let dz = _mm256_sub_ps(vz, qz);
-
-        let d2 = _mm256_add_ps(
-            _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy)),
-            _mm256_mul_ps(dz, dz),
-        );
-
-        // Compare: d2 <= radius_sq
-        let cmp = _mm256_cmp_ps(d2, radius_sq_v, _CMP_LE_OQ);
-        let mask = _mm256_movemask_ps(cmp) as u32;
-
-        if mask != 0 {
-            // Extract individual distances for matching lanes
-            let mut d2_arr = [0.0f32; 8];
-            _mm256_storeu_ps(d2_arr.as_mut_ptr(), d2);
-
-            let mut m = mask;
-            while m != 0 {
-                let bit = m.trailing_zeros() as usize;
-                result.push((base + bit, d2_arr[bit]));
-                m &= m - 1;
+        let dx = vx - qx;
+        let dy = vy - qy;
+        let dz = vz - qz;
+
+        let d2 = dx * dx + dy * dy + dz * dz;
+
+        // Compare: d2 <= radius_sq (scalar array comparison — no F32x8 cmp polyfill)
+        let d2_arr = d2.to_array();
+        for lane in 0..8 {
+            if d2_arr[lane] <= radius_sq {
+                result.push((base + lane, d2_arr[lane]));
             }
         }
     }
diff --git a/src/simd.rs b/src/simd.rs
index fc2e56ff..109c2569 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -779,6 +779,31 @@ mod scalar {
             for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+8+i]; out[b+i*2+1] = other.0[b+8+i]; } }
             Self(out)
         }
+        /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 128-bit (16-byte) lane.
+        #[inline(always)]
+        pub fn shuffle_bytes(self, idx: Self) -> Self {
+            let mut out = [0u8; 64];
+            for lane in 0..4 {
+                let b = lane * 16;
+                for i in 0..16 {
+                    out[b + i] = self.0[b + (idx.0[b + i] & 0x0F) as usize];
+                }
+            }
+            Self(out)
+        }
+        /// Sum all 64 bytes into a single `u64` without wrapping.
+        #[inline(always)]
+        pub fn sum_bytes_u64(self) -> u64 {
+            self.0.iter().map(|&b| b as u64).sum()
+        }
+        /// Build a nibble-popcount lookup table (replicated across 4 x 16-byte lanes).
+        #[inline(always)]
+        pub fn nibble_popcount_lut() -> Self {
+            let lane: [u8; 16] = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4];
+            let mut arr = [0u8; 64];
+            for l in 0..4 { arr[l*16..(l+1)*16].copy_from_slice(&lane); }
+            Self(arr)
+        }
     }
 
     // Mul for U32x16
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
index bf3726b6..c952469a 100644
--- a/src/simd_avx2.rs
+++ b/src/simd_avx2.rs
@@ -607,6 +607,17 @@ impl F32x16 {
         let a = self.to_array(); let b = other.to_array();
         let mut bits: u16 = 0; for i in 0..16 { if a[i] != b[i] { bits |= 1 << i; } } F32Mask16(bits)
     }
+    /// Gather 16 f32 values from `base_ptr` using 16 i32 indices.
+    ///
+    /// # Safety
+    /// Caller must ensure all indices are valid offsets into the memory at `base_ptr`.
+    #[inline(always)]
+    pub unsafe fn gather(indices: I32x16, base_ptr: *const f32) -> Self {
+        let idx = indices.0;
+        let mut o = [0.0f32; 16];
+        for i in 0..16 { o[i] = *base_ptr.add(idx[i] as usize); }
+        Self::from_array(o)
+    }
     #[inline(always)] pub fn to_bits(self) -> U32x16 {
         let a = self.to_array();
         let mut o = [0u32; 16]; for i in 0..16 { o[i] = a[i].to_bits(); } U32x16(o)
@@ -829,6 +840,34 @@ impl U8x64 {
     #[inline(always)] pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap() }
     #[inline(always)] pub fn simd_min(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].min(other.0[i]); } Self(o) }
     #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
+
+    /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 16-byte lane.
+    #[inline(always)]
+    pub fn shuffle_bytes(self, idx: Self) -> Self {
+        let mut out = [0u8; 64];
+        for lane in 0..4 {
+            let b = lane * 16;
+            for i in 0..16 {
+                out[b + i] = self.0[b + (idx.0[b + i] & 0x0F) as usize];
+            }
+        }
+        Self(out)
+    }
+
+    /// Sum all 64 bytes into a single `u64` without wrapping.
+    #[inline(always)]
+    pub fn sum_bytes_u64(self) -> u64 {
+        self.0.iter().map(|&b| b as u64).sum()
+    }
+
+    /// Build a nibble-popcount lookup table (replicated across 4 x 16-byte lanes).
+    #[inline(always)]
+    pub fn nibble_popcount_lut() -> Self {
+        let lane: [u8; 16] = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4];
+        let mut arr = [0u8; 64];
+        for l in 0..4 { arr[l*16..(l+1)*16].copy_from_slice(&lane); }
+        Self(arr)
+    }
 }
 
 avx2_int_type!(I32x16, i32, 16, 0i32);
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
index 99592963..4da011ff 100644
--- a/src/simd_avx512.rs
+++ b/src/simd_avx512.rs
@@ -235,6 +235,18 @@ impl F32x16 {
         // GE(a, b) = LE(b, a)
         other.simd_le(self)
     }
+
+    /// Gather 16 f32 values from `base_ptr` using 16 i32 indices.
+    ///
+    /// Equivalent to `_mm512_i32gather_ps::<4>(indices, base_ptr)`:
+    /// each lane loads `base_ptr[indices[lane]]`.
+    ///
+    /// # Safety
+    /// Caller must ensure all indices are valid offsets into the memory at `base_ptr`.
+    #[inline(always)]
+    pub unsafe fn gather(indices: I32x16, base_ptr: *const f32) -> Self {
+        Self(_mm512_i32gather_ps::<4>(indices.0, base_ptr))
+    }
 }
 
 impl_bin_op!(F32x16, Add, add, _mm512_add_ps);
@@ -623,6 +635,40 @@ impl U8x64 {
     pub fn unpack_hi_epi8(self, other: Self) -> Self {
         Self(unsafe { _mm512_unpackhi_epi8(self.0, other.0) })
     }
+
+    /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 128-bit lane.
+    /// Equivalent to `_mm512_shuffle_epi8(self.0, idx.0)`.
+    #[inline(always)]
+    pub fn shuffle_bytes(self, idx: Self) -> Self {
+        Self(unsafe { _mm512_shuffle_epi8(self.0, idx.0) })
+    }
+
+    /// Sum all 64 bytes into a single `u64` without wrapping.
+    ///
+    /// Uses `_mm512_sad_epu8` (groups of 8 bytes → u64 lanes) then horizontal add.
+    /// Range: 0..=64*255 = 16_320, always fits in u64.
+    #[inline(always)]
+    pub fn sum_bytes_u64(self) -> u64 {
+        unsafe {
+            let sad = _mm512_sad_epu8(self.0, _mm512_setzero_si512());
+            _mm512_reduce_add_epi64(sad) as u64
+        }
+    }
+
+    /// Build a nibble-popcount lookup table (replicated across all 4 × 128-bit lanes).
+    ///
+    /// Entry `i` = popcount of `i` for i in 0..16. Used with `shuffle_bytes` for
+    /// SIMD popcount via the Mula nibble-LUT algorithm.
+    #[inline(always)]
+    pub fn nibble_popcount_lut() -> Self {
+        // 0x04030302_03020201_03020201_02010100 replicated ×4
+        Self(unsafe { _mm512_set4_epi32(
+            0x04030302_u32 as i32,
+            0x03020201_u32 as i32,
+            0x03020201_u32 as i32,
+            0x02010100_u32 as i32,
+        )})
+    }
 }
 
 // u8 add/sub use AVX-512BW instructions