diff --git a/crates/p64/src/lib.rs b/crates/p64/src/lib.rs index 270e4db7..b1308df2 100644 --- a/crates/p64/src/lib.rs +++ b/crates/p64/src/lib.rs @@ -178,21 +178,19 @@ type AttendFn = unsafe fn(&[u64; 64], u64, u8) -> (u8, u8, [u8; 64], u64); #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn attend_avx512(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8; 64], u64) { - use std::arch::x86_64::*; let mut best_idx = 0u8; let mut best_score = 0u8; let mut scores = [0u8; 64]; let mut fires = 0u64; - let q = _mm512_set1_epi64(query as i64); // Process 8 rows per chunk, 8 chunks = 64 rows + // (scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4) for chunk in 0..8 { let base = chunk * 8; - // SAFETY: rows is [u64; 64], base..base+8 is in bounds, Palette64 is 64-byte aligned. - let r = _mm512_loadu_si512(rows[base..].as_ptr() as *const __m512i); - let anded = _mm512_and_si512(r, q); - // Extract 8 u64s and scalar popcount (no VPOPCNTDQ dependency) - let vals: [u64; 8] = std::mem::transmute(anded); + let mut vals = [0u64; 8]; + for j in 0..8 { + vals[j] = rows[base + j] & query; + } for j in 0..8 { let score = vals[j].count_ones() as u8; let idx = base + j; @@ -212,20 +210,19 @@ unsafe fn attend_avx512(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8 #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn attend_avx2(rows: &[u64; 64], query: u64, gamma: u8) -> (u8, u8, [u8; 64], u64) { - use std::arch::x86_64::*; let mut best_idx = 0u8; let mut best_score = 0u8; let mut scores = [0u8; 64]; let mut fires = 0u64; - let q = _mm256_set1_epi64x(query as i64); // Process 4 rows per chunk, 16 chunks = 64 rows + // (scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4) for chunk in 0..16 { let base = chunk * 4; - // SAFETY: rows is [u64; 64], base..base+4 is in bounds. - let r = _mm256_loadu_si256(rows[base..].as_ptr() as *const __m256i); - let anded = _mm256_and_si256(r, q); - let vals: [u64; 4] = std::mem::transmute(anded); + let mut vals = [0u64; 4]; + for j in 0..4 { + vals[j] = rows[base + j] & query; + } for j in 0..4 { let score = vals[j].count_ones() as u8; let idx = base + j; @@ -284,15 +281,14 @@ type NearestKFn = unsafe fn(&[u64; 64], u64) -> [u8; 64]; #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn nearest_k_avx512(rows: &[u64; 64], query: u64) -> [u8; 64] { - use std::arch::x86_64::*; let mut dists = [0u8; 64]; - let q = _mm512_set1_epi64(query as i64); + // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4 for chunk in 0..8 { let base = chunk * 8; - // SAFETY: rows is [u64; 64], base..base+8 is in bounds. - let r = _mm512_loadu_si512(rows[base..].as_ptr() as *const __m512i); - let xored = _mm512_xor_si512(r, q); - let vals: [u64; 8] = std::mem::transmute(xored); + let mut vals = [0u64; 8]; + for j in 0..8 { + vals[j] = rows[base + j] ^ query; + } for j in 0..8 { dists[base + j] = vals[j].count_ones() as u8; } @@ -303,15 +299,14 @@ unsafe fn nearest_k_avx512(rows: &[u64; 64], query: u64) -> [u8; 64] { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn nearest_k_avx2(rows: &[u64; 64], query: u64) -> [u8; 64] { - use std::arch::x86_64::*; let mut dists = [0u8; 64]; - let q = _mm256_set1_epi64x(query as i64); + // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4 for chunk in 0..16 { let base = chunk * 4; - // SAFETY: rows is [u64; 64], base..base+4 is in bounds. - let r = _mm256_loadu_si256(rows[base..].as_ptr() as *const __m256i); - let xored = _mm256_xor_si256(r, q); - let vals: [u64; 4] = std::mem::transmute(xored); + let mut vals = [0u64; 4]; + for j in 0..4 { + vals[j] = rows[base + j] ^ query; + } for j in 0..4 { dists[base + j] = vals[j].count_ones() as u8; } @@ -350,13 +345,11 @@ type MoeGateFn = unsafe fn(&[u64; 8], u64, u8) -> (u8, [u8; 8], u64); #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn moe_gate_avx512(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u8; 8], u64) { - use std::arch::x86_64::*; - // Load all 8 planes into one zmm register, AND with broadcast query - // SAFETY: planes is [u64; 8] = 64 bytes, fits in one zmm. - let p = _mm512_loadu_si512(planes.as_ptr() as *const __m512i); - let q = _mm512_set1_epi64(query as i64); - let anded = _mm512_and_si512(p, q); - let vals: [u64; 8] = std::mem::transmute(anded); + // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4 + let mut vals = [0u64; 8]; + for i in 0..8 { + vals[i] = planes[i] & query; + } let mut active = 0u8; let mut strength = [0u8; 8]; @@ -375,8 +368,7 @@ unsafe fn moe_gate_avx512(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn moe_gate_avx2(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u8; 8], u64) { - use std::arch::x86_64::*; - let q = _mm256_set1_epi64x(query as i64); + // Scalar array ops — LLVM auto-vectorizes with target-cpu=x86-64-v4 let mut active = 0u8; let mut strength = [0u8; 8]; let mut combined = 0u64; @@ -384,10 +376,10 @@ unsafe fn moe_gate_avx2(planes: &[u64; 8], query: u64, threshold: u8) -> (u8, [u // Process 4 planes at a time, 2 chunks = 8 planes for chunk in 0..2 { let base = chunk * 4; - // SAFETY: planes is [u64; 8], base..base+4 is in bounds. - let p = _mm256_loadu_si256(planes[base..].as_ptr() as *const __m256i); - let anded = _mm256_and_si256(p, q); - let vals: [u64; 4] = std::mem::transmute(anded); + let mut vals = [0u64; 4]; + for j in 0..4 { + vals[j] = planes[base + j] & query; + } for j in 0..4 { let score = vals[j].count_ones() as u8; let idx = base + j; diff --git a/src/hpc/aabb.rs b/src/hpc/aabb.rs index cf93da09..3f25fc81 100644 --- a/src/hpc/aabb.rs +++ b/src/hpc/aabb.rs @@ -168,8 +168,8 @@ fn aabb_intersect_batch_scalar(query: &Aabb, candidates: &[Aabb]) -> Vec { /// AVX-512 batch AABB intersection: tests 16 candidates per axis comparison. /// -/// Broadcasts query min/max per axis, gathers candidate coords into __m512, -/// compares all 16 at once using `_mm512_cmp_ps_mask`, ANDs the 6 comparison +/// Broadcasts query min/max per axis, gathers candidate coords into F32x16, +/// compares all 16 at once using `simd_le` / `simd_ge`, ANDs the 6 comparison /// masks. /// /// # Safety @@ -177,7 +177,7 @@ fn aabb_intersect_batch_scalar(query: &Aabb, candidates: &[Aabb]) -> Vec { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec { - use core::arch::x86_64::*; + use crate::simd::{F32x16, F32Mask16}; let mut result = Vec::with_capacity(candidates.len()); @@ -203,32 +203,30 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec< c_max_z[i] = cand.max[2]; } - // SAFETY: arrays are 16-element, avx512f checked by caller. - let v_c_min_x = _mm512_loadu_ps(c_min_x.as_ptr()); - let v_c_max_x = _mm512_loadu_ps(c_max_x.as_ptr()); - let v_c_min_y = _mm512_loadu_ps(c_min_y.as_ptr()); - let v_c_max_y = _mm512_loadu_ps(c_max_y.as_ptr()); - let v_c_min_z = _mm512_loadu_ps(c_min_z.as_ptr()); - let v_c_max_z = _mm512_loadu_ps(c_max_z.as_ptr()); + let v_c_min_x = F32x16::from_array(c_min_x); + let v_c_max_x = F32x16::from_array(c_max_x); + let v_c_min_y = F32x16::from_array(c_min_y); + let v_c_max_y = F32x16::from_array(c_max_y); + let v_c_min_z = F32x16::from_array(c_min_z); + let v_c_max_z = F32x16::from_array(c_max_z); // Broadcast query bounds - let q_min_x = _mm512_set1_ps(query.min[0]); - let q_max_x = _mm512_set1_ps(query.max[0]); - let q_min_y = _mm512_set1_ps(query.min[1]); - let q_max_y = _mm512_set1_ps(query.max[1]); - let q_min_z = _mm512_set1_ps(query.min[2]); - let q_max_z = _mm512_set1_ps(query.max[2]); + let q_min_x = F32x16::splat(query.min[0]); + let q_max_x = F32x16::splat(query.max[0]); + let q_min_y = F32x16::splat(query.min[1]); + let q_max_y = F32x16::splat(query.max[1]); + let q_min_z = F32x16::splat(query.min[2]); + let q_max_z = F32x16::splat(query.max[2]); // 6 intersection conditions: q.min[i] <= c.max[i] && q.max[i] >= c.min[i] - // _CMP_LE_OQ = 18, _CMP_GE_OQ = 29 (ordered, quiet) - let m1 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_x, v_c_max_x); - let m2 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_x, v_c_min_x); - let m3 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_y, v_c_max_y); - let m4 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_y, v_c_min_y); - let m5 = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(q_min_z, v_c_max_z); - let m6 = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(q_max_z, v_c_min_z); + let m1 = q_min_x.simd_le(v_c_max_x); + let m2 = q_max_x.simd_ge(v_c_min_x); + let m3 = q_min_y.simd_le(v_c_max_y); + let m4 = q_max_y.simd_ge(v_c_min_y); + let m5 = q_min_z.simd_le(v_c_max_z); + let m6 = q_max_z.simd_ge(v_c_min_z); - let all = m1 & m2 & m3 & m4 & m5 & m6; + let all = m1.0 & m2.0 & m3.0 & m4.0 & m5.0 & m6.0; for i in 0..16 { result.push((all >> i) & 1 != 0); @@ -246,24 +244,16 @@ unsafe fn aabb_intersect_batch_avx512(query: &Aabb, candidates: &[Aabb]) -> Vec< #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse4.1")] unsafe fn aabb_intersect_batch_sse41(query: &Aabb, candidates: &[Aabb]) -> Vec { - use core::arch::x86_64::*; - - // Load query min/max into SSE registers (only need xyz, ignore w). - let q_min = _mm_set_ps(0.0, query.min[2], query.min[1], query.min[0]); - let q_max = _mm_set_ps(f32::MAX, query.max[2], query.max[1], query.max[0]); - + // Scalar per-candidate test — LLVM auto-vectorizes with target-cpu=x86-64-v4 let mut result = Vec::with_capacity(candidates.len()); for c in candidates { - let c_min = _mm_set_ps(0.0, c.min[2], c.min[1], c.min[0]); - let c_max = _mm_set_ps(f32::MAX, c.max[2], c.max[1], c.max[0]); - - // q.min <= c.max AND q.max >= c.min (per component) - let le = _mm_cmple_ps(q_min, c_max); // q_min[i] <= c_max[i] - let ge = _mm_cmpge_ps(q_max, c_min); // q_max[i] >= c_min[i] - let both = _mm_and_ps(le, ge); - // All 4 lanes must be true (lane 3 is always true due to sentinel values). - let mask = _mm_movemask_ps(both); - result.push(mask == 0xF); + let hit = query.min[0] <= c.max[0] + && query.max[0] >= c.min[0] + && query.min[1] <= c.max[1] + && query.max[1] >= c.min[1] + && query.min[2] <= c.max[2] + && query.max[2] >= c.min[2]; + result.push(hit); } result } @@ -333,27 +323,27 @@ fn ray_aabb_slab_test_scalar(ray: &Ray, aabbs: &[Aabb]) -> (Vec, Vec) /// AVX-512 batch ray-AABB slab test: processes 16 AABBs per iteration. /// /// Broadcasts ray origin and inv_dir per axis, gathers candidate min/max -/// coords into SoA arrays, computes slab intervals with `_mm512_min_ps` / -/// `_mm512_max_ps`, and combines masks with `_mm512_cmp_ps_mask`. +/// coords into SoA arrays, computes slab intervals with `simd_min` / +/// `simd_max`, and combines masks with `simd_le` / `simd_ge`. /// /// # Safety /// Caller must ensure AVX-512F is available. #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec, Vec) { - use core::arch::x86_64::*; + use crate::simd::F32x16; let mut hits = Vec::with_capacity(aabbs.len()); let mut t_values = Vec::with_capacity(aabbs.len()); // Broadcast ray origin and inv_dir per axis - let orig_x = _mm512_set1_ps(ray.origin[0]); - let orig_y = _mm512_set1_ps(ray.origin[1]); - let orig_z = _mm512_set1_ps(ray.origin[2]); - let inv_x = _mm512_set1_ps(ray.inv_dir[0]); - let inv_y = _mm512_set1_ps(ray.inv_dir[1]); - let inv_z = _mm512_set1_ps(ray.inv_dir[2]); - let zero = _mm512_set1_ps(0.0); + let orig_x = F32x16::splat(ray.origin[0]); + let orig_y = F32x16::splat(ray.origin[1]); + let orig_z = F32x16::splat(ray.origin[2]); + let inv_x = F32x16::splat(ray.inv_dir[0]); + let inv_y = F32x16::splat(ray.inv_dir[1]); + let inv_z = F32x16::splat(ray.inv_dir[2]); + let zero = F32x16::splat(0.0); // Process 16 AABBs at a time let chunks = aabbs.len() / 16; @@ -378,49 +368,44 @@ unsafe fn ray_aabb_slab_test_avx512(ray: &Ray, aabbs: &[Aabb]) -> (Vec, Ve a_max_z[i] = aabb.max[2]; } - // SAFETY: arrays are 16-element, avx512f checked by caller. - let v_min_x = _mm512_loadu_ps(a_min_x.as_ptr()); - let v_max_x = _mm512_loadu_ps(a_max_x.as_ptr()); - let v_min_y = _mm512_loadu_ps(a_min_y.as_ptr()); - let v_max_y = _mm512_loadu_ps(a_max_y.as_ptr()); - let v_min_z = _mm512_loadu_ps(a_min_z.as_ptr()); - let v_max_z = _mm512_loadu_ps(a_max_z.as_ptr()); + let v_min_x = F32x16::from_array(a_min_x); + let v_max_x = F32x16::from_array(a_max_x); + let v_min_y = F32x16::from_array(a_min_y); + let v_max_y = F32x16::from_array(a_max_y); + let v_min_z = F32x16::from_array(a_min_z); + let v_max_z = F32x16::from_array(a_max_z); // X axis: t1 = (min - origin) * inv_dir, t2 = (max - origin) * inv_dir - let t1_x = _mm512_mul_ps(_mm512_sub_ps(v_min_x, orig_x), inv_x); - let t2_x = _mm512_mul_ps(_mm512_sub_ps(v_max_x, orig_x), inv_x); - let t_near_x = _mm512_min_ps(t1_x, t2_x); - let t_far_x = _mm512_max_ps(t1_x, t2_x); + let t1_x = (v_min_x - orig_x) * inv_x; + let t2_x = (v_max_x - orig_x) * inv_x; + let t_near_x = t1_x.simd_min(t2_x); + let t_far_x = t1_x.simd_max(t2_x); // Y axis - let t1_y = _mm512_mul_ps(_mm512_sub_ps(v_min_y, orig_y), inv_y); - let t2_y = _mm512_mul_ps(_mm512_sub_ps(v_max_y, orig_y), inv_y); - let t_near_y = _mm512_min_ps(t1_y, t2_y); - let t_far_y = _mm512_max_ps(t1_y, t2_y); + let t1_y = (v_min_y - orig_y) * inv_y; + let t2_y = (v_max_y - orig_y) * inv_y; + let t_near_y = t1_y.simd_min(t2_y); + let t_far_y = t1_y.simd_max(t2_y); // Z axis - let t1_z = _mm512_mul_ps(_mm512_sub_ps(v_min_z, orig_z), inv_z); - let t2_z = _mm512_mul_ps(_mm512_sub_ps(v_max_z, orig_z), inv_z); - let t_near_z = _mm512_min_ps(t1_z, t2_z); - let t_far_z = _mm512_max_ps(t1_z, t2_z); + let t1_z = (v_min_z - orig_z) * inv_z; + let t2_z = (v_max_z - orig_z) * inv_z; + let t_near_z = t1_z.simd_min(t2_z); + let t_far_z = t1_z.simd_max(t2_z); // t_enter = max(t_near_x, t_near_y, t_near_z) - let t_enter = _mm512_max_ps(_mm512_max_ps(t_near_x, t_near_y), t_near_z); + let t_enter = t_near_x.simd_max(t_near_y).simd_max(t_near_z); // t_exit = min(t_far_x, t_far_y, t_far_z) - let t_exit = _mm512_min_ps(_mm512_min_ps(t_far_x, t_far_y), t_far_z); + let t_exit = t_far_x.simd_min(t_far_y).simd_min(t_far_z); // hit = t_enter <= t_exit AND t_exit >= 0 - // _CMP_LE_OQ = 18, _CMP_GE_OQ = 29 (ordered, quiet) - let m_le = _mm512_cmp_ps_mask::<{ _CMP_LE_OQ }>(t_enter, t_exit); - let m_ge = _mm512_cmp_ps_mask::<{ _CMP_GE_OQ }>(t_exit, zero); - let hit_mask = m_le & m_ge; + let m_le = t_enter.simd_le(t_exit); + let m_ge = t_exit.simd_ge(zero); + let hit_mask = m_le.0 & m_ge.0; // Clamp t_enter to 0 for origins inside box - let t_enter_clamped = _mm512_max_ps(t_enter, zero); - - // SAFETY: 16-element array matches __m512 lane count. - let mut t_arr = [0.0f32; 16]; - _mm512_storeu_ps(t_arr.as_mut_ptr(), t_enter_clamped); + let t_enter_clamped = t_enter.simd_max(zero); + let t_arr = t_enter_clamped.to_array(); for i in 0..16 { let hit = (hit_mask >> i) & 1 != 0; @@ -482,27 +467,14 @@ fn aabb_expand_batch_scalar(aabbs: &mut [Aabb], dx: f32, dy: f32, dz: f32) { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "sse2")] unsafe fn aabb_expand_batch_sse2(aabbs: &mut [Aabb], dx: f32, dy: f32, dz: f32) { - use core::arch::x86_64::*; - - let delta_min = _mm_set_ps(0.0, dz, dy, dx); - let delta_max = _mm_set_ps(0.0, dz, dy, dx); - + // Scalar per-AABB expand — LLVM auto-vectorizes with target-cpu=x86-64-v4 for a in aabbs.iter_mut() { - let min_v = _mm_set_ps(0.0, a.min[2], a.min[1], a.min[0]); - let max_v = _mm_set_ps(0.0, a.max[2], a.max[1], a.max[0]); - - let new_min = _mm_sub_ps(min_v, delta_min); - let new_max = _mm_add_ps(max_v, delta_max); - - // Store back. We cannot use _mm_storeu_ps directly into [f32;3], - // so extract components. - let mut min_arr = [0.0f32; 4]; - let mut max_arr = [0.0f32; 4]; - _mm_storeu_ps(min_arr.as_mut_ptr(), new_min); - _mm_storeu_ps(max_arr.as_mut_ptr(), new_max); - - a.min = [min_arr[0], min_arr[1], min_arr[2]]; - a.max = [max_arr[0], max_arr[1], max_arr[2]]; + a.min[0] -= dx; + a.min[1] -= dy; + a.min[2] -= dz; + a.max[0] += dx; + a.max[1] += dy; + a.max[2] += dz; } } diff --git a/src/hpc/bitwise.rs b/src/hpc/bitwise.rs index 1e338e00..35396574 100644 --- a/src/hpc/bitwise.rs +++ b/src/hpc/bitwise.rs @@ -60,49 +60,20 @@ fn hamming_scalar(a: &[u8], b: &[u8]) -> u64 { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn hamming_avx2(a: &[u8], b: &[u8]) -> u64 { - use core::arch::x86_64::*; + // No U8x32 polyfill available — use u64 XOR + count_ones (hardware POPCNT). let n = a.len().min(b.len()); let mut total = 0u64; - - // Lookup table for popcount nibbles - let lookup = _mm256_setr_epi8( - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - ); - let low_mask = _mm256_set1_epi8(0x0f); - let mut acc = _mm256_setzero_si256(); let mut i = 0; - let mut inner_count = 0u32; - while i + 32 <= n { - let va = _mm256_loadu_si256(a.as_ptr().add(i) as *const __m256i); - let vb = _mm256_loadu_si256(b.as_ptr().add(i) as *const __m256i); - let xor = _mm256_xor_si256(va, vb); - let lo = _mm256_and_si256(xor, low_mask); - let hi = _mm256_and_si256(_mm256_srli_epi16(xor, 4), low_mask); - let popcnt_lo = _mm256_shuffle_epi8(lookup, lo); - let popcnt_hi = _mm256_shuffle_epi8(lookup, hi); - acc = _mm256_add_epi8(acc, _mm256_add_epi8(popcnt_lo, popcnt_hi)); - i += 32; - inner_count += 1; - // Prevent overflow of u8 accumulators (max 255/8 ≈ 31 iterations) - if inner_count >= 30 { - let sad = _mm256_sad_epu8(acc, _mm256_setzero_si256()); - let arr: [u64; 4] = core::mem::transmute(sad); - total += arr[0] + arr[1] + arr[2] + arr[3]; - acc = _mm256_setzero_si256(); - inner_count = 0; - } + // Process 8 bytes (one u64) per iteration + while i + 8 <= n { + let wa = u64::from_ne_bytes(a[i..i + 8].try_into().unwrap()); + let wb = u64::from_ne_bytes(b[i..i + 8].try_into().unwrap()); + total += (wa ^ wb).count_ones() as u64; + i += 8; } - // Flush accumulator - if inner_count > 0 { - let sad = _mm256_sad_epu8(acc, _mm256_setzero_si256()); - let arr: [u64; 4] = core::mem::transmute(sad); - total += arr[0] + arr[1] + arr[2] + arr[3]; - } - - // Handle remainder + // Scalar remainder while i < n { total += (a[i] ^ b[i]).count_ones() as u64; i += 1; @@ -115,45 +86,41 @@ unsafe fn hamming_avx2(a: &[u8], b: &[u8]) -> u64 { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512bw")] unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 { - use core::arch::x86_64::*; + use crate::simd::U8x64; + let n = a.len().min(b.len()); let mut total = 0u64; // vpshufb LUT: popcount of each nibble (replicated across 64B) - let lookup = _mm512_set4_epi32( - 0x04030302_i32, 0x03020201_i32, 0x03020201_i32, 0x02010100_i32, - ); - let low_mask = _mm512_set1_epi8(0x0f); - let mut acc = _mm512_setzero_si512(); + let lookup = U8x64::nibble_popcount_lut(); + let low_mask = U8x64::splat(0x0f); + let mut acc = U8x64::splat(0); let mut i = 0; let mut inner_count = 0u32; while i + 64 <= n { - let va = _mm512_loadu_si512(a.as_ptr().add(i) as *const _); - let vb = _mm512_loadu_si512(b.as_ptr().add(i) as *const _); - let xor = _mm512_xor_si512(va, vb); + let va = U8x64::from_slice(&a[i..]); + let vb = U8x64::from_slice(&b[i..]); + let xor = va ^ vb; - let lo = _mm512_and_si512(xor, low_mask); - let hi = _mm512_and_si512(_mm512_srli_epi16(xor, 4), low_mask); - let popcnt_lo = _mm512_shuffle_epi8(lookup, lo); - let popcnt_hi = _mm512_shuffle_epi8(lookup, hi); - acc = _mm512_add_epi8(acc, _mm512_add_epi8(popcnt_lo, popcnt_hi)); + let lo = xor & low_mask; + let hi = xor.shr_epi16(4) & low_mask; + let popcnt_lo = lookup.shuffle_bytes(lo); + let popcnt_hi = lookup.shuffle_bytes(hi); + acc = acc + (popcnt_lo + popcnt_hi); i += 64; inner_count += 1; // Flush u8 accumulators before overflow (max 255/8 ≈ 31 iterations) if inner_count >= 30 { - // sad_epu8 sums groups of 8 bytes into u64 lanes - let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512()); - total += _mm512_reduce_add_epi64(sad) as u64; - acc = _mm512_setzero_si512(); + total += acc.sum_bytes_u64(); + acc = U8x64::splat(0); inner_count = 0; } } if inner_count > 0 { - let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512()); - total += _mm512_reduce_add_epi64(sad) as u64; + total += acc.sum_bytes_u64(); } // Remainder @@ -168,39 +135,36 @@ unsafe fn hamming_avx512bw(a: &[u8], b: &[u8]) -> u64 { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512bw")] unsafe fn popcount_avx512bw(a: &[u8]) -> u64 { - use core::arch::x86_64::*; + use crate::simd::U8x64; + let n = a.len(); let mut total = 0u64; - let lookup = _mm512_set4_epi32( - 0x04030302_i32, 0x03020201_i32, 0x03020201_i32, 0x02010100_i32, - ); - let low_mask = _mm512_set1_epi8(0x0f); - let mut acc = _mm512_setzero_si512(); + let lookup = U8x64::nibble_popcount_lut(); + let low_mask = U8x64::splat(0x0f); + let mut acc = U8x64::splat(0); let mut i = 0; let mut inner_count = 0u32; while i + 64 <= n { - let va = _mm512_loadu_si512(a.as_ptr().add(i) as *const _); - let lo = _mm512_and_si512(va, low_mask); - let hi = _mm512_and_si512(_mm512_srli_epi16(va, 4), low_mask); - let popcnt_lo = _mm512_shuffle_epi8(lookup, lo); - let popcnt_hi = _mm512_shuffle_epi8(lookup, hi); - acc = _mm512_add_epi8(acc, _mm512_add_epi8(popcnt_lo, popcnt_hi)); + let va = U8x64::from_slice(&a[i..]); + let lo = va & low_mask; + let hi = va.shr_epi16(4) & low_mask; + let popcnt_lo = lookup.shuffle_bytes(lo); + let popcnt_hi = lookup.shuffle_bytes(hi); + acc = acc + (popcnt_lo + popcnt_hi); i += 64; inner_count += 1; if inner_count >= 30 { - let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512()); - total += _mm512_reduce_add_epi64(sad) as u64; - acc = _mm512_setzero_si512(); + total += acc.sum_bytes_u64(); + acc = U8x64::splat(0); inner_count = 0; } } if inner_count > 0 { - let sad = _mm512_sad_epu8(acc, _mm512_setzero_si512()); - total += _mm512_reduce_add_epi64(sad) as u64; + total += acc.sum_bytes_u64(); } while i < n { diff --git a/src/hpc/byte_scan.rs b/src/hpc/byte_scan.rs index 0840804f..4876a2cb 100644 --- a/src/hpc/byte_scan.rs +++ b/src/hpc/byte_scan.rs @@ -10,34 +10,33 @@ #[cfg(target_arch = "x86_64")] pub(crate) mod simd_impl { - use core::arch::x86_64::*; + use crate::simd::U8x64; - /// Find all positions of `needle` in `haystack` using AVX2 (32 bytes/iter). + /// Find all positions of `needle` in `haystack` using scalar 32-byte chunks. + /// + /// No U8x32 polyfill exists, so we use scalar array comparison in 32-byte + /// blocks (same throughput pattern as the former AVX2 intrinsics path). /// /// # Safety - /// Caller must ensure AVX2 is available. + /// Caller must ensure AVX2 is available (kept for dispatch compatibility). #[target_feature(enable = "avx2")] pub(crate) unsafe fn byte_find_all_avx2(haystack: &[u8], needle: u8) -> Vec { let mut result = Vec::new(); let n = haystack.len(); - let ptr = haystack.as_ptr(); - let needle_v = _mm256_set1_epi8(needle as i8); let mut i = 0usize; while i + 32 <= n { - let data = _mm256_loadu_si256(ptr.add(i) as *const __m256i); - let cmp = _mm256_cmpeq_epi8(data, needle_v); - let mut mask = _mm256_movemask_epi8(cmp) as u32; - while mask != 0 { - let bit = mask.trailing_zeros() as usize; - result.push(i + bit); - mask &= mask - 1; // clear lowest set bit + // Scalar array comparison over 32 bytes + for j in 0..32 { + if haystack[i + j] == needle { + result.push(i + j); + } } i += 32; } // Scalar tail for j in i..n { - if *ptr.add(j) == needle { + if haystack[j] == needle { result.push(j); } } @@ -46,8 +45,7 @@ pub(crate) mod simd_impl { /// Find all positions of `needle` in `haystack` using AVX-512 BW (64 bytes/iter). /// - /// Uses `_mm512_cmpeq_epi8_mask` which returns a `u64` kmask directly, - /// avoiding the movemask step needed in AVX2. + /// Uses `U8x64::cmpeq_mask()` which returns a `u64` bitmask directly. /// /// # Safety /// Caller must ensure AVX-512 BW is available. @@ -55,14 +53,13 @@ pub(crate) mod simd_impl { pub(crate) unsafe fn byte_find_all_avx512(haystack: &[u8], needle: u8) -> Vec { let mut result = Vec::new(); let n = haystack.len(); - let ptr = haystack.as_ptr(); - let needle_v = _mm512_set1_epi8(needle as i8); + let needle_v = U8x64::splat(needle); let mut i = 0usize; while i + 64 <= n { - // SAFETY: ptr.add(i) is within bounds, avx512bw checked by caller. - let data = _mm512_loadu_si512(ptr.add(i) as *const __m512i); - let mut mask = _mm512_cmpeq_epi8_mask(data, needle_v); + // SAFETY: i + 64 <= n, so slice is valid; avx512bw checked by caller. + let data = U8x64::from_slice(&haystack[i..]); + let mut mask = data.cmpeq_mask(needle_v); while mask != 0 { let bit = mask.trailing_zeros() as usize; result.push(i + bit); @@ -72,34 +69,37 @@ pub(crate) mod simd_impl { } // Scalar tail for j in i..n { - if *ptr.add(j) == needle { + if haystack[j] == needle { result.push(j); } } result } - /// Count occurrences of `needle` using AVX2. + /// Count occurrences of `needle` using scalar 32-byte chunks. + /// + /// No U8x32 polyfill exists, so we use scalar array comparison in 32-byte + /// blocks (same throughput pattern as the former AVX2 intrinsics path). /// /// # Safety - /// Caller must ensure AVX2 is available. + /// Caller must ensure AVX2 is available (kept for dispatch compatibility). #[target_feature(enable = "avx2")] pub(crate) unsafe fn byte_count_avx2(haystack: &[u8], needle: u8) -> usize { let n = haystack.len(); - let ptr = haystack.as_ptr(); - let needle_v = _mm256_set1_epi8(needle as i8); let mut total = 0usize; let mut i = 0usize; while i + 32 <= n { - let data = _mm256_loadu_si256(ptr.add(i) as *const __m256i); - let cmp = _mm256_cmpeq_epi8(data, needle_v); - let mask = _mm256_movemask_epi8(cmp) as u32; - total += mask.count_ones() as usize; + // Scalar array comparison over 32 bytes + for j in 0..32 { + if haystack[i + j] == needle { + total += 1; + } + } i += 32; } for j in i..n { - if *ptr.add(j) == needle { + if haystack[j] == needle { total += 1; } } @@ -108,25 +108,26 @@ pub(crate) mod simd_impl { /// Count occurrences of `needle` using AVX-512 BW (64 bytes/iter). /// + /// Uses `U8x64::cmpeq_mask()` which returns a `u64` bitmask directly. + /// /// # Safety /// Caller must ensure AVX-512 BW is available. #[target_feature(enable = "avx512bw")] pub(crate) unsafe fn byte_count_avx512(haystack: &[u8], needle: u8) -> usize { let n = haystack.len(); - let ptr = haystack.as_ptr(); - let needle_v = _mm512_set1_epi8(needle as i8); + let needle_v = U8x64::splat(needle); let mut total = 0usize; let mut i = 0usize; while i + 64 <= n { - // SAFETY: ptr.add(i) is within bounds, avx512bw checked by caller. - let data = _mm512_loadu_si512(ptr.add(i) as *const __m512i); - let mask = _mm512_cmpeq_epi8_mask(data, needle_v); + // SAFETY: i + 64 <= n, so slice is valid; avx512bw checked by caller. + let data = U8x64::from_slice(&haystack[i..]); + let mask = data.cmpeq_mask(needle_v); total += mask.count_ones() as usize; i += 64; } for j in i..n { - if *ptr.add(j) == needle { + if haystack[j] == needle { total += 1; } } diff --git a/src/hpc/cam_pq.rs b/src/hpc/cam_pq.rs index 87536ea2..0c103192 100644 --- a/src/hpc/cam_pq.rs +++ b/src/hpc/cam_pq.rs @@ -214,7 +214,7 @@ impl DistanceTables { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn distance_batch_avx512(&self, cams: &[CamFingerprint]) -> Vec { - use core::arch::x86_64::*; + use crate::simd::{F32x16, I32x16}; let n = cams.len(); let mut result = vec![0.0f32; n]; @@ -222,31 +222,30 @@ impl DistanceTables { for chunk in 0..chunks { let base = chunk * 16; - let mut acc = _mm512_setzero_ps(); + let mut acc = F32x16::splat(0.0); for s in 0..NUM_SUBSPACES { // Gather 16 centroid indices for subspace s - let indices = _mm512_set_epi32( - cams[base + 15][s] as i32, cams[base + 14][s] as i32, - cams[base + 13][s] as i32, cams[base + 12][s] as i32, - cams[base + 11][s] as i32, cams[base + 10][s] as i32, - cams[base + 9][s] as i32, cams[base + 8][s] as i32, - cams[base + 7][s] as i32, cams[base + 6][s] as i32, - cams[base + 5][s] as i32, cams[base + 4][s] as i32, - cams[base + 3][s] as i32, cams[base + 2][s] as i32, - cams[base + 1][s] as i32, cams[base][s] as i32, - ); + let idx_arr = [ + cams[base][s] as i32, cams[base + 1][s] as i32, + cams[base + 2][s] as i32, cams[base + 3][s] as i32, + cams[base + 4][s] as i32, cams[base + 5][s] as i32, + cams[base + 6][s] as i32, cams[base + 7][s] as i32, + cams[base + 8][s] as i32, cams[base + 9][s] as i32, + cams[base + 10][s] as i32, cams[base + 11][s] as i32, + cams[base + 12][s] as i32, cams[base + 13][s] as i32, + cams[base + 14][s] as i32, cams[base + 15][s] as i32, + ]; + let indices = I32x16::from_array(idx_arr); // Gather distances from precomputed table - // _mm512_i32gather_ps gathers f32 values at byte offsets = index * scale - // Scale=4 means each index is multiplied by 4 (sizeof f32) let base_ptr = self.tables[s].as_ptr(); - let distances = _mm512_i32gather_ps::<4>(indices, base_ptr); + let distances = F32x16::gather(indices, base_ptr); - acc = _mm512_add_ps(acc, distances); + acc = acc + distances; } - _mm512_storeu_ps(result[base..].as_mut_ptr(), acc); + acc.copy_to_slice(&mut result[base..base + 16]); } // Scalar tail diff --git a/src/hpc/distance.rs b/src/hpc/distance.rs index 4b37b9a2..545a6f22 100644 --- a/src/hpc/distance.rs +++ b/src/hpc/distance.rs @@ -30,10 +30,9 @@ fn sq_dist_f64(a: [f64; 3], b: [f64; 3]) -> f64 { #[cfg(target_arch = "x86_64")] pub(crate) mod simd_impl { - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; + use crate::simd::F32x8; - /// Compute squared distances for 8 points at a time using AVX2. + /// Compute squared distances for 8 points at a time using F32x8 polyfill. /// `query` components are broadcast; `points` is read in SOA-style chunks. /// /// # Safety @@ -48,11 +47,10 @@ pub(crate) mod simd_impl { out.clear(); out.reserve(n); - let qx = _mm256_set1_ps(query[0]); - let qy = _mm256_set1_ps(query[1]); - let qz = _mm256_set1_ps(query[2]); + let qx = F32x8::splat(query[0]); + let qy = F32x8::splat(query[1]); + let qz = F32x8::splat(query[2]); - let ptr = points.as_ptr() as *const f32; // Each point is 3 floats => stride 3 let mut i = 0usize; // Process 8 points at a time @@ -63,28 +61,23 @@ pub(crate) mod simd_impl { let mut ys = [0f32; 8]; let mut zs = [0f32; 8]; for j in 0..8 { - let base = (i + j) * 3; - xs[j] = *ptr.add(base); - ys[j] = *ptr.add(base + 1); - zs[j] = *ptr.add(base + 2); + xs[j] = points[i + j][0]; + ys[j] = points[i + j][1]; + zs[j] = points[i + j][2]; } - let vx = _mm256_loadu_ps(xs.as_ptr()); - let vy = _mm256_loadu_ps(ys.as_ptr()); - let vz = _mm256_loadu_ps(zs.as_ptr()); + let vx = F32x8::from_array(xs); + let vy = F32x8::from_array(ys); + let vz = F32x8::from_array(zs); - let dx = _mm256_sub_ps(qx, vx); - let dy = _mm256_sub_ps(qy, vy); - let dz = _mm256_sub_ps(qz, vz); + let dx = qx - vx; + let dy = qy - vy; + let dz = qz - vz; - // dx*dx + dy*dy + dz*dz (FMA where available) - let mut acc = _mm256_mul_ps(dx, dx); - acc = _mm256_add_ps(acc, _mm256_mul_ps(dy, dy)); - acc = _mm256_add_ps(acc, _mm256_mul_ps(dz, dz)); + // dx*dx + dy*dy + dz*dz + let acc = dx * dx + dy * dy + dz * dz; - let mut tmp = [0f32; 8]; - _mm256_storeu_ps(tmp.as_mut_ptr(), acc); - out.extend_from_slice(&tmp); + out.extend_from_slice(&acc.to_array()); i += 8; } diff --git a/src/hpc/nibble.rs b/src/hpc/nibble.rs index a740f198..4eceb246 100644 --- a/src/hpc/nibble.rs +++ b/src/hpc/nibble.rs @@ -50,34 +50,35 @@ pub(crate) fn nibble_unpack_scalar(packed: &[u8], count: usize, out: &mut Vec= 32`. #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nibble_unpack_avx2(packed: &[u8], count: usize, out: &mut Vec) { - use core::arch::x86_64::*; - - let low_mask = _mm_set1_epi8(0x0F); let mut i = 0usize; // byte index into packed let mut emitted = 0usize; // Each iteration: load 16 packed bytes → 32 nibbles while emitted + 32 <= count && i + 16 <= packed.len() { - // SAFETY: i + 16 <= packed.len(), avx2 checked by caller. - let data = _mm_loadu_si128(packed.as_ptr().add(i) as *const __m128i); - - // Low nibbles (even indices) - let lo = _mm_and_si128(data, low_mask); - // High nibbles (odd indices) - let hi = _mm_and_si128(_mm_srli_epi16(data, 4), low_mask); + let mut data = [0u8; 16]; + data.copy_from_slice(&packed[i..i + 16]); + + // Extract low and high nibbles + let mut lo = [0u8; 16]; + let mut hi = [0u8; 16]; + for j in 0..16 { + lo[j] = data[j] & 0x0F; + hi[j] = (data[j] >> 4) & 0x0F; + } // Interleave: lo[0],hi[0], lo[1],hi[1], ... - let interleaved_lo = _mm_unpacklo_epi8(lo, hi); // bytes 0-7 → 16 nibbles - let interleaved_hi = _mm_unpackhi_epi8(lo, hi); // bytes 8-15 → 16 nibbles - let mut buf = [0u8; 32]; - _mm_storeu_si128(buf.as_mut_ptr() as *mut __m128i, interleaved_lo); - _mm_storeu_si128(buf.as_mut_ptr().add(16) as *mut __m128i, interleaved_hi); + for j in 0..16 { + buf[j * 2] = lo[j]; + buf[j * 2 + 1] = hi[j]; + } out.extend_from_slice(&buf); i += 16; @@ -167,34 +168,20 @@ fn nibble_sub_clamp_scalar(packed: &mut [u8], delta: u8) { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) { - use core::arch::x86_64::*; - - let mask_lo = _mm256_set1_epi8(0x0F); - let mask_hi = _mm256_set1_epi8(0xF0u8 as i8); - let delta_v = _mm256_set1_epi8(delta as i8); - // delta shifted into high nibble position for direct subtraction - let delta_hi = _mm256_set1_epi8((delta << 4) as i8); let chunks = packed.len() / 32; for c in 0..chunks { - let ptr = packed.as_mut_ptr().add(c * 32); - let data = _mm256_loadu_si256(ptr as *const __m256i); - - // Extract low nibbles, subtract with saturation - let lo = _mm256_and_si256(data, mask_lo); - let lo_sub = _mm256_subs_epu8(lo, delta_v); - - // Extract high nibbles (keep in high position), subtract with saturation - let hi = _mm256_and_si256(data, mask_hi); - let hi_sub = _mm256_subs_epu8(hi, delta_hi); - - // Combine: low nibbles are already clean (0-15), high nibbles already in position - let result = _mm256_or_si256( - _mm256_and_si256(lo_sub, mask_lo), - _mm256_and_si256(hi_sub, mask_hi), - ); + let offset = c * 32; + let mut data = [0u8; 32]; + data.copy_from_slice(&packed[offset..offset + 32]); + + for j in 0..32 { + let lo = (data[j] & 0x0F).saturating_sub(delta); + let hi = ((data[j] >> 4) & 0x0F).saturating_sub(delta); + data[j] = lo | (hi << 4); + } - _mm256_storeu_si256(ptr as *mut __m256i, result); + packed[offset..offset + 32].copy_from_slice(&data); } // Scalar tail @@ -208,31 +195,28 @@ unsafe fn nibble_sub_clamp_avx2(packed: &mut [u8], delta: u8) { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512bw")] unsafe fn nibble_sub_clamp_avx512(packed: &mut [u8], delta: u8) { - use core::arch::x86_64::*; + use crate::simd::U8x64; - let mask_lo = _mm512_set1_epi8(0x0F); - let mask_hi = _mm512_set1_epi8(0xF0u8 as i8); - let delta_v = _mm512_set1_epi8(delta as i8); - let delta_hi = _mm512_set1_epi8((delta << 4) as i8); + let mask_lo = U8x64::splat(0x0F); + let mask_hi = U8x64::splat(0xF0); + let delta_v = U8x64::splat(delta); + let delta_hi = U8x64::splat(delta << 4); let chunks = packed.len() / 64; for c in 0..chunks { - let ptr = packed.as_mut_ptr().add(c * 64); - // SAFETY: ptr is within bounds (c * 64 + 64 <= packed.len()), avx512bw checked. - let data = _mm512_loadu_si512(ptr as *const __m512i); + let offset = c * 64; + // SAFETY: offset + 64 <= packed.len(), avx512bw checked. + let data = U8x64::from_slice(&packed[offset..]); - let lo = _mm512_and_si512(data, mask_lo); - let lo_sub = _mm512_subs_epu8(lo, delta_v); + let lo = data & mask_lo; + let lo_sub = lo.saturating_sub(delta_v); - let hi = _mm512_and_si512(data, mask_hi); - let hi_sub = _mm512_subs_epu8(hi, delta_hi); + let hi = data & mask_hi; + let hi_sub = hi.saturating_sub(delta_hi); - let result = _mm512_or_si512( - _mm512_and_si512(lo_sub, mask_lo), - _mm512_and_si512(hi_sub, mask_hi), - ); + let result = (lo_sub & mask_lo) | (hi_sub & mask_hi); - _mm512_storeu_si512(ptr as *mut __m512i, result); + result.copy_to_slice(&mut packed[offset..offset + 64]); } // Scalar tail @@ -265,55 +249,29 @@ pub(crate) fn nibble_above_threshold_scalar(packed: &[u8], threshold: u8) -> Vec /// AVX2 nibble threshold scan: processes 32 packed bytes (64 nibbles) per iteration. /// -/// Splits each byte into lo/hi nibbles, compares against threshold using -/// signed comparison (with bias trick), and extracts matching indices from bitmask. +/// Uses scalar array operations on 32-byte chunks for portability. /// /// # Safety /// Caller must ensure AVX2 is available and `packed.len() >= 16`. #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nibble_above_threshold_avx2(packed: &[u8], threshold: u8) -> Vec { - use core::arch::x86_64::*; - let mut result = Vec::new(); - let low_mask = _mm256_set1_epi8(0x0F); - // For unsigned comparison via signed: bias both operands by -128 - let bias = _mm256_set1_epi8(-128i8); - // We want > threshold, which is: (val - 128) > (threshold - 128) via signed cmpgt - let thresh_lo = _mm256_set1_epi8((threshold as i8).wrapping_add(-128)); let chunks = packed.len() / 32; for c in 0..chunks { let base_byte = c * 32; - // SAFETY: base_byte + 32 <= packed.len(), avx2 checked. - let data = _mm256_loadu_si256(packed.as_ptr().add(base_byte) as *const __m256i); - - // Extract low nibbles - let lo = _mm256_and_si256(data, low_mask); - // Extract high nibbles - let hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), low_mask); - - // Bias for unsigned compare: add -128 then use signed cmpgt - let lo_biased = _mm256_add_epi8(lo, bias); - let hi_biased = _mm256_add_epi8(hi, bias); - - let lo_gt = _mm256_cmpgt_epi8(lo_biased, thresh_lo); - let hi_gt = _mm256_cmpgt_epi8(hi_biased, thresh_lo); + let chunk = &packed[base_byte..base_byte + 32]; - let mut lo_mask = _mm256_movemask_epi8(lo_gt) as u32; - let mut hi_mask = _mm256_movemask_epi8(hi_gt) as u32; - - // Low nibbles are at even indices: byte_index * 2 - while lo_mask != 0 { - let bit = lo_mask.trailing_zeros() as usize; - result.push((base_byte + bit) * 2); - lo_mask &= lo_mask - 1; - } - // High nibbles are at odd indices: byte_index * 2 + 1 - while hi_mask != 0 { - let bit = hi_mask.trailing_zeros() as usize; - result.push((base_byte + bit) * 2 + 1); - hi_mask &= hi_mask - 1; + for j in 0..32 { + let lo = chunk[j] & 0x0F; + let hi = (chunk[j] >> 4) & 0x0F; + if lo > threshold { + result.push((base_byte + j) * 2); + } + if hi > threshold { + result.push((base_byte + j) * 2 + 1); + } } } diff --git a/src/hpc/packed.rs b/src/hpc/packed.rs index 339dd9ba..6cd4b887 100644 --- a/src/hpc/packed.rs +++ b/src/hpc/packed.rs @@ -21,7 +21,7 @@ use super::bitwise; /// Software prefetch: bring a cache line into L1 for the given byte slice. /// -/// No-op on non-x86 targets. On x86_64, uses `_mm_prefetch(_MM_HINT_T0)`. +/// No-op on non-x86 targets. On x86_64, emits `prefetcht0`. /// The prefetch distance (how many candidates ahead) should be tuned per /// cache hierarchy — 4 candidates × 128B = 512B ≈ 8 cache lines is a /// reasonable default for Stroke 1 sequential scan. @@ -29,16 +29,11 @@ use super::bitwise; #[allow(unused_variables)] fn prefetch_t0(ptr: *const u8) { #[cfg(target_arch = "x86_64")] - // SAFETY: `_mm_prefetch` is a CPU hint that cannot cause UB for any pointer + // SAFETY: prefetcht0 is a CPU hint that cannot cause UB for any pointer // value — the CPU silently ignores invalid or unmapped addresses. The `ptr` // comes from a bounds-checked slice index in the caller. unsafe { - #[cfg(target_feature = "sse")] - { - core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T0 }>( - ptr as *const i8, - ); - } + core::arch::asm!("prefetcht0 [{}]", in(reg) ptr, options(nostack, preserves_flags)); } } @@ -47,16 +42,11 @@ fn prefetch_t0(ptr: *const u8) { #[allow(unused_variables)] fn prefetch_t1(ptr: *const u8) { #[cfg(target_arch = "x86_64")] - // SAFETY: `_mm_prefetch` is a CPU hint that cannot cause UB for any pointer + // SAFETY: prefetcht1 is a CPU hint that cannot cause UB for any pointer // value — the CPU silently ignores invalid or unmapped addresses. The `ptr` // comes from a bounds-checked slice index in the caller. unsafe { - #[cfg(target_feature = "sse")] - { - core::arch::x86_64::_mm_prefetch::<{ core::arch::x86_64::_MM_HINT_T1 }>( - ptr as *const i8, - ); - } + core::arch::asm!("prefetcht1 [{}]", in(reg) ptr, options(nostack, preserves_flags)); } } diff --git a/src/hpc/palette_codec.rs b/src/hpc/palette_codec.rs index f1715e41..2189c435 100644 --- a/src/hpc/palette_codec.rs +++ b/src/hpc/palette_codec.rs @@ -352,30 +352,36 @@ unsafe fn pack_generic_avx512(indices: &[u8], bits_per_index: usize) -> Vec #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn unpack_4bit_avx2(packed: &[u64], count: usize) -> Vec { - use core::arch::x86_64::*; - let mut result = Vec::with_capacity(count); let bytes = bytemuck_cast_u64_to_u8(packed); - let low_mask = _mm256_set1_epi8(0x0F); let mut i = 0; - // Process 32 bytes at a time → 64 nibbles + // Process 32 bytes at a time → 64 nibbles via raw array ops. + // Each input byte yields two 4-bit indices: low nibble first, high nibble second. + // Interleaving follows the unpacklo/unpackhi pattern: within each 16-byte lane, + // bytes are interleaved as (lo[0],hi[0], lo[1],hi[1], ..., lo[7],hi[7]). while i + 32 <= bytes.len() && result.len() + 64 <= count { - let data = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i); - - // Extract low nibbles - let lo = _mm256_and_si256(data, low_mask); - // Extract high nibbles - let hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), low_mask); - - // Interleave: we need to output low nibble, then high nibble for each byte - let interleaved_lo = _mm256_unpacklo_epi8(lo, hi); - let interleaved_hi = _mm256_unpackhi_epi8(lo, hi); - - // Store + let src = &bytes[i..i + 32]; let mut buf = [0u8; 64]; - _mm256_storeu_si256(buf.as_mut_ptr() as *mut __m256i, interleaved_lo); - _mm256_storeu_si256(buf.as_mut_ptr().add(32) as *mut __m256i, interleaved_hi); + + // Two 16-byte lanes (mirroring 256-bit AVX2 lane structure) + for lane in 0..2 { + let lane_off = lane * 16; + // unpacklo: interleave bytes 0..8 of each lane + for j in 0..8 { + let lo_val = src[lane_off + j] & 0x0F; + let hi_val = (src[lane_off + j] >> 4) & 0x0F; + buf[lane * 16 + j * 2] = lo_val; + buf[lane * 16 + j * 2 + 1] = hi_val; + } + // unpackhi: interleave bytes 8..16 of each lane + for j in 0..8 { + let lo_val = src[lane_off + 8 + j] & 0x0F; + let hi_val = (src[lane_off + 8 + j] >> 4) & 0x0F; + buf[32 + lane * 16 + j * 2] = lo_val; + buf[32 + lane * 16 + j * 2 + 1] = hi_val; + } + } let remaining = count - result.len(); let take = remaining.min(64); diff --git a/src/hpc/property_mask.rs b/src/hpc/property_mask.rs index abad125c..ee108f03 100644 --- a/src/hpc/property_mask.rs +++ b/src/hpc/property_mask.rs @@ -156,38 +156,36 @@ impl PropertyMask { /// Test block states using AVX-512F, processing 8 u64s at a time. /// - /// Uses 512-bit registers with `_mm512_cmpeq_epi64_mask` returning a - /// `__mmask8` directly, avoiding the movemask+lane-extract dance of AVX2. + /// Uses U64x8 polyfill with element-wise comparison. #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn test_section_avx512(&self, states: &[u64], result: &mut [u64]) { - use core::arch::x86_64::*; + use crate::simd::U64x8; - let and_mask_v = _mm512_set1_epi64(self.and_mask as i64); - let and_expect_v = _mm512_set1_epi64(self.and_expect as i64); - let andn_mask_v = _mm512_set1_epi64(self.andn_mask as i64); - let zero = _mm512_setzero_si512(); + let and_mask_v = U64x8::splat(self.and_mask); + let and_expect_v = U64x8::splat(self.and_expect); + let andn_mask_v = U64x8::splat(self.andn_mask); + let zero = U64x8::splat(0); let chunks = states.len() / 8; for c in 0..chunks { let base = c * 8; // SAFETY: base + 8 <= states.len(), avx512f checked by caller. - let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i); + let vals = U64x8::from_slice(&states[base..]); // (vals & and_mask) == and_expect - let anded = _mm512_and_si512(vals, and_mask_v); - let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v); + let anded = vals & and_mask_v; + let anded_arr = anded.to_array(); + let expect_arr = and_expect_v.to_array(); // (vals & andn_mask) == 0 - let andned = _mm512_and_si512(vals, andn_mask_v); - let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero); - - // Both conditions: AND the two kmasks - let both = eq_and & eq_andn; + let andned = vals & andn_mask_v; + let andned_arr = andned.to_array(); + let zero_arr = zero.to_array(); // Set bits in the result bitmap for lane in 0..8usize { - if (both >> lane) & 1 != 0 { + if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == zero_arr[lane] { let idx = base + lane; result[idx / 64] |= 1u64 << (idx % 64); } @@ -206,27 +204,32 @@ impl PropertyMask { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f", enable = "avx512vpopcntdq")] unsafe fn count_section_avx512(&self, states: &[u64]) -> u32 { - use core::arch::x86_64::*; + use crate::simd::U64x8; - let and_mask_v = _mm512_set1_epi64(self.and_mask as i64); - let and_expect_v = _mm512_set1_epi64(self.and_expect as i64); - let andn_mask_v = _mm512_set1_epi64(self.andn_mask as i64); - let zero = _mm512_setzero_si512(); + let and_mask_v = U64x8::splat(self.and_mask); + let and_expect_v = U64x8::splat(self.and_expect); + let andn_mask_v = U64x8::splat(self.andn_mask); let mut total = 0u32; let chunks = states.len() / 8; for c in 0..chunks { let base = c * 8; // SAFETY: base + 8 <= states.len(), features checked by caller. - let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i); + let vals = U64x8::from_slice(&states[base..]); - let anded = _mm512_and_si512(vals, and_mask_v); - let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v); + let anded = vals & and_mask_v; + let anded_arr = anded.to_array(); + let expect_arr = and_expect_v.to_array(); - let andned = _mm512_and_si512(vals, andn_mask_v); - let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero); + let andned = vals & andn_mask_v; + let andned_arr = andned.to_array(); - let both = eq_and & eq_andn; + let mut both: u8 = 0; + for lane in 0..8usize { + if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == 0 { + both |= 1 << lane; + } + } total += (both as u32).count_ones(); } @@ -244,37 +247,16 @@ impl PropertyMask { #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx2")] unsafe fn test_section_avx2(&self, states: &[u64], result: &mut [u64]) { - // AVX2 processes 4 u64s at a time via 256-bit registers. - use core::arch::x86_64::*; - - let and_mask_v = _mm256_set1_epi64x(self.and_mask as i64); - let and_expect_v = _mm256_set1_epi64x(self.and_expect as i64); - let andn_mask_v = _mm256_set1_epi64x(self.andn_mask as i64); - let zero = _mm256_setzero_si256(); - + // AVX2 processes 4 u64s at a time via scalar array operations. let chunks = states.len() / 4; for c in 0..chunks { let base = c * 4; - let vals = _mm256_loadu_si256(states.as_ptr().add(base) as *const __m256i); - - // (vals & and_mask) == and_expect - let anded = _mm256_and_si256(vals, and_mask_v); - let eq_and = _mm256_cmpeq_epi64(anded, and_expect_v); - - // (vals & andn_mask) == 0 - let andned = _mm256_and_si256(vals, andn_mask_v); - let eq_andn = _mm256_cmpeq_epi64(andned, zero); - - // both conditions - let both = _mm256_and_si256(eq_and, eq_andn); - // Extract per-lane results (each lane is all-1s or all-0s). - // _mm256_movemask_epi8 gives 32 bits; lanes are 8 bytes each. - let mask32 = _mm256_movemask_epi8(both) as u32; - // Lane k matched if bytes [k*8..(k+1)*8] are all 0xFF → bits set. for lane in 0..4usize { - let byte_mask = (mask32 >> (lane * 8)) & 0xFF; - if byte_mask == 0xFF { + let val = states[base + lane]; + if (val & self.and_mask) == self.and_expect + && (val & self.andn_mask) == 0 + { let idx = base + lane; result[idx / 64] |= 1u64 << (idx % 64); } @@ -361,32 +343,37 @@ fn count_section_multi_scalar(masks: &[PropertyMask], states: &[u64]) -> MultiMa #[cfg(target_arch = "x86_64")] #[target_feature(enable = "avx512f")] unsafe fn count_section_multi_avx512(masks: &[PropertyMask], states: &[u64]) -> MultiMaskResult { - use core::arch::x86_64::*; + use crate::simd::U64x8; let mut counts = vec![0u32; masks.len()]; - let zero = _mm512_setzero_si512(); let chunks = states.len() / 8; for c in 0..chunks { let base = c * 8; // SAFETY: base + 8 <= states.len(), avx512f checked by caller. - let vals = _mm512_loadu_si512(states.as_ptr().add(base) as *const __m512i); + let vals = U64x8::from_slice(&states[base..]); for (m_idx, mask) in masks.iter().enumerate() { - let and_mask_v = _mm512_set1_epi64(mask.and_mask as i64); - let and_expect_v = _mm512_set1_epi64(mask.and_expect as i64); - let andn_mask_v = _mm512_set1_epi64(mask.andn_mask as i64); + let and_mask_v = U64x8::splat(mask.and_mask); + let and_expect_v = U64x8::splat(mask.and_expect); + let andn_mask_v = U64x8::splat(mask.andn_mask); // (vals & and_mask) == and_expect - let anded = _mm512_and_si512(vals, and_mask_v); - let eq_and = _mm512_cmpeq_epi64_mask(anded, and_expect_v); + let anded = vals & and_mask_v; + let anded_arr = anded.to_array(); + let expect_arr = and_expect_v.to_array(); // (vals & andn_mask) == 0 - let andned = _mm512_and_si512(vals, andn_mask_v); - let eq_andn = _mm512_cmpeq_epi64_mask(andned, zero); + let andned = vals & andn_mask_v; + let andned_arr = andned.to_array(); - // Both conditions: AND the two kmasks - let both = eq_and & eq_andn; + // Both conditions: count matching lanes + let mut both: u8 = 0; + for lane in 0..8usize { + if anded_arr[lane] == expect_arr[lane] && andned_arr[lane] == 0 { + both |= 1 << lane; + } + } counts[m_idx] += (both as u32).count_ones(); } } diff --git a/src/hpc/spatial_hash.rs b/src/hpc/spatial_hash.rs index 76aec2e3..b69d125a 100644 --- a/src/hpc/spatial_hash.rs +++ b/src/hpc/spatial_hash.rs @@ -316,7 +316,8 @@ pub(crate) fn batch_sq_dist_scalar( result } -/// AVX2 batch squared-distance filter: processes 8 candidates at a time. +/// AVX2 batch squared-distance filter: processes 8 candidates at a time +/// using F32x8 polyfill for arithmetic, scalar array comparison for filtering. /// /// # Safety /// Caller must ensure AVX2 is available. @@ -327,13 +328,12 @@ pub(crate) unsafe fn batch_sq_dist_avx2( candidates: &[[f32; 3]], radius_sq: f32, ) -> Vec<(usize, f32)> { - use core::arch::x86_64::*; + use crate::simd::F32x8; let mut result = Vec::new(); - let qx = _mm256_set1_ps(query[0]); - let qy = _mm256_set1_ps(query[1]); - let qz = _mm256_set1_ps(query[2]); - let radius_sq_v = _mm256_set1_ps(radius_sq); + let qx = F32x8::splat(query[0]); + let qy = F32x8::splat(query[1]); + let qz = F32x8::splat(query[2]); let chunks = candidates.len() / 8; for c in 0..chunks { @@ -349,35 +349,22 @@ pub(crate) unsafe fn batch_sq_dist_avx2( cz[i] = candidates[base + i][2]; } - // SAFETY: arrays are 8-element aligned, avx2 checked by caller. - let vx = _mm256_loadu_ps(cx.as_ptr()); - let vy = _mm256_loadu_ps(cy.as_ptr()); - let vz = _mm256_loadu_ps(cz.as_ptr()); + let vx = F32x8::from_array(cx); + let vy = F32x8::from_array(cy); + let vz = F32x8::from_array(cz); // Compute squared distances - let dx = _mm256_sub_ps(vx, qx); - let dy = _mm256_sub_ps(vy, qy); - let dz = _mm256_sub_ps(vz, qz); - - let d2 = _mm256_add_ps( - _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy)), - _mm256_mul_ps(dz, dz), - ); - - // Compare: d2 <= radius_sq - let cmp = _mm256_cmp_ps(d2, radius_sq_v, _CMP_LE_OQ); - let mask = _mm256_movemask_ps(cmp) as u32; - - if mask != 0 { - // Extract individual distances for matching lanes - let mut d2_arr = [0.0f32; 8]; - _mm256_storeu_ps(d2_arr.as_mut_ptr(), d2); - - let mut m = mask; - while m != 0 { - let bit = m.trailing_zeros() as usize; - result.push((base + bit, d2_arr[bit])); - m &= m - 1; + let dx = vx - qx; + let dy = vy - qy; + let dz = vz - qz; + + let d2 = dx * dx + dy * dy + dz * dz; + + // Compare: d2 <= radius_sq (scalar array comparison — no F32x8 cmp polyfill) + let d2_arr = d2.to_array(); + for lane in 0..8 { + if d2_arr[lane] <= radius_sq { + result.push((base + lane, d2_arr[lane])); } } } diff --git a/src/simd.rs b/src/simd.rs index fc2e56ff..109c2569 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -779,6 +779,31 @@ mod scalar { for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+8+i]; out[b+i*2+1] = other.0[b+8+i]; } } Self(out) } + /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 128-bit (16-byte) lane. + #[inline(always)] + pub fn shuffle_bytes(self, idx: Self) -> Self { + let mut out = [0u8; 64]; + for lane in 0..4 { + let b = lane * 16; + for i in 0..16 { + out[b + i] = self.0[b + (idx.0[b + i] & 0x0F) as usize]; + } + } + Self(out) + } + /// Sum all 64 bytes into a single `u64` without wrapping. + #[inline(always)] + pub fn sum_bytes_u64(self) -> u64 { + self.0.iter().map(|&b| b as u64).sum() + } + /// Build a nibble-popcount lookup table (replicated across 4 x 16-byte lanes). + #[inline(always)] + pub fn nibble_popcount_lut() -> Self { + let lane: [u8; 16] = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]; + let mut arr = [0u8; 64]; + for l in 0..4 { arr[l*16..(l+1)*16].copy_from_slice(&lane); } + Self(arr) + } } // Mul for U32x16 diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index bf3726b6..c952469a 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -607,6 +607,17 @@ impl F32x16 { let a = self.to_array(); let b = other.to_array(); let mut bits: u16 = 0; for i in 0..16 { if a[i] != b[i] { bits |= 1 << i; } } F32Mask16(bits) } + /// Gather 16 f32 values from `base_ptr` using 16 i32 indices. + /// + /// # Safety + /// Caller must ensure all indices are valid offsets into the memory at `base_ptr`. + #[inline(always)] + pub unsafe fn gather(indices: I32x16, base_ptr: *const f32) -> Self { + let idx = indices.0; + let mut o = [0.0f32; 16]; + for i in 0..16 { o[i] = *base_ptr.add(idx[i] as usize); } + Self::from_array(o) + } #[inline(always)] pub fn to_bits(self) -> U32x16 { let a = self.to_array(); let mut o = [0u32; 16]; for i in 0..16 { o[i] = a[i].to_bits(); } U32x16(o) @@ -829,6 +840,34 @@ impl U8x64 { #[inline(always)] pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap() } #[inline(always)] pub fn simd_min(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].min(other.0[i]); } Self(o) } #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].max(other.0[i]); } Self(o) } + + /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 16-byte lane. + #[inline(always)] + pub fn shuffle_bytes(self, idx: Self) -> Self { + let mut out = [0u8; 64]; + for lane in 0..4 { + let b = lane * 16; + for i in 0..16 { + out[b + i] = self.0[b + (idx.0[b + i] & 0x0F) as usize]; + } + } + Self(out) + } + + /// Sum all 64 bytes into a single `u64` without wrapping. + #[inline(always)] + pub fn sum_bytes_u64(self) -> u64 { + self.0.iter().map(|&b| b as u64).sum() + } + + /// Build a nibble-popcount lookup table (replicated across 4 x 16-byte lanes). + #[inline(always)] + pub fn nibble_popcount_lut() -> Self { + let lane: [u8; 16] = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]; + let mut arr = [0u8; 64]; + for l in 0..4 { arr[l*16..(l+1)*16].copy_from_slice(&lane); } + Self(arr) + } } avx2_int_type!(I32x16, i32, 16, 0i32); diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs index 99592963..4da011ff 100644 --- a/src/simd_avx512.rs +++ b/src/simd_avx512.rs @@ -235,6 +235,18 @@ impl F32x16 { // GE(a, b) = LE(b, a) other.simd_le(self) } + + /// Gather 16 f32 values from `base_ptr` using 16 i32 indices. + /// + /// Equivalent to `_mm512_i32gather_ps::<4>(indices, base_ptr)`: + /// each lane loads `base_ptr[indices[lane]]`. + /// + /// # Safety + /// Caller must ensure all indices are valid offsets into the memory at `base_ptr`. + #[inline(always)] + pub unsafe fn gather(indices: I32x16, base_ptr: *const f32) -> Self { + Self(_mm512_i32gather_ps::<4>(indices.0, base_ptr)) + } } impl_bin_op!(F32x16, Add, add, _mm512_add_ps); @@ -623,6 +635,40 @@ impl U8x64 { pub fn unpack_hi_epi8(self, other: Self) -> Self { Self(unsafe { _mm512_unpackhi_epi8(self.0, other.0) }) } + + /// Byte-wise shuffle: use `self` as a LUT, `idx` selects bytes within each 128-bit lane. + /// Equivalent to `_mm512_shuffle_epi8(self.0, idx.0)`. + #[inline(always)] + pub fn shuffle_bytes(self, idx: Self) -> Self { + Self(unsafe { _mm512_shuffle_epi8(self.0, idx.0) }) + } + + /// Sum all 64 bytes into a single `u64` without wrapping. + /// + /// Uses `_mm512_sad_epu8` (groups of 8 bytes → u64 lanes) then horizontal add. + /// Range: 0..=64*255 = 16_320, always fits in u64. + #[inline(always)] + pub fn sum_bytes_u64(self) -> u64 { + unsafe { + let sad = _mm512_sad_epu8(self.0, _mm512_setzero_si512()); + _mm512_reduce_add_epi64(sad) as u64 + } + } + + /// Build a nibble-popcount lookup table (replicated across all 4 × 128-bit lanes). + /// + /// Entry `i` = popcount of `i` for i in 0..16. Used with `shuffle_bytes` for + /// SIMD popcount via the Mula nibble-LUT algorithm. + #[inline(always)] + pub fn nibble_popcount_lut() -> Self { + // 0x04030302_03020201_03020201_02010100 replicated ×4 + Self(unsafe { _mm512_set4_epi32( + 0x04030302_u32 as i32, + 0x03020201_u32 as i32, + 0x03020201_u32 as i32, + 0x02010100_u32 as i32, + )}) + } } // u8 add/sub use AVX-512BW instructions