diff --git a/src/simd.rs b/src/simd.rs index a6654b74..fc2e56ff 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -546,6 +546,25 @@ mod scalar { for i in 0..16 { out[i] = self.0[i].abs(); } Self(out) } + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + let mut o = [0i32; 16]; + for i in 0..16 { o[i] = s[i] as i32; } + Self(o) + } + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + let mut o = [0i16; 16]; + for i in 0..16 { o[i] = self.0[i] as i16; } + o + } + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + let mut mask = 0u16; + for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } } + mask + } } impl Mul for I32x16 { diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index b8f9ad84..bf3726b6 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -843,6 +843,31 @@ impl I32x16 { #[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].max(other.0[i]); } Self(o) } #[inline(always)] pub fn cast_f32(self) -> F32x16 { let mut o = [0.0f32; 16]; for i in 0..16 { o[i] = self.0[i] as f32; } F32x16::from_array(o) } #[inline(always)] pub fn abs(self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].abs(); } Self(o) } + + /// Load 16 × i16, sign-extend to 16 × i32. + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + let mut o = [0i32; 16]; + for i in 0..16 { o[i] = s[i] as i32; } + Self(o) + } + + /// Narrow 16 × i32 to 16 × i16 (truncation). + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + let mut o = [0i16; 16]; + for i in 0..16 { o[i] = self.0[i] as i16; } + o + } + + /// Mask: bit i set where lane i >= 0. + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + let mut mask = 0u16; + for i in 0..16 { if self.0[i] >= 0 { mask |= 1 << i; } } + mask + } } impl Mul for I32x16 { type Output = Self; #[inline(always)] fn mul(self, r: Self) -> Self { let mut o = [0i32; 16]; for i in 0..16 { o[i] = self.0[i].wrapping_mul(r.0[i]); } Self(o) } } impl MulAssign for I32x16 { #[inline(always)] fn mul_assign(&mut self, r: Self) { *self = *self * r; } } diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs index ad249d3d..99592963 100644 --- a/src/simd_avx512.rs +++ b/src/simd_avx512.rs @@ -758,6 +758,40 @@ impl I32x16 { unsafe { _mm512_reduce_max_epi32(self.0) } } + // ── Base17 i16[17] operations: load-widen, abs, narrow ────────────── + // Used by bgz17_bridge.rs for L1 distance, weighted L1, sign agreement, xor_bind. + + /// Load 16 × i16 from slice, sign-extend to 16 × i32. + /// This is the first step of every Base17 kernel: i16 → i32 to avoid overflow. + #[inline(always)] + pub fn from_i16_slice(s: &[i16]) -> Self { + assert!(s.len() >= 16); + Self(unsafe { _mm512_cvtepi16_epi32(_mm256_loadu_si256(s.as_ptr() as *const __m256i)) }) + } + + /// Absolute value per lane. + #[inline(always)] + pub fn abs(self) -> Self { + Self(unsafe { _mm512_abs_epi32(self.0) }) + } + + /// Narrow 16 × i32 back to 16 × i16 (truncation, no saturation). + #[inline(always)] + pub fn to_i16_array(self) -> [i16; 16] { + unsafe { + let packed = _mm512_cvtepi32_epi16(self.0); + let mut arr = [0i16; 16]; + _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, packed); + arr + } + } + + /// Compare >= 0: returns 16-bit mask. Bit i set where lane i >= 0. + #[inline(always)] + pub fn cmpge_zero_mask(self) -> u16 { + unsafe { _mm512_cmpge_epi32_mask(self.0, _mm512_setzero_si512()) } + } + #[inline(always)] pub fn simd_min(self, other: Self) -> Self { Self(unsafe { _mm512_min_epi32(self.0, other.0) }) @@ -773,11 +807,6 @@ impl I32x16 { pub fn cast_f32(self) -> F32x16 { F32x16(unsafe { _mm512_cvtepi32_ps(self.0) }) } - - #[inline(always)] - pub fn abs(self) -> Self { - Self(unsafe { _mm512_abs_epi32(self.0) }) - } } impl_bin_op!(I32x16, Add, add, _mm512_add_epi32);