Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 115 additions & 4 deletions src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,64 @@ static TIER: LazyLock<Tier> = LazyLock::new(|| {
#[inline(always)]
fn tier() -> Tier { *TIER }

// BF16 tier detection happens inline in bf16_to_f32_batch() via
// is_x86_feature_detected!("avx512bf16") — no LazyLock needed.
// The check is cheap (reads a cached cpuid result) and the batch
// function uses as_chunks::<16>() + as_chunks::<8>() for SIMD widths.

// ============================================================================
// Preferred SIMD lane widths — compile-time constants for array_windows
// ============================================================================
//
// Consumer code uses these to select array_windows size at compile time:
//
// for window in data.array_windows::<{crate::simd::PREFERRED_F64_LANES}>() {
// let v = F64x8::from_array(*window); // AVX-512: native 8-wide
// // or
// let v = F64x4::from_array(*window); // AVX2: native 4-wide
// }
//
// generic_const_exprs is nightly, so consumers must #[cfg] branch on window size.
// These constants document the preferred width per tier.

/// Preferred f64 SIMD width (elements per register).
/// AVX-512: 8 lanes (__m512d). AVX2/scalar: 4 lanes (__m256d).
#[cfg(target_feature = "avx512f")]
pub const PREFERRED_F64_LANES: usize = 8;
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
pub const PREFERRED_F64_LANES: usize = 4;
#[cfg(not(target_arch = "x86_64"))]
pub const PREFERRED_F64_LANES: usize = 4; // scalar fallback: same as AVX2 shape

/// Preferred f32 SIMD width.
/// AVX-512: 16 lanes (__m512). AVX2/scalar: 8 lanes (__m256).
#[cfg(target_feature = "avx512f")]
pub const PREFERRED_F32_LANES: usize = 16;
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
pub const PREFERRED_F32_LANES: usize = 8;
#[cfg(not(target_arch = "x86_64"))]
pub const PREFERRED_F32_LANES: usize = 8;

/// Preferred u64 SIMD width.
/// AVX-512: 8 lanes. AVX2/scalar: 4 lanes.
#[cfg(target_feature = "avx512f")]
pub const PREFERRED_U64_LANES: usize = 8;
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
pub const PREFERRED_U64_LANES: usize = 4;
#[cfg(not(target_arch = "x86_64"))]
pub const PREFERRED_U64_LANES: usize = 4;

/// Preferred i16 SIMD width (for Base17 L1 on i16[17]).
/// AVX-512: 32 lanes (__m512i via epi16). AVX2: 16 lanes (__m256i).
/// Base17 has 17 dims — AVX-512 covers 32 (load 17 + 15 padding),
/// AVX2 covers 16 + 1 scalar.
#[cfg(target_feature = "avx512f")]
pub const PREFERRED_I16_LANES: usize = 32;
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
pub const PREFERRED_I16_LANES: usize = 16;
#[cfg(not(target_arch = "x86_64"))]
pub const PREFERRED_I16_LANES: usize = 16;

// ============================================================================
// x86_64: re-export based on tier
// ============================================================================
Expand All @@ -41,6 +99,16 @@ pub use crate::simd_avx512::{
f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
};

// BF16 types + batch conversion (always available — scalar fallback built in)
#[cfg(target_arch = "x86_64")]
pub use crate::simd_avx512::{
bf16_to_f32_scalar, f32_to_bf16_scalar,
bf16_to_f32_batch, f32_to_bf16_batch,
};
// BF16 SIMD types only available when avx512bf16 is enabled at compile time
#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
pub use crate::simd_avx512::{BF16x16, BF16x8};

#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
pub use crate::simd_avx512::{F32x8, F64x4, f32x8, f64x4};

Expand Down Expand Up @@ -645,22 +713,51 @@ mod scalar {
fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; }
}

// U8x64 extra methods
// U8x64 extra methods — byte-level operations for palette codec, nibble, byte scan
impl U8x64 {
#[inline(always)]
pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap_or(&0) }
#[inline(always)]
pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap_or(&0) }
#[inline(always)]
pub fn simd_min(self, other: Self) -> Self {
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].min(other.0[i]); } Self(out)
}
#[inline(always)]
pub fn simd_max(self, other: Self) -> Self {
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].max(other.0[i]); } Self(out)
}
#[inline(always)]
pub fn cmpeq_mask(self, other: Self) -> u64 {
let mut mask = 0u64;
for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
mask
}
#[inline(always)]
pub fn shr_epi16(self, imm: u32) -> Self {
let mut out = [0u8; 64];
for i in 0..64 { out[i] = self.0[i].min(other.0[i]); }
for i in (0..64).step_by(2) {
let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
let shifted = val >> imm;
let bytes = shifted.to_le_bytes();
out[i] = bytes[0]; out[i + 1] = bytes[1];
}
Self(out)
}
#[inline(always)]
pub fn simd_max(self, other: Self) -> Self {
pub fn saturating_sub(self, other: Self) -> Self {
let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out)
}
#[inline(always)]
pub fn unpack_lo_epi8(self, other: Self) -> Self {
let mut out = [0u8; 64];
for i in 0..64 { out[i] = self.0[i].max(other.0[i]); }
for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+i]; out[b+i*2+1] = other.0[b+i]; } }
Self(out)
}
#[inline(always)]
pub fn unpack_hi_epi8(self, other: Self) -> Self {
let mut out = [0u8; 64];
for lane in 0..4 { let b = lane * 16; for i in 0..8 { out[b+i*2] = self.0[b+8+i]; out[b+i*2+1] = other.0[b+8+i]; } }
Self(out)
}
}
Expand Down Expand Up @@ -697,6 +794,20 @@ pub use scalar::{
f32x8, f64x4,
};

// Scalar BF16 conversion — always available on all platforms
#[cfg(not(target_arch = "x86_64"))]
pub fn bf16_to_f32_scalar(bits: u16) -> f32 { f32::from_bits((bits as u32) << 16) }
#[cfg(not(target_arch = "x86_64"))]
pub fn f32_to_bf16_scalar(v: f32) -> u16 { (v.to_bits() >> 16) as u16 }
#[cfg(not(target_arch = "x86_64"))]
pub fn bf16_to_f32_batch(input: &[u16], output: &mut [f32]) {
for (i, &b) in input.iter().enumerate() { if i < output.len() { output[i] = bf16_to_f32_scalar(b); } }
}
#[cfg(not(target_arch = "x86_64"))]
pub fn f32_to_bf16_batch(input: &[f32], output: &mut [u16]) {
for (i, &v) in input.iter().enumerate() { if i < output.len() { output[i] = f32_to_bf16_scalar(v); } }
}

// ============================================================================
// SIMD math functions — ndarray additions (not in std::simd)
// ============================================================================
Expand Down
70 changes: 70 additions & 0 deletions src/simd_avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,76 @@ macro_rules! avx2_int_type {
}

avx2_int_type!(U8x64, u8, 64, 0u8);

// ── U8x64 byte-level operations (scalar fallback for AVX2 tier) ──────────
// These match the AVX-512 U8x64 methods in simd_avx512.rs.
impl U8x64 {
/// Byte-wise equality mask: bit i set if self[i] == other[i].
#[inline(always)]
pub fn cmpeq_mask(self, other: Self) -> u64 {
let mut mask = 0u64;
for i in 0..64 { if self.0[i] == other.0[i] { mask |= 1u64 << i; } }
mask
}

/// Shift right each 16-bit lane by imm bits (operates on pairs of u8 as u16).
#[inline(always)]
pub fn shr_epi16(self, imm: u32) -> Self {
let mut out = [0u8; 64];
for i in (0..64).step_by(2) {
let val = u16::from_le_bytes([self.0[i], self.0[i + 1]]);
let shifted = val >> imm;
let bytes = shifted.to_le_bytes();
out[i] = bytes[0];
out[i + 1] = bytes[1];
}
Self(out)
}

/// Saturating unsigned subtraction: max(a - b, 0) per byte.
#[inline(always)]
pub fn saturating_sub(self, other: Self) -> Self {
let mut out = [0u8; 64];
for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); }
Self(out)
}

/// Interleave low bytes within each 128-bit lane.
#[inline(always)]
pub fn unpack_lo_epi8(self, other: Self) -> Self {
let mut out = [0u8; 64];
// Operates per 16-byte lane (4 lanes in 512-bit)
for lane in 0..4 {
let base = lane * 16;
for i in 0..8 {
out[base + i * 2] = self.0[base + i];
out[base + i * 2 + 1] = other.0[base + i];
}
}
Self(out)
}

/// Interleave high bytes within each 128-bit lane.
#[inline(always)]
pub fn unpack_hi_epi8(self, other: Self) -> Self {
let mut out = [0u8; 64];
for lane in 0..4 {
let base = lane * 16;
for i in 0..8 {
out[base + i * 2] = self.0[base + 8 + i];
out[base + i * 2 + 1] = other.0[base + 8 + i];
}
}
Self(out)
}

/// Reduce min/max (not in macro).
#[inline(always)] pub fn reduce_min(self) -> u8 { *self.0.iter().min().unwrap() }
#[inline(always)] pub fn reduce_max(self) -> u8 { *self.0.iter().max().unwrap() }
#[inline(always)] pub fn simd_min(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].min(other.0[i]); } Self(o) }
#[inline(always)] pub fn simd_max(self, other: Self) -> Self { let mut o = [0u8; 64]; for i in 0..64 { o[i] = self.0[i].max(other.0[i]); } Self(o) }
}

avx2_int_type!(I32x16, i32, 16, 0i32);
avx2_int_type!(I64x8, i64, 8, 0i64);
avx2_int_type!(U32x16, u32, 16, 0u32);
Expand Down
Loading
Loading