From 521d23f23cd51f5c62dbce9003766c424c0760e3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 16:22:41 +0000
Subject: [PATCH 1/2] =?UTF-8?q?feat(simd=5Favx2):=20add=20U8x32=20?=
 =?UTF-8?q?=E2=80=94=20native=20AVX2=20byte=20vector=20(round-3=20W1)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The keystone for the cosmetic-SIMD sweep agent #11 audited on PR #142.
That audit found 8 confirmed cosmetic SIMD wrappers in hpc/byte_scan.rs,
hpc/palette_codec.rs, and hpc/aabb.rs — `#[target_feature(enable = "avx2")]`
decorating scalar bodies that gave zero speedup over plain scalar. The
root cause: there was no `U8x32` type in the polyfill, so consumers
couldn't write SIMD byte code at AVX2's natural width (32 bytes = one
__m256i ymm register).

This PR adds U8x32 with real __m256i storage and 26 polyfill methods
mirroring `simd_avx512::U8x64`:

Constructors:    splat, from_slice, from_array, to_array, copy_to_slice
Reductions:      reduce_sum (wrap-add), reduce_min, reduce_max, sum_bytes_u64
Min/max:         simd_min, simd_max  (_mm256_min_epu8, _mm256_max_epu8)
Compare→mask:    cmpeq_mask → u32, cmpgt_mask → u32 (unsigned via xor 0x80),
                 movemask → u32  (matches _mm256_movemask_epi8 width)
Saturating:      saturating_add, saturating_sub  (_mm256_adds/subs_epu8)
Avg:             pairwise_avg  (_mm256_avg_epu8, round-up)
Shifts:          shr_epi16, shl_epi16  (16-bit lane shifts via _mm256_srl/sll_epi16)
Shuffles:        shuffle_bytes  (within-128-bit-lane, _mm256_shuffle_epi8)
                 permute_bytes  (cross-lane, scalar fallback — AVX2 has no
                 native cross-lane byte permute; matches U8x64's behavior
                 on AVX-512F-without-VBMI hosts)
                 unpack_lo_epi8, unpack_hi_epi8  (_mm256_unpacklo/hi_epi8)
Conditional:     mask_blend  (_mm256_blendv_epi8, MSB-driven, NOT bitmask)
LUT:             nibble_popcount_lut

Plus operators: BitAnd, BitOr, BitXor, Add (wrapping), Sub (wrapping),
Debug, Default. All ~26 methods.

Re-exported from `crate::simd::U8x32` for both AVX-512 and AVX2 build
tiers — U8x32 is the natural AVX2 byte width and is needed regardless
of whether AVX-512's U8x64 is the consumer's preferred width.

Soundness model matches the rest of simd_avx2.rs: `_mm256_*` intrinsics
are wrapped in `unsafe { }` blocks inside safe `pub fn`, trusting that
AVX2 is the compile target (x86-64-v3 is project baseline). The codebase
uses this pattern already in the AVX2 popcount at simd_avx2.rs:357.

Test coverage:
- 18 new tests in `mod u8x32_tests` covering: roundtrip, sum/min/max
  reductions, unsigned cmp masks (incl. high-byte > 127 to verify the
  XOR-0x80 unsigned trick), saturating add/sub clamps, pairwise_avg
  round-up, shr_epi16 nibble extraction, permute_bytes reverse,
  mask_blend per-MSB selection, nibble_popcount_lut via shuffle_bytes.
- All 18 pass. Total test count 1786 → 1804 with no regressions.

clippy --features rayon -- -D warnings: clean.

Companion: this PR unblocks the round-3 consumer fleet which will
rewrite byte_find_all_avx2 / pack_indices / aabb_intersect_batch_sse41
and friends to use `crate::simd::U8x32` instead of `#[target_feature]`
wrappers around scalar code. Each consumer rewrite ships as its own PR
in the next wave.
---
 src/simd.rs      |   8 +
 src/simd_avx2.rs | 517 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 525 insertions(+)

diff --git a/src/simd.rs b/src/simd.rs
index ccd35aa0..5f37eb4c 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -265,6 +265,14 @@ pub use crate::simd_avx2::{
     I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
 };
 
+// U8x32 — native AVX2 byte width (one __m256i = 32 bytes). Available on
+// both AVX-512 and AVX2 builds: it's the natural width for byte-level
+// AVX2 ops, and on AVX-512 builds it's the half-register companion to
+// U8x64. Lives in simd_avx2.rs (single source of truth) and is re-exported
+// from both tier branches.
+#[cfg(target_arch = "x86_64")]
+pub use crate::simd_avx2::{u8x32, U8x32};
+
 // ============================================================================
 // Non-x86: scalar fallback types with identical API
 // ============================================================================
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
index 0f7799fe..c07d92d0 100644
--- a/src/simd_avx2.rs
+++ b/src/simd_avx2.rs
@@ -1728,6 +1728,520 @@ impl I64x8 {
     }
 }
 
+// ═══════════════════════════════════════════════════════════════════
+// U8x32 — native AVX2 byte vector (one __m256i = 32 bytes).
+//
+// The AVX2-tier "byte width" for the polyfill. AVX-512's U8x64 lives in
+// simd_avx512.rs and maps to one __m512i; on AVX2 the equivalent 64-byte
+// shape (the U8x64 macro above) is implemented as a scalar [u8; 64]
+// fallback because AVX2's natural byte width is 32, not 64. Consumers
+// that want REAL AVX2 SIMD speedup over scalar should chunk their data
+// in 32-byte windows and use U8x32.
+//
+// Requires AVX2 at compile time (project baseline is x86-64-v3, so this
+// holds on every supported build). Calling these methods on a baseline
+// x86_64 build (no AVX2) would SIGILL — same constraint as the rest of
+// the file's `_mm256_*` users (e.g. the AVX2 popcount at line ~357).
+// ═══════════════════════════════════════════════════════════════════
+
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+/// 32-byte unsigned-integer SIMD vector mapping to one AVX2 `__m256i`.
+///
+/// API mirrors `simd_avx512::U8x64` so consumer code can pick the natural
+/// byte width for its loop (32 on AVX2, 64 on AVX-512) and rely on the
+/// same method set. The polyfill in `simd.rs` re-exports both.
+#[cfg(target_arch = "x86_64")]
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct U8x32(pub __m256i);
+
+#[cfg(target_arch = "x86_64")]
+impl U8x32 {
+    /// Number of u8 lanes (32 = one AVX2 ymm register).
+    pub const LANES: usize = 32;
+
+    // ── Constructors ────────────────────────────────────────────────
+
+    /// Broadcast a single byte to all 32 lanes.
+    #[inline(always)]
+    pub fn splat(v: u8) -> Self {
+        // SAFETY: AVX2 is the project baseline (x86-64-v3); calling
+        // `_mm256_set1_epi8` requires AVX, which AVX2 implies.
+        Self(unsafe { _mm256_set1_epi8(v as i8) })
+    }
+
+    /// Unaligned load 32 bytes from a slice. Panics if `s.len() < 32`.
+    #[inline(always)]
+    pub fn from_slice(s: &[u8]) -> Self {
+        assert!(s.len() >= 32, "U8x32::from_slice needs ≥32 bytes");
+        // SAFETY: bounds checked above; loadu allows unaligned src.
+        Self(unsafe { _mm256_loadu_si256(s.as_ptr() as *const __m256i) })
+    }
+
+    /// Load 32 bytes from a fixed-size array.
+    #[inline(always)]
+    pub fn from_array(arr: [u8; 32]) -> Self {
+        // SAFETY: `arr` is exactly 32 bytes contiguous; loadu allows any align.
+        Self(unsafe { _mm256_loadu_si256(arr.as_ptr() as *const __m256i) })
+    }
+
+    /// Store all 32 bytes to a `[u8; 32]` array.
+    #[inline(always)]
+    pub fn to_array(self) -> [u8; 32] {
+        let mut out = [0u8; 32];
+        // SAFETY: `out` is exactly 32 bytes contiguous; storeu allows any align.
+        unsafe { _mm256_storeu_si256(out.as_mut_ptr() as *mut __m256i, self.0) };
+        out
+    }
+
+    /// Copy all 32 bytes into a mutable slice. Panics if `s.len() < 32`.
+    #[inline(always)]
+    pub fn copy_to_slice(self, s: &mut [u8]) {
+        assert!(s.len() >= 32, "U8x32::copy_to_slice needs ≥32 bytes");
+        // SAFETY: bounds checked above; storeu allows unaligned dst.
+        unsafe { _mm256_storeu_si256(s.as_mut_ptr() as *mut __m256i, self.0) };
+    }
+
+    // ── Reductions ──────────────────────────────────────────────────
+
+    /// Sum of all 32 bytes, modulo 2^8. Wraps on overflow.
+    #[inline(always)]
+    pub fn reduce_sum(self) -> u8 {
+        let arr = self.to_array();
+        arr.iter().fold(0u8, |acc, &b| acc.wrapping_add(b))
+    }
+
+    /// Unsigned minimum across all 32 lanes.
+    #[inline(always)]
+    pub fn reduce_min(self) -> u8 {
+        let arr = self.to_array();
+        *arr.iter().min().unwrap()
+    }
+
+    /// Unsigned maximum across all 32 lanes.
+    #[inline(always)]
+    pub fn reduce_max(self) -> u8 {
+        let arr = self.to_array();
+        *arr.iter().max().unwrap()
+    }
+
+    /// Sum-of-absolute-differences against zero ⇒ horizontal byte sum
+    /// folded into the low 64 bits of each 128-bit lane, then combined.
+    /// Returns the total as u64 (does NOT wrap at 2^8). Useful for
+    /// counting set bits in popcount-style masks.
+    #[inline(always)]
+    pub fn sum_bytes_u64(self) -> u64 {
+        // SAFETY: AVX2 baseline.
+        let sums = unsafe { _mm256_sad_epu8(self.0, _mm256_setzero_si256()) };
+        // sad_epu8 places 4 partial sums (one per 64-bit lane) in u16 slots.
+        // Pull them out and add manually — small N, scalar is fine.
+        let mut tmp = [0u64; 4];
+        unsafe { _mm256_storeu_si256(tmp.as_mut_ptr() as *mut __m256i, sums) };
+        tmp[0] + tmp[1] + tmp[2] + tmp[3]
+    }
+
+    // ── Min / max (lane-wise) ───────────────────────────────────────
+
+    /// Lane-wise unsigned min.
+    #[inline(always)]
+    pub fn simd_min(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_min_epu8(self.0, other.0) })
+    }
+
+    /// Lane-wise unsigned max.
+    #[inline(always)]
+    pub fn simd_max(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_max_epu8(self.0, other.0) })
+    }
+
+    // ── Comparison → bitmask ────────────────────────────────────────
+
+    /// Per-lane equality. Returns a 32-bit mask: bit `i` set iff
+    /// `self[i] == other[i]`. (Matches the shape of `U8x64::cmpeq_mask`
+    /// at the natural AVX2 width.)
+    #[inline(always)]
+    pub fn cmpeq_mask(self, other: Self) -> u32 {
+        // SAFETY: AVX2 baseline.
+        let eq = unsafe { _mm256_cmpeq_epi8(self.0, other.0) };
+        // movemask_epi8 extracts the MSB of each byte. After cmpeq, each
+        // lane is 0xFF (match) or 0x00 (mismatch); MSB matches what we want.
+        unsafe { _mm256_movemask_epi8(eq) as u32 }
+    }
+
+    /// Per-lane unsigned greater-than. Returns a 32-bit mask.
+    /// AVX2 only has signed `_mm256_cmpgt_epi8`, so we XOR both
+    /// operands with `0x80` to convert unsigned ↔ signed (preserves
+    /// ordering for unsigned compare).
+    #[inline(always)]
+    pub fn cmpgt_mask(self, other: Self) -> u32 {
+        // SAFETY: AVX2 baseline.
+        unsafe {
+            let bias = _mm256_set1_epi8(i8::MIN); // 0x80
+            let a_s = _mm256_xor_si256(self.0, bias);
+            let b_s = _mm256_xor_si256(other.0, bias);
+            let gt = _mm256_cmpgt_epi8(a_s, b_s);
+            _mm256_movemask_epi8(gt) as u32
+        }
+    }
+
+    /// Extract MSB of each lane as a 32-bit mask (matches
+    /// `U8x64::movemask` at AVX2 width).
+    #[inline(always)]
+    pub fn movemask(self) -> u32 {
+        // SAFETY: AVX2 baseline.
+        unsafe { _mm256_movemask_epi8(self.0) as u32 }
+    }
+
+    // ── Saturating arithmetic ────────────────────────────────────────
+
+    /// Per-lane saturating unsigned add: `min(a + b, 255)`.
+    #[inline(always)]
+    pub fn saturating_add(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_adds_epu8(self.0, other.0) })
+    }
+
+    /// Per-lane saturating unsigned sub: `max(a - b, 0)`.
+    #[inline(always)]
+    pub fn saturating_sub(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_subs_epu8(self.0, other.0) })
+    }
+
+    /// Per-lane unsigned rounded average: `(a + b + 1) >> 1`.
+    #[inline(always)]
+    pub fn pairwise_avg(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_avg_epu8(self.0, other.0) })
+    }
+
+    // ── 16-bit-lane shifts (used by nibble pack/unpack) ─────────────
+
+    /// Right shift each 16-bit lane by `imm` bits. (AVX2 has no native
+    /// 8-bit shift; 16-bit shift + mask is the standard idiom.)
+    #[inline(always)]
+    pub fn shr_epi16(self, imm: u32) -> Self {
+        // SAFETY: AVX2 baseline. `imm` is an arbitrary count; we use the
+        // vector-count form to avoid the const-generic constraint.
+        Self(unsafe { _mm256_srl_epi16(self.0, _mm_cvtsi32_si128(imm as i32)) })
+    }
+
+    /// Left shift each 16-bit lane by `imm` bits.
+    #[inline(always)]
+    pub fn shl_epi16(self, imm: u32) -> Self {
+        // SAFETY: AVX2 baseline. Vector-count form (see shr_epi16).
+        Self(unsafe { _mm256_sll_epi16(self.0, _mm_cvtsi32_si128(imm as i32)) })
+    }
+
+    // ── Lane shuffles ───────────────────────────────────────────────
+
+    /// Within-128-bit-lane byte shuffle. `idx[i]` (0..16) selects the
+    /// source byte within the SAME 128-bit half; high-bit set in
+    /// `idx[i]` zeroes the output lane. Matches `_mm256_shuffle_epi8`.
+    /// (Cross-lane permute is NOT available in pure AVX2 — use
+    /// `permute_bytes` for that, which falls back to scalar.)
+    #[inline(always)]
+    pub fn shuffle_bytes(self, idx: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_shuffle_epi8(self.0, idx.0) })
+    }
+
+    /// Cross-lane byte permute (full 32-byte). AVX2 has no native
+    /// cross-lane byte permute, so this falls back to scalar — same
+    /// shape as `simd_avx512::U8x64::permute_bytes` does on
+    /// AVX-512F-without-VBMI hosts.
+    ///
+    /// `idx[i]` selects the source byte at position `idx[i] & 0x1F`.
+    #[inline(always)]
+    pub fn permute_bytes(self, idx: Self) -> Self {
+        let src = self.to_array();
+        let idx_arr = idx.to_array();
+        let mut out = [0u8; 32];
+        for i in 0..32 {
+            out[i] = src[(idx_arr[i] & 0x1F) as usize];
+        }
+        Self::from_array(out)
+    }
+
+    /// Interleave low 8 bytes of each 128-bit half (`_mm256_unpacklo_epi8`).
+    /// Output: `[a0,b0, a1,b1, ..., a7,b7]` within each 128-bit half.
+    #[inline(always)]
+    pub fn unpack_lo_epi8(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_unpacklo_epi8(self.0, other.0) })
+    }
+
+    /// Interleave high 8 bytes of each 128-bit half (`_mm256_unpackhi_epi8`).
+    #[inline(always)]
+    pub fn unpack_hi_epi8(self, other: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_unpackhi_epi8(self.0, other.0) })
+    }
+
+    // ── Conditional move via bit mask ───────────────────────────────
+
+    /// Select `a` where mask bit is set, else `b`. The mask is a
+    /// `U8x32` whose lane MSB acts as the boolean (matches
+    /// `_mm256_blendv_epi8` semantics — different from the
+    /// 64-bit-bitmask shape of `U8x64::mask_blend`).
+    #[inline(always)]
+    pub fn mask_blend(mask: Self, a: Self, b: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_blendv_epi8(b.0, a.0, mask.0) })
+    }
+
+    // ── Pre-computed LUT used by nibble popcount ───────────────────
+
+    /// Returns a `U8x32` populated with the nibble-popcount lookup
+    /// table replicated across both 128-bit halves. `shuffle_bytes`
+    /// with this LUT computes `popcount(nibble)` for 32 bytes in
+    /// parallel.
+    #[inline(always)]
+    pub fn nibble_popcount_lut() -> Self {
+        // Index i ∈ [0,15] → number of set bits in i.
+        Self::from_array([
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        ])
+    }
+}
+
+// Bitwise + arithmetic operator impls so consumers can use natural
+// `a + b`, `a & b`, etc. without method chaining. Match the U8x64 shape.
+
+#[cfg(target_arch = "x86_64")]
+impl core::ops::BitAnd for U8x32 {
+    type Output = Self;
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_and_si256(self.0, rhs.0) })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl core::ops::BitOr for U8x32 {
+    type Output = Self;
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_or_si256(self.0, rhs.0) })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl core::ops::BitXor for U8x32 {
+    type Output = Self;
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self {
+        // SAFETY: AVX2 baseline.
+        Self(unsafe { _mm256_xor_si256(self.0, rhs.0) })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl core::ops::Add for U8x32 {
+    type Output = Self;
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self {
+        // SAFETY: AVX2 baseline. WRAPS — use saturating_add for clamp.
+        Self(unsafe { _mm256_add_epi8(self.0, rhs.0) })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl core::ops::Sub for U8x32 {
+    type Output = Self;
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self {
+        // SAFETY: AVX2 baseline. WRAPS — use saturating_sub for clamp.
+        Self(unsafe { _mm256_sub_epi8(self.0, rhs.0) })
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl core::fmt::Debug for U8x32 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "U8x32({:?})", self.to_array())
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+impl Default for U8x32 {
+    #[inline(always)]
+    fn default() -> Self {
+        Self::splat(0)
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════
+// U8x32 tests
+// ═══════════════════════════════════════════════════════════════════
+
+#[cfg(all(test, target_arch = "x86_64"))]
+mod u8x32_tests {
+    use super::U8x32;
+
+    #[test]
+    fn splat_and_to_array() {
+        let v = U8x32::splat(42);
+        assert_eq!(v.to_array(), [42u8; 32]);
+    }
+
+    #[test]
+    fn from_array_roundtrip() {
+        let arr: [u8; 32] = core::array::from_fn(|i| i as u8);
+        let v = U8x32::from_array(arr);
+        assert_eq!(v.to_array(), arr);
+    }
+
+    #[test]
+    fn from_slice_to_slice() {
+        let src: Vec<u8> = (0..40).map(|i| i as u8).collect();
+        let v = U8x32::from_slice(&src);
+        let mut dst = vec![0u8; 32];
+        v.copy_to_slice(&mut dst);
+        assert_eq!(&dst[..], &src[..32]);
+    }
+
+    #[test]
+    fn reduce_sum_wraps() {
+        // 32 × 8 = 256 → wraps to 0 in u8.
+        let v = U8x32::splat(8);
+        assert_eq!(v.reduce_sum(), 0);
+    }
+
+    #[test]
+    fn sum_bytes_u64_does_not_wrap() {
+        // 32 × 100 = 3200 — beyond u8, but sum_bytes_u64 returns full u64.
+        let v = U8x32::splat(100);
+        assert_eq!(v.sum_bytes_u64(), 3200);
+    }
+
+    #[test]
+    fn reduce_min_max() {
+        let arr: [u8; 32] = core::array::from_fn(|i| (i * 7 + 3) as u8);
+        let v = U8x32::from_array(arr);
+        assert_eq!(v.reduce_min(), *arr.iter().min().unwrap());
+        assert_eq!(v.reduce_max(), *arr.iter().max().unwrap());
+    }
+
+    #[test]
+    fn simd_min_max_unsigned() {
+        let a = U8x32::from_array(core::array::from_fn(|i| i as u8));
+        let b = U8x32::from_array(core::array::from_fn(|i| (31 - i) as u8));
+        let lo = a.simd_min(b).to_array();
+        let hi = a.simd_max(b).to_array();
+        for i in 0..32 {
+            assert_eq!(lo[i], (i as u8).min((31 - i) as u8));
+            assert_eq!(hi[i], (i as u8).max((31 - i) as u8));
+        }
+    }
+
+    #[test]
+    fn cmpeq_mask_matches_scalar() {
+        let a: [u8; 32] = core::array::from_fn(|i| (i & 7) as u8);
+        let b: [u8; 32] = core::array::from_fn(|i| (i & 6) as u8);
+        let va = U8x32::from_array(a);
+        let vb = U8x32::from_array(b);
+        let m = va.cmpeq_mask(vb);
+        for i in 0..32 {
+            let bit = (m >> i) & 1 == 1;
+            assert_eq!(bit, a[i] == b[i], "lane {} disagrees", i);
+        }
+    }
+
+    #[test]
+    fn cmpgt_mask_matches_scalar_unsigned() {
+        // High bytes (>= 128) must compare as unsigned, NOT signed.
+        let a: [u8; 32] = core::array::from_fn(|i| (i * 9) as u8);
+        let b: [u8; 32] = core::array::from_fn(|i| (200u8.wrapping_sub(i as u8)) as u8);
+        let va = U8x32::from_array(a);
+        let vb = U8x32::from_array(b);
+        let m = va.cmpgt_mask(vb);
+        for i in 0..32 {
+            let bit = (m >> i) & 1 == 1;
+            assert_eq!(bit, a[i] > b[i], "lane {} disagrees (a={} b={})", i, a[i], b[i]);
+        }
+    }
+
+    #[test]
+    fn saturating_add_clamps() {
+        let a = U8x32::splat(200);
+        let b = U8x32::splat(100);
+        assert_eq!(a.saturating_add(b).to_array(), [255u8; 32]);
+    }
+
+    #[test]
+    fn saturating_sub_clamps() {
+        let a = U8x32::splat(10);
+        let b = U8x32::splat(50);
+        assert_eq!(a.saturating_sub(b).to_array(), [0u8; 32]);
+    }
+
+    #[test]
+    fn pairwise_avg_rounds_up() {
+        let a = U8x32::splat(7);
+        let b = U8x32::splat(8);
+        // (7 + 8 + 1) >> 1 = 8
+        assert_eq!(a.pairwise_avg(b).to_array(), [8u8; 32]);
+    }
+
+    #[test]
+    fn shr_epi16_extracts_nibble() {
+        // Pack 0xAB in low byte of each 16-bit pair, shift right 4 → 0x0A.
+        let mut arr = [0u8; 32];
+        for i in (0..32).step_by(2) {
+            arr[i] = 0xAB;
+        }
+        let shifted = U8x32::from_array(arr).shr_epi16(4).to_array();
+        for i in (0..32).step_by(2) {
+            assert_eq!(shifted[i], 0x0A);
+            assert_eq!(shifted[i + 1], 0x00);
+        }
+    }
+
+    #[test]
+    fn permute_bytes_reverse() {
+        let src: [u8; 32] = core::array::from_fn(|i| i as u8);
+        let idx: [u8; 32] = core::array::from_fn(|i| (31 - i) as u8);
+        let out = U8x32::from_array(src).permute_bytes(U8x32::from_array(idx)).to_array();
+        for i in 0..32 {
+            assert_eq!(out[i], src[31 - i]);
+        }
+    }
+
+    #[test]
+    fn mask_blend_selects_per_msb() {
+        let a = U8x32::splat(0xAA);
+        let b = U8x32::splat(0x55);
+        // Mask with MSB set on every other lane → selects a/b alternating.
+        let mask_arr: [u8; 32] = core::array::from_fn(|i| if i % 2 == 0 { 0x80 } else { 0x00 });
+        let mask = U8x32::from_array(mask_arr);
+        let out = U8x32::mask_blend(mask, a, b).to_array();
+        for i in 0..32 {
+            assert_eq!(out[i], if i % 2 == 0 { 0xAA } else { 0x55 });
+        }
+    }
+
+    #[test]
+    fn nibble_popcount_lut_via_shuffle() {
+        let lut = U8x32::nibble_popcount_lut();
+        // For each possible nibble value 0..15, shuffle should produce
+        // its popcount.
+        let idx: [u8; 32] = core::array::from_fn(|i| (i & 0x0F) as u8);
+        let counts = lut.shuffle_bytes(U8x32::from_array(idx)).to_array();
+        for i in 0..32 {
+            let n = (i & 0x0F) as u32;
+            assert_eq!(counts[i] as u32, n.count_ones(), "lane {}", i);
+        }
+    }
+}
+
 /// Lowercase aliases (std::simd convention).
 #[allow(non_camel_case_types)]
 pub type f32x16 = F32x16;
@@ -1735,6 +2249,9 @@ pub type f32x16 = F32x16;
 pub type f64x8 = F64x8;
 #[allow(non_camel_case_types)]
 pub type u8x64 = U8x64;
+#[cfg(target_arch = "x86_64")]
+#[allow(non_camel_case_types)]
+pub type u8x32 = U8x32;
 #[allow(non_camel_case_types)]
 pub type i32x16 = I32x16;
 #[allow(non_camel_case_types)]

From bc71b6bf3a17207692f244af5d6a56bc1bdd947a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 13 May 2026 17:15:56 +0000
Subject: [PATCH 2/2] style(simd_avx2): apply nightly rustfmt to U8x32
 additions (PR #144 fmt fix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The format/nightly CI job on PR #144 flagged two sites in the U8x32
additions:

1. `nibble_popcount_lut` — 32-byte literal split into two 16-element
   rows for readability. Nightly rustfmt's chains_overflow_last_block
   + width budget collapse it to one line. Restored.

2. `permute_bytes_reverse` test — a 3-method chain was on one line.
   Nightly rustfmt wants each `.method()` on its own line under
   `chain_width = 60`. Restored.

No semantic change. `cargo +nightly fmt --all --check` clean after.
---
 src/simd_avx2.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
index c07d92d0..5e164eab 100644
--- a/src/simd_avx2.rs
+++ b/src/simd_avx2.rs
@@ -2004,8 +2004,7 @@ impl U8x32 {
     pub fn nibble_popcount_lut() -> Self {
         // Index i ∈ [0,15] → number of set bits in i.
         Self::from_array([
-            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
         ])
     }
 }
@@ -2209,7 +2208,9 @@ mod u8x32_tests {
     fn permute_bytes_reverse() {
         let src: [u8; 32] = core::array::from_fn(|i| i as u8);
         let idx: [u8; 32] = core::array::from_fn(|i| (31 - i) as u8);
-        let out = U8x32::from_array(src).permute_bytes(U8x32::from_array(idx)).to_array();
+        let out = U8x32::from_array(src)
+            .permute_bytes(U8x32::from_array(idx))
+            .to_array();
         for i in 0..32 {
             assert_eq!(out[i], src[31 - i]);
         }