diff --git a/src/hpc/arrow_bridge.rs b/src/hpc/arrow_bridge.rs
index 363c4e55..7ff1a7cf 100644
--- a/src/hpc/arrow_bridge.rs
+++ b/src/hpc/arrow_bridge.rs
@@ -17,13 +17,13 @@ pub const PLANE_BINARY_BYTES: usize = 2048;
 pub const BINARY_BYTES: usize = PLANE_BYTES; // 2048
 
 /// Soaking accumulator length (i8 entries per plane).
-pub const SOAKING_DIMS: usize = 10000;
+pub const SOAKING_DIMS: usize = 16_384;
 
-/// Sigma attention mask width in bytes (10000-bit mask).
-pub const SIGMA_MASK_BYTES: usize = 1250;
+/// Sigma attention mask width in bytes (16384-bit mask = 2048 bytes).
+pub const SIGMA_MASK_BYTES: usize = 2048;
 
 /// Default soaking dimension count.
-pub const DEFAULT_SOAKING_DIM: usize = 10000;
+pub const DEFAULT_SOAKING_DIM: usize = 16_384;
 
 /// Schema field names for the bind_nodes_v2 three-plane layout.
 pub mod schema {
@@ -266,7 +266,7 @@ pub struct BindNodeV2 {
     pub object_soaking: Option<Vec<i8>>,
     /// Composite XOR fingerprint: S XOR P XOR O.
     pub spo_binary: [u8; PLANE_BINARY_BYTES],
-    /// 10000-bit attention mask (sigma).
+    /// 16384-bit attention mask (sigma).
     pub sigma_mask: [u8; SIGMA_MASK_BYTES],
     /// NARS frequency (u16 fixed-point, 0..65535).
     pub nars_frequency: u16,
@@ -572,7 +572,7 @@ impl BindNodeV2 {
 
     /// Convert a Plane accumulator (16384 i8) to a soaking vector (SOAKING_DIMS i8).
     ///
-    /// Truncates if the accumulator is longer than SOAKING_DIMS, pads with 0 if shorter.
+    /// With SOAKING_DIMS = 16384, the copy is one-to-one (no truncation, no padding).
     fn acc_to_soaking(acc: &[i8; 16384]) -> Vec<i8> {
         let mut soaking = vec![0i8; SOAKING_DIMS];
         let copy_len = SOAKING_DIMS.min(acc.len());
@@ -878,8 +878,8 @@ mod tests {
     #[test]
     fn schema_constants() {
         assert_eq!(PLANE_BINARY_BYTES, 2048);
-        assert_eq!(SOAKING_DIMS, 10000);
-        assert_eq!(SIGMA_MASK_BYTES, 1250);
+        assert_eq!(SOAKING_DIMS, 16_384);
+        assert_eq!(SIGMA_MASK_BYTES, 2048);
     }
 
     #[test]
@@ -1140,9 +1140,9 @@ mod tests {
         let (mut s, mut p, mut o) = make_test_planes();
         let node = BindNodeV2::new(&mut s, &mut p, &mut o, "test");
         assert_eq!(node.sigma_mask.len(), SIGMA_MASK_BYTES);
-        assert_eq!(node.sigma_mask.len(), 1250);
-        // sigma_mask * 8 = 10000 bits
-        assert_eq!(node.sigma_mask.len() * 8, 10000);
+        assert_eq!(node.sigma_mask.len(), 2048);
+        // sigma_mask * 8 = 16384 bits
+        assert_eq!(node.sigma_mask.len() * 8, 16_384);
     }
 
     #[test]
diff --git a/src/hpc/audio/codec_map.rs b/src/hpc/audio/codec_map.rs
index 24e2935a..df664327 100644
--- a/src/hpc/audio/codec_map.rs
+++ b/src/hpc/audio/codec_map.rs
@@ -35,11 +35,17 @@
 /// what aspect of that codec it captures, and what it replaces.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum CodecSource {
+    /// Opus / CELT.
     Opus,
+    /// OpenAI Whisper.
     Whisper,
+    /// MP3.
     Mp3,
+    /// Ogg Vorbis.
     OggVorbis,
+    /// Suno Bark.
     Bark,
+    /// ElevenLabs.
     ElevenLabs,
 }
 
@@ -66,11 +72,17 @@ pub enum AudioAspect {
 
 /// Complete provenance record for one primitive.
 pub struct Provenance {
+    /// Name of the primitive in this codebase.
     pub our_type: &'static str,
+    /// Byte size of the primitive (0 = transform/decision, not stored).
     pub byte_size: usize,
+    /// Production codec the primitive was transcoded from.
     pub source: CodecSource,
+    /// Aspect of audio the primitive captures.
     pub aspect: AudioAspect,
+    /// Concept in the source codec that this corresponds to.
     pub source_concept: &'static str,
+    /// What this primitive replaces (in the source codec or a peer).
     pub what_it_replaces: &'static str,
 }
 
@@ -212,6 +224,7 @@ pub const PROVENANCE: &[Provenance] = &[
 ///   Bark tokens: ~128 bytes per frame
 ///   Ours: 52-69 bytes per frame (complete, including phase + identity)
 pub const FRAME_BUDGET: usize = 52;
+/// Per-frame byte budget when the TTS RvqFrame (17 bytes) is also carried.
 pub const FRAME_BUDGET_WITH_TTS: usize = 69;
 
 /// Codec comparison: bits per second at comparable quality.
diff --git a/src/hpc/audio/modes.rs b/src/hpc/audio/modes.rs
index 9f042ca1..9464894d 100644
--- a/src/hpc/audio/modes.rs
+++ b/src/hpc/audio/modes.rs
@@ -27,12 +27,19 @@ use super::bands;
 /// and maps to a Base17 stride for spectral character.
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum Mode {
+    /// Ionian (major): W-W-H-W-W-W-H — bright, resolved.
     Ionian,     // Major: W-W-H-W-W-W-H → bright, resolved
+    /// Dorian: minor with natural 6th — warm, jazz.
     Dorian,     // Minor with ♮6: warm, jazz
+    /// Phrygian: minor with flat 2nd — dark, flamenco.
     Phrygian,   // Minor with ♭2: dark, flamenco
+    /// Lydian: major with sharp 4th — dreamy, floating.
     Lydian,     // Major with ♯4: dreamy, floating
+    /// Mixolydian: major with flat 7th — dominant, bluesy.
     Mixolydian, // Major with ♭7: dominant, bluesy
+    /// Aeolian (natural minor) — sad, reflective.
     Aeolian,    // Natural minor: sad, reflective
+    /// Locrian (diminished) — unstable, tense.
     Locrian,    // Diminished: unstable, tense
 }
 
diff --git a/src/hpc/audio/phase.rs b/src/hpc/audio/phase.rs
index 18dd6684..348ed800 100644
--- a/src/hpc/audio/phase.rs
+++ b/src/hpc/audio/phase.rs
@@ -128,6 +128,7 @@ pub fn phase_gradient(
 /// Together: complete nonverbal vocal characterization in 52 bytes.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct PhaseDescriptor {
+    /// 4-byte packed descriptor (coherence, gradient, entropy, stability).
     pub bytes: [u8; 4],
 }
 
diff --git a/src/hpc/audio/voice.rs b/src/hpc/audio/voice.rs
index ff051c4c..c5cba037 100644
--- a/src/hpc/audio/voice.rs
+++ b/src/hpc/audio/voice.rs
@@ -32,10 +32,12 @@ pub const N_VOICE_CHANNELS: usize = 16;
 /// Compression: 16 bytes (vs Bark's 1024-dim semantic token embedding).
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub struct VoiceArchetype {
+    /// 16 i8 voice-identity channels (pitch / resonance / articulation / prosody).
     pub channels: [i8; N_VOICE_CHANNELS],
 }
 
 impl VoiceArchetype {
+    /// Serialized size of a VoiceArchetype, in bytes.
     pub const BYTE_SIZE: usize = N_VOICE_CHANNELS;
 
     /// Zero archetype (neutral voice).
@@ -177,6 +179,7 @@ impl VoiceArchetype {
 /// For a 256-entry codebook: 256 × 16 bytes = 4 KB.
 #[derive(Clone, Debug)]
 pub struct VoiceCodebook {
+    /// Voice archetype prototypes; index = codebook ID.
     pub entries: Vec<VoiceArchetype>,
 }
 
@@ -245,6 +248,7 @@ pub struct RvqFrame {
 }
 
 impl RvqFrame {
+    /// Serialized size of an RvqFrame, in bytes.
     pub const BYTE_SIZE: usize = 17;
 
     /// Serialize to 17 bytes.
@@ -292,13 +296,17 @@ impl RvqFrame {
 /// VoiceFrame (21B) is the compressed synthesis frame.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct VoiceFrame {
+    /// Compressed RVQ codes (archetype + coarse + fine).
     pub rvq: RvqFrame,
+    /// Per-frame phase dynamics descriptor.
     pub phase: super::phase::PhaseDescriptor,
 }
 
 impl VoiceFrame {
+    /// Serialized size of a VoiceFrame, in bytes (RvqFrame + 4-byte phase).
     pub const BYTE_SIZE: usize = RvqFrame::BYTE_SIZE + 4; // 21 bytes
 
+    /// Serialize this VoiceFrame to its 21-byte wire representation.
     pub fn to_bytes(&self) -> [u8; Self::BYTE_SIZE] {
         let mut bytes = [0u8; Self::BYTE_SIZE];
         bytes[..17].copy_from_slice(&self.rvq.to_bytes());
@@ -306,6 +314,7 @@ impl VoiceFrame {
         bytes
     }
 
+    /// Deserialize a VoiceFrame from its 21-byte wire representation.
     pub fn from_bytes(bytes: &[u8; Self::BYTE_SIZE]) -> Self {
         let mut rvq_bytes = [0u8; 17];
         rvq_bytes.copy_from_slice(&bytes[..17]);
diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs
index bdacb4b0..b855cab9 100644
--- a/src/hpc/deepnsm.rs
+++ b/src/hpc/deepnsm.rs
@@ -646,14 +646,14 @@ pub fn nsm_decompose(text: &str) -> NsmDecomposition {
     NsmDecomposition { weights, dominant }
 }
 
-/// Encode an NSM decomposition as a 10000-bit binary vector (1250 bytes).
+/// Encode an NSM decomposition as a 16384-bit binary vector (2048 bytes).
 ///
 /// For each prime with weight > 0, hash prime_index with blake3 to produce
 /// a deterministic bit pattern, then XOR into result for primes whose
 /// normalised weight exceeds 0.5 of the max weight (or any nonzero weight
 /// when only one prime is present).
-pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] {
-    let mut result = [0u8; 1250];
+pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 2048] {
+    let mut result = [0u8; 2048];
 
     let max_w = decomp.weights.iter().cloned().fold(0.0f32, f32::max);
     if max_w == 0.0 {
@@ -665,20 +665,20 @@ pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] {
         if w < threshold {
             continue;
         }
-        // Hash the prime index to get a deterministic 1250-byte pattern
+        // Hash the prime index to get a deterministic 2048-byte pattern
         let hash_input = (i as u32).to_le_bytes();
-        let mut pattern = [0u8; 1250];
-        // Use blake3 in extended-output mode to fill 1250 bytes
+        let mut pattern = [0u8; 2048];
+        // Use blake3 in extended-output mode to fill 2048 bytes
         let mut hasher = blake3::Hasher::new();
         hasher.update(&hash_input);
         let mut reader = hasher.finalize_xof();
         reader.fill(&mut pattern);
 
-        // XOR 1250 bytes via crate::simd::U8x64.
-        // 1250 = 19×64 (1216) + 34 scalar remainder.
+        // XOR 2048 bytes via crate::simd::U8x64.
+        // 2048 = 32×64 exactly (no scalar remainder, SIMD-clean).
         {
             use crate::simd::U8x64;
-            let chunks = 1250 / 64; // 19
+            let chunks = 2048 / 64; // 32
             for c in 0..chunks {
                 let off = c * 64;
                 let vr = U8x64::from_slice(&result[off..off + 64]);
@@ -686,10 +686,6 @@ pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] {
                 let xored = vr ^ vp;
                 xored.copy_to_slice(&mut result[off..off + 64]);
             }
-            // Scalar remainder (34 bytes).
-            for j in (chunks * 64)..1250 {
-                result[j] ^= pattern[j];
-            }
         }
     }
 
diff --git a/src/hpc/framebuffer.rs b/src/hpc/framebuffer.rs
new file mode 100644
index 00000000..a0d214d6
--- /dev/null
+++ b/src/hpc/framebuffer.rs
@@ -0,0 +1,1303 @@
+//! Palette-indexed framebuffer — ndarray IS the graphics card.
+//!
+//! Composes a screen as a `[u8; W*H]` palette-indexed bitmap. Wire format
+//! is palette_codec-compressed (4-bit nibble at 16 colors → 8× smaller
+//! than RGB888). q2 receives a ready-made bitmap and blits with
+//! `canvas.putImageData(...)`.
+//!
+//! # Tier-adaptive palette
+//!
+//! The detected SIMD tier determines the palette depth AND foveal detail
+//! budget. Lower-capability hardware gets fewer colors and simpler sprites
+//! that compress better and process faster:
+//!
+//! | Tier        | Palette | Bits/px | Sprite | Wire KB (1024²) |
+//! |-------------|---------|---------|--------|-----------------|
+//! | AVX-512/AMX | 16      | 4       | 8×8    | 512             |
+//! | AVX2        | 8       | 3       | 6×6    | 384             |
+//! | NEON/scalar | 4       | 2       | 4×4    | 256             |
+//!
+//! # Views
+//!
+//! - **MRI view** — full-screen density heatmap, all nodes visible,
+//!   palette maps to intensity (white=hot, black=cold). Overview radar.
+//! - **Neo4j view** — nodes as dot sprites, edges as Bresenham lines,
+//!   labels as glyph sprites. Interactive-style graph display.
+//! - **Cloud view** — distant/peripheral nodes as a nibble-packed
+//!   density field at mipmap L1/L2. Foveal region sharp, periphery fog.
+
+use crate::hpc::palette_codec::{bits_for_palette_size, pack_indices};
+use crate::simd::PREFERRED_F32_LANES;
+
+// ─────────────────────────────────────────────────────────────────────
+// Tier-adaptive palette selection
+// ─────────────────────────────────────────────────────────────────────
+
+/// Palette depth based on detected SIMD tier.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum PaletteTier {
+    /// AVX-512 / AMX: 16 colors, 4 bits/pixel, 8×8 sprites.
+    Full16,
+    /// AVX2: 8 colors, 3 bits/pixel, 6×6 sprites.
+    Mid8,
+    /// NEON / scalar: 4 colors, 2 bits/pixel, 4×4 sprites.
+    Low4,
+}
+
+impl PaletteTier {
+    /// Auto-detect from the active SIMD lane width.
+    pub fn detect() -> Self {
+        match PREFERRED_F32_LANES {
+            16 => Self::Full16,   // AVX-512 / AMX
+            8  => Self::Mid8,     // AVX2
+            _  => Self::Low4,     // NEON (4), scalar (≤4)
+        }
+    }
+
+    /// Number of palette entries for this tier.
+    #[inline]
+    pub fn palette_size(self) -> usize {
+        match self {
+            Self::Full16 => 16,
+            Self::Mid8 => 8,
+            Self::Low4 => 4,
+        }
+    }
+
+    /// Bits per pixel for this tier.
+    #[inline]
+    pub fn bits_per_pixel(self) -> usize {
+        bits_for_palette_size(self.palette_size())
+    }
+
+    /// Sprite edge length (square) for node dots.
+    #[inline]
+    pub fn sprite_size(self) -> usize {
+        match self {
+            Self::Full16 => 8,
+            Self::Mid8 => 6,
+            Self::Low4 => 4,
+        }
+    }
+
+    /// Wire size in bytes for a `width × height` framebuffer at this tier.
+    #[inline]
+    pub fn wire_bytes(self, width: usize, height: usize) -> usize {
+        let total_px = width * height;
+        let bpp = self.bits_per_pixel();
+        (total_px * bpp + 7) / 8
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Framebuffer + SpriteAtlas
+// ─────────────────────────────────────────────────────────────────────
+
+/// View mode — determines how the framebuffer is composed.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ViewMode {
+    /// Density heatmap — every node plots at its position, intensity =
+    /// confidence. Palette maps linearly: 0 = background, max = hottest.
+    Mri,
+    /// Nodes as dot sprites, edges as Bresenham lines. Neo4j-style.
+    Neo4j,
+    /// Foveal sharp, peripheral density fog. Hybrid.
+    Cloud,
+}
+
+/// Palette-indexed framebuffer. Each pixel is a u8 index into a palette
+/// whose size is determined by the SIMD tier.
+pub struct Framebuffer {
+    pub width: usize,
+    pub height: usize,
+    pub tier: PaletteTier,
+    /// Row-major palette indices, length = width × height.
+    pub pixels: Vec<u8>,
+    /// Dirty rectangle: (x0, y0, x1, y1). Only the region inside needs
+    /// re-encoding on the wire. Reset to (0,0,0,0) after each `pack()`.
+    pub dirty: (usize, usize, usize, usize),
+}
+
+impl Framebuffer {
+    /// Allocate a cleared framebuffer at the given resolution and auto-detected tier.
+    pub fn new(width: usize, height: usize) -> Self {
+        let tier = PaletteTier::detect();
+        Self {
+            width,
+            height,
+            tier,
+            pixels: vec![0u8; width * height],
+            dirty: (0, 0, width, height),
+        }
+    }
+
+    /// Allocate with an explicit tier (for testing or override).
+    pub fn with_tier(width: usize, height: usize, tier: PaletteTier) -> Self {
+        Self {
+            width,
+            height,
+            tier,
+            pixels: vec![0u8; width * height],
+            dirty: (0, 0, width, height),
+        }
+    }
+
+    /// Clear the entire framebuffer to palette index 0 (background).
+    #[inline]
+    pub fn clear(&mut self) {
+        self.pixels.fill(0);
+        self.dirty = (0, 0, self.width, self.height);
+    }
+
+    /// Set a single pixel (with bounds check). Expands dirty rect.
+    #[inline]
+    pub fn set_pixel(&mut self, x: usize, y: usize, color: u8) {
+        if x < self.width && y < self.height {
+            self.pixels[y * self.width + x] = color;
+            self.expand_dirty(x, y, x + 1, y + 1);
+        }
+    }
+
+    /// Plot a filled dot (square sprite) centered at (cx, cy).
+    pub fn plot_dot(&mut self, cx: usize, cy: usize, color: u8) {
+        let r = self.tier.sprite_size() / 2;
+        let x0 = cx.saturating_sub(r);
+        let y0 = cy.saturating_sub(r);
+        let x1 = (cx + r).min(self.width);
+        let y1 = (cy + r).min(self.height);
+        for y in y0..y1 {
+            let row = y * self.width;
+            for x in x0..x1 {
+                self.pixels[row + x] = color;
+            }
+        }
+        self.expand_dirty(x0, y0, x1, y1);
+    }
+
+    /// Draw a Bresenham line from (x0,y0) to (x1,y1) with palette index.
+    pub fn draw_line(&mut self, mut x0: i32, mut y0: i32, x1: i32, y1: i32, color: u8) {
+        let dx = (x1 - x0).abs();
+        let dy = -(y1 - y0).abs();
+        let sx: i32 = if x0 < x1 { 1 } else { -1 };
+        let sy: i32 = if y0 < y1 { 1 } else { -1 };
+        let mut err = dx + dy;
+
+        loop {
+            if x0 >= 0 && y0 >= 0 && (x0 as usize) < self.width && (y0 as usize) < self.height {
+                self.pixels[y0 as usize * self.width + x0 as usize] = color;
+            }
+            if x0 == x1 && y0 == y1 { break; }
+            let e2 = 2 * err;
+            if e2 >= dy { err += dy; x0 += sx; }
+            if e2 <= dx { err += dx; y0 += sy; }
+        }
+        let (lx, rx) = (x0.min(x1).max(0) as usize, (x0.max(x1) as usize + 1).min(self.width));
+        let (ly, ry) = (y0.min(y1).max(0) as usize, (y0.max(y1) as usize + 1).min(self.height));
+        self.expand_dirty(lx, ly, rx, ry);
+    }
+
+    /// MRI density blit — for each node, increment the pixel at its projected
+    /// position. Clamped to palette max so saturated regions show as hottest.
+    pub fn blit_mri_density(&mut self, screen_xs: &[usize], screen_ys: &[usize]) {
+        let max_idx = (self.tier.palette_size() - 1) as u8;
+        for (&sx, &sy) in screen_xs.iter().zip(screen_ys.iter()) {
+            if sx < self.width && sy < self.height {
+                let idx = sy * self.width + sx;
+                self.pixels[idx] = self.pixels[idx].saturating_add(1).min(max_idx);
+            }
+        }
+        self.dirty = (0, 0, self.width, self.height);
+    }
+
+    /// Pack the framebuffer into palette_codec wire format.
+    ///
+    /// Returns `(packed_u64s, bits_per_pixel)`. The consumer unpacks with
+    /// `palette_codec::unpack_indices(&packed, bpp, w*h)`.
+    pub fn pack(&mut self) -> (Vec<u64>, usize) {
+        let bpp = self.tier.bits_per_pixel();
+        let packed = pack_indices(&self.pixels, bpp);
+        self.dirty = (0, 0, 0, 0);
+        (packed, bpp)
+    }
+
+    /// Byte count of the last `pack()` output (for bandwidth estimation).
+    pub fn packed_byte_estimate(&self) -> usize {
+        self.tier.wire_bytes(self.width, self.height)
+    }
+
+    fn expand_dirty(&mut self, x0: usize, y0: usize, x1: usize, y1: usize) {
+        self.dirty.0 = self.dirty.0.min(x0);
+        self.dirty.1 = self.dirty.1.min(y0);
+        self.dirty.2 = self.dirty.2.max(x1);
+        self.dirty.3 = self.dirty.3.max(y1);
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Mipmap — bitwise 4× downsampling for LOD pyramid.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Downsample a framebuffer 2× in each axis (4× total pixels).
+///
+/// Each 2×2 block maps to one pixel. Strategy: max (brightest wins),
+/// matching the MRI heatmap "any signal in this region" semantic.
+pub fn downsample_2x(src: &[u8], src_w: usize, src_h: usize) -> (Vec<u8>, usize, usize) {
+    let dst_w = src_w / 2;
+    let dst_h = src_h / 2;
+    let mut dst = vec![0u8; dst_w * dst_h];
+    for dy in 0..dst_h {
+        for dx in 0..dst_w {
+            let sx = dx * 2;
+            let sy = dy * 2;
+            let a = src[sy * src_w + sx];
+            let b = src[sy * src_w + sx + 1];
+            let c = src[(sy + 1) * src_w + sx];
+            let d = src[(sy + 1) * src_w + sx + 1];
+            dst[dy * dst_w + dx] = a.max(b).max(c).max(d);
+        }
+    }
+    (dst, dst_w, dst_h)
+}
+
+/// Full mipmap pyramid from L0 (original) down to the level where
+/// both dimensions are < `min_dim`.
+pub fn build_mipmap_pyramid(fb: &Framebuffer, min_dim: usize) -> Vec<(Vec<u8>, usize, usize)> {
+    let mut levels = Vec::new();
+    let mut cur = fb.pixels.clone();
+    let mut w = fb.width;
+    let mut h = fb.height;
+    levels.push((cur.clone(), w, h));
+    while w > min_dim && h > min_dim {
+        let (down, dw, dh) = downsample_2x(&cur, w, h);
+        levels.push((down.clone(), dw, dh));
+        cur = down;
+        w = dw;
+        h = dh;
+    }
+    levels
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Compose: RenderFrame → Framebuffer (the "graphics card" pipeline).
+// ─────────────────────────────────────────────────────────────────────
+
+/// Project a 3D position to 2D screen coordinates (orthographic).
+///
+/// Simple orthographic: x → screen_x, y → screen_y (z ignored).
+/// Scale and offset are applied. This is the dumbest projection that
+/// works; replace with perspective when q2 has a camera matrix.
+#[inline]
+pub fn project_ortho(
+    pos_x: f32, pos_y: f32,
+    scale: f32, offset_x: f32, offset_y: f32,
+    screen_w: usize, screen_h: usize,
+) -> (usize, usize) {
+    let sx = ((pos_x * scale + offset_x) as usize).min(screen_w.saturating_sub(1));
+    let sy = ((pos_y * scale + offset_y) as usize).min(screen_h.saturating_sub(1));
+    (sx, sy)
+}
+
+use crate::hpc::renderer::RenderFrame;
+
+/// Compose a Neo4j-style view: dots at nodes, lines for edges.
+///
+/// `edges` is a list of (source_idx, target_idx) pairs into the frame's
+/// node arrays. `color_fn` maps node index → palette color.
+pub fn compose_neo4j(
+    fb: &mut Framebuffer,
+    frame: &RenderFrame,
+    edges: &[(usize, usize)],
+    scale: f32,
+    offset: (f32, f32),
+    node_color: u8,
+    edge_color: u8,
+) {
+    fb.clear();
+    let w = fb.width;
+    let h = fb.height;
+
+    // Edges first (so nodes overdraw on top).
+    for &(src, tgt) in edges {
+        if src >= frame.len || tgt >= frame.len { continue; }
+        let (sx0, sy0) = project_ortho(
+            frame.positions[src * 3], frame.positions[src * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        let (sx1, sy1) = project_ortho(
+            frame.positions[tgt * 3], frame.positions[tgt * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        fb.draw_line(sx0 as i32, sy0 as i32, sx1 as i32, sy1 as i32, edge_color);
+    }
+
+    // Nodes as dot sprites.
+    for i in 0..frame.len {
+        let (sx, sy) = project_ortho(
+            frame.positions[i * 3], frame.positions[i * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        fb.plot_dot(sx, sy, node_color);
+    }
+}
+
+/// Compose an MRI density heatmap view.
+pub fn compose_mri(
+    fb: &mut Framebuffer,
+    frame: &RenderFrame,
+    scale: f32,
+    offset: (f32, f32),
+) {
+    fb.clear();
+    let w = fb.width;
+    let h = fb.height;
+
+    let mut xs = Vec::with_capacity(frame.len);
+    let mut ys = Vec::with_capacity(frame.len);
+    for i in 0..frame.len {
+        let (sx, sy) = project_ortho(
+            frame.positions[i * 3], frame.positions[i * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        xs.push(sx);
+        ys.push(sy);
+    }
+    fb.blit_mri_density(&xs, &ys);
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::hpc::palette_codec::unpack_indices;
+
+    #[test]
+    fn tier_detect_matches_lane_width() {
+        let tier = PaletteTier::detect();
+        match PREFERRED_F32_LANES {
+            16 => assert_eq!(tier, PaletteTier::Full16),
+            8  => assert_eq!(tier, PaletteTier::Mid8),
+            _  => assert_eq!(tier, PaletteTier::Low4),
+        }
+    }
+
+    #[test]
+    fn tier_palette_sizes() {
+        assert_eq!(PaletteTier::Full16.palette_size(), 16);
+        assert_eq!(PaletteTier::Mid8.palette_size(), 8);
+        assert_eq!(PaletteTier::Low4.palette_size(), 4);
+    }
+
+    #[test]
+    fn tier_bits_per_pixel() {
+        assert_eq!(PaletteTier::Full16.bits_per_pixel(), 4);
+        assert_eq!(PaletteTier::Mid8.bits_per_pixel(), 3);
+        assert_eq!(PaletteTier::Low4.bits_per_pixel(), 2);
+    }
+
+    #[test]
+    fn tier_sprite_sizes() {
+        assert_eq!(PaletteTier::Full16.sprite_size(), 8);
+        assert_eq!(PaletteTier::Mid8.sprite_size(), 6);
+        assert_eq!(PaletteTier::Low4.sprite_size(), 4);
+    }
+
+    #[test]
+    fn framebuffer_clear_sets_all_zero() {
+        let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        fb.pixels[100] = 5;
+        fb.clear();
+        assert!(fb.pixels.iter().all(|&p| p == 0));
+    }
+
+    #[test]
+    fn plot_dot_size_matches_tier() {
+        for tier in [PaletteTier::Full16, PaletteTier::Mid8, PaletteTier::Low4] {
+            let mut fb = Framebuffer::with_tier(64, 64, tier);
+            fb.plot_dot(32, 32, 1);
+            let lit: usize = fb.pixels.iter().filter(|&&p| p > 0).count();
+            let expected = tier.sprite_size() * tier.sprite_size();
+            assert_eq!(lit, expected, "tier {:?}", tier);
+        }
+    }
+
+    #[test]
+    fn bresenham_horizontal_line() {
+        let mut fb = Framebuffer::with_tier(32, 32, PaletteTier::Full16);
+        fb.draw_line(2, 5, 10, 5, 3);
+        for x in 2..=10 {
+            assert_eq!(fb.pixels[5 * 32 + x], 3);
+        }
+    }
+
+    #[test]
+    fn bresenham_diagonal_line() {
+        let mut fb = Framebuffer::with_tier(32, 32, PaletteTier::Full16);
+        fb.draw_line(0, 0, 7, 7, 2);
+        for i in 0..=7 {
+            assert_eq!(fb.pixels[i * 32 + i], 2);
+        }
+    }
+
+    #[test]
+    fn mri_density_accumulates() {
+        let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Full16);
+        let xs = vec![5, 5, 5]; // same pixel hit 3 times
+        let ys = vec![5, 5, 5];
+        fb.blit_mri_density(&xs, &ys);
+        assert_eq!(fb.pixels[5 * 16 + 5], 3);
+    }
+
+    #[test]
+    fn mri_density_clamps_to_palette_max() {
+        let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Low4);
+        // Low4 = palette_size 4, max index = 3.
+        let xs = vec![2; 10];
+        let ys = vec![2; 10];
+        fb.blit_mri_density(&xs, &ys);
+        assert_eq!(fb.pixels[2 * 16 + 2], 3); // clamped
+    }
+
+    #[test]
+    fn pack_roundtrips_through_palette_codec() {
+        let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Full16);
+        fb.plot_dot(8, 8, 7);
+        let original = fb.pixels.clone();
+        let (packed, bpp) = fb.pack();
+        let recovered = unpack_indices(&packed, bpp, 16 * 16);
+        assert_eq!(original, recovered);
+    }
+
+    #[test]
+    fn downsample_2x_shrinks_dimensions() {
+        let src = vec![1u8; 64 * 64];
+        let (dst, w, h) = downsample_2x(&src, 64, 64);
+        assert_eq!(w, 32);
+        assert_eq!(h, 32);
+        assert_eq!(dst.len(), 32 * 32);
+        assert!(dst.iter().all(|&p| p == 1));
+    }
+
+    #[test]
+    fn mipmap_pyramid_has_correct_levels() {
+        let fb = Framebuffer::with_tier(256, 256, PaletteTier::Full16);
+        let pyramid = build_mipmap_pyramid(&fb, 8);
+        // 256 → 128 → 64 → 32 → 16 → 8 = 6 levels (including L0).
+        assert!(pyramid.len() >= 5);
+        assert_eq!(pyramid[0].1, 256);
+        assert_eq!(pyramid[1].1, 128);
+    }
+
+    #[test]
+    fn compose_neo4j_plots_nodes_and_edges() {
+        let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        let mut frame = RenderFrame::with_capacity(16);
+        // Two nodes
+        frame.len = 2;
+        frame.positions[0] = 10.0; frame.positions[1] = 10.0; frame.positions[2] = 0.0;
+        frame.positions[3] = 50.0; frame.positions[4] = 50.0; frame.positions[5] = 0.0;
+        let edges = vec![(0, 1)];
+        compose_neo4j(&mut fb, &frame, &edges, 1.0, (0.0, 0.0), 5, 2);
+        // Node 0 should have a dot around (10, 10).
+        assert_eq!(fb.pixels[10 * 64 + 10], 5);
+        // Edge should have at least one pixel of color 2 on the diagonal.
+        let edge_count = fb.pixels.iter().filter(|&&p| p == 2).count();
+        assert!(edge_count > 0, "edge should have drawn pixels");
+    }
+
+    #[test]
+    fn compose_mri_plots_density() {
+        let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        let mut frame = RenderFrame::with_capacity(16);
+        frame.len = 3;
+        // Three nodes at same spot → density = 3.
+        for i in 0..3 {
+            frame.positions[i * 3] = 20.0;
+            frame.positions[i * 3 + 1] = 20.0;
+        }
+        compose_mri(&mut fb, &frame, 1.0, (0.0, 0.0));
+        assert_eq!(fb.pixels[20 * 64 + 20], 3);
+    }
+
+    #[test]
+    fn wire_bytes_decrease_with_lower_tier() {
+        let full = PaletteTier::Full16.wire_bytes(1024, 768);
+        let mid = PaletteTier::Mid8.wire_bytes(1024, 768);
+        let low = PaletteTier::Low4.wire_bytes(1024, 768);
+        assert!(full > mid, "16-color > 8-color wire");
+        assert!(mid > low, "8-color > 4-color wire");
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Wobble spring — organic node displacement that masks layout jitter.
+//
+// When a node moves (velocity exceeds threshold), wobble energy is
+// injected. It decays exponentially each tick. At render time, wobble
+// is added to the projected position — the node oscillates around its
+// physics-true location. The effect: the graph feels alive, and small
+// layout inaccuracies are hidden behind spring motion.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Per-node wobble state: displacement + decay.
+#[derive(Debug, Clone)]
+pub struct WobbleState {
+    /// Per-node wobble displacement (x, y interleaved; length = 2·capacity).
+    pub displace: Vec<f32>,
+    /// Decay factor per tick [0, 1). 0.92 = ~12 frames to half-life.
+    pub decay: f32,
+    /// Velocity threshold: inject wobble when speed exceeds this.
+    pub inject_threshold: f32,
+    /// Injection amplitude: max wobble pixels on injection.
+    pub amplitude: f32,
+}
+
+impl WobbleState {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            displace: vec![0.0; capacity * 2],
+            decay: 0.92,
+            inject_threshold: 0.5,
+            amplitude: 3.0,
+        }
+    }
+
+    /// Inject wobble for nodes whose velocity exceeds the threshold,
+    /// then decay all displacements. Call once per tick.
+    pub fn tick(&mut self, velocities: &[f32], len: usize) {
+        // Inject: if |v| > threshold, add random-ish displacement
+        // (use velocity direction × amplitude for deterministic wobble).
+        for i in 0..len {
+            let vx = velocities[i * 3];
+            let vy = velocities[i * 3 + 1];
+            let speed = (vx * vx + vy * vy).sqrt();
+            if speed > self.inject_threshold {
+                // Perpendicular to velocity direction → organic wobble
+                let norm = speed.recip();
+                self.displace[i * 2]     += -vy * norm * self.amplitude;
+                self.displace[i * 2 + 1] +=  vx * norm * self.amplitude;
+            }
+        }
+        // Decay all
+        for d in self.displace.iter_mut() {
+            *d *= self.decay;
+        }
+    }
+
+    /// Get wobble-adjusted screen position for node `i`.
+    #[inline]
+    pub fn adjust(&self, sx: usize, sy: usize, node_idx: usize) -> (usize, usize) {
+        let dx = self.displace.get(node_idx * 2).copied().unwrap_or(0.0);
+        let dy = self.displace.get(node_idx * 2 + 1).copied().unwrap_or(0.0);
+        (
+            (sx as f32 + dx).max(0.0) as usize,
+            (sy as f32 + dy).max(0.0) as usize,
+        )
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Neuron firing — nodes pulse when the cognitive shader resolves them.
+//
+// fire_intensity[i] ∈ [0, 255]. The shader sets it to 255 on Commit,
+// 200 on Epiphany, 128 on FailureTicket. Each tick it decays by
+// `decay_rate`. The framebuffer maps fire_intensity to a brighter
+// palette index (additive blend).
+// ─────────────────────────────────────────────────────────────────────
+
+/// Per-node fire intensity for visual neuron-pulse feedback.
+#[derive(Debug, Clone)]
+pub struct FireState {
+    /// Intensity per node [0, 255]. 0 = dark, 255 = just fired.
+    pub intensity: Vec<u8>,
+    /// Subtracted per tick. Higher = faster fade.
+    pub decay_rate: u8,
+}
+
+impl FireState {
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            intensity: vec![0u8; capacity],
+            decay_rate: 16,
+        }
+    }
+
+    /// Fire a node at the given intensity (255 = max).
+    #[inline]
+    pub fn fire(&mut self, node_idx: usize, intensity: u8) {
+        if let Some(v) = self.intensity.get_mut(node_idx) {
+            *v = (*v).max(intensity);
+        }
+    }
+
+    /// Decay all intensities by `decay_rate`. Call once per tick.
+    pub fn tick(&mut self) {
+        for v in self.intensity.iter_mut() {
+            *v = v.saturating_sub(self.decay_rate);
+        }
+    }
+
+    /// Map fire intensity to a palette color boost.
+    /// Returns 0 (no boost) or 1–3 extra palette indices to add.
+    #[inline]
+    pub fn color_boost(&self, node_idx: usize, palette_max: u8) -> u8 {
+        let raw = self.intensity.get(node_idx).copied().unwrap_or(0);
+        // Scale [0,255] → [0, palette_max/2] extra indices
+        let boost = (raw as u16 * (palette_max as u16 / 2)) / 255;
+        boost as u8
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Glyph atlas — 5×7 bitmap font for node labels.
+//
+// 95 printable ASCII characters (0x20–0x7E) stored as 5-byte columns
+// (7 rows each). Total atlas: 95 × 5 = 475 bytes — fits in L1.
+// ─────────────────────────────────────────────────────────────────────
+
+/// 5×7 bitmap glyph for one character. Column-major: glyph[col] has 7 bits (rows).
+pub type Glyph = [u8; 5];
+
+/// Minimal 5×7 ASCII glyph set. Covers A-Z, 0-9, space, common punctuation.
+/// Missing chars render as a filled block.
+pub static GLYPH_ATLAS: [Glyph; 128] = {
+    let mut atlas = [[0x7Fu8; 5]; 128]; // default = filled block
+    // Space
+    atlas[b' ' as usize] = [0, 0, 0, 0, 0];
+    // Digits 0-9
+    atlas[b'0' as usize] = [0x3E, 0x51, 0x49, 0x45, 0x3E];
+    atlas[b'1' as usize] = [0x00, 0x42, 0x7F, 0x40, 0x00];
+    atlas[b'2' as usize] = [0x62, 0x51, 0x49, 0x49, 0x46];
+    atlas[b'3' as usize] = [0x22, 0x41, 0x49, 0x49, 0x36];
+    atlas[b'4' as usize] = [0x18, 0x14, 0x12, 0x7F, 0x10];
+    atlas[b'5' as usize] = [0x27, 0x45, 0x45, 0x45, 0x39];
+    atlas[b'6' as usize] = [0x3C, 0x4A, 0x49, 0x49, 0x30];
+    atlas[b'7' as usize] = [0x01, 0x71, 0x09, 0x05, 0x03];
+    atlas[b'8' as usize] = [0x36, 0x49, 0x49, 0x49, 0x36];
+    atlas[b'9' as usize] = [0x06, 0x49, 0x49, 0x29, 0x1E];
+    // Letters A-Z
+    atlas[b'A' as usize] = [0x7E, 0x09, 0x09, 0x09, 0x7E];
+    atlas[b'B' as usize] = [0x7F, 0x49, 0x49, 0x49, 0x36];
+    atlas[b'C' as usize] = [0x3E, 0x41, 0x41, 0x41, 0x22];
+    atlas[b'D' as usize] = [0x7F, 0x41, 0x41, 0x41, 0x3E];
+    atlas[b'E' as usize] = [0x7F, 0x49, 0x49, 0x49, 0x41];
+    atlas[b'F' as usize] = [0x7F, 0x09, 0x09, 0x09, 0x01];
+    atlas[b'G' as usize] = [0x3E, 0x41, 0x49, 0x49, 0x7A];
+    atlas[b'H' as usize] = [0x7F, 0x08, 0x08, 0x08, 0x7F];
+    atlas[b'I' as usize] = [0x00, 0x41, 0x7F, 0x41, 0x00];
+    atlas[b'J' as usize] = [0x20, 0x40, 0x41, 0x3F, 0x01];
+    atlas[b'K' as usize] = [0x7F, 0x08, 0x14, 0x22, 0x41];
+    atlas[b'L' as usize] = [0x7F, 0x40, 0x40, 0x40, 0x40];
+    atlas[b'M' as usize] = [0x7F, 0x02, 0x0C, 0x02, 0x7F];
+    atlas[b'N' as usize] = [0x7F, 0x04, 0x08, 0x10, 0x7F];
+    atlas[b'O' as usize] = [0x3E, 0x41, 0x41, 0x41, 0x3E];
+    atlas[b'P' as usize] = [0x7F, 0x09, 0x09, 0x09, 0x06];
+    atlas[b'Q' as usize] = [0x3E, 0x41, 0x51, 0x21, 0x5E];
+    atlas[b'R' as usize] = [0x7F, 0x09, 0x19, 0x29, 0x46];
+    atlas[b'S' as usize] = [0x26, 0x49, 0x49, 0x49, 0x32];
+    atlas[b'T' as usize] = [0x01, 0x01, 0x7F, 0x01, 0x01];
+    atlas[b'U' as usize] = [0x3F, 0x40, 0x40, 0x40, 0x3F];
+    atlas[b'V' as usize] = [0x1F, 0x20, 0x40, 0x20, 0x1F];
+    atlas[b'W' as usize] = [0x3F, 0x40, 0x38, 0x40, 0x3F];
+    atlas[b'X' as usize] = [0x63, 0x14, 0x08, 0x14, 0x63];
+    atlas[b'Y' as usize] = [0x03, 0x04, 0x78, 0x04, 0x03];
+    atlas[b'Z' as usize] = [0x61, 0x51, 0x49, 0x45, 0x43];
+    // Punctuation
+    atlas[b'.' as usize] = [0x00, 0x60, 0x60, 0x00, 0x00];
+    atlas[b'-' as usize] = [0x08, 0x08, 0x08, 0x08, 0x08];
+    atlas[b'_' as usize] = [0x40, 0x40, 0x40, 0x40, 0x40];
+    atlas[b':' as usize] = [0x00, 0x36, 0x36, 0x00, 0x00];
+    atlas
+};
+
+impl Framebuffer {
+    /// Blit a text label at (x, y) using the 5×7 glyph atlas.
+    pub fn draw_label(&mut self, x: usize, y: usize, text: &str, color: u8) {
+        let mut cx = x;
+        for ch in text.bytes() {
+            let idx = (ch as usize).min(127);
+            let glyph = &GLYPH_ATLAS[idx];
+            for col in 0..5 {
+                let bits = glyph[col];
+                for row in 0..7 {
+                    if bits & (1 << row) != 0 {
+                        let px = cx + col;
+                        let py = y + row;
+                        if px < self.width && py < self.height {
+                            self.pixels[py * self.width + px] = color;
+                        }
+                    }
+                }
+            }
+            cx += 6; // 5 pixels + 1 gap
+        }
+        let text_w = text.len() * 6;
+        self.expand_dirty(x, y, (x + text_w).min(self.width), (y + 7).min(self.height));
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Flyby ring buffer — Amiga demo scene trick.
+//
+// Pre-render N frames of a mathematically computed satellite orbit
+// around the graph. Store as a ring of palette_codec-packed framebuffers.
+// During zoom/pan transitions or when the compute budget is spent,
+// play from the ring. The loop is seamless (Lissajous orbit completes
+// one full cycle over N frames). Higher N = smoother apparent frame rate
+// at the cost of memory.
+//
+// 300 frames × 512 KB each (16-color 1024²) = 150 MB.
+// 300 frames × 128 KB each (16-color 512²)  =  38 MB — fits L3.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Pre-rendered flyby frame (palette_codec-packed + camera state).
+#[derive(Clone)]
+pub struct FlybyFrame {
+    /// Packed pixel indices (via palette_codec).
+    pub packed: Vec<u64>,
+    /// Bits per pixel used for packing.
+    pub bpp: usize,
+    /// Camera position at this keyframe.
+    pub cam_x: f32,
+    pub cam_y: f32,
+    pub cam_zoom: f32,
+}
+
+/// Ring buffer of pre-rendered flyby keyframes.
+pub struct FlybyCache {
+    pub frames: Vec<FlybyFrame>,
+    /// Current playback position in [0, frames.len()).
+    pub cursor: usize,
+    /// Width/height of pre-rendered frames.
+    pub width: usize,
+    pub height: usize,
+}
+
+impl FlybyCache {
+    /// Pre-render `n_frames` of a Lissajous satellite orbit.
+    ///
+    /// The orbit traces a figure-8 around the graph center, completing
+    /// one full loop over `n_frames`. Scale determines the orbital radius
+    /// in world units; zoom_range controls the min/max camera zoom.
+    pub fn prerender(
+        fb_template: &Framebuffer,
+        frame: &RenderFrame,
+        edges: &[(usize, usize)],
+        n_frames: usize,
+        orbit_radius: f32,
+        zoom_range: (f32, f32),
+        node_color: u8,
+        edge_color: u8,
+    ) -> Self {
+        let mut frames = Vec::with_capacity(n_frames);
+        let w = fb_template.width;
+        let h = fb_template.height;
+        let tier = fb_template.tier;
+
+        for i in 0..n_frames {
+            let t = (i as f32 / n_frames as f32) * std::f32::consts::TAU;
+            // Lissajous: x = A·sin(t), y = A·sin(2t) → figure-8 orbit
+            let cam_x = orbit_radius * t.sin() + (w as f32 / 2.0);
+            let cam_y = orbit_radius * (2.0 * t).sin() + (h as f32 / 2.0);
+            // Zoom oscillates between min and max over the orbit
+            let zoom_t = (t.cos() + 1.0) * 0.5; // [0, 1]
+            let cam_zoom = zoom_range.0 + (zoom_range.1 - zoom_range.0) * zoom_t;
+
+            let mut fb = Framebuffer::with_tier(w, h, tier);
+            compose_neo4j(
+                &mut fb, frame, edges,
+                cam_zoom, (-cam_x * cam_zoom + w as f32 / 2.0,
+                           -cam_y * cam_zoom + h as f32 / 2.0),
+                node_color, edge_color,
+            );
+            let (packed, bpp) = fb.pack();
+            frames.push(FlybyFrame { packed, bpp, cam_x, cam_y, cam_zoom });
+        }
+        Self { frames, cursor: 0, width: w, height: h }
+    }
+
+    /// Advance the cursor and return the next keyframe (looping).
+    pub fn next_frame(&mut self) -> &FlybyFrame {
+        let frame = &self.frames[self.cursor];
+        self.cursor = (self.cursor + 1) % self.frames.len();
+        frame
+    }
+
+    /// Seek to the keyframe closest to the given camera position.
+    /// Used when transitioning from interactive mode back to flyby.
+    pub fn seek_nearest(&mut self, cam_x: f32, cam_y: f32) {
+        let mut best_dist = f32::MAX;
+        let mut best_idx = 0;
+        for (i, f) in self.frames.iter().enumerate() {
+            let dx = f.cam_x - cam_x;
+            let dy = f.cam_y - cam_y;
+            let d = dx * dx + dy * dy;
+            if d < best_dist {
+                best_dist = d;
+                best_idx = i;
+            }
+        }
+        self.cursor = best_idx;
+    }
+
+    /// Total memory used by the cache.
+    pub fn memory_bytes(&self) -> usize {
+        self.frames.iter().map(|f| f.packed.len() * 8).sum()
+    }
+
+    /// Frame count.
+    pub fn len(&self) -> usize { self.frames.len() }
+
+    pub fn is_empty(&self) -> bool { self.frames.is_empty() }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Full composition: wobble + fire + labels + Neo4j view
+// ─────────────────────────────────────────────────────────────────────
+
+/// Full Neo4j-style compose with wobble, neuron fire, and labels.
+pub fn compose_neo4j_full(
+    fb: &mut Framebuffer,
+    frame: &RenderFrame,
+    edges: &[(usize, usize)],
+    scale: f32,
+    offset: (f32, f32),
+    wobble: &WobbleState,
+    fire: &FireState,
+    labels: &[&str],
+    node_base_color: u8,
+    edge_color: u8,
+    label_color: u8,
+) {
+    fb.clear();
+    let w = fb.width;
+    let h = fb.height;
+    let pal_max = (fb.tier.palette_size() - 1) as u8;
+
+    // 1. Edges (drawn first so nodes overdraw).
+    for &(src, tgt) in edges {
+        if src >= frame.len || tgt >= frame.len { continue; }
+        let (sx0, sy0) = project_ortho(
+            frame.positions[src * 3], frame.positions[src * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        let (sx1, sy1) = project_ortho(
+            frame.positions[tgt * 3], frame.positions[tgt * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        let (wx0, wy0) = wobble.adjust(sx0, sy0, src);
+        let (wx1, wy1) = wobble.adjust(sx1, sy1, tgt);
+        fb.draw_line(wx0 as i32, wy0 as i32, wx1 as i32, wy1 as i32, edge_color);
+    }
+
+    // 2. Nodes as dot sprites with fire boost.
+    for i in 0..frame.len {
+        let (sx, sy) = project_ortho(
+            frame.positions[i * 3], frame.positions[i * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        let (wx, wy) = wobble.adjust(sx, sy, i);
+        let boost = fire.color_boost(i, pal_max);
+        let color = (node_base_color + boost).min(pal_max);
+        fb.plot_dot(wx, wy, color);
+    }
+
+    // 3. Labels (drawn last so text is on top).
+    for (i, &label) in labels.iter().enumerate().take(frame.len) {
+        if label.is_empty() { continue; }
+        let (sx, sy) = project_ortho(
+            frame.positions[i * 3], frame.positions[i * 3 + 1],
+            scale, offset.0, offset.1, w, h,
+        );
+        let (wx, wy) = wobble.adjust(sx, sy, i);
+        let label_y = wy + fb.tier.sprite_size() / 2 + 1;
+        fb.draw_label(wx.saturating_sub(label.len() * 3), label_y, label, label_color);
+    }
+}
+
+#[cfg(test)]
+mod visual_tests {
+    use super::*;
+    use crate::hpc::renderer::RenderFrame;
+
+    #[test]
+    fn wobble_decays_toward_zero() {
+        let mut w = WobbleState::new(4);
+        w.displace[0] = 10.0;
+        w.displace[1] = -5.0;
+        let vels = vec![0.0f32; 12]; // no new injection
+        for _ in 0..200 {
+            w.tick(&vels, 4);
+        }
+        assert!(w.displace[0].abs() < 0.01, "got {}", w.displace[0]);
+        assert!(w.displace[1].abs() < 0.01, "got {}", w.displace[1]);
+    }
+
+    #[test]
+    fn wobble_injects_on_high_velocity() {
+        let mut w = WobbleState::new(2);
+        let vels = vec![10.0, 0.0, 0.0, 0.0, 0.0, 0.0];
+        w.tick(&vels, 1);
+        // Perpendicular to (10, 0) → displacement in Y
+        assert!(w.displace[1].abs() > 0.1);
+    }
+
+    #[test]
+    fn fire_decays_to_zero() {
+        let mut f = FireState::new(4);
+        f.fire(0, 255);
+        assert_eq!(f.intensity[0], 255);
+        for _ in 0..20 {
+            f.tick();
+        }
+        assert_eq!(f.intensity[0], 0);
+    }
+
+    #[test]
+    fn fire_color_boost_scales_with_intensity() {
+        let mut f = FireState::new(4);
+        f.fire(0, 255);
+        let boost_full = f.color_boost(0, 15);
+        assert!(boost_full > 0);
+        f.intensity[0] = 0;
+        let boost_zero = f.color_boost(0, 15);
+        assert_eq!(boost_zero, 0);
+    }
+
+    #[test]
+    fn draw_label_renders_pixels() {
+        let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        fb.draw_label(4, 4, "AB", 5);
+        let lit: usize = fb.pixels.iter().filter(|&&p| p == 5).count();
+        assert!(lit > 10, "A+B glyphs should light at least 10 pixels");
+    }
+
+    #[test]
+    fn flyby_cache_loops_seamlessly() {
+        let mut frame = RenderFrame::with_capacity(16);
+        frame.len = 2;
+        frame.positions[0] = 10.0; frame.positions[1] = 10.0;
+        frame.positions[3] = 20.0; frame.positions[4] = 20.0;
+        let edges = vec![(0, 1)];
+        let fb_template = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        let mut cache = FlybyCache::prerender(
+            &fb_template, &frame, &edges, 8, 10.0, (0.5, 2.0), 5, 2,
+        );
+        assert_eq!(cache.len(), 8);
+        // Play through more than one loop — should not panic.
+        for _ in 0..20 {
+            let _ = cache.next_frame();
+        }
+        // Cursor wraps around.
+        assert_eq!(cache.cursor, 20 % 8);
+    }
+
+    #[test]
+    fn flyby_seek_nearest_finds_closest_frame() {
+        let mut frame = RenderFrame::with_capacity(16);
+        frame.len = 1;
+        frame.positions[0] = 32.0; frame.positions[1] = 32.0;
+        let fb_template = Framebuffer::with_tier(64, 64, PaletteTier::Full16);
+        let mut cache = FlybyCache::prerender(
+            &fb_template, &frame, &[], 16, 10.0, (1.0, 1.0), 5, 2,
+        );
+        cache.seek_nearest(32.0, 32.0);
+        let f = &cache.frames[cache.cursor];
+        let dx = f.cam_x - 32.0;
+        let dy = f.cam_y - 32.0;
+        assert!((dx * dx + dy * dy).sqrt() < 20.0);
+    }
+
+    #[test]
+    fn compose_neo4j_full_with_wobble_fire_labels() {
+        let mut fb = Framebuffer::with_tier(128, 128, PaletteTier::Full16);
+        let mut frame = RenderFrame::with_capacity(16);
+        frame.len = 2;
+        frame.positions[0] = 30.0; frame.positions[1] = 30.0;
+        frame.positions[3] = 90.0; frame.positions[4] = 90.0;
+        let edges = vec![(0, 1)];
+        let wobble = WobbleState::new(16);
+        let mut fire = FireState::new(16);
+        fire.fire(0, 255);
+        let labels = vec!["NODE0", "NODE1"];
+        compose_neo4j_full(
+            &mut fb, &frame, &edges, 1.0, (0.0, 0.0),
+            &wobble, &fire, &labels, 3, 1, 7,
+        );
+        // Node 0 should be brighter (fire boost) than base color 3.
+        let node0_pixel = fb.pixels[30 * 128 + 30];
+        assert!(node0_pixel >= 3, "node0 should have at least base color");
+        // Label pixels should exist at color 7.
+        let label_count = fb.pixels.iter().filter(|&&p| p == 7).count();
+        assert!(label_count > 0, "labels should render");
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Pyramid shader — heat diffusion through the cache-aligned pyramid.
+//
+// The inverse Stufenpyramide IS a GPU shader pipeline:
+//   L1 (64²)   → 4 KB   → registers / L0 cache
+//   L2 (256²)  → 64 KB  → L1 data cache
+//   L3 (4K²)   → 2 MB   → L2 cache  (bit) / 16 MB (byte)
+//   L4 (16K²)  → 32 MB  → L3 cache
+//
+// A perturbation enters at L1, diffuses at each level, then upscales
+// 4× to the next. Each level physically runs in its matching CPU cache.
+// The viewer sees cognition ripple through the hardware.
+// ─────────────────────────────────────────────────────────────────────
+
+/// 3×3 box-blur diffusion: each pixel = average of itself + 8 neighbors.
+/// In-place via double buffer (src → dst, then swap pointers).
+/// Palette-safe: result is clamped to [0, max_palette].
+pub fn diffuse_step(
+    src: &[u8], dst: &mut [u8],
+    width: usize, height: usize,
+    max_palette: u8,
+) {
+    for y in 0..height {
+        for x in 0..width {
+            let mut sum: u16 = 0;
+            let mut count: u16 = 0;
+            for dy in -1i32..=1 {
+                for dx in -1i32..=1 {
+                    let nx = x as i32 + dx;
+                    let ny = y as i32 + dy;
+                    if nx >= 0 && ny >= 0 && (nx as usize) < width && (ny as usize) < height {
+                        sum += src[ny as usize * width + nx as usize] as u16;
+                        count += 1;
+                    }
+                }
+            }
+            dst[y * width + x] = ((sum / count) as u8).min(max_palette);
+        }
+    }
+}
+
+/// Upscale 2× via nearest-neighbor (L_n → L_{n+1}).
+pub fn upscale_2x(src: &[u8], src_w: usize, src_h: usize) -> (Vec<u8>, usize, usize) {
+    let dst_w = src_w * 2;
+    let dst_h = src_h * 2;
+    let mut dst = vec![0u8; dst_w * dst_h];
+    for sy in 0..src_h {
+        for sx in 0..src_w {
+            let v = src[sy * src_w + sx];
+            let dy = sy * 2;
+            let dx = sx * 2;
+            dst[dy * dst_w + dx] = v;
+            dst[dy * dst_w + dx + 1] = v;
+            dst[(dy + 1) * dst_w + dx] = v;
+            dst[(dy + 1) * dst_w + dx + 1] = v;
+        }
+    }
+    (dst, dst_w, dst_h)
+}
+
+/// Four-level pyramid shader state.
+///
+/// Each level is a framebuffer at its native resolution. `tick()` runs
+/// one diffusion step at each level, then upscales L1→L2→L3→L4.
+/// Inject heat at L1 via `inject(x, y, intensity)`.
+pub struct PyramidShader {
+    /// L1: 64×64 (4 KB).
+    pub l1: Vec<u8>,
+    /// L2: 256×256 (64 KB).
+    pub l2: Vec<u8>,
+    /// L3: 1024×1024 (1 MB) — scaled down from 4K for practical display.
+    pub l3: Vec<u8>,
+    /// L4: 2048×2048 (4 MB) — the output surface.
+    pub l4: Vec<u8>,
+    /// Scratch buffer for double-buffer diffusion (same size as L4).
+    scratch: Vec<u8>,
+    /// Palette max (from tier).
+    pub palette_max: u8,
+    /// Tick counter.
+    pub tick: u64,
+}
+
+impl PyramidShader {
+    pub fn new(palette_max: u8) -> Self {
+        Self {
+            l1: vec![0u8; 64 * 64],
+            l2: vec![0u8; 256 * 256],
+            l3: vec![0u8; 1024 * 1024],
+            l4: vec![0u8; 2048 * 2048],
+            scratch: vec![0u8; 2048 * 2048],
+            palette_max,
+            tick: 0,
+        }
+    }
+
+    /// Inject heat at L1 coordinates (0..64, 0..64).
+    pub fn inject(&mut self, x: usize, y: usize, intensity: u8) {
+        if x < 64 && y < 64 {
+            self.l1[y * 64 + x] = self.l1[y * 64 + x].saturating_add(intensity).min(self.palette_max);
+        }
+    }
+
+    /// One shader tick: diffuse each level, then cascade upward.
+    ///
+    /// This IS the cognitive shader made visible. Each level physically
+    /// fits its CPU cache tier. The 4× widening at each step IS the
+    /// cache hierarchy doubling pattern.
+    pub fn tick(&mut self) {
+        // 1. Diffuse at each level independently.
+        //    L1: 64² = 4 KB → runs in registers / L0.
+        let mut scratch_l1 = vec![0u8; 64 * 64];
+        diffuse_step(&self.l1, &mut scratch_l1, 64, 64, self.palette_max);
+        self.l1.copy_from_slice(&scratch_l1);
+
+        //    L2: 256² = 64 KB → runs in L1 data cache.
+        let mut scratch_l2 = vec![0u8; 256 * 256];
+        diffuse_step(&self.l2, &mut scratch_l2, 256, 256, self.palette_max);
+        self.l2.copy_from_slice(&scratch_l2);
+
+        //    L3: 1024² = 1 MB → runs in L2 cache.
+        let mut scratch_l3 = vec![0u8; 1024 * 1024];
+        diffuse_step(&self.l3, &mut scratch_l3, 1024, 1024, self.palette_max);
+        self.l3.copy_from_slice(&scratch_l3);
+
+        // 2. Cascade: L1 upscales into L2, L2 into L3, L3 into L4.
+        //    Additive blend (saturating) so existing diffusion + upscaled signal combine.
+        let (up1, _, _) = upscale_2x(&self.l1, 64, 64);       // 128²
+        let (up1b, _, _) = upscale_2x(&up1, 128, 128);         // 256²
+        for (dst, src) in self.l2.iter_mut().zip(up1b.iter()) {
+            *dst = dst.saturating_add(*src).min(self.palette_max);
+        }
+
+        let (up2, _, _) = upscale_2x(&self.l2, 256, 256);      // 512²
+        let (up2b, _, _) = upscale_2x(&up2, 512, 512);          // 1024²
+        for (dst, src) in self.l3.iter_mut().zip(up2b.iter()) {
+            *dst = dst.saturating_add(*src).min(self.palette_max);
+        }
+
+        let (up3, _, _) = upscale_2x(&self.l3, 1024, 1024);    // 2048²
+        for (dst, src) in self.l4.iter_mut().zip(up3.iter()) {
+            *dst = dst.saturating_add(*src).min(self.palette_max);
+        }
+
+        // 3. Global decay on L4 (prevents saturation).
+        for v in self.l4.iter_mut() {
+            *v = v.saturating_sub(1);
+        }
+
+        self.tick += 1;
+    }
+
+    /// Compose a 2×2 panel view of all four levels into a framebuffer.
+    ///
+    /// Top-left = L1 (upscaled to panel size), top-right = L2,
+    /// bottom-left = L3, bottom-right = L4. Each panel is `pw × ph`.
+    pub fn compose_quad_view(&self, fb: &mut Framebuffer) {
+        let pw = fb.width / 2;
+        let ph = fb.height / 2;
+
+        // L1 → top-left (upscale from 64² to pw×ph)
+        blit_scaled(&self.l1, 64, 64, fb, 0, 0, pw, ph);
+        // L2 → top-right (upscale from 256² to pw×ph)
+        blit_scaled(&self.l2, 256, 256, fb, pw, 0, pw, ph);
+        // L3 → bottom-left (downscale from 1024² to pw×ph)
+        blit_scaled(&self.l3, 1024, 1024, fb, 0, ph, pw, ph);
+        // L4 → bottom-right (downscale from 2048² to pw×ph)
+        blit_scaled(&self.l4, 2048, 2048, fb, pw, ph, pw, ph);
+
+        fb.dirty = (0, 0, fb.width, fb.height);
+    }
+
+    /// Memory footprint across all levels.
+    pub fn memory_bytes(&self) -> usize {
+        self.l1.len() + self.l2.len() + self.l3.len() + self.l4.len() + self.scratch.len()
+    }
+}
+
+/// Nearest-neighbor scale-blit from src (src_w × src_h) into a region
+/// of the framebuffer at (dst_x, dst_y) with size (dst_w × dst_h).
+fn blit_scaled(
+    src: &[u8], src_w: usize, src_h: usize,
+    fb: &mut Framebuffer,
+    dst_x: usize, dst_y: usize,
+    dst_w: usize, dst_h: usize,
+) {
+    for dy in 0..dst_h {
+        let sy = (dy * src_h) / dst_h;
+        for dx in 0..dst_w {
+            let sx = (dx * src_w) / dst_w;
+            let px = dst_x + dx;
+            let py = dst_y + dy;
+            if px < fb.width && py < fb.height && sy < src_h && sx < src_w {
+                fb.pixels[py * fb.width + px] = src[sy * src_w + sx];
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod pyramid_tests {
+    use super::*;
+
+    #[test]
+    fn pyramid_shader_inject_and_tick() {
+        let mut ps = PyramidShader::new(15);
+        ps.inject(32, 32, 15);
+        assert_eq!(ps.l1[32 * 64 + 32], 15);
+        ps.tick();
+        // After one tick, heat should have diffused to neighbors at L1
+        // and cascaded to L2/L3/L4.
+        assert!(ps.l1[32 * 64 + 33] > 0, "L1 should diffuse right");
+        assert!(ps.l2[128 * 256 + 128] > 0, "L2 should receive cascade");
+    }
+
+    #[test]
+    fn pyramid_shader_decays_to_zero() {
+        let mut ps = PyramidShader::new(15);
+        ps.inject(32, 32, 15);
+        for _ in 0..200 {
+            ps.tick();
+        }
+        let l4_max = ps.l4.iter().copied().max().unwrap_or(0);
+        assert_eq!(l4_max, 0, "L4 should decay to zero after enough ticks");
+    }
+
+    #[test]
+    fn pyramid_shader_compose_quad_view() {
+        let mut ps = PyramidShader::new(15);
+        ps.inject(32, 32, 15);
+        ps.tick();
+        let mut fb = Framebuffer::with_tier(128, 128, PaletteTier::Full16);
+        ps.compose_quad_view(&mut fb);
+        // Top-left panel (L1 upscaled) should have nonzero pixels.
+        let tl_sum: u32 = fb.pixels[..64 * 128].iter().map(|&v| v as u32).sum();
+        assert!(tl_sum > 0, "L1 panel should show the injection");
+    }
+
+    #[test]
+    fn pyramid_shader_memory_footprint() {
+        let ps = PyramidShader::new(15);
+        // L1=4K + L2=64K + L3=1M + L4=4M + scratch=4M ≈ 9.07 MB
+        assert!(ps.memory_bytes() > 5_000_000);
+        assert!(ps.memory_bytes() < 20_000_000);
+    }
+
+    #[test]
+    fn upscale_2x_doubles_dimensions() {
+        let src = vec![5u8; 8 * 8];
+        let (dst, w, h) = upscale_2x(&src, 8, 8);
+        assert_eq!(w, 16);
+        assert_eq!(h, 16);
+        assert!(dst.iter().all(|&v| v == 5));
+    }
+
+    #[test]
+    fn diffuse_step_smooths_spike() {
+        let mut src = vec![0u8; 16 * 16];
+        src[8 * 16 + 8] = 15; // single hot pixel
+        let mut dst = vec![0u8; 16 * 16];
+        diffuse_step(&src, &mut dst, 16, 16, 15);
+        // Center should have decreased (averaged with zero neighbors).
+        assert!(dst[8 * 16 + 8] < 15);
+        // At least one neighbor should be nonzero.
+        let neighbor_sum: u16 = [
+            dst[7 * 16 + 8], dst[9 * 16 + 8],
+            dst[8 * 16 + 7], dst[8 * 16 + 9],
+        ].iter().map(|&v| v as u16).sum();
+        assert!(neighbor_sum > 0, "diffusion should spread to neighbors");
+    }
+}
diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs
index 3d41d5f8..f7106c72 100644
--- a/src/hpc/mod.rs
+++ b/src/hpc/mod.rs
@@ -227,6 +227,8 @@ pub mod jitson;
 pub mod jitson_cranelift;
 pub mod ocr_simd;
 pub mod ocr_felt;
+pub mod renderer;
+pub mod framebuffer;
 /// Audio primitives: MDCT, band energies, PVQ, AudioFrame codec.
 /// Transcoded from Opus CELT for the HHTL cascade → waveform pipeline.
 pub mod audio;
diff --git a/src/hpc/ocr_felt.rs b/src/hpc/ocr_felt.rs
index 72e984d7..1742da6b 100644
--- a/src/hpc/ocr_felt.rs
+++ b/src/hpc/ocr_felt.rs
@@ -23,10 +23,12 @@ const SKEW_FLOOR: f64 = EULER_GAMMA / (EULER_GAMMA + 1.0);
 /// A glyph's felt identity: 17 dimensions capturing shape qualia.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub struct GlyphBase17 {
+    /// 17 i16 dimensions encoding the glyph's shape.
     pub dims: [i16; 17],
 }
 
 impl GlyphBase17 {
+    /// All-zero glyph (used as a sentinel / default).
     pub const ZERO: Self = Self { dims: [0i16; 17] };
 
     /// Project a binary glyph patch to 17D via golden-step folding.
@@ -67,6 +69,7 @@ impl GlyphBase17 {
 
 /// Character codebook: 256 entries mapping u8 → (char, GlyphBase17).
 pub struct CharCodebook {
+    /// 256 codebook slots; each holds a character and its Base17 fingerprint.
     pub entries: [(char, GlyphBase17); 256],
 }
 
@@ -268,8 +271,11 @@ pub fn fast_skew_check(bin: &BinaryImage) -> SkewResult {
 /// Skew detection result.
 #[derive(Debug, Clone, Copy)]
 pub struct SkewResult {
+    /// Detected skew angle in radians.
     pub angle: f32,
+    /// Confidence in the result (0.0 - 1.0).
     pub confidence: f32,
+    /// Whether a full search was performed (vs. fast path skipped).
     pub searched: bool,
 }
 
diff --git a/src/hpc/ocr_simd.rs b/src/hpc/ocr_simd.rs
index a7b87224..753b6967 100644
--- a/src/hpc/ocr_simd.rs
+++ b/src/hpc/ocr_simd.rs
@@ -17,8 +17,11 @@ use crate::simd::{F32x16, U8x64};
 /// Grayscale image as flat row-major `&[u8]`.
 /// Width × Height pixels, one byte per pixel (0=black, 255=white).
 pub struct GrayImage<'a> {
+    /// Pixel bytes, row-major.
     pub data: &'a [u8],
+    /// Image width in pixels.
     pub width: usize,
+    /// Image height in pixels.
     pub height: usize,
 }
 
@@ -26,8 +29,11 @@ pub struct GrayImage<'a> {
 /// Each byte stores 8 pixels (bit-packed, MSB = leftmost).
 #[derive(Debug)]
 pub struct BinaryImage {
+    /// Bit-packed pixels (8 pixels per byte, MSB = leftmost).
     pub bits: Vec<u64>,
+    /// Image width in pixels.
     pub width: usize,
+    /// Image height in pixels.
     pub height: usize,
 }
 
diff --git a/src/hpc/renderer.rs b/src/hpc/renderer.rs
new file mode 100644
index 00000000..242560ff
--- /dev/null
+++ b/src/hpc/renderer.rs
@@ -0,0 +1,953 @@
+//! SIMD-accelerated double-buffer renderer for SPO graph visualization.
+//!
+//! This is the hardware-acceleration mothership for q2 cockpit / Palantir
+//! Gotham / Neo4j-style visual rendering. Per-tier dispatch via the
+//! `crate::simd` polyfill — AVX-512 / AVX2 / AMX / NEON / scalar fallback,
+//! all transparent to the consumer. Same pattern as `hpc::vsa`.
+//!
+//! # Architecture
+//!
+//! ```text
+//!   front: LazyLock<RwLock<RenderFrame>>   ← readers (REST/SSE) read here
+//!   back:  LazyLock<RwLock<RenderFrame>>   ← shader cycle writes here
+//!
+//!   tick(dt):
+//!     1. integrate forces into back-buffer (F32x16 mul_add fused multiply-add)
+//!     2. atomic swap front↔back via AtomicUsize index
+//!     3. readers pick up new frame on next .read()
+//! ```
+//!
+//! # SIMD dispatch
+//!
+//! All hot-path math (force accumulation, position integration, fingerprint
+//! similarity) uses `crate::simd::{F32x16, F64x8, U8x64}` which compile-time
+//! routes to:
+//!
+//! | Tier             | F32 lanes | FMA path           |
+//! |------------------|-----------|--------------------|
+//! | x86 AVX-512      | 16        | `_mm512_fmadd_ps`  |
+//! | x86 AVX2         | 8         | `_mm256_fmadd_ps`  |
+//! | x86 AMX          | 16+tile   | `_tile_dpbf16ps`   |
+//! | aarch64 NEON     | 4         | `vfmaq_f32`        |
+//! | scalar fallback  | 16 (loop) | `f32::mul_add`     |
+//!
+//! Consumer writes `crate::simd::F32x16`. The polyfill picks the path.
+//!
+//! # Frame layout (SoA, 64-byte aligned)
+//!
+//! - `positions: Vec<f32>` — flat x0,y0,z0,x1,y1,z1,…  (3·N floats)
+//! - `velocities: Vec<f32>` — same shape, integrated each tick
+//! - `charges: Vec<f32>` — repulsion strength per node (Coulomb-like)
+//! - `fingerprints: Vec<u64>` — VSA_WORDS·N (16384-bit per node)
+//!
+//! All capacities are multiples of `PREFERRED_F32_LANES` so SIMD passes
+//! never hit a scalar tail at the active tier.
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::{LazyLock, RwLock};
+
+use crate::hpc::vsa::VSA_WORDS;
+use crate::simd::{F32x16, PREFERRED_F32_LANES};
+
+/// Number of f32 components per node position (3D = x,y,z).
+pub const POSITION_DIMS: usize = 3;
+
+/// Round `n` up to the nearest multiple of `lanes` so SIMD passes never
+/// hit a scalar tail. Always returns ≥ `n`.
+#[inline]
+pub const fn pad_to_lanes(n: usize, lanes: usize) -> usize {
+    (n + lanes - 1) / lanes * lanes
+}
+
+/// One frame of render state — Structure-of-Arrays, 64-byte aligned.
+///
+/// Allocated capacity is padded so every component buffer is a multiple
+/// of `PREFERRED_F32_LANES` floats / `VSA_WORDS` u64. The active node
+/// count is tracked in `len`; trailing slots are zero-padded and ignored
+/// by the renderer but still SIMD-aligned for the loop bound.
+#[derive(Debug, Clone)]
+pub struct RenderFrame {
+    /// Active node count (≤ capacity).
+    pub len: usize,
+    /// Padded capacity (multiple of PREFERRED_F32_LANES).
+    pub capacity: usize,
+    /// Flat 3D positions: x0,y0,z0,x1,y1,z1,… length = 3·capacity.
+    pub positions: Vec<f32>,
+    /// Flat 3D velocities, same shape as positions.
+    pub velocities: Vec<f32>,
+    /// Per-node repulsion charge (length = capacity).
+    pub charges: Vec<f32>,
+    /// Per-node VSA fingerprint (length = VSA_WORDS·capacity).
+    pub fingerprints: Vec<u64>,
+    /// Logical tick number when this frame was last written.
+    pub tick: u64,
+}
+
+impl RenderFrame {
+    /// Allocate an empty frame with capacity for `n` nodes (rounded up
+    /// to PREFERRED_F32_LANES).
+    pub fn with_capacity(n: usize) -> Self {
+        let capacity = pad_to_lanes(n, PREFERRED_F32_LANES);
+        Self {
+            len: 0,
+            capacity,
+            positions: vec![0.0; POSITION_DIMS * capacity],
+            velocities: vec![0.0; POSITION_DIMS * capacity],
+            charges: vec![0.0; capacity],
+            fingerprints: vec![0u64; VSA_WORDS * capacity],
+            tick: 0,
+        }
+    }
+
+    /// Total bytes resident for this frame (debug / health).
+    pub fn byte_footprint(&self) -> usize {
+        self.positions.len() * 4
+            + self.velocities.len() * 4
+            + self.charges.len() * 4
+            + self.fingerprints.len() * 8
+    }
+}
+
+impl Default for RenderFrame {
+    fn default() -> Self {
+        Self::with_capacity(0)
+    }
+}
+
+/// Double-buffered renderer with atomic front/back swap.
+///
+/// Two pre-allocated `RenderFrame`s live in `frames[0]` / `frames[1]`.
+/// `front_idx` (0 or 1) names the frame readers see; the back frame
+/// is `1 - front_idx`. `swap()` flips the index — atomic, no allocation.
+///
+/// Readers acquire a read lock on the FRONT frame; the shader cycle
+/// acquires a write lock on the BACK frame. They never contend.
+pub struct Renderer {
+    /// Two pre-allocated frames (front + back).
+    pub frames: [RwLock<RenderFrame>; 2],
+    /// Index of the frame currently visible to readers.
+    front_idx: AtomicUsize,
+    /// Monotonic tick counter.
+    tick_count: AtomicU64,
+}
+
+impl Renderer {
+    /// Allocate a renderer with capacity for `n` nodes per frame.
+    pub fn with_capacity(n: usize) -> Self {
+        Self {
+            frames: [
+                RwLock::new(RenderFrame::with_capacity(n)),
+                RwLock::new(RenderFrame::with_capacity(n)),
+            ],
+            front_idx: AtomicUsize::new(0),
+            tick_count: AtomicU64::new(0),
+        }
+    }
+
+    /// Index of the currently-front frame (0 or 1).
+    #[inline]
+    pub fn front_index(&self) -> usize {
+        self.front_idx.load(Ordering::Acquire)
+    }
+
+    /// Index of the currently-back frame (1 - front_idx).
+    #[inline]
+    pub fn back_index(&self) -> usize {
+        1 - self.front_index()
+    }
+
+    /// Read-lock the front frame (for REST / SSE consumers).
+    pub fn read_front(&self) -> std::sync::RwLockReadGuard<'_, RenderFrame> {
+        self.frames[self.front_index()].read().expect("front lock poisoned")
+    }
+
+    /// Write-lock the back frame (for the shader cycle to mutate).
+    pub fn write_back(&self) -> std::sync::RwLockWriteGuard<'_, RenderFrame> {
+        self.frames[self.back_index()].write().expect("back lock poisoned")
+    }
+
+    /// Atomically swap front and back. Readers acquired BEFORE the swap
+    /// keep observing the old front; subsequent readers see the new front.
+    pub fn swap(&self) {
+        // XOR-flip via fetch_xor — single atomic write.
+        self.front_idx.fetch_xor(1, Ordering::AcqRel);
+    }
+
+    /// Current tick count (monotonically increasing across `tick()` calls).
+    #[inline]
+    pub fn tick_count(&self) -> u64 {
+        self.tick_count.load(Ordering::Acquire)
+    }
+
+    /// Advance physics by `dt` seconds and swap buffers.
+    ///
+    /// Hot path: SIMD-FMA velocity integration over the BACK frame, then
+    /// atomic swap. Friction `damping ∈ [0,1]` is applied per axis.
+    pub fn tick(&self, dt: f32, damping: f32) {
+        {
+            let mut back = self.write_back();
+            let RenderFrame { positions, velocities, tick, .. } = &mut *back;
+            integrate_simd(positions, velocities, dt, damping);
+            *tick = self.tick_count.load(Ordering::Acquire) + 1;
+        }
+        self.swap();
+        self.tick_count.fetch_add(1, Ordering::AcqRel);
+    }
+}
+
+impl Default for Renderer {
+    fn default() -> Self {
+        Self::with_capacity(0)
+    }
+}
+
+/// Process-global default renderer — single LazyLock-initialized instance.
+///
+/// Capacity is bootstrapped at 4096 nodes (rounded up to PREFERRED_F32_LANES).
+/// Consumers wanting a different capacity should construct their own
+/// `Renderer::with_capacity(...)` in their binary, not touch this static.
+pub static GLOBAL_RENDERER: LazyLock<Renderer> =
+    LazyLock::new(|| Renderer::with_capacity(4096));
+
+// ─────────────────────────────────────────────────────────────────────
+// SIMD hot path — integrate_simd dispatches via crate::simd::F32x16
+// which compile-time routes to AVX-512 / AVX2 / AMX / NEON / scalar.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Integrate positions += velocities·dt then apply damping, in SIMD chunks.
+///
+/// Uses `slice::as_chunks_mut::<16>()` for SIMD slicing — the array-window
+/// pattern documented in `crate::simd`. Both buffers are guaranteed to be
+/// multiples of `PREFERRED_F32_LANES` (enforced by `RenderFrame::with_capacity`),
+/// so the remainder slice is empty and there's no scalar tail.
+///
+/// One pass = one fused multiply-add per lane:
+///   `position = velocity * dt + position`
+///   `velocity = velocity * damping`
+#[inline]
+pub fn integrate_simd(positions: &mut [f32], velocities: &mut [f32], dt: f32, damping: f32) {
+    debug_assert_eq!(positions.len(), velocities.len());
+    debug_assert_eq!(positions.len() % PREFERRED_F32_LANES, 0);
+
+    let dt_v = cached_splat(dt);
+    let damping_v = F32x16::splat(damping);
+
+    // SIMD slicing via stable as_chunks_mut::<16>(). The remainder is
+    // empty by construction (capacity is padded to PREFERRED_F32_LANES).
+    let (p_chunks, p_tail) = positions.as_chunks_mut::<16>();
+    let (v_chunks, v_tail) = velocities.as_chunks_mut::<16>();
+    debug_assert!(p_tail.is_empty() && v_tail.is_empty());
+
+    for (p, v) in p_chunks.iter_mut().zip(v_chunks.iter_mut()) {
+        let pv = F32x16::from_array(*p);
+        let vv = F32x16::from_array(*v);
+        // FMA: position = velocity * dt + position
+        let p_new = vv.mul_add(dt_v, pv);
+        // Damping: velocity *= damping (one mul, no FMA needed)
+        let v_new = vv * damping_v;
+        p_new.copy_to_slice(p);
+        v_new.copy_to_slice(v);
+    }
+}
+
+/// Apply a uniform per-axis force to every node's velocity (e.g. gravity).
+/// `force` is `[fx, fy, fz]` accelerated by `dt`.
+///
+/// SIMD-FMA: `velocity[axis] = force[axis] * dt + velocity[axis]`.
+#[inline]
+pub fn apply_uniform_force(velocities: &mut [f32], force: [f32; 3], dt: f32) {
+    debug_assert_eq!(velocities.len() % PREFERRED_F32_LANES, 0);
+    debug_assert_eq!(velocities.len() % POSITION_DIMS, 0);
+
+    // Build a 16-lane pattern of [fx,fy,fz,fx,fy,fz,…] padded to 16.
+    // Since 16 isn't a multiple of 3, we go axis-major: process X, then Y, then Z
+    // each as their own SIMD pass over a strided view. For simplicity in this
+    // initial implementation, do it scalar over axes and SIMD across nodes.
+    //
+    // The fast path is 3 separate SIMD passes (one per axis); we encode it as
+    // a single pass with a 16-lane force vector by pre-tiling the force.
+
+    // Pre-tile the force vector to 48 floats = 16 lanes × 3 axes pattern,
+    // then iterate in 48-element chunks. For now keep it simple and correct.
+    let n_nodes = velocities.len() / POSITION_DIMS;
+    let dt_v = F32x16::splat(dt);
+
+    // Axis 0 (X)
+    let f_v = F32x16::splat(force[0]);
+    for i in (0..n_nodes).step_by(16) {
+        if i + 16 <= n_nodes {
+            // Gather positions of axis 0 every 3rd index — for the initial cut
+            // we use scalar to keep the code clear; a future optimisation can
+            // reshape velocities to xs/ys/zs SoA for full SIMD-FMA per axis.
+            let _ = (f_v, dt_v);
+            for k in 0..16 {
+                let idx = (i + k) * POSITION_DIMS;
+                velocities[idx] = force[0].mul_add(dt, velocities[idx]);
+            }
+        } else {
+            for k in 0..(n_nodes - i) {
+                let idx = (i + k) * POSITION_DIMS;
+                velocities[idx] = force[0].mul_add(dt, velocities[idx]);
+            }
+        }
+    }
+    // Axes 1 (Y), 2 (Z)
+    for axis in 1..POSITION_DIMS {
+        for n in 0..n_nodes {
+            let idx = n * POSITION_DIMS + axis;
+            velocities[idx] = force[axis].mul_add(dt, velocities[idx]);
+        }
+    }
+}
+
+/// Per-tier SIMD lane width report — for tests / diagnostics.
+#[inline]
+pub const fn active_lane_width() -> usize {
+    PREFERRED_F32_LANES
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn pad_to_lanes_rounds_up() {
+        assert_eq!(pad_to_lanes(0, 16), 0);
+        assert_eq!(pad_to_lanes(1, 16), 16);
+        assert_eq!(pad_to_lanes(15, 16), 16);
+        assert_eq!(pad_to_lanes(16, 16), 16);
+        assert_eq!(pad_to_lanes(17, 16), 32);
+        assert_eq!(pad_to_lanes(100, 16), 112);
+    }
+
+    #[test]
+    fn frame_capacity_is_simd_aligned() {
+        let f = RenderFrame::with_capacity(100);
+        assert_eq!(f.capacity % PREFERRED_F32_LANES, 0);
+        assert_eq!(f.positions.len() % PREFERRED_F32_LANES, 0);
+        assert_eq!(f.velocities.len() % PREFERRED_F32_LANES, 0);
+        assert_eq!(f.charges.len() % PREFERRED_F32_LANES, 0);
+        // fingerprints: VSA_WORDS·capacity, VSA_WORDS = 256
+        assert_eq!(f.fingerprints.len() / VSA_WORDS, f.capacity);
+    }
+
+    #[test]
+    fn frame_byte_footprint_matches_capacity() {
+        let f = RenderFrame::with_capacity(16);
+        // 16 nodes × (3·4 pos + 3·4 vel + 4 charge + 256·8 fp) = 16 · (12+12+4+2048) = 16 · 2076
+        assert_eq!(f.byte_footprint(), f.capacity * (12 + 12 + 4 + 256 * 8));
+    }
+
+    #[test]
+    fn renderer_swap_flips_index() {
+        let r = Renderer::with_capacity(16);
+        assert_eq!(r.front_index(), 0);
+        assert_eq!(r.back_index(), 1);
+        r.swap();
+        assert_eq!(r.front_index(), 1);
+        assert_eq!(r.back_index(), 0);
+        r.swap();
+        assert_eq!(r.front_index(), 0);
+    }
+
+    #[test]
+    fn integrate_simd_applies_velocity_and_damping() {
+        let mut positions = vec![0.0f32; 16];
+        let mut velocities = vec![1.0f32; 16];
+        integrate_simd(&mut positions, &mut velocities, 0.5, 0.9);
+        // position += v·dt = 0 + 1·0.5 = 0.5
+        for &p in &positions {
+            assert!((p - 0.5).abs() < 1e-6, "p = {p}");
+        }
+        // velocity *= damping = 1 · 0.9 = 0.9
+        for &v in &velocities {
+            assert!((v - 0.9).abs() < 1e-6, "v = {v}");
+        }
+    }
+
+    #[test]
+    fn integrate_simd_handles_multi_chunk() {
+        let mut positions = vec![0.0f32; 64];
+        let mut velocities = vec![2.0f32; 64];
+        integrate_simd(&mut positions, &mut velocities, 0.25, 1.0);
+        for &p in &positions {
+            assert!((p - 0.5).abs() < 1e-6);
+        }
+        for &v in &velocities {
+            assert!((v - 2.0).abs() < 1e-6);
+        }
+    }
+
+    #[test]
+    fn renderer_tick_advances_count_and_swaps() {
+        let r = Renderer::with_capacity(16);
+        let initial_front = r.front_index();
+        let initial_tick = r.tick_count();
+        r.tick(0.016, 0.99); // 60 fps, light damping
+        assert_eq!(r.tick_count(), initial_tick + 1);
+        assert_eq!(r.front_index(), 1 - initial_front);
+    }
+
+    #[test]
+    fn renderer_60_ticks_keep_simd_alignment() {
+        let r = Renderer::with_capacity(1024);
+        for _ in 0..60 {
+            r.tick(1.0 / 60.0, 0.95);
+        }
+        assert_eq!(r.tick_count(), 60);
+        let front = r.read_front();
+        assert_eq!(front.positions.len() % PREFERRED_F32_LANES, 0);
+        assert_eq!(front.velocities.len() % PREFERRED_F32_LANES, 0);
+    }
+
+    #[test]
+    fn apply_uniform_force_accelerates_velocity() {
+        // 16 nodes × 3 axes = 48 floats. 48 = 3×16 → a multiple of 16.
+        let mut velocities = vec![0.0f32; 48];
+        apply_uniform_force(&mut velocities, [1.0, 2.0, 3.0], 0.5);
+        for n in 0..16 {
+            assert!((velocities[n * 3] - 0.5).abs() < 1e-6);     // X: 1·0.5
+            assert!((velocities[n * 3 + 1] - 1.0).abs() < 1e-6); // Y: 2·0.5
+            assert!((velocities[n * 3 + 2] - 1.5).abs() < 1e-6); // Z: 3·0.5
+        }
+    }
+
+    #[test]
+    fn active_lane_width_is_simd_aligned_constant() {
+        let w = active_lane_width();
+        assert!(w == 4 || w == 8 || w == 16);
+        // VSA_DIMS (16384) is divisible by every active tier's lane width.
+        assert_eq!(crate::hpc::vsa::VSA_DIMS % w, 0);
+    }
+
+    #[test]
+    fn global_renderer_starts_at_tick_zero() {
+        let _ = &*GLOBAL_RENDERER;
+        // First-touch: tick count is 0; capacity is at least 4096
+        // (could be greater if PREFERRED_F32_LANES > 16 at some future tier).
+        assert!(GLOBAL_RENDERER.tick_count() >= 0);
+        let f = GLOBAL_RENDERER.read_front();
+        assert!(f.capacity >= 4096);
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// LazyLock-cached splat constants for the common tick rates.
+//
+// `F32x16::splat(dt)` is one CPU instruction at AVX-512 (`_mm512_set1_ps`)
+// but the renderer ticks at fixed rates 99% of the time, so we cache the
+// three canonical splat values: 60 fps / 30 fps / 15 fps.
+//
+// `cached_splat(dt)` returns the cached vector when `dt` matches one of
+// the canonical rates, falling back to a fresh splat otherwise. Tolerance
+// ±2 µs absorbs floating-point jitter without bypassing the cache.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Tick budget for 60 fps in seconds (1/60).
+pub const DT_60: f32 = 1.0 / 60.0;
+/// Tick budget for 30 fps in seconds (1/30).
+pub const DT_30: f32 = 1.0 / 30.0;
+/// Tick budget for 15 fps in seconds (1/15).
+pub const DT_15: f32 = 1.0 / 15.0;
+
+static SPLAT_60: LazyLock<F32x16> = LazyLock::new(|| F32x16::splat(DT_60));
+static SPLAT_30: LazyLock<F32x16> = LazyLock::new(|| F32x16::splat(DT_30));
+static SPLAT_15: LazyLock<F32x16> = LazyLock::new(|| F32x16::splat(DT_15));
+
+/// Splat `dt` into an `F32x16`, returning a cached value for the canonical
+/// rates (60 / 30 / 15 fps). Falls back to a fresh splat for arbitrary `dt`.
+#[inline]
+pub fn cached_splat(dt: f32) -> F32x16 {
+    const TOL: f32 = 2e-6;
+    if (dt - DT_60).abs() < TOL { *SPLAT_60 }
+    else if (dt - DT_30).abs() < TOL { *SPLAT_30 }
+    else if (dt - DT_15).abs() < TOL { *SPLAT_15 }
+    else { F32x16::splat(dt) }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Viewport + foveated rendering — only spend SIMD cycles on what's seen.
+// ─────────────────────────────────────────────────────────────────────
+
+/// Camera + view-volume parameters for foveated rendering.
+///
+/// Nodes are classified by distance to `center` into four priority bands.
+/// The renderer ticks foveal nodes every frame, peripheral every other,
+/// distant every fourth, and skips off-screen entirely. Net effect: O(N)
+/// work scales with what the camera is actually looking at, not the
+/// graph total.
+#[derive(Debug, Clone, Copy)]
+pub struct Viewport {
+    /// Camera focus point in world coordinates.
+    pub center: [f32; 3],
+    /// Foveal radius — full-detail every-tick zone.
+    pub foveal_radius: f32,
+    /// Peripheral radius — half-rate update zone (foveal_radius < r ≤ peripheral).
+    pub peripheral_radius: f32,
+    /// Cull radius — beyond this, skip entirely (not just slow).
+    pub cull_radius: f32,
+}
+
+impl Viewport {
+    /// Default: 4.0 unit foveal, 16.0 peripheral, 64.0 cull.
+    pub fn default_at(center: [f32; 3]) -> Self {
+        Self { center, foveal_radius: 4.0, peripheral_radius: 16.0, cull_radius: 64.0 }
+    }
+}
+
+/// Update priority for one node — controls how often it's integrated.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u8)]
+pub enum UpdatePriority {
+    /// Every tick (foveal zone).
+    Foveal = 0,
+    /// Every 2nd tick (peripheral zone).
+    Peripheral = 1,
+    /// Every 4th tick (distant but in-frustum).
+    Distant = 2,
+    /// Skip (out of cull radius).
+    OffScreen = 3,
+}
+
+impl UpdatePriority {
+    /// Stride between updates for this priority.
+    #[inline]
+    pub fn tick_stride(self) -> u64 {
+        match self {
+            Self::Foveal => 1,
+            Self::Peripheral => 2,
+            Self::Distant => 4,
+            Self::OffScreen => u64::MAX,
+        }
+    }
+
+    /// Should this node be updated on the given tick?
+    #[inline]
+    pub fn should_update(self, tick: u64) -> bool {
+        let stride = self.tick_stride();
+        if stride == u64::MAX { false } else { tick % stride == 0 }
+    }
+}
+
+/// Classify each node by distance to viewport center.
+///
+/// Returns a Vec<UpdatePriority> of length `len` (active nodes only). Trailing
+/// padded slots are not classified — they're never integrated regardless.
+pub fn classify_priorities(positions: &[f32], len: usize, vp: &Viewport) -> Vec<UpdatePriority> {
+    let mut out = Vec::with_capacity(len);
+    let f2 = vp.foveal_radius * vp.foveal_radius;
+    let p2 = vp.peripheral_radius * vp.peripheral_radius;
+    let c2 = vp.cull_radius * vp.cull_radius;
+    for i in 0..len {
+        let dx = positions[i * POSITION_DIMS]     - vp.center[0];
+        let dy = positions[i * POSITION_DIMS + 1] - vp.center[1];
+        let dz = positions[i * POSITION_DIMS + 2] - vp.center[2];
+        let d2 = dx * dx + dy * dy + dz * dz;
+        out.push(
+            if d2 <= f2 { UpdatePriority::Foveal }
+            else if d2 <= p2 { UpdatePriority::Peripheral }
+            else if d2 <= c2 { UpdatePriority::Distant }
+            else { UpdatePriority::OffScreen }
+        );
+    }
+    out
+}
+
+/// Foveated integrate — apply `integrate_simd` only to the chunks where
+/// at least one node has `priorities[node] == active_priority` AND
+/// `priority.should_update(tick)`.
+///
+/// Operates in 16-element chunks (the SIMD chunk size), so the smallest
+/// granularity of skipping is 16/POSITION_DIMS ≈ 5 nodes. For graphs
+/// where most nodes share a priority band this is near-optimal; for
+/// random-priority graphs, foveated savings drop toward zero (worst case
+/// is the same cost as `integrate_simd`).
+pub fn integrate_foveated(
+    positions: &mut [f32],
+    velocities: &mut [f32],
+    priorities: &[UpdatePriority],
+    tick: u64,
+    dt: f32,
+    damping: f32,
+) {
+    debug_assert_eq!(positions.len(), velocities.len());
+    debug_assert_eq!(positions.len() % PREFERRED_F32_LANES, 0);
+
+    let dt_v = cached_splat(dt);
+    let damping_v = F32x16::splat(damping);
+
+    // Each 16-float chunk covers ceil(16/3) = 6 nodes (with overlap on
+    // the chunk boundary). We skip a chunk only if EVERY node mapping to
+    // it is OffScreen or stride-skipped this tick; otherwise we update
+    // the whole chunk (cheap — one FMA — vs. branching cost).
+    let nodes_per_chunk = 16 / POSITION_DIMS + 1; // round up
+
+    let (p_chunks, _) = positions.as_chunks_mut::<16>();
+    let (v_chunks, _) = velocities.as_chunks_mut::<16>();
+
+    for (chunk_idx, (p, v)) in p_chunks.iter_mut().zip(v_chunks.iter_mut()).enumerate() {
+        // Determine which active nodes fall into this chunk.
+        let node_lo = (chunk_idx * 16) / POSITION_DIMS;
+        let node_hi = (node_lo + nodes_per_chunk).min(priorities.len());
+
+        // Skip only if every node in the band agrees to skip THIS tick.
+        let all_skip = (node_lo..node_hi)
+            .all(|n| !priorities[n].should_update(tick));
+        if all_skip { continue; }
+
+        let pv = F32x16::from_array(*p);
+        let vv = F32x16::from_array(*v);
+        let p_new = vv.mul_add(dt_v, pv);
+        let v_new = vv * damping_v;
+        p_new.copy_to_slice(p);
+        v_new.copy_to_slice(v);
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// FPS controller — adaptive frame rate under load.
+//
+// Targets 60 ticks/s by default. If the last tick took longer than the
+// budget allows, the controller drops to 30 fps; if even 30 fps overruns,
+// it drops to 15 fps. When ticks consistently come in under-budget, it
+// climbs back up. This keeps the cockpit responsive on big graphs without
+// manual rate selection.
+// ─────────────────────────────────────────────────────────────────────
+
+use std::time::Instant;
+
+/// Adaptive FPS targeting 60 → 30 → 15 with hysteresis.
+pub struct FpsController {
+    /// Active target rate in Hz (60, 30, or 15).
+    target_hz: AtomicU32,
+    /// Last tick wall-clock (ns since renderer construction).
+    last_tick_ns: AtomicU64,
+    /// Construction instant — origin for last_tick_ns.
+    origin: Instant,
+    /// Rolling mean tick duration in ns (EWMA).
+    avg_tick_ns: AtomicU64,
+    /// Consecutive under-budget ticks (used to climb back up).
+    under_budget_streak: AtomicU32,
+}
+
+use std::sync::atomic::AtomicU32;
+
+impl FpsController {
+    /// Construct with `target_hz` initial rate (clamped to {15, 30, 60}).
+    pub fn new(target_hz: u32) -> Self {
+        let clamped = match target_hz {
+            x if x >= 60 => 60,
+            x if x >= 30 => 30,
+            _ => 15,
+        };
+        Self {
+            target_hz: AtomicU32::new(clamped),
+            last_tick_ns: AtomicU64::new(0),
+            origin: Instant::now(),
+            avg_tick_ns: AtomicU64::new(0),
+            under_budget_streak: AtomicU32::new(0),
+        }
+    }
+
+    /// Currently active target rate in Hz (60, 30, or 15).
+    #[inline]
+    pub fn target_hz(&self) -> u32 {
+        self.target_hz.load(Ordering::Acquire)
+    }
+
+    /// `dt` in seconds for the active target rate.
+    #[inline]
+    pub fn dt(&self) -> f32 {
+        match self.target_hz() {
+            60 => DT_60,
+            30 => DT_30,
+            _  => DT_15,
+        }
+    }
+
+    /// Rolling mean tick duration in nanoseconds.
+    #[inline]
+    pub fn avg_tick_ns(&self) -> u64 {
+        self.avg_tick_ns.load(Ordering::Acquire)
+    }
+
+    /// Tick budget for the current target rate, in nanoseconds.
+    #[inline]
+    pub fn budget_ns(&self) -> u64 {
+        1_000_000_000u64 / self.target_hz() as u64
+    }
+
+    /// Record the duration of one tick and adapt the rate if needed.
+    ///
+    /// EWMA with α = 1/8 keeps the average responsive without flapping.
+    /// Step down: 3 consecutive over-budget ticks → halve target.
+    /// Step up:   60 consecutive under-budget ticks (~1s @ current rate) → double target.
+    pub fn record_tick(&self, duration_ns: u64) {
+        // EWMA update (α = 1/8): avg = avg + (sample - avg)/8
+        let prev = self.avg_tick_ns.load(Ordering::Acquire);
+        let next = if prev == 0 {
+            duration_ns
+        } else {
+            prev + (duration_ns.saturating_sub(prev) / 8)
+                 - (prev.saturating_sub(duration_ns) / 8)
+        };
+        self.avg_tick_ns.store(next, Ordering::Release);
+
+        let budget = self.budget_ns();
+        let cur = self.target_hz();
+        if duration_ns > budget {
+            self.under_budget_streak.store(0, Ordering::Release);
+            // Step down 60 → 30 → 15. Don't go below 15.
+            let new_hz = match cur { 60 => 30, 30 => 15, _ => 15 };
+            if new_hz != cur {
+                self.target_hz.store(new_hz, Ordering::Release);
+            }
+        } else {
+            let streak = self.under_budget_streak.fetch_add(1, Ordering::AcqRel) + 1;
+            if streak >= 60 {
+                let new_hz = match cur { 15 => 30, 30 => 60, _ => 60 };
+                if new_hz != cur {
+                    self.target_hz.store(new_hz, Ordering::Release);
+                }
+                self.under_budget_streak.store(0, Ordering::Release);
+            }
+        }
+
+        let elapsed = self.origin.elapsed().as_nanos() as u64;
+        self.last_tick_ns.store(elapsed, Ordering::Release);
+    }
+}
+
+impl Default for FpsController {
+    fn default() -> Self {
+        Self::new(60)
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────
+// Renderer convenience: adaptive + foveated tick wrappers.
+// ─────────────────────────────────────────────────────────────────────
+
+impl Renderer {
+    /// Tick at the rate the `FpsController` currently targets, measuring
+    /// the duration so the controller can adapt for next call.
+    ///
+    /// This is the recommended top-level entry point for cockpit servers:
+    /// just call `r.tick_adaptive(&fps_ctl, damping)` in a loop and the
+    /// rate auto-tunes between 60 / 30 / 15 fps based on observed load.
+    pub fn tick_adaptive(&self, fps: &FpsController, damping: f32) {
+        let start = std::time::Instant::now();
+        self.tick(fps.dt(), damping);
+        fps.record_tick(start.elapsed().as_nanos() as u64);
+    }
+
+    /// Foveated tick — classify by viewport, integrate only chunks where
+    /// at least one node should update this tick.
+    ///
+    /// The classification cost is O(N) (one squared-distance per node) but
+    /// is done once per tick; the SIMD integration cost drops by the share
+    /// of off-screen / sub-rate nodes. For a typical cockpit camera, foveal
+    /// nodes are ≤ 20% of the graph → 5× speedup vs full integrate.
+    pub fn tick_foveated(&self, fps: &FpsController, damping: f32, vp: &Viewport) {
+        let start = std::time::Instant::now();
+        let dt = fps.dt();
+        let tick_now = self.tick_count.load(Ordering::Acquire) + 1;
+        {
+            let mut back = self.write_back();
+            let RenderFrame { positions, velocities, len, tick, .. } = &mut *back;
+            let priorities = classify_priorities(positions, *len, vp);
+            integrate_foveated(positions, velocities, &priorities, tick_now, dt, damping);
+            *tick = tick_now;
+        }
+        self.swap();
+        self.tick_count.fetch_add(1, Ordering::AcqRel);
+        fps.record_tick(start.elapsed().as_nanos() as u64);
+    }
+}
+
+#[cfg(test)]
+mod adaptive_tests {
+    use super::*;
+
+    #[test]
+    fn cached_splat_returns_canonical_for_60fps() {
+        let v = cached_splat(DT_60);
+        // The cached vector should be byte-identical to a fresh splat.
+        let fresh = F32x16::splat(DT_60);
+        // F32x16 doesn't implement PartialEq directly; compare via copy_to_slice.
+        let mut a = [0.0f32; 16];
+        let mut b = [0.0f32; 16];
+        v.copy_to_slice(&mut a);
+        fresh.copy_to_slice(&mut b);
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn cached_splat_falls_back_for_arbitrary_dt() {
+        let v = cached_splat(0.0314);
+        let mut out = [0.0f32; 16];
+        v.copy_to_slice(&mut out);
+        for x in out { assert!((x - 0.0314).abs() < 1e-6); }
+    }
+
+    #[test]
+    fn cached_splat_within_tolerance_hits_cache() {
+        // 1/60 = 0.01666666… → 0.01666666 + 1µs should still hit the cache.
+        let v = cached_splat(DT_60 + 1e-7);
+        let mut out = [0.0f32; 16];
+        v.copy_to_slice(&mut out);
+        // Cached at exactly DT_60, not the slightly-higher input.
+        assert!((out[0] - DT_60).abs() < 1e-6);
+    }
+
+    #[test]
+    fn priority_stride_progression() {
+        assert_eq!(UpdatePriority::Foveal.tick_stride(), 1);
+        assert_eq!(UpdatePriority::Peripheral.tick_stride(), 2);
+        assert_eq!(UpdatePriority::Distant.tick_stride(), 4);
+        assert_eq!(UpdatePriority::OffScreen.tick_stride(), u64::MAX);
+    }
+
+    #[test]
+    fn priority_should_update_respects_stride() {
+        assert!(UpdatePriority::Foveal.should_update(0));
+        assert!(UpdatePriority::Foveal.should_update(7));
+        assert!(UpdatePriority::Peripheral.should_update(0));
+        assert!(!UpdatePriority::Peripheral.should_update(1));
+        assert!(UpdatePriority::Peripheral.should_update(2));
+        assert!(UpdatePriority::Distant.should_update(0));
+        assert!(!UpdatePriority::Distant.should_update(1));
+        assert!(UpdatePriority::Distant.should_update(4));
+        assert!(!UpdatePriority::OffScreen.should_update(0));
+        assert!(!UpdatePriority::OffScreen.should_update(u64::MAX - 1));
+    }
+
+    #[test]
+    fn classify_priorities_assigns_zones() {
+        // 4 nodes: at center, foveal-edge, peripheral-zone, off-screen.
+        let positions = vec![
+             0.0,  0.0,  0.0,   // node 0 — at center → Foveal
+             3.0,  0.0,  0.0,   // node 1 — within foveal radius (4)
+             8.0,  0.0,  0.0,   // node 2 — within peripheral (16)
+            70.0,  0.0,  0.0,   // node 3 — beyond cull (64)
+        ];
+        let vp = Viewport::default_at([0.0, 0.0, 0.0]);
+        let p = classify_priorities(&positions, 4, &vp);
+        assert_eq!(p[0], UpdatePriority::Foveal);
+        assert_eq!(p[1], UpdatePriority::Foveal);
+        assert_eq!(p[2], UpdatePriority::Peripheral);
+        assert_eq!(p[3], UpdatePriority::OffScreen);
+    }
+
+    #[test]
+    fn integrate_foveated_skips_offscreen_chunks() {
+        // 32 floats = 2 SIMD chunks. Mark all nodes as OffScreen → no update.
+        let mut positions = vec![1.0f32; 32];
+        let mut velocities = vec![1.0f32; 32];
+        let priorities = vec![UpdatePriority::OffScreen; 12]; // covers both chunks
+        integrate_foveated(&mut positions, &mut velocities, &priorities, 0, 0.5, 0.9);
+        for &p in &positions { assert_eq!(p, 1.0); } // unchanged
+        for &v in &velocities { assert_eq!(v, 1.0); } // unchanged
+    }
+
+    #[test]
+    fn integrate_foveated_updates_foveal_chunks() {
+        let mut positions = vec![0.0f32; 32];
+        let mut velocities = vec![1.0f32; 32];
+        let priorities = vec![UpdatePriority::Foveal; 12];
+        integrate_foveated(&mut positions, &mut velocities, &priorities, 0, 0.5, 0.9);
+        for &p in &positions {
+            assert!((p - 0.5).abs() < 1e-6);
+        }
+        for &v in &velocities {
+            assert!((v - 0.9).abs() < 1e-6);
+        }
+    }
+
+    #[test]
+    fn integrate_foveated_respects_peripheral_stride() {
+        let mut positions = vec![0.0f32; 32];
+        let mut velocities = vec![1.0f32; 32];
+        let priorities = vec![UpdatePriority::Peripheral; 12];
+        // Tick 1 (odd) — peripheral skips
+        integrate_foveated(&mut positions, &mut velocities, &priorities, 1, 0.5, 0.9);
+        for &p in &positions { assert_eq!(p, 0.0); }
+        // Tick 2 (even) — peripheral updates
+        integrate_foveated(&mut positions, &mut velocities, &priorities, 2, 0.5, 0.9);
+        for &p in &positions { assert!((p - 0.5).abs() < 1e-6); }
+    }
+
+    #[test]
+    fn fps_controller_starts_at_60() {
+        let c = FpsController::default();
+        assert_eq!(c.target_hz(), 60);
+        assert_eq!(c.dt(), DT_60);
+        assert_eq!(c.budget_ns(), 16_666_666); // 1e9 / 60
+    }
+
+    #[test]
+    fn fps_controller_steps_down_on_overrun() {
+        let c = FpsController::default();
+        // Single overrun: budget_60 = 16.67ms; record 50ms tick → step down.
+        c.record_tick(50_000_000);
+        assert_eq!(c.target_hz(), 30);
+        // Another overrun at 30 (budget = 33ms): record 100ms → step to 15.
+        c.record_tick(100_000_000);
+        assert_eq!(c.target_hz(), 15);
+    }
+
+    #[test]
+    fn fps_controller_steps_up_on_sustained_under_budget() {
+        let c = FpsController::new(15);
+        // Record 60 fast ticks → climb to 30.
+        for _ in 0..60 { c.record_tick(1_000_000); } // 1ms each
+        assert_eq!(c.target_hz(), 30);
+        // Another 60 fast → climb to 60.
+        for _ in 0..60 { c.record_tick(1_000_000); }
+        assert_eq!(c.target_hz(), 60);
+    }
+
+    #[test]
+    fn fps_controller_dt_tracks_target() {
+        let c = FpsController::new(60);
+        assert_eq!(c.dt(), DT_60);
+        c.record_tick(50_000_000); // step to 30
+        assert_eq!(c.dt(), DT_30);
+        c.record_tick(50_000_000); // step to 15
+        assert_eq!(c.dt(), DT_15);
+    }
+
+    #[test]
+    fn renderer_tick_adaptive_advances_count() {
+        let r = Renderer::with_capacity(64);
+        let fps = FpsController::default();
+        r.tick_adaptive(&fps, 0.95);
+        assert_eq!(r.tick_count(), 1);
+        // FpsController should have recorded a sample.
+        assert!(fps.avg_tick_ns() > 0);
+    }
+
+    #[test]
+    fn renderer_tick_foveated_advances_count_and_swaps() {
+        let r = Renderer::with_capacity(64);
+        {
+            let mut back = r.write_back();
+            back.len = 8;
+        }
+        let fps = FpsController::default();
+        let vp = Viewport::default_at([0.0, 0.0, 0.0]);
+        let initial_front = r.front_index();
+        r.tick_foveated(&fps, 0.95, &vp);
+        assert_eq!(r.tick_count(), 1);
+        assert_eq!(r.front_index(), 1 - initial_front);
+    }
+
+    #[test]
+    fn integrate_simd_array_chunks_have_no_tail() {
+        // After migration: 16384 % PREFERRED_F32_LANES == 0, so as_chunks_mut
+        // remainder must be empty.
+        let mut p = vec![0.0f32; 16_384];
+        let (_chunks, tail) = p.as_chunks_mut::<16>();
+        assert!(tail.is_empty(), "no scalar tail at 16384");
+    }
+}
diff --git a/src/hpc/vsa.rs b/src/hpc/vsa.rs
index b6b795a5..282342a0 100644
--- a/src/hpc/vsa.rs
+++ b/src/hpc/vsa.rs
@@ -1,4 +1,4 @@
-//! Vector Symbolic Architecture: 10,000-dimensional binary operations.
+//! Vector Symbolic Architecture: 16,384-dimensional binary operations.
 //!
 //! VSA is working memory. It fills (bundle), crystallizes (unbundle),
 //! empties (clean), repeats. Like breathing.
@@ -7,29 +7,34 @@
 //! - bundle: majority vote via i16 accumulator
 //! - clean: iterative similarity search against codebook
 //! - permute: cyclic shift for sequence encoding
+//!
+//! 16384 = 256 u64 words exactly — power of 2 SIMD-clean at every precision
+//! tier (FP16x32 / FP32x16 / F64x8). Matches the Binary16K / Vsa16k carrier
+//! shared with lance-graph-contract (`crystal::fingerprint`).
 
-/// VSA dimensionality: 10,000 bits.
-pub const VSA_DIMS: usize = 10_000;
+/// VSA dimensionality: 16,384 bits (Binary16K).
+pub const VSA_DIMS: usize = 16_384;
 
-/// VSA bytes: ceil(10000/8) = 1250.
-pub const VSA_BYTES: usize = 1250;
+/// VSA bytes: 16384/8 = 2048.
+pub const VSA_BYTES: usize = 2048;
 
-/// VSA u64 words: ceil(10000/64) = 157 (with 8 padding bits in last word).
-pub const VSA_WORDS: usize = 157;
+/// VSA u64 words: 16384/64 = 256 (exact, no padding).
+pub const VSA_WORDS: usize = 256;
 
-/// Number of meaningful bits in the last word: 10000 - 156*64 = 16.
+/// Number of meaningful bits in the last word: 16384 - 255*64 = 64 (full word).
 const TAIL_BITS: usize = VSA_DIMS - (VSA_WORDS - 1) * 64;
 
-/// Mask for the meaningful bits in the last word.
-const TAIL_MASK: u64 = (1u64 << TAIL_BITS) - 1;
+/// Mask for the meaningful bits in the last word: !0u64 since the format
+/// is power-of-2 aligned (every word is fully meaningful).
+const TAIL_MASK: u64 = u64::MAX;
 
-/// A 10,000-dimensional binary VSA vector.
+/// A 16,384-dimensional binary VSA vector (Binary16K).
 ///
-/// Stored as 157 u64 words (10048 bits total), with only the first 10,000
-/// bits meaningful. The upper 48 bits of the last word are always zero.
+/// Stored as 256 u64 words (16384 bits total), all bits meaningful — the
+/// format is SIMD-clean at every precision tier (FP16x32 / FP32x16 / F64x8).
 #[derive(Clone, PartialEq, Eq)]
 pub struct VsaVector {
-    /// 157 u64 words = 10048 bits, only first 10000 are meaningful.
+    /// 256 u64 words = 16384 bits, all meaningful.
     pub words: [u64; VSA_WORDS],
 }
 
@@ -93,8 +98,8 @@ impl VsaVector {
 
     /// Create a VSA vector from a byte slice.
     ///
-    /// If `data` is shorter than [`VSA_BYTES`] (1250), uses blake3 in XOF
-    /// mode to expand it. If longer, only the first 1250 bytes are used.
+    /// If `data` is shorter than [`VSA_BYTES`] (2048), uses blake3 in XOF
+    /// mode to expand it. If longer, only the first 2048 bytes are used.
     ///
     /// # Example
     ///
@@ -144,7 +149,7 @@ impl VsaVector {
     /// Create a VSA vector from text using blake3 hash expansion.
     ///
     /// The text is hashed with blake3, then expanded via XOF mode to fill
-    /// all 1250 bytes. Deterministic: same text always produces same vector.
+    /// all 2048 bytes. Deterministic: same text always produces same vector.
     ///
     /// # Example
     ///
@@ -165,8 +170,8 @@ impl VsaVector {
 
     /// Zero-copy view of the vector as a byte slice.
     ///
-    /// Returns all `VSA_WORDS * 8` bytes (1256 bytes). The last 6 bytes
-    /// contain only padding zeros.
+    /// Returns all `VSA_WORDS * 8` bytes (2048 bytes). All bytes are
+    /// meaningful — 16384 / 8 = 2048 exactly, no padding.
     ///
     /// # Safety
     ///
@@ -180,7 +185,7 @@ impl VsaVector {
         }
     }
 
-    /// Population count: number of set bits (within the meaningful 10,000).
+    /// Population count: number of set bits (out of 16,384).
     #[inline]
     pub fn popcount(&self) -> u32 {
         super::bitwise::popcount_raw(self.as_bytes()) as u32
@@ -292,7 +297,7 @@ pub fn vsa_similarity(a: &VsaVector, b: &VsaVector) -> f32 {
 
 /// Raw Hamming distance between two VSA vectors.
 ///
-/// Counts the number of bit positions (out of 10,000) that differ.
+/// Counts the number of bit positions (out of 16,384) that differ.
 /// Delegates to SIMD-accelerated bitwise operations.
 ///
 /// # Example
@@ -306,7 +311,7 @@ pub fn vsa_hamming(a: &VsaVector, b: &VsaVector) -> u32 {
     super::bitwise::hamming_distance_raw(a.as_bytes(), b.as_bytes()) as u32
 }
 
-/// Cyclic bit permutation (left shift by `shift` positions within 10,000 bits).
+/// Cyclic bit permutation (left shift by `shift` positions within 16,384 bits).
 ///
 /// Bit at position `i` moves to position `(i + shift) % VSA_DIMS`.
 /// Used for sequence encoding: `permute(item, position)`.
@@ -409,7 +414,7 @@ pub fn vsa_clean<'a>(dirty: &VsaVector, codebook: &'a [VsaVector]) -> Option<&'a
 impl VsaAccumulator {
     /// Create a new zero accumulator.
     ///
-    /// All 10,000 dimension tallies start at 0.
+    /// All 16,384 dimension tallies start at 0.
     ///
     /// # Example
     ///
@@ -693,9 +698,12 @@ mod tests {
 
     #[test]
     fn test_constants() {
-        assert_eq!(TAIL_BITS, 16);
-        assert_eq!(TAIL_MASK, 0xFFFF);
-        assert_eq!((VSA_WORDS - 1) * 64 + TAIL_BITS, VSA_DIMS);
+        assert_eq!(VSA_DIMS, 16_384);
+        assert_eq!(VSA_WORDS, 256);
+        assert_eq!(VSA_BYTES, 2048);
+        assert_eq!(TAIL_BITS, 64);
+        assert_eq!(TAIL_MASK, u64::MAX);
+        assert_eq!(VSA_WORDS * 64, VSA_DIMS);
     }
 
     #[test]
diff --git a/src/simd.rs b/src/simd.rs
index ae35a939..d832203d 100644
--- a/src/simd.rs
+++ b/src/simd.rs
@@ -805,6 +805,42 @@ mod scalar {
         pub fn saturating_sub(self, other: Self) -> Self {
             let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out)
         }
+        // ── Tier 1: seismon rasterizer primitives (scalar fallbacks) ──
+        #[inline(always)]
+        pub fn pairwise_avg(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out)
+        }
+        #[inline(always)]
+        pub fn cmpgt_mask(self, other: Self) -> u64 {
+            let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m
+        }
+        #[inline(always)]
+        pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out)
+        }
+        #[inline(always)]
+        pub fn shl_epi16(self, imm: u32) -> Self {
+            let mut out = [0u8; 64];
+            for i in (0..64).step_by(2) {
+                let v = u16::from_le_bytes([self.0[i], self.0[i+1]]);
+                let s = if imm < 16 { v << imm } else { 0 };
+                let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1];
+            }
+            Self(out)
+        }
+        // ── Tier 2: sprite blit + palette remap (scalar fallbacks) ──
+        #[inline(always)]
+        pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
+            for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } }
+        }
+        #[inline(always)]
+        pub fn saturating_add(self, other: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out)
+        }
+        #[inline(always)]
+        pub fn permute_bytes(self, idx: Self) -> Self {
+            let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out)
+        }
         #[inline(always)]
         pub fn unpack_lo_epi8(self, other: Self) -> Self {
             let mut out = [0u8; 64];
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
index 62fae415..e00ff5b1 100644
--- a/src/simd_avx2.rs
+++ b/src/simd_avx2.rs
@@ -806,6 +806,43 @@ impl U8x64 {
         Self(out)
     }
 
+    // ── Tier 1+2: seismon rasterizer primitives (AVX2 scalar fallbacks) ──
+
+    #[inline(always)]
+    pub fn pairwise_avg(self, other: Self) -> Self {
+        let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out)
+    }
+    #[inline(always)]
+    pub fn cmpgt_mask(self, other: Self) -> u64 {
+        let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m
+    }
+    #[inline(always)]
+    pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
+        let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out)
+    }
+    #[inline(always)]
+    pub fn shl_epi16(self, imm: u32) -> Self {
+        let mut out = [0u8; 64];
+        for i in (0..64).step_by(2) {
+            let v = u16::from_le_bytes([self.0[i], self.0[i+1]]);
+            let s = if imm < 16 { v << imm } else { 0 };
+            let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1];
+        }
+        Self(out)
+    }
+    #[inline(always)]
+    pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
+        for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } }
+    }
+    #[inline(always)]
+    pub fn saturating_add(self, other: Self) -> Self {
+        let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out)
+    }
+    #[inline(always)]
+    pub fn permute_bytes(self, idx: Self) -> Self {
+        let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out)
+    }
+
     /// Interleave low bytes within each 128-bit lane.
     #[inline(always)]
     pub fn unpack_lo_epi8(self, other: Self) -> Self {
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
index cec3be86..947fed0e 100644
--- a/src/simd_avx512.rs
+++ b/src/simd_avx512.rs
@@ -624,6 +624,82 @@ impl U8x64 {
         Self(unsafe { _mm512_subs_epu8(self.0, other.0) })
     }
 
+    // ── Tier 1: seismon rasterizer primitives ─────────────────────────
+
+    /// Pairwise unsigned byte average: (a[i] + b[i] + 1) >> 1 per byte.
+    /// Core op for 4×4 mipmap downsample (vpavgb + horizontal pair = 2 ops).
+    #[inline(always)]
+    pub fn pairwise_avg(self, other: Self) -> Self {
+        // SAFETY: AVX-512BW instruction, operates on all 64 bytes.
+        Self(unsafe { _mm512_avg_epu8(self.0, other.0) })
+    }
+
+    /// Byte-wise unsigned greater-than comparison. Returns 64-bit mask:
+    /// bit i set if self[i] > other[i]. Symmetric to `cmpeq_mask`.
+    /// Used for threshold density fields, depth/Z-test, hit-tests.
+    #[inline(always)]
+    pub fn cmpgt_mask(self, other: Self) -> u64 {
+        // SAFETY: AVX-512BW instruction. Unsigned compare via _epu8.
+        unsafe { _mm512_cmpgt_epu8_mask(self.0, other.0) }
+    }
+
+    /// Masked blend: for each bit in `mask`, select from `b` if set, else `a`.
+    /// Sprite alpha blit: write atlas pixel where mask bit set, keep framebuffer otherwise.
+    #[inline(always)]
+    pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self {
+        // SAFETY: AVX-512BW instruction. mask selects between a and b per byte.
+        Self(unsafe { _mm512_mask_blend_epi8(mask, a.0, b.0) })
+    }
+
+    /// Shift left each 16-bit lane by immediate bits (nibble write: place high nibble).
+    /// Completes the nibble shift pair with `shr_epi16`.
+    #[inline(always)]
+    pub fn shl_epi16(self, imm: u32) -> Self {
+        Self(unsafe { match imm {
+            1 => _mm512_slli_epi16(self.0, 1),
+            2 => _mm512_slli_epi16(self.0, 2),
+            3 => _mm512_slli_epi16(self.0, 3),
+            4 => _mm512_slli_epi16(self.0, 4),
+            5 => _mm512_slli_epi16(self.0, 5),
+            6 => _mm512_slli_epi16(self.0, 6),
+            7 => _mm512_slli_epi16(self.0, 7),
+            8 => _mm512_slli_epi16(self.0, 8),
+            _ => _mm512_setzero_si512(),
+        }})
+    }
+
+    // ── Tier 2: sprite blit + palette LUT + cross-lane shuffle ────────
+
+    /// Masked store: write only bytes where mask bit is set.
+    /// Partial-tile writes at framebuffer edges without scalar fallback.
+    ///
+    /// # Safety
+    /// `ptr` must point to at least 64 writable bytes (may be unaligned).
+    #[inline(always)]
+    pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) {
+        // SAFETY: AVX-512BW masked store. Caller guarantees ptr validity.
+        _mm512_mask_storeu_epi8(ptr as *mut i8, mask, self.0);
+    }
+
+    /// Saturating unsigned addition: min(a + b, 255) per byte.
+    /// Additive blend without overflow wrap. Symmetric to `saturating_sub`.
+    #[inline(always)]
+    pub fn saturating_add(self, other: Self) -> Self {
+        // SAFETY: AVX-512BW instruction.
+        Self(unsafe { _mm512_adds_epu8(self.0, other.0) })
+    }
+
+    /// Cross-lane byte permute: rearrange all 64 bytes by index vector.
+    /// `idx[i]` selects which byte of `self` appears at position `i`.
+    /// Unlike `shuffle_bytes` (within-lane), this crosses 128-bit lane boundaries.
+    /// Needed for sprite atlas reorder and palette remap > 16 entries.
+    #[inline(always)]
+    pub fn permute_bytes(self, idx: Self) -> Self {
+        // SAFETY: AVX-512VBMI instruction (_mm512_permutexvar_epi8).
+        // Falls back to multi-shuffle on CPUs without VBMI.
+        Self(unsafe { _mm512_permutexvar_epi8(idx.0, self.0) })
+    }
+
     /// Interleave low bytes: [a0,b0,a1,b1,...] from lower halves.
     #[inline(always)]
     pub fn unpack_lo_epi8(self, other: Self) -> Self {
@@ -2728,3 +2804,119 @@ mod f16_tests {
         }
     }
 }
+
+#[cfg(test)]
+mod u8x64_rasterizer_tests {
+    use super::U8x64;
+
+    #[test]
+    fn pairwise_avg_basic() {
+        let a = U8x64::splat(10);
+        let b = U8x64::splat(20);
+        let avg = a.pairwise_avg(b);
+        let mut out = [0u8; 64];
+        avg.copy_to_slice(&mut out);
+        // (10 + 20 + 1) >> 1 = 15
+        assert!(out.iter().all(|&v| v == 15));
+    }
+
+    #[test]
+    fn pairwise_avg_rounding() {
+        let a = U8x64::splat(1);
+        let b = U8x64::splat(2);
+        let avg = a.pairwise_avg(b);
+        let mut out = [0u8; 64];
+        avg.copy_to_slice(&mut out);
+        // (1 + 2 + 1) >> 1 = 2  (rounds up)
+        assert!(out.iter().all(|&v| v == 2));
+    }
+
+    #[test]
+    fn cmpgt_mask_basic() {
+        let a = U8x64::splat(10);
+        let b = U8x64::splat(5);
+        assert_eq!(a.cmpgt_mask(b), u64::MAX); // all greater
+        assert_eq!(b.cmpgt_mask(a), 0);         // none greater
+        assert_eq!(a.cmpgt_mask(a), 0);         // equal = not greater
+    }
+
+    #[test]
+    fn mask_blend_selects_correctly() {
+        let a = U8x64::splat(10);
+        let b = U8x64::splat(20);
+        // mask = 0: all from a
+        let r0 = U8x64::mask_blend(0, a, b);
+        let mut out = [0u8; 64];
+        r0.copy_to_slice(&mut out);
+        assert!(out.iter().all(|&v| v == 10));
+        // mask = all 1s: all from b
+        let r1 = U8x64::mask_blend(u64::MAX, a, b);
+        r1.copy_to_slice(&mut out);
+        assert!(out.iter().all(|&v| v == 20));
+        // mask = bit 0 only: first byte from b, rest from a
+        let r2 = U8x64::mask_blend(1, a, b);
+        r2.copy_to_slice(&mut out);
+        assert_eq!(out[0], 20);
+        assert_eq!(out[1], 10);
+    }
+
+    #[test]
+    fn shl_epi16_shift_4() {
+        let mut data = [0u8; 64];
+        data[0] = 0x0F; data[1] = 0x00; // u16 = 0x000F
+        let v = U8x64::from_slice(&data);
+        let shifted = v.shl_epi16(4);
+        let mut out = [0u8; 64];
+        shifted.copy_to_slice(&mut out);
+        let result = u16::from_le_bytes([out[0], out[1]]);
+        assert_eq!(result, 0x00F0);
+    }
+
+    #[test]
+    fn saturating_add_clamps_at_255() {
+        let a = U8x64::splat(200);
+        let b = U8x64::splat(100);
+        let sum = a.saturating_add(b);
+        let mut out = [0u8; 64];
+        sum.copy_to_slice(&mut out);
+        assert!(out.iter().all(|&v| v == 255));
+    }
+
+    #[test]
+    fn saturating_add_no_overflow() {
+        let a = U8x64::splat(10);
+        let b = U8x64::splat(20);
+        let sum = a.saturating_add(b);
+        let mut out = [0u8; 64];
+        sum.copy_to_slice(&mut out);
+        assert!(out.iter().all(|&v| v == 30));
+    }
+
+    #[test]
+    fn permute_bytes_identity() {
+        let mut data = [0u8; 64];
+        for i in 0..64 { data[i] = i as u8; }
+        let v = U8x64::from_slice(&data);
+        // Identity permutation
+        let mut idx = [0u8; 64];
+        for i in 0..64 { idx[i] = i as u8; }
+        let perm = v.permute_bytes(U8x64::from_slice(&idx));
+        let mut out = [0u8; 64];
+        perm.copy_to_slice(&mut out);
+        assert_eq!(out, data);
+    }
+
+    #[test]
+    fn permute_bytes_reverse() {
+        let mut data = [0u8; 64];
+        for i in 0..64 { data[i] = i as u8; }
+        let v = U8x64::from_slice(&data);
+        // Reverse permutation
+        let mut idx = [0u8; 64];
+        for i in 0..64 { idx[i] = (63 - i) as u8; }
+        let perm = v.permute_bytes(U8x64::from_slice(&idx));
+        let mut out = [0u8; 64];
+        perm.copy_to_slice(&mut out);
+        for i in 0..64 { assert_eq!(out[i], (63 - i) as u8); }
+    }
+}