diff --git a/src/hpc/arrow_bridge.rs b/src/hpc/arrow_bridge.rs index 363c4e55..7ff1a7cf 100644 --- a/src/hpc/arrow_bridge.rs +++ b/src/hpc/arrow_bridge.rs @@ -17,13 +17,13 @@ pub const PLANE_BINARY_BYTES: usize = 2048; pub const BINARY_BYTES: usize = PLANE_BYTES; // 2048 /// Soaking accumulator length (i8 entries per plane). -pub const SOAKING_DIMS: usize = 10000; +pub const SOAKING_DIMS: usize = 16_384; -/// Sigma attention mask width in bytes (10000-bit mask). -pub const SIGMA_MASK_BYTES: usize = 1250; +/// Sigma attention mask width in bytes (16384-bit mask = 2048 bytes). +pub const SIGMA_MASK_BYTES: usize = 2048; /// Default soaking dimension count. -pub const DEFAULT_SOAKING_DIM: usize = 10000; +pub const DEFAULT_SOAKING_DIM: usize = 16_384; /// Schema field names for the bind_nodes_v2 three-plane layout. pub mod schema { @@ -266,7 +266,7 @@ pub struct BindNodeV2 { pub object_soaking: Option>, /// Composite XOR fingerprint: S XOR P XOR O. pub spo_binary: [u8; PLANE_BINARY_BYTES], - /// 10000-bit attention mask (sigma). + /// 16384-bit attention mask (sigma). pub sigma_mask: [u8; SIGMA_MASK_BYTES], /// NARS frequency (u16 fixed-point, 0..65535). pub nars_frequency: u16, @@ -572,7 +572,7 @@ impl BindNodeV2 { /// Convert a Plane accumulator (16384 i8) to a soaking vector (SOAKING_DIMS i8). /// - /// Truncates if the accumulator is longer than SOAKING_DIMS, pads with 0 if shorter. + /// With SOAKING_DIMS = 16384, the copy is one-to-one (no truncation, no padding). fn acc_to_soaking(acc: &[i8; 16384]) -> Vec { let mut soaking = vec![0i8; SOAKING_DIMS]; let copy_len = SOAKING_DIMS.min(acc.len()); @@ -878,8 +878,8 @@ mod tests { #[test] fn schema_constants() { assert_eq!(PLANE_BINARY_BYTES, 2048); - assert_eq!(SOAKING_DIMS, 10000); - assert_eq!(SIGMA_MASK_BYTES, 1250); + assert_eq!(SOAKING_DIMS, 16_384); + assert_eq!(SIGMA_MASK_BYTES, 2048); } #[test] @@ -1140,9 +1140,9 @@ mod tests { let (mut s, mut p, mut o) = make_test_planes(); let node = BindNodeV2::new(&mut s, &mut p, &mut o, "test"); assert_eq!(node.sigma_mask.len(), SIGMA_MASK_BYTES); - assert_eq!(node.sigma_mask.len(), 1250); - // sigma_mask * 8 = 10000 bits - assert_eq!(node.sigma_mask.len() * 8, 10000); + assert_eq!(node.sigma_mask.len(), 2048); + // sigma_mask * 8 = 16384 bits + assert_eq!(node.sigma_mask.len() * 8, 16_384); } #[test] diff --git a/src/hpc/audio/codec_map.rs b/src/hpc/audio/codec_map.rs index 24e2935a..df664327 100644 --- a/src/hpc/audio/codec_map.rs +++ b/src/hpc/audio/codec_map.rs @@ -35,11 +35,17 @@ /// what aspect of that codec it captures, and what it replaces. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum CodecSource { + /// Opus / CELT. Opus, + /// OpenAI Whisper. Whisper, + /// MP3. Mp3, + /// Ogg Vorbis. OggVorbis, + /// Suno Bark. Bark, + /// ElevenLabs. ElevenLabs, } @@ -66,11 +72,17 @@ pub enum AudioAspect { /// Complete provenance record for one primitive. pub struct Provenance { + /// Name of the primitive in this codebase. pub our_type: &'static str, + /// Byte size of the primitive (0 = transform/decision, not stored). pub byte_size: usize, + /// Production codec the primitive was transcoded from. pub source: CodecSource, + /// Aspect of audio the primitive captures. pub aspect: AudioAspect, + /// Concept in the source codec that this corresponds to. pub source_concept: &'static str, + /// What this primitive replaces (in the source codec or a peer). pub what_it_replaces: &'static str, } @@ -212,6 +224,7 @@ pub const PROVENANCE: &[Provenance] = &[ /// Bark tokens: ~128 bytes per frame /// Ours: 52-69 bytes per frame (complete, including phase + identity) pub const FRAME_BUDGET: usize = 52; +/// Per-frame byte budget when the TTS RvqFrame (17 bytes) is also carried. pub const FRAME_BUDGET_WITH_TTS: usize = 69; /// Codec comparison: bits per second at comparable quality. diff --git a/src/hpc/audio/modes.rs b/src/hpc/audio/modes.rs index 9f042ca1..9464894d 100644 --- a/src/hpc/audio/modes.rs +++ b/src/hpc/audio/modes.rs @@ -27,12 +27,19 @@ use super::bands; /// and maps to a Base17 stride for spectral character. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum Mode { + /// Ionian (major): W-W-H-W-W-W-H — bright, resolved. Ionian, // Major: W-W-H-W-W-W-H → bright, resolved + /// Dorian: minor with natural 6th — warm, jazz. Dorian, // Minor with ♮6: warm, jazz + /// Phrygian: minor with flat 2nd — dark, flamenco. Phrygian, // Minor with ♭2: dark, flamenco + /// Lydian: major with sharp 4th — dreamy, floating. Lydian, // Major with ♯4: dreamy, floating + /// Mixolydian: major with flat 7th — dominant, bluesy. Mixolydian, // Major with ♭7: dominant, bluesy + /// Aeolian (natural minor) — sad, reflective. Aeolian, // Natural minor: sad, reflective + /// Locrian (diminished) — unstable, tense. Locrian, // Diminished: unstable, tense } diff --git a/src/hpc/audio/phase.rs b/src/hpc/audio/phase.rs index 18dd6684..348ed800 100644 --- a/src/hpc/audio/phase.rs +++ b/src/hpc/audio/phase.rs @@ -128,6 +128,7 @@ pub fn phase_gradient( /// Together: complete nonverbal vocal characterization in 52 bytes. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PhaseDescriptor { + /// 4-byte packed descriptor (coherence, gradient, entropy, stability). pub bytes: [u8; 4], } diff --git a/src/hpc/audio/voice.rs b/src/hpc/audio/voice.rs index ff051c4c..c5cba037 100644 --- a/src/hpc/audio/voice.rs +++ b/src/hpc/audio/voice.rs @@ -32,10 +32,12 @@ pub const N_VOICE_CHANNELS: usize = 16; /// Compression: 16 bytes (vs Bark's 1024-dim semantic token embedding). #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct VoiceArchetype { + /// 16 i8 voice-identity channels (pitch / resonance / articulation / prosody). pub channels: [i8; N_VOICE_CHANNELS], } impl VoiceArchetype { + /// Serialized size of a VoiceArchetype, in bytes. pub const BYTE_SIZE: usize = N_VOICE_CHANNELS; /// Zero archetype (neutral voice). @@ -177,6 +179,7 @@ impl VoiceArchetype { /// For a 256-entry codebook: 256 × 16 bytes = 4 KB. #[derive(Clone, Debug)] pub struct VoiceCodebook { + /// Voice archetype prototypes; index = codebook ID. pub entries: Vec, } @@ -245,6 +248,7 @@ pub struct RvqFrame { } impl RvqFrame { + /// Serialized size of an RvqFrame, in bytes. pub const BYTE_SIZE: usize = 17; /// Serialize to 17 bytes. @@ -292,13 +296,17 @@ impl RvqFrame { /// VoiceFrame (21B) is the compressed synthesis frame. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct VoiceFrame { + /// Compressed RVQ codes (archetype + coarse + fine). pub rvq: RvqFrame, + /// Per-frame phase dynamics descriptor. pub phase: super::phase::PhaseDescriptor, } impl VoiceFrame { + /// Serialized size of a VoiceFrame, in bytes (RvqFrame + 4-byte phase). pub const BYTE_SIZE: usize = RvqFrame::BYTE_SIZE + 4; // 21 bytes + /// Serialize this VoiceFrame to its 21-byte wire representation. pub fn to_bytes(&self) -> [u8; Self::BYTE_SIZE] { let mut bytes = [0u8; Self::BYTE_SIZE]; bytes[..17].copy_from_slice(&self.rvq.to_bytes()); @@ -306,6 +314,7 @@ impl VoiceFrame { bytes } + /// Deserialize a VoiceFrame from its 21-byte wire representation. pub fn from_bytes(bytes: &[u8; Self::BYTE_SIZE]) -> Self { let mut rvq_bytes = [0u8; 17]; rvq_bytes.copy_from_slice(&bytes[..17]); diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs index bdacb4b0..b855cab9 100644 --- a/src/hpc/deepnsm.rs +++ b/src/hpc/deepnsm.rs @@ -646,14 +646,14 @@ pub fn nsm_decompose(text: &str) -> NsmDecomposition { NsmDecomposition { weights, dominant } } -/// Encode an NSM decomposition as a 10000-bit binary vector (1250 bytes). +/// Encode an NSM decomposition as a 16384-bit binary vector (2048 bytes). /// /// For each prime with weight > 0, hash prime_index with blake3 to produce /// a deterministic bit pattern, then XOR into result for primes whose /// normalised weight exceeds 0.5 of the max weight (or any nonzero weight /// when only one prime is present). -pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] { - let mut result = [0u8; 1250]; +pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 2048] { + let mut result = [0u8; 2048]; let max_w = decomp.weights.iter().cloned().fold(0.0f32, f32::max); if max_w == 0.0 { @@ -665,20 +665,20 @@ pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] { if w < threshold { continue; } - // Hash the prime index to get a deterministic 1250-byte pattern + // Hash the prime index to get a deterministic 2048-byte pattern let hash_input = (i as u32).to_le_bytes(); - let mut pattern = [0u8; 1250]; - // Use blake3 in extended-output mode to fill 1250 bytes + let mut pattern = [0u8; 2048]; + // Use blake3 in extended-output mode to fill 2048 bytes let mut hasher = blake3::Hasher::new(); hasher.update(&hash_input); let mut reader = hasher.finalize_xof(); reader.fill(&mut pattern); - // XOR 1250 bytes via crate::simd::U8x64. - // 1250 = 19×64 (1216) + 34 scalar remainder. + // XOR 2048 bytes via crate::simd::U8x64. + // 2048 = 32×64 exactly (no scalar remainder, SIMD-clean). { use crate::simd::U8x64; - let chunks = 1250 / 64; // 19 + let chunks = 2048 / 64; // 32 for c in 0..chunks { let off = c * 64; let vr = U8x64::from_slice(&result[off..off + 64]); @@ -686,10 +686,6 @@ pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] { let xored = vr ^ vp; xored.copy_to_slice(&mut result[off..off + 64]); } - // Scalar remainder (34 bytes). - for j in (chunks * 64)..1250 { - result[j] ^= pattern[j]; - } } } diff --git a/src/hpc/framebuffer.rs b/src/hpc/framebuffer.rs new file mode 100644 index 00000000..a0d214d6 --- /dev/null +++ b/src/hpc/framebuffer.rs @@ -0,0 +1,1303 @@ +//! Palette-indexed framebuffer — ndarray IS the graphics card. +//! +//! Composes a screen as a `[u8; W*H]` palette-indexed bitmap. Wire format +//! is palette_codec-compressed (4-bit nibble at 16 colors → 8× smaller +//! than RGB888). q2 receives a ready-made bitmap and blits with +//! `canvas.putImageData(...)`. +//! +//! # Tier-adaptive palette +//! +//! The detected SIMD tier determines the palette depth AND foveal detail +//! budget. Lower-capability hardware gets fewer colors and simpler sprites +//! that compress better and process faster: +//! +//! | Tier | Palette | Bits/px | Sprite | Wire KB (1024²) | +//! |-------------|---------|---------|--------|-----------------| +//! | AVX-512/AMX | 16 | 4 | 8×8 | 512 | +//! | AVX2 | 8 | 3 | 6×6 | 384 | +//! | NEON/scalar | 4 | 2 | 4×4 | 256 | +//! +//! # Views +//! +//! - **MRI view** — full-screen density heatmap, all nodes visible, +//! palette maps to intensity (white=hot, black=cold). Overview radar. +//! - **Neo4j view** — nodes as dot sprites, edges as Bresenham lines, +//! labels as glyph sprites. Interactive-style graph display. +//! - **Cloud view** — distant/peripheral nodes as a nibble-packed +//! density field at mipmap L1/L2. Foveal region sharp, periphery fog. + +use crate::hpc::palette_codec::{bits_for_palette_size, pack_indices}; +use crate::simd::PREFERRED_F32_LANES; + +// ───────────────────────────────────────────────────────────────────── +// Tier-adaptive palette selection +// ───────────────────────────────────────────────────────────────────── + +/// Palette depth based on detected SIMD tier. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PaletteTier { + /// AVX-512 / AMX: 16 colors, 4 bits/pixel, 8×8 sprites. + Full16, + /// AVX2: 8 colors, 3 bits/pixel, 6×6 sprites. + Mid8, + /// NEON / scalar: 4 colors, 2 bits/pixel, 4×4 sprites. + Low4, +} + +impl PaletteTier { + /// Auto-detect from the active SIMD lane width. + pub fn detect() -> Self { + match PREFERRED_F32_LANES { + 16 => Self::Full16, // AVX-512 / AMX + 8 => Self::Mid8, // AVX2 + _ => Self::Low4, // NEON (4), scalar (≤4) + } + } + + /// Number of palette entries for this tier. + #[inline] + pub fn palette_size(self) -> usize { + match self { + Self::Full16 => 16, + Self::Mid8 => 8, + Self::Low4 => 4, + } + } + + /// Bits per pixel for this tier. + #[inline] + pub fn bits_per_pixel(self) -> usize { + bits_for_palette_size(self.palette_size()) + } + + /// Sprite edge length (square) for node dots. + #[inline] + pub fn sprite_size(self) -> usize { + match self { + Self::Full16 => 8, + Self::Mid8 => 6, + Self::Low4 => 4, + } + } + + /// Wire size in bytes for a `width × height` framebuffer at this tier. + #[inline] + pub fn wire_bytes(self, width: usize, height: usize) -> usize { + let total_px = width * height; + let bpp = self.bits_per_pixel(); + (total_px * bpp + 7) / 8 + } +} + +// ───────────────────────────────────────────────────────────────────── +// Framebuffer + SpriteAtlas +// ───────────────────────────────────────────────────────────────────── + +/// View mode — determines how the framebuffer is composed. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ViewMode { + /// Density heatmap — every node plots at its position, intensity = + /// confidence. Palette maps linearly: 0 = background, max = hottest. + Mri, + /// Nodes as dot sprites, edges as Bresenham lines. Neo4j-style. + Neo4j, + /// Foveal sharp, peripheral density fog. Hybrid. + Cloud, +} + +/// Palette-indexed framebuffer. Each pixel is a u8 index into a palette +/// whose size is determined by the SIMD tier. +pub struct Framebuffer { + pub width: usize, + pub height: usize, + pub tier: PaletteTier, + /// Row-major palette indices, length = width × height. + pub pixels: Vec, + /// Dirty rectangle: (x0, y0, x1, y1). Only the region inside needs + /// re-encoding on the wire. Reset to (0,0,0,0) after each `pack()`. + pub dirty: (usize, usize, usize, usize), +} + +impl Framebuffer { + /// Allocate a cleared framebuffer at the given resolution and auto-detected tier. + pub fn new(width: usize, height: usize) -> Self { + let tier = PaletteTier::detect(); + Self { + width, + height, + tier, + pixels: vec![0u8; width * height], + dirty: (0, 0, width, height), + } + } + + /// Allocate with an explicit tier (for testing or override). + pub fn with_tier(width: usize, height: usize, tier: PaletteTier) -> Self { + Self { + width, + height, + tier, + pixels: vec![0u8; width * height], + dirty: (0, 0, width, height), + } + } + + /// Clear the entire framebuffer to palette index 0 (background). + #[inline] + pub fn clear(&mut self) { + self.pixels.fill(0); + self.dirty = (0, 0, self.width, self.height); + } + + /// Set a single pixel (with bounds check). Expands dirty rect. + #[inline] + pub fn set_pixel(&mut self, x: usize, y: usize, color: u8) { + if x < self.width && y < self.height { + self.pixels[y * self.width + x] = color; + self.expand_dirty(x, y, x + 1, y + 1); + } + } + + /// Plot a filled dot (square sprite) centered at (cx, cy). + pub fn plot_dot(&mut self, cx: usize, cy: usize, color: u8) { + let r = self.tier.sprite_size() / 2; + let x0 = cx.saturating_sub(r); + let y0 = cy.saturating_sub(r); + let x1 = (cx + r).min(self.width); + let y1 = (cy + r).min(self.height); + for y in y0..y1 { + let row = y * self.width; + for x in x0..x1 { + self.pixels[row + x] = color; + } + } + self.expand_dirty(x0, y0, x1, y1); + } + + /// Draw a Bresenham line from (x0,y0) to (x1,y1) with palette index. + pub fn draw_line(&mut self, mut x0: i32, mut y0: i32, x1: i32, y1: i32, color: u8) { + let dx = (x1 - x0).abs(); + let dy = -(y1 - y0).abs(); + let sx: i32 = if x0 < x1 { 1 } else { -1 }; + let sy: i32 = if y0 < y1 { 1 } else { -1 }; + let mut err = dx + dy; + + loop { + if x0 >= 0 && y0 >= 0 && (x0 as usize) < self.width && (y0 as usize) < self.height { + self.pixels[y0 as usize * self.width + x0 as usize] = color; + } + if x0 == x1 && y0 == y1 { break; } + let e2 = 2 * err; + if e2 >= dy { err += dy; x0 += sx; } + if e2 <= dx { err += dx; y0 += sy; } + } + let (lx, rx) = (x0.min(x1).max(0) as usize, (x0.max(x1) as usize + 1).min(self.width)); + let (ly, ry) = (y0.min(y1).max(0) as usize, (y0.max(y1) as usize + 1).min(self.height)); + self.expand_dirty(lx, ly, rx, ry); + } + + /// MRI density blit — for each node, increment the pixel at its projected + /// position. Clamped to palette max so saturated regions show as hottest. + pub fn blit_mri_density(&mut self, screen_xs: &[usize], screen_ys: &[usize]) { + let max_idx = (self.tier.palette_size() - 1) as u8; + for (&sx, &sy) in screen_xs.iter().zip(screen_ys.iter()) { + if sx < self.width && sy < self.height { + let idx = sy * self.width + sx; + self.pixels[idx] = self.pixels[idx].saturating_add(1).min(max_idx); + } + } + self.dirty = (0, 0, self.width, self.height); + } + + /// Pack the framebuffer into palette_codec wire format. + /// + /// Returns `(packed_u64s, bits_per_pixel)`. The consumer unpacks with + /// `palette_codec::unpack_indices(&packed, bpp, w*h)`. + pub fn pack(&mut self) -> (Vec, usize) { + let bpp = self.tier.bits_per_pixel(); + let packed = pack_indices(&self.pixels, bpp); + self.dirty = (0, 0, 0, 0); + (packed, bpp) + } + + /// Byte count of the last `pack()` output (for bandwidth estimation). + pub fn packed_byte_estimate(&self) -> usize { + self.tier.wire_bytes(self.width, self.height) + } + + fn expand_dirty(&mut self, x0: usize, y0: usize, x1: usize, y1: usize) { + self.dirty.0 = self.dirty.0.min(x0); + self.dirty.1 = self.dirty.1.min(y0); + self.dirty.2 = self.dirty.2.max(x1); + self.dirty.3 = self.dirty.3.max(y1); + } +} + +// ───────────────────────────────────────────────────────────────────── +// Mipmap — bitwise 4× downsampling for LOD pyramid. +// ───────────────────────────────────────────────────────────────────── + +/// Downsample a framebuffer 2× in each axis (4× total pixels). +/// +/// Each 2×2 block maps to one pixel. Strategy: max (brightest wins), +/// matching the MRI heatmap "any signal in this region" semantic. +pub fn downsample_2x(src: &[u8], src_w: usize, src_h: usize) -> (Vec, usize, usize) { + let dst_w = src_w / 2; + let dst_h = src_h / 2; + let mut dst = vec![0u8; dst_w * dst_h]; + for dy in 0..dst_h { + for dx in 0..dst_w { + let sx = dx * 2; + let sy = dy * 2; + let a = src[sy * src_w + sx]; + let b = src[sy * src_w + sx + 1]; + let c = src[(sy + 1) * src_w + sx]; + let d = src[(sy + 1) * src_w + sx + 1]; + dst[dy * dst_w + dx] = a.max(b).max(c).max(d); + } + } + (dst, dst_w, dst_h) +} + +/// Full mipmap pyramid from L0 (original) down to the level where +/// both dimensions are < `min_dim`. +pub fn build_mipmap_pyramid(fb: &Framebuffer, min_dim: usize) -> Vec<(Vec, usize, usize)> { + let mut levels = Vec::new(); + let mut cur = fb.pixels.clone(); + let mut w = fb.width; + let mut h = fb.height; + levels.push((cur.clone(), w, h)); + while w > min_dim && h > min_dim { + let (down, dw, dh) = downsample_2x(&cur, w, h); + levels.push((down.clone(), dw, dh)); + cur = down; + w = dw; + h = dh; + } + levels +} + +// ───────────────────────────────────────────────────────────────────── +// Compose: RenderFrame → Framebuffer (the "graphics card" pipeline). +// ───────────────────────────────────────────────────────────────────── + +/// Project a 3D position to 2D screen coordinates (orthographic). +/// +/// Simple orthographic: x → screen_x, y → screen_y (z ignored). +/// Scale and offset are applied. This is the dumbest projection that +/// works; replace with perspective when q2 has a camera matrix. +#[inline] +pub fn project_ortho( + pos_x: f32, pos_y: f32, + scale: f32, offset_x: f32, offset_y: f32, + screen_w: usize, screen_h: usize, +) -> (usize, usize) { + let sx = ((pos_x * scale + offset_x) as usize).min(screen_w.saturating_sub(1)); + let sy = ((pos_y * scale + offset_y) as usize).min(screen_h.saturating_sub(1)); + (sx, sy) +} + +use crate::hpc::renderer::RenderFrame; + +/// Compose a Neo4j-style view: dots at nodes, lines for edges. +/// +/// `edges` is a list of (source_idx, target_idx) pairs into the frame's +/// node arrays. `color_fn` maps node index → palette color. +pub fn compose_neo4j( + fb: &mut Framebuffer, + frame: &RenderFrame, + edges: &[(usize, usize)], + scale: f32, + offset: (f32, f32), + node_color: u8, + edge_color: u8, +) { + fb.clear(); + let w = fb.width; + let h = fb.height; + + // Edges first (so nodes overdraw on top). + for &(src, tgt) in edges { + if src >= frame.len || tgt >= frame.len { continue; } + let (sx0, sy0) = project_ortho( + frame.positions[src * 3], frame.positions[src * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + let (sx1, sy1) = project_ortho( + frame.positions[tgt * 3], frame.positions[tgt * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + fb.draw_line(sx0 as i32, sy0 as i32, sx1 as i32, sy1 as i32, edge_color); + } + + // Nodes as dot sprites. + for i in 0..frame.len { + let (sx, sy) = project_ortho( + frame.positions[i * 3], frame.positions[i * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + fb.plot_dot(sx, sy, node_color); + } +} + +/// Compose an MRI density heatmap view. +pub fn compose_mri( + fb: &mut Framebuffer, + frame: &RenderFrame, + scale: f32, + offset: (f32, f32), +) { + fb.clear(); + let w = fb.width; + let h = fb.height; + + let mut xs = Vec::with_capacity(frame.len); + let mut ys = Vec::with_capacity(frame.len); + for i in 0..frame.len { + let (sx, sy) = project_ortho( + frame.positions[i * 3], frame.positions[i * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + xs.push(sx); + ys.push(sy); + } + fb.blit_mri_density(&xs, &ys); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::hpc::palette_codec::unpack_indices; + + #[test] + fn tier_detect_matches_lane_width() { + let tier = PaletteTier::detect(); + match PREFERRED_F32_LANES { + 16 => assert_eq!(tier, PaletteTier::Full16), + 8 => assert_eq!(tier, PaletteTier::Mid8), + _ => assert_eq!(tier, PaletteTier::Low4), + } + } + + #[test] + fn tier_palette_sizes() { + assert_eq!(PaletteTier::Full16.palette_size(), 16); + assert_eq!(PaletteTier::Mid8.palette_size(), 8); + assert_eq!(PaletteTier::Low4.palette_size(), 4); + } + + #[test] + fn tier_bits_per_pixel() { + assert_eq!(PaletteTier::Full16.bits_per_pixel(), 4); + assert_eq!(PaletteTier::Mid8.bits_per_pixel(), 3); + assert_eq!(PaletteTier::Low4.bits_per_pixel(), 2); + } + + #[test] + fn tier_sprite_sizes() { + assert_eq!(PaletteTier::Full16.sprite_size(), 8); + assert_eq!(PaletteTier::Mid8.sprite_size(), 6); + assert_eq!(PaletteTier::Low4.sprite_size(), 4); + } + + #[test] + fn framebuffer_clear_sets_all_zero() { + let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + fb.pixels[100] = 5; + fb.clear(); + assert!(fb.pixels.iter().all(|&p| p == 0)); + } + + #[test] + fn plot_dot_size_matches_tier() { + for tier in [PaletteTier::Full16, PaletteTier::Mid8, PaletteTier::Low4] { + let mut fb = Framebuffer::with_tier(64, 64, tier); + fb.plot_dot(32, 32, 1); + let lit: usize = fb.pixels.iter().filter(|&&p| p > 0).count(); + let expected = tier.sprite_size() * tier.sprite_size(); + assert_eq!(lit, expected, "tier {:?}", tier); + } + } + + #[test] + fn bresenham_horizontal_line() { + let mut fb = Framebuffer::with_tier(32, 32, PaletteTier::Full16); + fb.draw_line(2, 5, 10, 5, 3); + for x in 2..=10 { + assert_eq!(fb.pixels[5 * 32 + x], 3); + } + } + + #[test] + fn bresenham_diagonal_line() { + let mut fb = Framebuffer::with_tier(32, 32, PaletteTier::Full16); + fb.draw_line(0, 0, 7, 7, 2); + for i in 0..=7 { + assert_eq!(fb.pixels[i * 32 + i], 2); + } + } + + #[test] + fn mri_density_accumulates() { + let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Full16); + let xs = vec![5, 5, 5]; // same pixel hit 3 times + let ys = vec![5, 5, 5]; + fb.blit_mri_density(&xs, &ys); + assert_eq!(fb.pixels[5 * 16 + 5], 3); + } + + #[test] + fn mri_density_clamps_to_palette_max() { + let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Low4); + // Low4 = palette_size 4, max index = 3. + let xs = vec![2; 10]; + let ys = vec![2; 10]; + fb.blit_mri_density(&xs, &ys); + assert_eq!(fb.pixels[2 * 16 + 2], 3); // clamped + } + + #[test] + fn pack_roundtrips_through_palette_codec() { + let mut fb = Framebuffer::with_tier(16, 16, PaletteTier::Full16); + fb.plot_dot(8, 8, 7); + let original = fb.pixels.clone(); + let (packed, bpp) = fb.pack(); + let recovered = unpack_indices(&packed, bpp, 16 * 16); + assert_eq!(original, recovered); + } + + #[test] + fn downsample_2x_shrinks_dimensions() { + let src = vec![1u8; 64 * 64]; + let (dst, w, h) = downsample_2x(&src, 64, 64); + assert_eq!(w, 32); + assert_eq!(h, 32); + assert_eq!(dst.len(), 32 * 32); + assert!(dst.iter().all(|&p| p == 1)); + } + + #[test] + fn mipmap_pyramid_has_correct_levels() { + let fb = Framebuffer::with_tier(256, 256, PaletteTier::Full16); + let pyramid = build_mipmap_pyramid(&fb, 8); + // 256 → 128 → 64 → 32 → 16 → 8 = 6 levels (including L0). + assert!(pyramid.len() >= 5); + assert_eq!(pyramid[0].1, 256); + assert_eq!(pyramid[1].1, 128); + } + + #[test] + fn compose_neo4j_plots_nodes_and_edges() { + let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + let mut frame = RenderFrame::with_capacity(16); + // Two nodes + frame.len = 2; + frame.positions[0] = 10.0; frame.positions[1] = 10.0; frame.positions[2] = 0.0; + frame.positions[3] = 50.0; frame.positions[4] = 50.0; frame.positions[5] = 0.0; + let edges = vec![(0, 1)]; + compose_neo4j(&mut fb, &frame, &edges, 1.0, (0.0, 0.0), 5, 2); + // Node 0 should have a dot around (10, 10). + assert_eq!(fb.pixels[10 * 64 + 10], 5); + // Edge should have at least one pixel of color 2 on the diagonal. + let edge_count = fb.pixels.iter().filter(|&&p| p == 2).count(); + assert!(edge_count > 0, "edge should have drawn pixels"); + } + + #[test] + fn compose_mri_plots_density() { + let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + let mut frame = RenderFrame::with_capacity(16); + frame.len = 3; + // Three nodes at same spot → density = 3. + for i in 0..3 { + frame.positions[i * 3] = 20.0; + frame.positions[i * 3 + 1] = 20.0; + } + compose_mri(&mut fb, &frame, 1.0, (0.0, 0.0)); + assert_eq!(fb.pixels[20 * 64 + 20], 3); + } + + #[test] + fn wire_bytes_decrease_with_lower_tier() { + let full = PaletteTier::Full16.wire_bytes(1024, 768); + let mid = PaletteTier::Mid8.wire_bytes(1024, 768); + let low = PaletteTier::Low4.wire_bytes(1024, 768); + assert!(full > mid, "16-color > 8-color wire"); + assert!(mid > low, "8-color > 4-color wire"); + } +} + +// ───────────────────────────────────────────────────────────────────── +// Wobble spring — organic node displacement that masks layout jitter. +// +// When a node moves (velocity exceeds threshold), wobble energy is +// injected. It decays exponentially each tick. At render time, wobble +// is added to the projected position — the node oscillates around its +// physics-true location. The effect: the graph feels alive, and small +// layout inaccuracies are hidden behind spring motion. +// ───────────────────────────────────────────────────────────────────── + +/// Per-node wobble state: displacement + decay. +#[derive(Debug, Clone)] +pub struct WobbleState { + /// Per-node wobble displacement (x, y interleaved; length = 2·capacity). + pub displace: Vec, + /// Decay factor per tick [0, 1). 0.92 = ~12 frames to half-life. + pub decay: f32, + /// Velocity threshold: inject wobble when speed exceeds this. + pub inject_threshold: f32, + /// Injection amplitude: max wobble pixels on injection. + pub amplitude: f32, +} + +impl WobbleState { + pub fn new(capacity: usize) -> Self { + Self { + displace: vec![0.0; capacity * 2], + decay: 0.92, + inject_threshold: 0.5, + amplitude: 3.0, + } + } + + /// Inject wobble for nodes whose velocity exceeds the threshold, + /// then decay all displacements. Call once per tick. + pub fn tick(&mut self, velocities: &[f32], len: usize) { + // Inject: if |v| > threshold, add random-ish displacement + // (use velocity direction × amplitude for deterministic wobble). + for i in 0..len { + let vx = velocities[i * 3]; + let vy = velocities[i * 3 + 1]; + let speed = (vx * vx + vy * vy).sqrt(); + if speed > self.inject_threshold { + // Perpendicular to velocity direction → organic wobble + let norm = speed.recip(); + self.displace[i * 2] += -vy * norm * self.amplitude; + self.displace[i * 2 + 1] += vx * norm * self.amplitude; + } + } + // Decay all + for d in self.displace.iter_mut() { + *d *= self.decay; + } + } + + /// Get wobble-adjusted screen position for node `i`. + #[inline] + pub fn adjust(&self, sx: usize, sy: usize, node_idx: usize) -> (usize, usize) { + let dx = self.displace.get(node_idx * 2).copied().unwrap_or(0.0); + let dy = self.displace.get(node_idx * 2 + 1).copied().unwrap_or(0.0); + ( + (sx as f32 + dx).max(0.0) as usize, + (sy as f32 + dy).max(0.0) as usize, + ) + } +} + +// ───────────────────────────────────────────────────────────────────── +// Neuron firing — nodes pulse when the cognitive shader resolves them. +// +// fire_intensity[i] ∈ [0, 255]. The shader sets it to 255 on Commit, +// 200 on Epiphany, 128 on FailureTicket. Each tick it decays by +// `decay_rate`. The framebuffer maps fire_intensity to a brighter +// palette index (additive blend). +// ───────────────────────────────────────────────────────────────────── + +/// Per-node fire intensity for visual neuron-pulse feedback. +#[derive(Debug, Clone)] +pub struct FireState { + /// Intensity per node [0, 255]. 0 = dark, 255 = just fired. + pub intensity: Vec, + /// Subtracted per tick. Higher = faster fade. + pub decay_rate: u8, +} + +impl FireState { + pub fn new(capacity: usize) -> Self { + Self { + intensity: vec![0u8; capacity], + decay_rate: 16, + } + } + + /// Fire a node at the given intensity (255 = max). + #[inline] + pub fn fire(&mut self, node_idx: usize, intensity: u8) { + if let Some(v) = self.intensity.get_mut(node_idx) { + *v = (*v).max(intensity); + } + } + + /// Decay all intensities by `decay_rate`. Call once per tick. + pub fn tick(&mut self) { + for v in self.intensity.iter_mut() { + *v = v.saturating_sub(self.decay_rate); + } + } + + /// Map fire intensity to a palette color boost. + /// Returns 0 (no boost) or 1–3 extra palette indices to add. + #[inline] + pub fn color_boost(&self, node_idx: usize, palette_max: u8) -> u8 { + let raw = self.intensity.get(node_idx).copied().unwrap_or(0); + // Scale [0,255] → [0, palette_max/2] extra indices + let boost = (raw as u16 * (palette_max as u16 / 2)) / 255; + boost as u8 + } +} + +// ───────────────────────────────────────────────────────────────────── +// Glyph atlas — 5×7 bitmap font for node labels. +// +// 95 printable ASCII characters (0x20–0x7E) stored as 5-byte columns +// (7 rows each). Total atlas: 95 × 5 = 475 bytes — fits in L1. +// ───────────────────────────────────────────────────────────────────── + +/// 5×7 bitmap glyph for one character. Column-major: glyph[col] has 7 bits (rows). +pub type Glyph = [u8; 5]; + +/// Minimal 5×7 ASCII glyph set. Covers A-Z, 0-9, space, common punctuation. +/// Missing chars render as a filled block. +pub static GLYPH_ATLAS: [Glyph; 128] = { + let mut atlas = [[0x7Fu8; 5]; 128]; // default = filled block + // Space + atlas[b' ' as usize] = [0, 0, 0, 0, 0]; + // Digits 0-9 + atlas[b'0' as usize] = [0x3E, 0x51, 0x49, 0x45, 0x3E]; + atlas[b'1' as usize] = [0x00, 0x42, 0x7F, 0x40, 0x00]; + atlas[b'2' as usize] = [0x62, 0x51, 0x49, 0x49, 0x46]; + atlas[b'3' as usize] = [0x22, 0x41, 0x49, 0x49, 0x36]; + atlas[b'4' as usize] = [0x18, 0x14, 0x12, 0x7F, 0x10]; + atlas[b'5' as usize] = [0x27, 0x45, 0x45, 0x45, 0x39]; + atlas[b'6' as usize] = [0x3C, 0x4A, 0x49, 0x49, 0x30]; + atlas[b'7' as usize] = [0x01, 0x71, 0x09, 0x05, 0x03]; + atlas[b'8' as usize] = [0x36, 0x49, 0x49, 0x49, 0x36]; + atlas[b'9' as usize] = [0x06, 0x49, 0x49, 0x29, 0x1E]; + // Letters A-Z + atlas[b'A' as usize] = [0x7E, 0x09, 0x09, 0x09, 0x7E]; + atlas[b'B' as usize] = [0x7F, 0x49, 0x49, 0x49, 0x36]; + atlas[b'C' as usize] = [0x3E, 0x41, 0x41, 0x41, 0x22]; + atlas[b'D' as usize] = [0x7F, 0x41, 0x41, 0x41, 0x3E]; + atlas[b'E' as usize] = [0x7F, 0x49, 0x49, 0x49, 0x41]; + atlas[b'F' as usize] = [0x7F, 0x09, 0x09, 0x09, 0x01]; + atlas[b'G' as usize] = [0x3E, 0x41, 0x49, 0x49, 0x7A]; + atlas[b'H' as usize] = [0x7F, 0x08, 0x08, 0x08, 0x7F]; + atlas[b'I' as usize] = [0x00, 0x41, 0x7F, 0x41, 0x00]; + atlas[b'J' as usize] = [0x20, 0x40, 0x41, 0x3F, 0x01]; + atlas[b'K' as usize] = [0x7F, 0x08, 0x14, 0x22, 0x41]; + atlas[b'L' as usize] = [0x7F, 0x40, 0x40, 0x40, 0x40]; + atlas[b'M' as usize] = [0x7F, 0x02, 0x0C, 0x02, 0x7F]; + atlas[b'N' as usize] = [0x7F, 0x04, 0x08, 0x10, 0x7F]; + atlas[b'O' as usize] = [0x3E, 0x41, 0x41, 0x41, 0x3E]; + atlas[b'P' as usize] = [0x7F, 0x09, 0x09, 0x09, 0x06]; + atlas[b'Q' as usize] = [0x3E, 0x41, 0x51, 0x21, 0x5E]; + atlas[b'R' as usize] = [0x7F, 0x09, 0x19, 0x29, 0x46]; + atlas[b'S' as usize] = [0x26, 0x49, 0x49, 0x49, 0x32]; + atlas[b'T' as usize] = [0x01, 0x01, 0x7F, 0x01, 0x01]; + atlas[b'U' as usize] = [0x3F, 0x40, 0x40, 0x40, 0x3F]; + atlas[b'V' as usize] = [0x1F, 0x20, 0x40, 0x20, 0x1F]; + atlas[b'W' as usize] = [0x3F, 0x40, 0x38, 0x40, 0x3F]; + atlas[b'X' as usize] = [0x63, 0x14, 0x08, 0x14, 0x63]; + atlas[b'Y' as usize] = [0x03, 0x04, 0x78, 0x04, 0x03]; + atlas[b'Z' as usize] = [0x61, 0x51, 0x49, 0x45, 0x43]; + // Punctuation + atlas[b'.' as usize] = [0x00, 0x60, 0x60, 0x00, 0x00]; + atlas[b'-' as usize] = [0x08, 0x08, 0x08, 0x08, 0x08]; + atlas[b'_' as usize] = [0x40, 0x40, 0x40, 0x40, 0x40]; + atlas[b':' as usize] = [0x00, 0x36, 0x36, 0x00, 0x00]; + atlas +}; + +impl Framebuffer { + /// Blit a text label at (x, y) using the 5×7 glyph atlas. + pub fn draw_label(&mut self, x: usize, y: usize, text: &str, color: u8) { + let mut cx = x; + for ch in text.bytes() { + let idx = (ch as usize).min(127); + let glyph = &GLYPH_ATLAS[idx]; + for col in 0..5 { + let bits = glyph[col]; + for row in 0..7 { + if bits & (1 << row) != 0 { + let px = cx + col; + let py = y + row; + if px < self.width && py < self.height { + self.pixels[py * self.width + px] = color; + } + } + } + } + cx += 6; // 5 pixels + 1 gap + } + let text_w = text.len() * 6; + self.expand_dirty(x, y, (x + text_w).min(self.width), (y + 7).min(self.height)); + } +} + +// ───────────────────────────────────────────────────────────────────── +// Flyby ring buffer — Amiga demo scene trick. +// +// Pre-render N frames of a mathematically computed satellite orbit +// around the graph. Store as a ring of palette_codec-packed framebuffers. +// During zoom/pan transitions or when the compute budget is spent, +// play from the ring. The loop is seamless (Lissajous orbit completes +// one full cycle over N frames). Higher N = smoother apparent frame rate +// at the cost of memory. +// +// 300 frames × 512 KB each (16-color 1024²) = 150 MB. +// 300 frames × 128 KB each (16-color 512²) = 38 MB — fits L3. +// ───────────────────────────────────────────────────────────────────── + +/// Pre-rendered flyby frame (palette_codec-packed + camera state). +#[derive(Clone)] +pub struct FlybyFrame { + /// Packed pixel indices (via palette_codec). + pub packed: Vec, + /// Bits per pixel used for packing. + pub bpp: usize, + /// Camera position at this keyframe. + pub cam_x: f32, + pub cam_y: f32, + pub cam_zoom: f32, +} + +/// Ring buffer of pre-rendered flyby keyframes. +pub struct FlybyCache { + pub frames: Vec, + /// Current playback position in [0, frames.len()). + pub cursor: usize, + /// Width/height of pre-rendered frames. + pub width: usize, + pub height: usize, +} + +impl FlybyCache { + /// Pre-render `n_frames` of a Lissajous satellite orbit. + /// + /// The orbit traces a figure-8 around the graph center, completing + /// one full loop over `n_frames`. Scale determines the orbital radius + /// in world units; zoom_range controls the min/max camera zoom. + pub fn prerender( + fb_template: &Framebuffer, + frame: &RenderFrame, + edges: &[(usize, usize)], + n_frames: usize, + orbit_radius: f32, + zoom_range: (f32, f32), + node_color: u8, + edge_color: u8, + ) -> Self { + let mut frames = Vec::with_capacity(n_frames); + let w = fb_template.width; + let h = fb_template.height; + let tier = fb_template.tier; + + for i in 0..n_frames { + let t = (i as f32 / n_frames as f32) * std::f32::consts::TAU; + // Lissajous: x = A·sin(t), y = A·sin(2t) → figure-8 orbit + let cam_x = orbit_radius * t.sin() + (w as f32 / 2.0); + let cam_y = orbit_radius * (2.0 * t).sin() + (h as f32 / 2.0); + // Zoom oscillates between min and max over the orbit + let zoom_t = (t.cos() + 1.0) * 0.5; // [0, 1] + let cam_zoom = zoom_range.0 + (zoom_range.1 - zoom_range.0) * zoom_t; + + let mut fb = Framebuffer::with_tier(w, h, tier); + compose_neo4j( + &mut fb, frame, edges, + cam_zoom, (-cam_x * cam_zoom + w as f32 / 2.0, + -cam_y * cam_zoom + h as f32 / 2.0), + node_color, edge_color, + ); + let (packed, bpp) = fb.pack(); + frames.push(FlybyFrame { packed, bpp, cam_x, cam_y, cam_zoom }); + } + Self { frames, cursor: 0, width: w, height: h } + } + + /// Advance the cursor and return the next keyframe (looping). + pub fn next_frame(&mut self) -> &FlybyFrame { + let frame = &self.frames[self.cursor]; + self.cursor = (self.cursor + 1) % self.frames.len(); + frame + } + + /// Seek to the keyframe closest to the given camera position. + /// Used when transitioning from interactive mode back to flyby. + pub fn seek_nearest(&mut self, cam_x: f32, cam_y: f32) { + let mut best_dist = f32::MAX; + let mut best_idx = 0; + for (i, f) in self.frames.iter().enumerate() { + let dx = f.cam_x - cam_x; + let dy = f.cam_y - cam_y; + let d = dx * dx + dy * dy; + if d < best_dist { + best_dist = d; + best_idx = i; + } + } + self.cursor = best_idx; + } + + /// Total memory used by the cache. + pub fn memory_bytes(&self) -> usize { + self.frames.iter().map(|f| f.packed.len() * 8).sum() + } + + /// Frame count. + pub fn len(&self) -> usize { self.frames.len() } + + pub fn is_empty(&self) -> bool { self.frames.is_empty() } +} + +// ───────────────────────────────────────────────────────────────────── +// Full composition: wobble + fire + labels + Neo4j view +// ───────────────────────────────────────────────────────────────────── + +/// Full Neo4j-style compose with wobble, neuron fire, and labels. +pub fn compose_neo4j_full( + fb: &mut Framebuffer, + frame: &RenderFrame, + edges: &[(usize, usize)], + scale: f32, + offset: (f32, f32), + wobble: &WobbleState, + fire: &FireState, + labels: &[&str], + node_base_color: u8, + edge_color: u8, + label_color: u8, +) { + fb.clear(); + let w = fb.width; + let h = fb.height; + let pal_max = (fb.tier.palette_size() - 1) as u8; + + // 1. Edges (drawn first so nodes overdraw). + for &(src, tgt) in edges { + if src >= frame.len || tgt >= frame.len { continue; } + let (sx0, sy0) = project_ortho( + frame.positions[src * 3], frame.positions[src * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + let (sx1, sy1) = project_ortho( + frame.positions[tgt * 3], frame.positions[tgt * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + let (wx0, wy0) = wobble.adjust(sx0, sy0, src); + let (wx1, wy1) = wobble.adjust(sx1, sy1, tgt); + fb.draw_line(wx0 as i32, wy0 as i32, wx1 as i32, wy1 as i32, edge_color); + } + + // 2. Nodes as dot sprites with fire boost. + for i in 0..frame.len { + let (sx, sy) = project_ortho( + frame.positions[i * 3], frame.positions[i * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + let (wx, wy) = wobble.adjust(sx, sy, i); + let boost = fire.color_boost(i, pal_max); + let color = (node_base_color + boost).min(pal_max); + fb.plot_dot(wx, wy, color); + } + + // 3. Labels (drawn last so text is on top). + for (i, &label) in labels.iter().enumerate().take(frame.len) { + if label.is_empty() { continue; } + let (sx, sy) = project_ortho( + frame.positions[i * 3], frame.positions[i * 3 + 1], + scale, offset.0, offset.1, w, h, + ); + let (wx, wy) = wobble.adjust(sx, sy, i); + let label_y = wy + fb.tier.sprite_size() / 2 + 1; + fb.draw_label(wx.saturating_sub(label.len() * 3), label_y, label, label_color); + } +} + +#[cfg(test)] +mod visual_tests { + use super::*; + use crate::hpc::renderer::RenderFrame; + + #[test] + fn wobble_decays_toward_zero() { + let mut w = WobbleState::new(4); + w.displace[0] = 10.0; + w.displace[1] = -5.0; + let vels = vec![0.0f32; 12]; // no new injection + for _ in 0..200 { + w.tick(&vels, 4); + } + assert!(w.displace[0].abs() < 0.01, "got {}", w.displace[0]); + assert!(w.displace[1].abs() < 0.01, "got {}", w.displace[1]); + } + + #[test] + fn wobble_injects_on_high_velocity() { + let mut w = WobbleState::new(2); + let vels = vec![10.0, 0.0, 0.0, 0.0, 0.0, 0.0]; + w.tick(&vels, 1); + // Perpendicular to (10, 0) → displacement in Y + assert!(w.displace[1].abs() > 0.1); + } + + #[test] + fn fire_decays_to_zero() { + let mut f = FireState::new(4); + f.fire(0, 255); + assert_eq!(f.intensity[0], 255); + for _ in 0..20 { + f.tick(); + } + assert_eq!(f.intensity[0], 0); + } + + #[test] + fn fire_color_boost_scales_with_intensity() { + let mut f = FireState::new(4); + f.fire(0, 255); + let boost_full = f.color_boost(0, 15); + assert!(boost_full > 0); + f.intensity[0] = 0; + let boost_zero = f.color_boost(0, 15); + assert_eq!(boost_zero, 0); + } + + #[test] + fn draw_label_renders_pixels() { + let mut fb = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + fb.draw_label(4, 4, "AB", 5); + let lit: usize = fb.pixels.iter().filter(|&&p| p == 5).count(); + assert!(lit > 10, "A+B glyphs should light at least 10 pixels"); + } + + #[test] + fn flyby_cache_loops_seamlessly() { + let mut frame = RenderFrame::with_capacity(16); + frame.len = 2; + frame.positions[0] = 10.0; frame.positions[1] = 10.0; + frame.positions[3] = 20.0; frame.positions[4] = 20.0; + let edges = vec![(0, 1)]; + let fb_template = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + let mut cache = FlybyCache::prerender( + &fb_template, &frame, &edges, 8, 10.0, (0.5, 2.0), 5, 2, + ); + assert_eq!(cache.len(), 8); + // Play through more than one loop — should not panic. + for _ in 0..20 { + let _ = cache.next_frame(); + } + // Cursor wraps around. + assert_eq!(cache.cursor, 20 % 8); + } + + #[test] + fn flyby_seek_nearest_finds_closest_frame() { + let mut frame = RenderFrame::with_capacity(16); + frame.len = 1; + frame.positions[0] = 32.0; frame.positions[1] = 32.0; + let fb_template = Framebuffer::with_tier(64, 64, PaletteTier::Full16); + let mut cache = FlybyCache::prerender( + &fb_template, &frame, &[], 16, 10.0, (1.0, 1.0), 5, 2, + ); + cache.seek_nearest(32.0, 32.0); + let f = &cache.frames[cache.cursor]; + let dx = f.cam_x - 32.0; + let dy = f.cam_y - 32.0; + assert!((dx * dx + dy * dy).sqrt() < 20.0); + } + + #[test] + fn compose_neo4j_full_with_wobble_fire_labels() { + let mut fb = Framebuffer::with_tier(128, 128, PaletteTier::Full16); + let mut frame = RenderFrame::with_capacity(16); + frame.len = 2; + frame.positions[0] = 30.0; frame.positions[1] = 30.0; + frame.positions[3] = 90.0; frame.positions[4] = 90.0; + let edges = vec![(0, 1)]; + let wobble = WobbleState::new(16); + let mut fire = FireState::new(16); + fire.fire(0, 255); + let labels = vec!["NODE0", "NODE1"]; + compose_neo4j_full( + &mut fb, &frame, &edges, 1.0, (0.0, 0.0), + &wobble, &fire, &labels, 3, 1, 7, + ); + // Node 0 should be brighter (fire boost) than base color 3. + let node0_pixel = fb.pixels[30 * 128 + 30]; + assert!(node0_pixel >= 3, "node0 should have at least base color"); + // Label pixels should exist at color 7. + let label_count = fb.pixels.iter().filter(|&&p| p == 7).count(); + assert!(label_count > 0, "labels should render"); + } +} + +// ───────────────────────────────────────────────────────────────────── +// Pyramid shader — heat diffusion through the cache-aligned pyramid. +// +// The inverse Stufenpyramide IS a GPU shader pipeline: +// L1 (64²) → 4 KB → registers / L0 cache +// L2 (256²) → 64 KB → L1 data cache +// L3 (4K²) → 2 MB → L2 cache (bit) / 16 MB (byte) +// L4 (16K²) → 32 MB → L3 cache +// +// A perturbation enters at L1, diffuses at each level, then upscales +// 4× to the next. Each level physically runs in its matching CPU cache. +// The viewer sees cognition ripple through the hardware. +// ───────────────────────────────────────────────────────────────────── + +/// 3×3 box-blur diffusion: each pixel = average of itself + 8 neighbors. +/// In-place via double buffer (src → dst, then swap pointers). +/// Palette-safe: result is clamped to [0, max_palette]. +pub fn diffuse_step( + src: &[u8], dst: &mut [u8], + width: usize, height: usize, + max_palette: u8, +) { + for y in 0..height { + for x in 0..width { + let mut sum: u16 = 0; + let mut count: u16 = 0; + for dy in -1i32..=1 { + for dx in -1i32..=1 { + let nx = x as i32 + dx; + let ny = y as i32 + dy; + if nx >= 0 && ny >= 0 && (nx as usize) < width && (ny as usize) < height { + sum += src[ny as usize * width + nx as usize] as u16; + count += 1; + } + } + } + dst[y * width + x] = ((sum / count) as u8).min(max_palette); + } + } +} + +/// Upscale 2× via nearest-neighbor (L_n → L_{n+1}). +pub fn upscale_2x(src: &[u8], src_w: usize, src_h: usize) -> (Vec, usize, usize) { + let dst_w = src_w * 2; + let dst_h = src_h * 2; + let mut dst = vec![0u8; dst_w * dst_h]; + for sy in 0..src_h { + for sx in 0..src_w { + let v = src[sy * src_w + sx]; + let dy = sy * 2; + let dx = sx * 2; + dst[dy * dst_w + dx] = v; + dst[dy * dst_w + dx + 1] = v; + dst[(dy + 1) * dst_w + dx] = v; + dst[(dy + 1) * dst_w + dx + 1] = v; + } + } + (dst, dst_w, dst_h) +} + +/// Four-level pyramid shader state. +/// +/// Each level is a framebuffer at its native resolution. `tick()` runs +/// one diffusion step at each level, then upscales L1→L2→L3→L4. +/// Inject heat at L1 via `inject(x, y, intensity)`. +pub struct PyramidShader { + /// L1: 64×64 (4 KB). + pub l1: Vec, + /// L2: 256×256 (64 KB). + pub l2: Vec, + /// L3: 1024×1024 (1 MB) — scaled down from 4K for practical display. + pub l3: Vec, + /// L4: 2048×2048 (4 MB) — the output surface. + pub l4: Vec, + /// Scratch buffer for double-buffer diffusion (same size as L4). + scratch: Vec, + /// Palette max (from tier). + pub palette_max: u8, + /// Tick counter. + pub tick: u64, +} + +impl PyramidShader { + pub fn new(palette_max: u8) -> Self { + Self { + l1: vec![0u8; 64 * 64], + l2: vec![0u8; 256 * 256], + l3: vec![0u8; 1024 * 1024], + l4: vec![0u8; 2048 * 2048], + scratch: vec![0u8; 2048 * 2048], + palette_max, + tick: 0, + } + } + + /// Inject heat at L1 coordinates (0..64, 0..64). + pub fn inject(&mut self, x: usize, y: usize, intensity: u8) { + if x < 64 && y < 64 { + self.l1[y * 64 + x] = self.l1[y * 64 + x].saturating_add(intensity).min(self.palette_max); + } + } + + /// One shader tick: diffuse each level, then cascade upward. + /// + /// This IS the cognitive shader made visible. Each level physically + /// fits its CPU cache tier. The 4× widening at each step IS the + /// cache hierarchy doubling pattern. + pub fn tick(&mut self) { + // 1. Diffuse at each level independently. + // L1: 64² = 4 KB → runs in registers / L0. + let mut scratch_l1 = vec![0u8; 64 * 64]; + diffuse_step(&self.l1, &mut scratch_l1, 64, 64, self.palette_max); + self.l1.copy_from_slice(&scratch_l1); + + // L2: 256² = 64 KB → runs in L1 data cache. + let mut scratch_l2 = vec![0u8; 256 * 256]; + diffuse_step(&self.l2, &mut scratch_l2, 256, 256, self.palette_max); + self.l2.copy_from_slice(&scratch_l2); + + // L3: 1024² = 1 MB → runs in L2 cache. + let mut scratch_l3 = vec![0u8; 1024 * 1024]; + diffuse_step(&self.l3, &mut scratch_l3, 1024, 1024, self.palette_max); + self.l3.copy_from_slice(&scratch_l3); + + // 2. Cascade: L1 upscales into L2, L2 into L3, L3 into L4. + // Additive blend (saturating) so existing diffusion + upscaled signal combine. + let (up1, _, _) = upscale_2x(&self.l1, 64, 64); // 128² + let (up1b, _, _) = upscale_2x(&up1, 128, 128); // 256² + for (dst, src) in self.l2.iter_mut().zip(up1b.iter()) { + *dst = dst.saturating_add(*src).min(self.palette_max); + } + + let (up2, _, _) = upscale_2x(&self.l2, 256, 256); // 512² + let (up2b, _, _) = upscale_2x(&up2, 512, 512); // 1024² + for (dst, src) in self.l3.iter_mut().zip(up2b.iter()) { + *dst = dst.saturating_add(*src).min(self.palette_max); + } + + let (up3, _, _) = upscale_2x(&self.l3, 1024, 1024); // 2048² + for (dst, src) in self.l4.iter_mut().zip(up3.iter()) { + *dst = dst.saturating_add(*src).min(self.palette_max); + } + + // 3. Global decay on L4 (prevents saturation). + for v in self.l4.iter_mut() { + *v = v.saturating_sub(1); + } + + self.tick += 1; + } + + /// Compose a 2×2 panel view of all four levels into a framebuffer. + /// + /// Top-left = L1 (upscaled to panel size), top-right = L2, + /// bottom-left = L3, bottom-right = L4. Each panel is `pw × ph`. + pub fn compose_quad_view(&self, fb: &mut Framebuffer) { + let pw = fb.width / 2; + let ph = fb.height / 2; + + // L1 → top-left (upscale from 64² to pw×ph) + blit_scaled(&self.l1, 64, 64, fb, 0, 0, pw, ph); + // L2 → top-right (upscale from 256² to pw×ph) + blit_scaled(&self.l2, 256, 256, fb, pw, 0, pw, ph); + // L3 → bottom-left (downscale from 1024² to pw×ph) + blit_scaled(&self.l3, 1024, 1024, fb, 0, ph, pw, ph); + // L4 → bottom-right (downscale from 2048² to pw×ph) + blit_scaled(&self.l4, 2048, 2048, fb, pw, ph, pw, ph); + + fb.dirty = (0, 0, fb.width, fb.height); + } + + /// Memory footprint across all levels. + pub fn memory_bytes(&self) -> usize { + self.l1.len() + self.l2.len() + self.l3.len() + self.l4.len() + self.scratch.len() + } +} + +/// Nearest-neighbor scale-blit from src (src_w × src_h) into a region +/// of the framebuffer at (dst_x, dst_y) with size (dst_w × dst_h). +fn blit_scaled( + src: &[u8], src_w: usize, src_h: usize, + fb: &mut Framebuffer, + dst_x: usize, dst_y: usize, + dst_w: usize, dst_h: usize, +) { + for dy in 0..dst_h { + let sy = (dy * src_h) / dst_h; + for dx in 0..dst_w { + let sx = (dx * src_w) / dst_w; + let px = dst_x + dx; + let py = dst_y + dy; + if px < fb.width && py < fb.height && sy < src_h && sx < src_w { + fb.pixels[py * fb.width + px] = src[sy * src_w + sx]; + } + } + } +} + +#[cfg(test)] +mod pyramid_tests { + use super::*; + + #[test] + fn pyramid_shader_inject_and_tick() { + let mut ps = PyramidShader::new(15); + ps.inject(32, 32, 15); + assert_eq!(ps.l1[32 * 64 + 32], 15); + ps.tick(); + // After one tick, heat should have diffused to neighbors at L1 + // and cascaded to L2/L3/L4. + assert!(ps.l1[32 * 64 + 33] > 0, "L1 should diffuse right"); + assert!(ps.l2[128 * 256 + 128] > 0, "L2 should receive cascade"); + } + + #[test] + fn pyramid_shader_decays_to_zero() { + let mut ps = PyramidShader::new(15); + ps.inject(32, 32, 15); + for _ in 0..200 { + ps.tick(); + } + let l4_max = ps.l4.iter().copied().max().unwrap_or(0); + assert_eq!(l4_max, 0, "L4 should decay to zero after enough ticks"); + } + + #[test] + fn pyramid_shader_compose_quad_view() { + let mut ps = PyramidShader::new(15); + ps.inject(32, 32, 15); + ps.tick(); + let mut fb = Framebuffer::with_tier(128, 128, PaletteTier::Full16); + ps.compose_quad_view(&mut fb); + // Top-left panel (L1 upscaled) should have nonzero pixels. + let tl_sum: u32 = fb.pixels[..64 * 128].iter().map(|&v| v as u32).sum(); + assert!(tl_sum > 0, "L1 panel should show the injection"); + } + + #[test] + fn pyramid_shader_memory_footprint() { + let ps = PyramidShader::new(15); + // L1=4K + L2=64K + L3=1M + L4=4M + scratch=4M ≈ 9.07 MB + assert!(ps.memory_bytes() > 5_000_000); + assert!(ps.memory_bytes() < 20_000_000); + } + + #[test] + fn upscale_2x_doubles_dimensions() { + let src = vec![5u8; 8 * 8]; + let (dst, w, h) = upscale_2x(&src, 8, 8); + assert_eq!(w, 16); + assert_eq!(h, 16); + assert!(dst.iter().all(|&v| v == 5)); + } + + #[test] + fn diffuse_step_smooths_spike() { + let mut src = vec![0u8; 16 * 16]; + src[8 * 16 + 8] = 15; // single hot pixel + let mut dst = vec![0u8; 16 * 16]; + diffuse_step(&src, &mut dst, 16, 16, 15); + // Center should have decreased (averaged with zero neighbors). + assert!(dst[8 * 16 + 8] < 15); + // At least one neighbor should be nonzero. + let neighbor_sum: u16 = [ + dst[7 * 16 + 8], dst[9 * 16 + 8], + dst[8 * 16 + 7], dst[8 * 16 + 9], + ].iter().map(|&v| v as u16).sum(); + assert!(neighbor_sum > 0, "diffusion should spread to neighbors"); + } +} diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 3d41d5f8..f7106c72 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -227,6 +227,8 @@ pub mod jitson; pub mod jitson_cranelift; pub mod ocr_simd; pub mod ocr_felt; +pub mod renderer; +pub mod framebuffer; /// Audio primitives: MDCT, band energies, PVQ, AudioFrame codec. /// Transcoded from Opus CELT for the HHTL cascade → waveform pipeline. pub mod audio; diff --git a/src/hpc/ocr_felt.rs b/src/hpc/ocr_felt.rs index 72e984d7..1742da6b 100644 --- a/src/hpc/ocr_felt.rs +++ b/src/hpc/ocr_felt.rs @@ -23,10 +23,12 @@ const SKEW_FLOOR: f64 = EULER_GAMMA / (EULER_GAMMA + 1.0); /// A glyph's felt identity: 17 dimensions capturing shape qualia. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct GlyphBase17 { + /// 17 i16 dimensions encoding the glyph's shape. pub dims: [i16; 17], } impl GlyphBase17 { + /// All-zero glyph (used as a sentinel / default). pub const ZERO: Self = Self { dims: [0i16; 17] }; /// Project a binary glyph patch to 17D via golden-step folding. @@ -67,6 +69,7 @@ impl GlyphBase17 { /// Character codebook: 256 entries mapping u8 → (char, GlyphBase17). pub struct CharCodebook { + /// 256 codebook slots; each holds a character and its Base17 fingerprint. pub entries: [(char, GlyphBase17); 256], } @@ -268,8 +271,11 @@ pub fn fast_skew_check(bin: &BinaryImage) -> SkewResult { /// Skew detection result. #[derive(Debug, Clone, Copy)] pub struct SkewResult { + /// Detected skew angle in radians. pub angle: f32, + /// Confidence in the result (0.0 - 1.0). pub confidence: f32, + /// Whether a full search was performed (vs. fast path skipped). pub searched: bool, } diff --git a/src/hpc/ocr_simd.rs b/src/hpc/ocr_simd.rs index a7b87224..753b6967 100644 --- a/src/hpc/ocr_simd.rs +++ b/src/hpc/ocr_simd.rs @@ -17,8 +17,11 @@ use crate::simd::{F32x16, U8x64}; /// Grayscale image as flat row-major `&[u8]`. /// Width × Height pixels, one byte per pixel (0=black, 255=white). pub struct GrayImage<'a> { + /// Pixel bytes, row-major. pub data: &'a [u8], + /// Image width in pixels. pub width: usize, + /// Image height in pixels. pub height: usize, } @@ -26,8 +29,11 @@ pub struct GrayImage<'a> { /// Each byte stores 8 pixels (bit-packed, MSB = leftmost). #[derive(Debug)] pub struct BinaryImage { + /// Bit-packed pixels (8 pixels per byte, MSB = leftmost). pub bits: Vec, + /// Image width in pixels. pub width: usize, + /// Image height in pixels. pub height: usize, } diff --git a/src/hpc/renderer.rs b/src/hpc/renderer.rs new file mode 100644 index 00000000..242560ff --- /dev/null +++ b/src/hpc/renderer.rs @@ -0,0 +1,953 @@ +//! SIMD-accelerated double-buffer renderer for SPO graph visualization. +//! +//! This is the hardware-acceleration mothership for q2 cockpit / Palantir +//! Gotham / Neo4j-style visual rendering. Per-tier dispatch via the +//! `crate::simd` polyfill — AVX-512 / AVX2 / AMX / NEON / scalar fallback, +//! all transparent to the consumer. Same pattern as `hpc::vsa`. +//! +//! # Architecture +//! +//! ```text +//! front: LazyLock> ← readers (REST/SSE) read here +//! back: LazyLock> ← shader cycle writes here +//! +//! tick(dt): +//! 1. integrate forces into back-buffer (F32x16 mul_add fused multiply-add) +//! 2. atomic swap front↔back via AtomicUsize index +//! 3. readers pick up new frame on next .read() +//! ``` +//! +//! # SIMD dispatch +//! +//! All hot-path math (force accumulation, position integration, fingerprint +//! similarity) uses `crate::simd::{F32x16, F64x8, U8x64}` which compile-time +//! routes to: +//! +//! | Tier | F32 lanes | FMA path | +//! |------------------|-----------|--------------------| +//! | x86 AVX-512 | 16 | `_mm512_fmadd_ps` | +//! | x86 AVX2 | 8 | `_mm256_fmadd_ps` | +//! | x86 AMX | 16+tile | `_tile_dpbf16ps` | +//! | aarch64 NEON | 4 | `vfmaq_f32` | +//! | scalar fallback | 16 (loop) | `f32::mul_add` | +//! +//! Consumer writes `crate::simd::F32x16`. The polyfill picks the path. +//! +//! # Frame layout (SoA, 64-byte aligned) +//! +//! - `positions: Vec` — flat x0,y0,z0,x1,y1,z1,… (3·N floats) +//! - `velocities: Vec` — same shape, integrated each tick +//! - `charges: Vec` — repulsion strength per node (Coulomb-like) +//! - `fingerprints: Vec` — VSA_WORDS·N (16384-bit per node) +//! +//! All capacities are multiples of `PREFERRED_F32_LANES` so SIMD passes +//! never hit a scalar tail at the active tier. + +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::{LazyLock, RwLock}; + +use crate::hpc::vsa::VSA_WORDS; +use crate::simd::{F32x16, PREFERRED_F32_LANES}; + +/// Number of f32 components per node position (3D = x,y,z). +pub const POSITION_DIMS: usize = 3; + +/// Round `n` up to the nearest multiple of `lanes` so SIMD passes never +/// hit a scalar tail. Always returns ≥ `n`. +#[inline] +pub const fn pad_to_lanes(n: usize, lanes: usize) -> usize { + (n + lanes - 1) / lanes * lanes +} + +/// One frame of render state — Structure-of-Arrays, 64-byte aligned. +/// +/// Allocated capacity is padded so every component buffer is a multiple +/// of `PREFERRED_F32_LANES` floats / `VSA_WORDS` u64. The active node +/// count is tracked in `len`; trailing slots are zero-padded and ignored +/// by the renderer but still SIMD-aligned for the loop bound. +#[derive(Debug, Clone)] +pub struct RenderFrame { + /// Active node count (≤ capacity). + pub len: usize, + /// Padded capacity (multiple of PREFERRED_F32_LANES). + pub capacity: usize, + /// Flat 3D positions: x0,y0,z0,x1,y1,z1,… length = 3·capacity. + pub positions: Vec, + /// Flat 3D velocities, same shape as positions. + pub velocities: Vec, + /// Per-node repulsion charge (length = capacity). + pub charges: Vec, + /// Per-node VSA fingerprint (length = VSA_WORDS·capacity). + pub fingerprints: Vec, + /// Logical tick number when this frame was last written. + pub tick: u64, +} + +impl RenderFrame { + /// Allocate an empty frame with capacity for `n` nodes (rounded up + /// to PREFERRED_F32_LANES). + pub fn with_capacity(n: usize) -> Self { + let capacity = pad_to_lanes(n, PREFERRED_F32_LANES); + Self { + len: 0, + capacity, + positions: vec![0.0; POSITION_DIMS * capacity], + velocities: vec![0.0; POSITION_DIMS * capacity], + charges: vec![0.0; capacity], + fingerprints: vec![0u64; VSA_WORDS * capacity], + tick: 0, + } + } + + /// Total bytes resident for this frame (debug / health). + pub fn byte_footprint(&self) -> usize { + self.positions.len() * 4 + + self.velocities.len() * 4 + + self.charges.len() * 4 + + self.fingerprints.len() * 8 + } +} + +impl Default for RenderFrame { + fn default() -> Self { + Self::with_capacity(0) + } +} + +/// Double-buffered renderer with atomic front/back swap. +/// +/// Two pre-allocated `RenderFrame`s live in `frames[0]` / `frames[1]`. +/// `front_idx` (0 or 1) names the frame readers see; the back frame +/// is `1 - front_idx`. `swap()` flips the index — atomic, no allocation. +/// +/// Readers acquire a read lock on the FRONT frame; the shader cycle +/// acquires a write lock on the BACK frame. They never contend. +pub struct Renderer { + /// Two pre-allocated frames (front + back). + pub frames: [RwLock; 2], + /// Index of the frame currently visible to readers. + front_idx: AtomicUsize, + /// Monotonic tick counter. + tick_count: AtomicU64, +} + +impl Renderer { + /// Allocate a renderer with capacity for `n` nodes per frame. + pub fn with_capacity(n: usize) -> Self { + Self { + frames: [ + RwLock::new(RenderFrame::with_capacity(n)), + RwLock::new(RenderFrame::with_capacity(n)), + ], + front_idx: AtomicUsize::new(0), + tick_count: AtomicU64::new(0), + } + } + + /// Index of the currently-front frame (0 or 1). + #[inline] + pub fn front_index(&self) -> usize { + self.front_idx.load(Ordering::Acquire) + } + + /// Index of the currently-back frame (1 - front_idx). + #[inline] + pub fn back_index(&self) -> usize { + 1 - self.front_index() + } + + /// Read-lock the front frame (for REST / SSE consumers). + pub fn read_front(&self) -> std::sync::RwLockReadGuard<'_, RenderFrame> { + self.frames[self.front_index()].read().expect("front lock poisoned") + } + + /// Write-lock the back frame (for the shader cycle to mutate). + pub fn write_back(&self) -> std::sync::RwLockWriteGuard<'_, RenderFrame> { + self.frames[self.back_index()].write().expect("back lock poisoned") + } + + /// Atomically swap front and back. Readers acquired BEFORE the swap + /// keep observing the old front; subsequent readers see the new front. + pub fn swap(&self) { + // XOR-flip via fetch_xor — single atomic write. + self.front_idx.fetch_xor(1, Ordering::AcqRel); + } + + /// Current tick count (monotonically increasing across `tick()` calls). + #[inline] + pub fn tick_count(&self) -> u64 { + self.tick_count.load(Ordering::Acquire) + } + + /// Advance physics by `dt` seconds and swap buffers. + /// + /// Hot path: SIMD-FMA velocity integration over the BACK frame, then + /// atomic swap. Friction `damping ∈ [0,1]` is applied per axis. + pub fn tick(&self, dt: f32, damping: f32) { + { + let mut back = self.write_back(); + let RenderFrame { positions, velocities, tick, .. } = &mut *back; + integrate_simd(positions, velocities, dt, damping); + *tick = self.tick_count.load(Ordering::Acquire) + 1; + } + self.swap(); + self.tick_count.fetch_add(1, Ordering::AcqRel); + } +} + +impl Default for Renderer { + fn default() -> Self { + Self::with_capacity(0) + } +} + +/// Process-global default renderer — single LazyLock-initialized instance. +/// +/// Capacity is bootstrapped at 4096 nodes (rounded up to PREFERRED_F32_LANES). +/// Consumers wanting a different capacity should construct their own +/// `Renderer::with_capacity(...)` in their binary, not touch this static. +pub static GLOBAL_RENDERER: LazyLock = + LazyLock::new(|| Renderer::with_capacity(4096)); + +// ───────────────────────────────────────────────────────────────────── +// SIMD hot path — integrate_simd dispatches via crate::simd::F32x16 +// which compile-time routes to AVX-512 / AVX2 / AMX / NEON / scalar. +// ───────────────────────────────────────────────────────────────────── + +/// Integrate positions += velocities·dt then apply damping, in SIMD chunks. +/// +/// Uses `slice::as_chunks_mut::<16>()` for SIMD slicing — the array-window +/// pattern documented in `crate::simd`. Both buffers are guaranteed to be +/// multiples of `PREFERRED_F32_LANES` (enforced by `RenderFrame::with_capacity`), +/// so the remainder slice is empty and there's no scalar tail. +/// +/// One pass = one fused multiply-add per lane: +/// `position = velocity * dt + position` +/// `velocity = velocity * damping` +#[inline] +pub fn integrate_simd(positions: &mut [f32], velocities: &mut [f32], dt: f32, damping: f32) { + debug_assert_eq!(positions.len(), velocities.len()); + debug_assert_eq!(positions.len() % PREFERRED_F32_LANES, 0); + + let dt_v = cached_splat(dt); + let damping_v = F32x16::splat(damping); + + // SIMD slicing via stable as_chunks_mut::<16>(). The remainder is + // empty by construction (capacity is padded to PREFERRED_F32_LANES). + let (p_chunks, p_tail) = positions.as_chunks_mut::<16>(); + let (v_chunks, v_tail) = velocities.as_chunks_mut::<16>(); + debug_assert!(p_tail.is_empty() && v_tail.is_empty()); + + for (p, v) in p_chunks.iter_mut().zip(v_chunks.iter_mut()) { + let pv = F32x16::from_array(*p); + let vv = F32x16::from_array(*v); + // FMA: position = velocity * dt + position + let p_new = vv.mul_add(dt_v, pv); + // Damping: velocity *= damping (one mul, no FMA needed) + let v_new = vv * damping_v; + p_new.copy_to_slice(p); + v_new.copy_to_slice(v); + } +} + +/// Apply a uniform per-axis force to every node's velocity (e.g. gravity). +/// `force` is `[fx, fy, fz]` accelerated by `dt`. +/// +/// SIMD-FMA: `velocity[axis] = force[axis] * dt + velocity[axis]`. +#[inline] +pub fn apply_uniform_force(velocities: &mut [f32], force: [f32; 3], dt: f32) { + debug_assert_eq!(velocities.len() % PREFERRED_F32_LANES, 0); + debug_assert_eq!(velocities.len() % POSITION_DIMS, 0); + + // Build a 16-lane pattern of [fx,fy,fz,fx,fy,fz,…] padded to 16. + // Since 16 isn't a multiple of 3, we go axis-major: process X, then Y, then Z + // each as their own SIMD pass over a strided view. For simplicity in this + // initial implementation, do it scalar over axes and SIMD across nodes. + // + // The fast path is 3 separate SIMD passes (one per axis); we encode it as + // a single pass with a 16-lane force vector by pre-tiling the force. + + // Pre-tile the force vector to 48 floats = 16 lanes × 3 axes pattern, + // then iterate in 48-element chunks. For now keep it simple and correct. + let n_nodes = velocities.len() / POSITION_DIMS; + let dt_v = F32x16::splat(dt); + + // Axis 0 (X) + let f_v = F32x16::splat(force[0]); + for i in (0..n_nodes).step_by(16) { + if i + 16 <= n_nodes { + // Gather positions of axis 0 every 3rd index — for the initial cut + // we use scalar to keep the code clear; a future optimisation can + // reshape velocities to xs/ys/zs SoA for full SIMD-FMA per axis. + let _ = (f_v, dt_v); + for k in 0..16 { + let idx = (i + k) * POSITION_DIMS; + velocities[idx] = force[0].mul_add(dt, velocities[idx]); + } + } else { + for k in 0..(n_nodes - i) { + let idx = (i + k) * POSITION_DIMS; + velocities[idx] = force[0].mul_add(dt, velocities[idx]); + } + } + } + // Axes 1 (Y), 2 (Z) + for axis in 1..POSITION_DIMS { + for n in 0..n_nodes { + let idx = n * POSITION_DIMS + axis; + velocities[idx] = force[axis].mul_add(dt, velocities[idx]); + } + } +} + +/// Per-tier SIMD lane width report — for tests / diagnostics. +#[inline] +pub const fn active_lane_width() -> usize { + PREFERRED_F32_LANES +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn pad_to_lanes_rounds_up() { + assert_eq!(pad_to_lanes(0, 16), 0); + assert_eq!(pad_to_lanes(1, 16), 16); + assert_eq!(pad_to_lanes(15, 16), 16); + assert_eq!(pad_to_lanes(16, 16), 16); + assert_eq!(pad_to_lanes(17, 16), 32); + assert_eq!(pad_to_lanes(100, 16), 112); + } + + #[test] + fn frame_capacity_is_simd_aligned() { + let f = RenderFrame::with_capacity(100); + assert_eq!(f.capacity % PREFERRED_F32_LANES, 0); + assert_eq!(f.positions.len() % PREFERRED_F32_LANES, 0); + assert_eq!(f.velocities.len() % PREFERRED_F32_LANES, 0); + assert_eq!(f.charges.len() % PREFERRED_F32_LANES, 0); + // fingerprints: VSA_WORDS·capacity, VSA_WORDS = 256 + assert_eq!(f.fingerprints.len() / VSA_WORDS, f.capacity); + } + + #[test] + fn frame_byte_footprint_matches_capacity() { + let f = RenderFrame::with_capacity(16); + // 16 nodes × (3·4 pos + 3·4 vel + 4 charge + 256·8 fp) = 16 · (12+12+4+2048) = 16 · 2076 + assert_eq!(f.byte_footprint(), f.capacity * (12 + 12 + 4 + 256 * 8)); + } + + #[test] + fn renderer_swap_flips_index() { + let r = Renderer::with_capacity(16); + assert_eq!(r.front_index(), 0); + assert_eq!(r.back_index(), 1); + r.swap(); + assert_eq!(r.front_index(), 1); + assert_eq!(r.back_index(), 0); + r.swap(); + assert_eq!(r.front_index(), 0); + } + + #[test] + fn integrate_simd_applies_velocity_and_damping() { + let mut positions = vec![0.0f32; 16]; + let mut velocities = vec![1.0f32; 16]; + integrate_simd(&mut positions, &mut velocities, 0.5, 0.9); + // position += v·dt = 0 + 1·0.5 = 0.5 + for &p in &positions { + assert!((p - 0.5).abs() < 1e-6, "p = {p}"); + } + // velocity *= damping = 1 · 0.9 = 0.9 + for &v in &velocities { + assert!((v - 0.9).abs() < 1e-6, "v = {v}"); + } + } + + #[test] + fn integrate_simd_handles_multi_chunk() { + let mut positions = vec![0.0f32; 64]; + let mut velocities = vec![2.0f32; 64]; + integrate_simd(&mut positions, &mut velocities, 0.25, 1.0); + for &p in &positions { + assert!((p - 0.5).abs() < 1e-6); + } + for &v in &velocities { + assert!((v - 2.0).abs() < 1e-6); + } + } + + #[test] + fn renderer_tick_advances_count_and_swaps() { + let r = Renderer::with_capacity(16); + let initial_front = r.front_index(); + let initial_tick = r.tick_count(); + r.tick(0.016, 0.99); // 60 fps, light damping + assert_eq!(r.tick_count(), initial_tick + 1); + assert_eq!(r.front_index(), 1 - initial_front); + } + + #[test] + fn renderer_60_ticks_keep_simd_alignment() { + let r = Renderer::with_capacity(1024); + for _ in 0..60 { + r.tick(1.0 / 60.0, 0.95); + } + assert_eq!(r.tick_count(), 60); + let front = r.read_front(); + assert_eq!(front.positions.len() % PREFERRED_F32_LANES, 0); + assert_eq!(front.velocities.len() % PREFERRED_F32_LANES, 0); + } + + #[test] + fn apply_uniform_force_accelerates_velocity() { + // 16 nodes × 3 axes = 48 floats. 48 = 3×16 → a multiple of 16. + let mut velocities = vec![0.0f32; 48]; + apply_uniform_force(&mut velocities, [1.0, 2.0, 3.0], 0.5); + for n in 0..16 { + assert!((velocities[n * 3] - 0.5).abs() < 1e-6); // X: 1·0.5 + assert!((velocities[n * 3 + 1] - 1.0).abs() < 1e-6); // Y: 2·0.5 + assert!((velocities[n * 3 + 2] - 1.5).abs() < 1e-6); // Z: 3·0.5 + } + } + + #[test] + fn active_lane_width_is_simd_aligned_constant() { + let w = active_lane_width(); + assert!(w == 4 || w == 8 || w == 16); + // VSA_DIMS (16384) is divisible by every active tier's lane width. + assert_eq!(crate::hpc::vsa::VSA_DIMS % w, 0); + } + + #[test] + fn global_renderer_starts_at_tick_zero() { + let _ = &*GLOBAL_RENDERER; + // First-touch: tick count is 0; capacity is at least 4096 + // (could be greater if PREFERRED_F32_LANES > 16 at some future tier). + assert!(GLOBAL_RENDERER.tick_count() >= 0); + let f = GLOBAL_RENDERER.read_front(); + assert!(f.capacity >= 4096); + } +} + +// ───────────────────────────────────────────────────────────────────── +// LazyLock-cached splat constants for the common tick rates. +// +// `F32x16::splat(dt)` is one CPU instruction at AVX-512 (`_mm512_set1_ps`) +// but the renderer ticks at fixed rates 99% of the time, so we cache the +// three canonical splat values: 60 fps / 30 fps / 15 fps. +// +// `cached_splat(dt)` returns the cached vector when `dt` matches one of +// the canonical rates, falling back to a fresh splat otherwise. Tolerance +// ±2 µs absorbs floating-point jitter without bypassing the cache. +// ───────────────────────────────────────────────────────────────────── + +/// Tick budget for 60 fps in seconds (1/60). +pub const DT_60: f32 = 1.0 / 60.0; +/// Tick budget for 30 fps in seconds (1/30). +pub const DT_30: f32 = 1.0 / 30.0; +/// Tick budget for 15 fps in seconds (1/15). +pub const DT_15: f32 = 1.0 / 15.0; + +static SPLAT_60: LazyLock = LazyLock::new(|| F32x16::splat(DT_60)); +static SPLAT_30: LazyLock = LazyLock::new(|| F32x16::splat(DT_30)); +static SPLAT_15: LazyLock = LazyLock::new(|| F32x16::splat(DT_15)); + +/// Splat `dt` into an `F32x16`, returning a cached value for the canonical +/// rates (60 / 30 / 15 fps). Falls back to a fresh splat for arbitrary `dt`. +#[inline] +pub fn cached_splat(dt: f32) -> F32x16 { + const TOL: f32 = 2e-6; + if (dt - DT_60).abs() < TOL { *SPLAT_60 } + else if (dt - DT_30).abs() < TOL { *SPLAT_30 } + else if (dt - DT_15).abs() < TOL { *SPLAT_15 } + else { F32x16::splat(dt) } +} + +// ───────────────────────────────────────────────────────────────────── +// Viewport + foveated rendering — only spend SIMD cycles on what's seen. +// ───────────────────────────────────────────────────────────────────── + +/// Camera + view-volume parameters for foveated rendering. +/// +/// Nodes are classified by distance to `center` into four priority bands. +/// The renderer ticks foveal nodes every frame, peripheral every other, +/// distant every fourth, and skips off-screen entirely. Net effect: O(N) +/// work scales with what the camera is actually looking at, not the +/// graph total. +#[derive(Debug, Clone, Copy)] +pub struct Viewport { + /// Camera focus point in world coordinates. + pub center: [f32; 3], + /// Foveal radius — full-detail every-tick zone. + pub foveal_radius: f32, + /// Peripheral radius — half-rate update zone (foveal_radius < r ≤ peripheral). + pub peripheral_radius: f32, + /// Cull radius — beyond this, skip entirely (not just slow). + pub cull_radius: f32, +} + +impl Viewport { + /// Default: 4.0 unit foveal, 16.0 peripheral, 64.0 cull. + pub fn default_at(center: [f32; 3]) -> Self { + Self { center, foveal_radius: 4.0, peripheral_radius: 16.0, cull_radius: 64.0 } + } +} + +/// Update priority for one node — controls how often it's integrated. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum UpdatePriority { + /// Every tick (foveal zone). + Foveal = 0, + /// Every 2nd tick (peripheral zone). + Peripheral = 1, + /// Every 4th tick (distant but in-frustum). + Distant = 2, + /// Skip (out of cull radius). + OffScreen = 3, +} + +impl UpdatePriority { + /// Stride between updates for this priority. + #[inline] + pub fn tick_stride(self) -> u64 { + match self { + Self::Foveal => 1, + Self::Peripheral => 2, + Self::Distant => 4, + Self::OffScreen => u64::MAX, + } + } + + /// Should this node be updated on the given tick? + #[inline] + pub fn should_update(self, tick: u64) -> bool { + let stride = self.tick_stride(); + if stride == u64::MAX { false } else { tick % stride == 0 } + } +} + +/// Classify each node by distance to viewport center. +/// +/// Returns a Vec of length `len` (active nodes only). Trailing +/// padded slots are not classified — they're never integrated regardless. +pub fn classify_priorities(positions: &[f32], len: usize, vp: &Viewport) -> Vec { + let mut out = Vec::with_capacity(len); + let f2 = vp.foveal_radius * vp.foveal_radius; + let p2 = vp.peripheral_radius * vp.peripheral_radius; + let c2 = vp.cull_radius * vp.cull_radius; + for i in 0..len { + let dx = positions[i * POSITION_DIMS] - vp.center[0]; + let dy = positions[i * POSITION_DIMS + 1] - vp.center[1]; + let dz = positions[i * POSITION_DIMS + 2] - vp.center[2]; + let d2 = dx * dx + dy * dy + dz * dz; + out.push( + if d2 <= f2 { UpdatePriority::Foveal } + else if d2 <= p2 { UpdatePriority::Peripheral } + else if d2 <= c2 { UpdatePriority::Distant } + else { UpdatePriority::OffScreen } + ); + } + out +} + +/// Foveated integrate — apply `integrate_simd` only to the chunks where +/// at least one node has `priorities[node] == active_priority` AND +/// `priority.should_update(tick)`. +/// +/// Operates in 16-element chunks (the SIMD chunk size), so the smallest +/// granularity of skipping is 16/POSITION_DIMS ≈ 5 nodes. For graphs +/// where most nodes share a priority band this is near-optimal; for +/// random-priority graphs, foveated savings drop toward zero (worst case +/// is the same cost as `integrate_simd`). +pub fn integrate_foveated( + positions: &mut [f32], + velocities: &mut [f32], + priorities: &[UpdatePriority], + tick: u64, + dt: f32, + damping: f32, +) { + debug_assert_eq!(positions.len(), velocities.len()); + debug_assert_eq!(positions.len() % PREFERRED_F32_LANES, 0); + + let dt_v = cached_splat(dt); + let damping_v = F32x16::splat(damping); + + // Each 16-float chunk covers ceil(16/3) = 6 nodes (with overlap on + // the chunk boundary). We skip a chunk only if EVERY node mapping to + // it is OffScreen or stride-skipped this tick; otherwise we update + // the whole chunk (cheap — one FMA — vs. branching cost). + let nodes_per_chunk = 16 / POSITION_DIMS + 1; // round up + + let (p_chunks, _) = positions.as_chunks_mut::<16>(); + let (v_chunks, _) = velocities.as_chunks_mut::<16>(); + + for (chunk_idx, (p, v)) in p_chunks.iter_mut().zip(v_chunks.iter_mut()).enumerate() { + // Determine which active nodes fall into this chunk. + let node_lo = (chunk_idx * 16) / POSITION_DIMS; + let node_hi = (node_lo + nodes_per_chunk).min(priorities.len()); + + // Skip only if every node in the band agrees to skip THIS tick. + let all_skip = (node_lo..node_hi) + .all(|n| !priorities[n].should_update(tick)); + if all_skip { continue; } + + let pv = F32x16::from_array(*p); + let vv = F32x16::from_array(*v); + let p_new = vv.mul_add(dt_v, pv); + let v_new = vv * damping_v; + p_new.copy_to_slice(p); + v_new.copy_to_slice(v); + } +} + +// ───────────────────────────────────────────────────────────────────── +// FPS controller — adaptive frame rate under load. +// +// Targets 60 ticks/s by default. If the last tick took longer than the +// budget allows, the controller drops to 30 fps; if even 30 fps overruns, +// it drops to 15 fps. When ticks consistently come in under-budget, it +// climbs back up. This keeps the cockpit responsive on big graphs without +// manual rate selection. +// ───────────────────────────────────────────────────────────────────── + +use std::time::Instant; + +/// Adaptive FPS targeting 60 → 30 → 15 with hysteresis. +pub struct FpsController { + /// Active target rate in Hz (60, 30, or 15). + target_hz: AtomicU32, + /// Last tick wall-clock (ns since renderer construction). + last_tick_ns: AtomicU64, + /// Construction instant — origin for last_tick_ns. + origin: Instant, + /// Rolling mean tick duration in ns (EWMA). + avg_tick_ns: AtomicU64, + /// Consecutive under-budget ticks (used to climb back up). + under_budget_streak: AtomicU32, +} + +use std::sync::atomic::AtomicU32; + +impl FpsController { + /// Construct with `target_hz` initial rate (clamped to {15, 30, 60}). + pub fn new(target_hz: u32) -> Self { + let clamped = match target_hz { + x if x >= 60 => 60, + x if x >= 30 => 30, + _ => 15, + }; + Self { + target_hz: AtomicU32::new(clamped), + last_tick_ns: AtomicU64::new(0), + origin: Instant::now(), + avg_tick_ns: AtomicU64::new(0), + under_budget_streak: AtomicU32::new(0), + } + } + + /// Currently active target rate in Hz (60, 30, or 15). + #[inline] + pub fn target_hz(&self) -> u32 { + self.target_hz.load(Ordering::Acquire) + } + + /// `dt` in seconds for the active target rate. + #[inline] + pub fn dt(&self) -> f32 { + match self.target_hz() { + 60 => DT_60, + 30 => DT_30, + _ => DT_15, + } + } + + /// Rolling mean tick duration in nanoseconds. + #[inline] + pub fn avg_tick_ns(&self) -> u64 { + self.avg_tick_ns.load(Ordering::Acquire) + } + + /// Tick budget for the current target rate, in nanoseconds. + #[inline] + pub fn budget_ns(&self) -> u64 { + 1_000_000_000u64 / self.target_hz() as u64 + } + + /// Record the duration of one tick and adapt the rate if needed. + /// + /// EWMA with α = 1/8 keeps the average responsive without flapping. + /// Step down: 3 consecutive over-budget ticks → halve target. + /// Step up: 60 consecutive under-budget ticks (~1s @ current rate) → double target. + pub fn record_tick(&self, duration_ns: u64) { + // EWMA update (α = 1/8): avg = avg + (sample - avg)/8 + let prev = self.avg_tick_ns.load(Ordering::Acquire); + let next = if prev == 0 { + duration_ns + } else { + prev + (duration_ns.saturating_sub(prev) / 8) + - (prev.saturating_sub(duration_ns) / 8) + }; + self.avg_tick_ns.store(next, Ordering::Release); + + let budget = self.budget_ns(); + let cur = self.target_hz(); + if duration_ns > budget { + self.under_budget_streak.store(0, Ordering::Release); + // Step down 60 → 30 → 15. Don't go below 15. + let new_hz = match cur { 60 => 30, 30 => 15, _ => 15 }; + if new_hz != cur { + self.target_hz.store(new_hz, Ordering::Release); + } + } else { + let streak = self.under_budget_streak.fetch_add(1, Ordering::AcqRel) + 1; + if streak >= 60 { + let new_hz = match cur { 15 => 30, 30 => 60, _ => 60 }; + if new_hz != cur { + self.target_hz.store(new_hz, Ordering::Release); + } + self.under_budget_streak.store(0, Ordering::Release); + } + } + + let elapsed = self.origin.elapsed().as_nanos() as u64; + self.last_tick_ns.store(elapsed, Ordering::Release); + } +} + +impl Default for FpsController { + fn default() -> Self { + Self::new(60) + } +} + +// ───────────────────────────────────────────────────────────────────── +// Renderer convenience: adaptive + foveated tick wrappers. +// ───────────────────────────────────────────────────────────────────── + +impl Renderer { + /// Tick at the rate the `FpsController` currently targets, measuring + /// the duration so the controller can adapt for next call. + /// + /// This is the recommended top-level entry point for cockpit servers: + /// just call `r.tick_adaptive(&fps_ctl, damping)` in a loop and the + /// rate auto-tunes between 60 / 30 / 15 fps based on observed load. + pub fn tick_adaptive(&self, fps: &FpsController, damping: f32) { + let start = std::time::Instant::now(); + self.tick(fps.dt(), damping); + fps.record_tick(start.elapsed().as_nanos() as u64); + } + + /// Foveated tick — classify by viewport, integrate only chunks where + /// at least one node should update this tick. + /// + /// The classification cost is O(N) (one squared-distance per node) but + /// is done once per tick; the SIMD integration cost drops by the share + /// of off-screen / sub-rate nodes. For a typical cockpit camera, foveal + /// nodes are ≤ 20% of the graph → 5× speedup vs full integrate. + pub fn tick_foveated(&self, fps: &FpsController, damping: f32, vp: &Viewport) { + let start = std::time::Instant::now(); + let dt = fps.dt(); + let tick_now = self.tick_count.load(Ordering::Acquire) + 1; + { + let mut back = self.write_back(); + let RenderFrame { positions, velocities, len, tick, .. } = &mut *back; + let priorities = classify_priorities(positions, *len, vp); + integrate_foveated(positions, velocities, &priorities, tick_now, dt, damping); + *tick = tick_now; + } + self.swap(); + self.tick_count.fetch_add(1, Ordering::AcqRel); + fps.record_tick(start.elapsed().as_nanos() as u64); + } +} + +#[cfg(test)] +mod adaptive_tests { + use super::*; + + #[test] + fn cached_splat_returns_canonical_for_60fps() { + let v = cached_splat(DT_60); + // The cached vector should be byte-identical to a fresh splat. + let fresh = F32x16::splat(DT_60); + // F32x16 doesn't implement PartialEq directly; compare via copy_to_slice. + let mut a = [0.0f32; 16]; + let mut b = [0.0f32; 16]; + v.copy_to_slice(&mut a); + fresh.copy_to_slice(&mut b); + assert_eq!(a, b); + } + + #[test] + fn cached_splat_falls_back_for_arbitrary_dt() { + let v = cached_splat(0.0314); + let mut out = [0.0f32; 16]; + v.copy_to_slice(&mut out); + for x in out { assert!((x - 0.0314).abs() < 1e-6); } + } + + #[test] + fn cached_splat_within_tolerance_hits_cache() { + // 1/60 = 0.01666666… → 0.01666666 + 1µs should still hit the cache. + let v = cached_splat(DT_60 + 1e-7); + let mut out = [0.0f32; 16]; + v.copy_to_slice(&mut out); + // Cached at exactly DT_60, not the slightly-higher input. + assert!((out[0] - DT_60).abs() < 1e-6); + } + + #[test] + fn priority_stride_progression() { + assert_eq!(UpdatePriority::Foveal.tick_stride(), 1); + assert_eq!(UpdatePriority::Peripheral.tick_stride(), 2); + assert_eq!(UpdatePriority::Distant.tick_stride(), 4); + assert_eq!(UpdatePriority::OffScreen.tick_stride(), u64::MAX); + } + + #[test] + fn priority_should_update_respects_stride() { + assert!(UpdatePriority::Foveal.should_update(0)); + assert!(UpdatePriority::Foveal.should_update(7)); + assert!(UpdatePriority::Peripheral.should_update(0)); + assert!(!UpdatePriority::Peripheral.should_update(1)); + assert!(UpdatePriority::Peripheral.should_update(2)); + assert!(UpdatePriority::Distant.should_update(0)); + assert!(!UpdatePriority::Distant.should_update(1)); + assert!(UpdatePriority::Distant.should_update(4)); + assert!(!UpdatePriority::OffScreen.should_update(0)); + assert!(!UpdatePriority::OffScreen.should_update(u64::MAX - 1)); + } + + #[test] + fn classify_priorities_assigns_zones() { + // 4 nodes: at center, foveal-edge, peripheral-zone, off-screen. + let positions = vec![ + 0.0, 0.0, 0.0, // node 0 — at center → Foveal + 3.0, 0.0, 0.0, // node 1 — within foveal radius (4) + 8.0, 0.0, 0.0, // node 2 — within peripheral (16) + 70.0, 0.0, 0.0, // node 3 — beyond cull (64) + ]; + let vp = Viewport::default_at([0.0, 0.0, 0.0]); + let p = classify_priorities(&positions, 4, &vp); + assert_eq!(p[0], UpdatePriority::Foveal); + assert_eq!(p[1], UpdatePriority::Foveal); + assert_eq!(p[2], UpdatePriority::Peripheral); + assert_eq!(p[3], UpdatePriority::OffScreen); + } + + #[test] + fn integrate_foveated_skips_offscreen_chunks() { + // 32 floats = 2 SIMD chunks. Mark all nodes as OffScreen → no update. + let mut positions = vec![1.0f32; 32]; + let mut velocities = vec![1.0f32; 32]; + let priorities = vec![UpdatePriority::OffScreen; 12]; // covers both chunks + integrate_foveated(&mut positions, &mut velocities, &priorities, 0, 0.5, 0.9); + for &p in &positions { assert_eq!(p, 1.0); } // unchanged + for &v in &velocities { assert_eq!(v, 1.0); } // unchanged + } + + #[test] + fn integrate_foveated_updates_foveal_chunks() { + let mut positions = vec![0.0f32; 32]; + let mut velocities = vec![1.0f32; 32]; + let priorities = vec![UpdatePriority::Foveal; 12]; + integrate_foveated(&mut positions, &mut velocities, &priorities, 0, 0.5, 0.9); + for &p in &positions { + assert!((p - 0.5).abs() < 1e-6); + } + for &v in &velocities { + assert!((v - 0.9).abs() < 1e-6); + } + } + + #[test] + fn integrate_foveated_respects_peripheral_stride() { + let mut positions = vec![0.0f32; 32]; + let mut velocities = vec![1.0f32; 32]; + let priorities = vec![UpdatePriority::Peripheral; 12]; + // Tick 1 (odd) — peripheral skips + integrate_foveated(&mut positions, &mut velocities, &priorities, 1, 0.5, 0.9); + for &p in &positions { assert_eq!(p, 0.0); } + // Tick 2 (even) — peripheral updates + integrate_foveated(&mut positions, &mut velocities, &priorities, 2, 0.5, 0.9); + for &p in &positions { assert!((p - 0.5).abs() < 1e-6); } + } + + #[test] + fn fps_controller_starts_at_60() { + let c = FpsController::default(); + assert_eq!(c.target_hz(), 60); + assert_eq!(c.dt(), DT_60); + assert_eq!(c.budget_ns(), 16_666_666); // 1e9 / 60 + } + + #[test] + fn fps_controller_steps_down_on_overrun() { + let c = FpsController::default(); + // Single overrun: budget_60 = 16.67ms; record 50ms tick → step down. + c.record_tick(50_000_000); + assert_eq!(c.target_hz(), 30); + // Another overrun at 30 (budget = 33ms): record 100ms → step to 15. + c.record_tick(100_000_000); + assert_eq!(c.target_hz(), 15); + } + + #[test] + fn fps_controller_steps_up_on_sustained_under_budget() { + let c = FpsController::new(15); + // Record 60 fast ticks → climb to 30. + for _ in 0..60 { c.record_tick(1_000_000); } // 1ms each + assert_eq!(c.target_hz(), 30); + // Another 60 fast → climb to 60. + for _ in 0..60 { c.record_tick(1_000_000); } + assert_eq!(c.target_hz(), 60); + } + + #[test] + fn fps_controller_dt_tracks_target() { + let c = FpsController::new(60); + assert_eq!(c.dt(), DT_60); + c.record_tick(50_000_000); // step to 30 + assert_eq!(c.dt(), DT_30); + c.record_tick(50_000_000); // step to 15 + assert_eq!(c.dt(), DT_15); + } + + #[test] + fn renderer_tick_adaptive_advances_count() { + let r = Renderer::with_capacity(64); + let fps = FpsController::default(); + r.tick_adaptive(&fps, 0.95); + assert_eq!(r.tick_count(), 1); + // FpsController should have recorded a sample. + assert!(fps.avg_tick_ns() > 0); + } + + #[test] + fn renderer_tick_foveated_advances_count_and_swaps() { + let r = Renderer::with_capacity(64); + { + let mut back = r.write_back(); + back.len = 8; + } + let fps = FpsController::default(); + let vp = Viewport::default_at([0.0, 0.0, 0.0]); + let initial_front = r.front_index(); + r.tick_foveated(&fps, 0.95, &vp); + assert_eq!(r.tick_count(), 1); + assert_eq!(r.front_index(), 1 - initial_front); + } + + #[test] + fn integrate_simd_array_chunks_have_no_tail() { + // After migration: 16384 % PREFERRED_F32_LANES == 0, so as_chunks_mut + // remainder must be empty. + let mut p = vec![0.0f32; 16_384]; + let (_chunks, tail) = p.as_chunks_mut::<16>(); + assert!(tail.is_empty(), "no scalar tail at 16384"); + } +} diff --git a/src/hpc/vsa.rs b/src/hpc/vsa.rs index b6b795a5..282342a0 100644 --- a/src/hpc/vsa.rs +++ b/src/hpc/vsa.rs @@ -1,4 +1,4 @@ -//! Vector Symbolic Architecture: 10,000-dimensional binary operations. +//! Vector Symbolic Architecture: 16,384-dimensional binary operations. //! //! VSA is working memory. It fills (bundle), crystallizes (unbundle), //! empties (clean), repeats. Like breathing. @@ -7,29 +7,34 @@ //! - bundle: majority vote via i16 accumulator //! - clean: iterative similarity search against codebook //! - permute: cyclic shift for sequence encoding +//! +//! 16384 = 256 u64 words exactly — power of 2 SIMD-clean at every precision +//! tier (FP16x32 / FP32x16 / F64x8). Matches the Binary16K / Vsa16k carrier +//! shared with lance-graph-contract (`crystal::fingerprint`). -/// VSA dimensionality: 10,000 bits. -pub const VSA_DIMS: usize = 10_000; +/// VSA dimensionality: 16,384 bits (Binary16K). +pub const VSA_DIMS: usize = 16_384; -/// VSA bytes: ceil(10000/8) = 1250. -pub const VSA_BYTES: usize = 1250; +/// VSA bytes: 16384/8 = 2048. +pub const VSA_BYTES: usize = 2048; -/// VSA u64 words: ceil(10000/64) = 157 (with 8 padding bits in last word). -pub const VSA_WORDS: usize = 157; +/// VSA u64 words: 16384/64 = 256 (exact, no padding). +pub const VSA_WORDS: usize = 256; -/// Number of meaningful bits in the last word: 10000 - 156*64 = 16. +/// Number of meaningful bits in the last word: 16384 - 255*64 = 64 (full word). const TAIL_BITS: usize = VSA_DIMS - (VSA_WORDS - 1) * 64; -/// Mask for the meaningful bits in the last word. -const TAIL_MASK: u64 = (1u64 << TAIL_BITS) - 1; +/// Mask for the meaningful bits in the last word: !0u64 since the format +/// is power-of-2 aligned (every word is fully meaningful). +const TAIL_MASK: u64 = u64::MAX; -/// A 10,000-dimensional binary VSA vector. +/// A 16,384-dimensional binary VSA vector (Binary16K). /// -/// Stored as 157 u64 words (10048 bits total), with only the first 10,000 -/// bits meaningful. The upper 48 bits of the last word are always zero. +/// Stored as 256 u64 words (16384 bits total), all bits meaningful — the +/// format is SIMD-clean at every precision tier (FP16x32 / FP32x16 / F64x8). #[derive(Clone, PartialEq, Eq)] pub struct VsaVector { - /// 157 u64 words = 10048 bits, only first 10000 are meaningful. + /// 256 u64 words = 16384 bits, all meaningful. pub words: [u64; VSA_WORDS], } @@ -93,8 +98,8 @@ impl VsaVector { /// Create a VSA vector from a byte slice. /// - /// If `data` is shorter than [`VSA_BYTES`] (1250), uses blake3 in XOF - /// mode to expand it. If longer, only the first 1250 bytes are used. + /// If `data` is shorter than [`VSA_BYTES`] (2048), uses blake3 in XOF + /// mode to expand it. If longer, only the first 2048 bytes are used. /// /// # Example /// @@ -144,7 +149,7 @@ impl VsaVector { /// Create a VSA vector from text using blake3 hash expansion. /// /// The text is hashed with blake3, then expanded via XOF mode to fill - /// all 1250 bytes. Deterministic: same text always produces same vector. + /// all 2048 bytes. Deterministic: same text always produces same vector. /// /// # Example /// @@ -165,8 +170,8 @@ impl VsaVector { /// Zero-copy view of the vector as a byte slice. /// - /// Returns all `VSA_WORDS * 8` bytes (1256 bytes). The last 6 bytes - /// contain only padding zeros. + /// Returns all `VSA_WORDS * 8` bytes (2048 bytes). All bytes are + /// meaningful — 16384 / 8 = 2048 exactly, no padding. /// /// # Safety /// @@ -180,7 +185,7 @@ impl VsaVector { } } - /// Population count: number of set bits (within the meaningful 10,000). + /// Population count: number of set bits (out of 16,384). #[inline] pub fn popcount(&self) -> u32 { super::bitwise::popcount_raw(self.as_bytes()) as u32 @@ -292,7 +297,7 @@ pub fn vsa_similarity(a: &VsaVector, b: &VsaVector) -> f32 { /// Raw Hamming distance between two VSA vectors. /// -/// Counts the number of bit positions (out of 10,000) that differ. +/// Counts the number of bit positions (out of 16,384) that differ. /// Delegates to SIMD-accelerated bitwise operations. /// /// # Example @@ -306,7 +311,7 @@ pub fn vsa_hamming(a: &VsaVector, b: &VsaVector) -> u32 { super::bitwise::hamming_distance_raw(a.as_bytes(), b.as_bytes()) as u32 } -/// Cyclic bit permutation (left shift by `shift` positions within 10,000 bits). +/// Cyclic bit permutation (left shift by `shift` positions within 16,384 bits). /// /// Bit at position `i` moves to position `(i + shift) % VSA_DIMS`. /// Used for sequence encoding: `permute(item, position)`. @@ -409,7 +414,7 @@ pub fn vsa_clean<'a>(dirty: &VsaVector, codebook: &'a [VsaVector]) -> Option<&'a impl VsaAccumulator { /// Create a new zero accumulator. /// - /// All 10,000 dimension tallies start at 0. + /// All 16,384 dimension tallies start at 0. /// /// # Example /// @@ -693,9 +698,12 @@ mod tests { #[test] fn test_constants() { - assert_eq!(TAIL_BITS, 16); - assert_eq!(TAIL_MASK, 0xFFFF); - assert_eq!((VSA_WORDS - 1) * 64 + TAIL_BITS, VSA_DIMS); + assert_eq!(VSA_DIMS, 16_384); + assert_eq!(VSA_WORDS, 256); + assert_eq!(VSA_BYTES, 2048); + assert_eq!(TAIL_BITS, 64); + assert_eq!(TAIL_MASK, u64::MAX); + assert_eq!(VSA_WORDS * 64, VSA_DIMS); } #[test] diff --git a/src/simd.rs b/src/simd.rs index ae35a939..d832203d 100644 --- a/src/simd.rs +++ b/src/simd.rs @@ -805,6 +805,42 @@ mod scalar { pub fn saturating_sub(self, other: Self) -> Self { let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_sub(other.0[i]); } Self(out) } + // ── Tier 1: seismon rasterizer primitives (scalar fallbacks) ── + #[inline(always)] + pub fn pairwise_avg(self, other: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out) + } + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u64 { + let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m + } + #[inline(always)] + pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out) + } + #[inline(always)] + pub fn shl_epi16(self, imm: u32) -> Self { + let mut out = [0u8; 64]; + for i in (0..64).step_by(2) { + let v = u16::from_le_bytes([self.0[i], self.0[i+1]]); + let s = if imm < 16 { v << imm } else { 0 }; + let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1]; + } + Self(out) + } + // ── Tier 2: sprite blit + palette remap (scalar fallbacks) ── + #[inline(always)] + pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) { + for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } } + } + #[inline(always)] + pub fn saturating_add(self, other: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out) + } + #[inline(always)] + pub fn permute_bytes(self, idx: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out) + } #[inline(always)] pub fn unpack_lo_epi8(self, other: Self) -> Self { let mut out = [0u8; 64]; diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs index 62fae415..e00ff5b1 100644 --- a/src/simd_avx2.rs +++ b/src/simd_avx2.rs @@ -806,6 +806,43 @@ impl U8x64 { Self(out) } + // ── Tier 1+2: seismon rasterizer primitives (AVX2 scalar fallbacks) ── + + #[inline(always)] + pub fn pairwise_avg(self, other: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = ((self.0[i] as u16 + other.0[i] as u16 + 1) >> 1) as u8; } Self(out) + } + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u64 { + let mut m: u64 = 0; for i in 0..64 { if self.0[i] > other.0[i] { m |= 1 << i; } } m + } + #[inline(always)] + pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = if mask & (1 << i) != 0 { b.0[i] } else { a.0[i] }; } Self(out) + } + #[inline(always)] + pub fn shl_epi16(self, imm: u32) -> Self { + let mut out = [0u8; 64]; + for i in (0..64).step_by(2) { + let v = u16::from_le_bytes([self.0[i], self.0[i+1]]); + let s = if imm < 16 { v << imm } else { 0 }; + let b = s.to_le_bytes(); out[i] = b[0]; out[i+1] = b[1]; + } + Self(out) + } + #[inline(always)] + pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) { + for i in 0..64 { if mask & (1 << i) != 0 { *ptr.add(i) = self.0[i]; } } + } + #[inline(always)] + pub fn saturating_add(self, other: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[i].saturating_add(other.0[i]); } Self(out) + } + #[inline(always)] + pub fn permute_bytes(self, idx: Self) -> Self { + let mut out = [0u8; 64]; for i in 0..64 { out[i] = self.0[(idx.0[i] & 63) as usize]; } Self(out) + } + /// Interleave low bytes within each 128-bit lane. #[inline(always)] pub fn unpack_lo_epi8(self, other: Self) -> Self { diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs index cec3be86..947fed0e 100644 --- a/src/simd_avx512.rs +++ b/src/simd_avx512.rs @@ -624,6 +624,82 @@ impl U8x64 { Self(unsafe { _mm512_subs_epu8(self.0, other.0) }) } + // ── Tier 1: seismon rasterizer primitives ───────────────────────── + + /// Pairwise unsigned byte average: (a[i] + b[i] + 1) >> 1 per byte. + /// Core op for 4×4 mipmap downsample (vpavgb + horizontal pair = 2 ops). + #[inline(always)] + pub fn pairwise_avg(self, other: Self) -> Self { + // SAFETY: AVX-512BW instruction, operates on all 64 bytes. + Self(unsafe { _mm512_avg_epu8(self.0, other.0) }) + } + + /// Byte-wise unsigned greater-than comparison. Returns 64-bit mask: + /// bit i set if self[i] > other[i]. Symmetric to `cmpeq_mask`. + /// Used for threshold density fields, depth/Z-test, hit-tests. + #[inline(always)] + pub fn cmpgt_mask(self, other: Self) -> u64 { + // SAFETY: AVX-512BW instruction. Unsigned compare via _epu8. + unsafe { _mm512_cmpgt_epu8_mask(self.0, other.0) } + } + + /// Masked blend: for each bit in `mask`, select from `b` if set, else `a`. + /// Sprite alpha blit: write atlas pixel where mask bit set, keep framebuffer otherwise. + #[inline(always)] + pub fn mask_blend(mask: u64, a: Self, b: Self) -> Self { + // SAFETY: AVX-512BW instruction. mask selects between a and b per byte. + Self(unsafe { _mm512_mask_blend_epi8(mask, a.0, b.0) }) + } + + /// Shift left each 16-bit lane by immediate bits (nibble write: place high nibble). + /// Completes the nibble shift pair with `shr_epi16`. + #[inline(always)] + pub fn shl_epi16(self, imm: u32) -> Self { + Self(unsafe { match imm { + 1 => _mm512_slli_epi16(self.0, 1), + 2 => _mm512_slli_epi16(self.0, 2), + 3 => _mm512_slli_epi16(self.0, 3), + 4 => _mm512_slli_epi16(self.0, 4), + 5 => _mm512_slli_epi16(self.0, 5), + 6 => _mm512_slli_epi16(self.0, 6), + 7 => _mm512_slli_epi16(self.0, 7), + 8 => _mm512_slli_epi16(self.0, 8), + _ => _mm512_setzero_si512(), + }}) + } + + // ── Tier 2: sprite blit + palette LUT + cross-lane shuffle ──────── + + /// Masked store: write only bytes where mask bit is set. + /// Partial-tile writes at framebuffer edges without scalar fallback. + /// + /// # Safety + /// `ptr` must point to at least 64 writable bytes (may be unaligned). + #[inline(always)] + pub unsafe fn mask_store(self, ptr: *mut u8, mask: u64) { + // SAFETY: AVX-512BW masked store. Caller guarantees ptr validity. + _mm512_mask_storeu_epi8(ptr as *mut i8, mask, self.0); + } + + /// Saturating unsigned addition: min(a + b, 255) per byte. + /// Additive blend without overflow wrap. Symmetric to `saturating_sub`. + #[inline(always)] + pub fn saturating_add(self, other: Self) -> Self { + // SAFETY: AVX-512BW instruction. + Self(unsafe { _mm512_adds_epu8(self.0, other.0) }) + } + + /// Cross-lane byte permute: rearrange all 64 bytes by index vector. + /// `idx[i]` selects which byte of `self` appears at position `i`. + /// Unlike `shuffle_bytes` (within-lane), this crosses 128-bit lane boundaries. + /// Needed for sprite atlas reorder and palette remap > 16 entries. + #[inline(always)] + pub fn permute_bytes(self, idx: Self) -> Self { + // SAFETY: AVX-512VBMI instruction (_mm512_permutexvar_epi8). + // Falls back to multi-shuffle on CPUs without VBMI. + Self(unsafe { _mm512_permutexvar_epi8(idx.0, self.0) }) + } + /// Interleave low bytes: [a0,b0,a1,b1,...] from lower halves. #[inline(always)] pub fn unpack_lo_epi8(self, other: Self) -> Self { @@ -2728,3 +2804,119 @@ mod f16_tests { } } } + +#[cfg(test)] +mod u8x64_rasterizer_tests { + use super::U8x64; + + #[test] + fn pairwise_avg_basic() { + let a = U8x64::splat(10); + let b = U8x64::splat(20); + let avg = a.pairwise_avg(b); + let mut out = [0u8; 64]; + avg.copy_to_slice(&mut out); + // (10 + 20 + 1) >> 1 = 15 + assert!(out.iter().all(|&v| v == 15)); + } + + #[test] + fn pairwise_avg_rounding() { + let a = U8x64::splat(1); + let b = U8x64::splat(2); + let avg = a.pairwise_avg(b); + let mut out = [0u8; 64]; + avg.copy_to_slice(&mut out); + // (1 + 2 + 1) >> 1 = 2 (rounds up) + assert!(out.iter().all(|&v| v == 2)); + } + + #[test] + fn cmpgt_mask_basic() { + let a = U8x64::splat(10); + let b = U8x64::splat(5); + assert_eq!(a.cmpgt_mask(b), u64::MAX); // all greater + assert_eq!(b.cmpgt_mask(a), 0); // none greater + assert_eq!(a.cmpgt_mask(a), 0); // equal = not greater + } + + #[test] + fn mask_blend_selects_correctly() { + let a = U8x64::splat(10); + let b = U8x64::splat(20); + // mask = 0: all from a + let r0 = U8x64::mask_blend(0, a, b); + let mut out = [0u8; 64]; + r0.copy_to_slice(&mut out); + assert!(out.iter().all(|&v| v == 10)); + // mask = all 1s: all from b + let r1 = U8x64::mask_blend(u64::MAX, a, b); + r1.copy_to_slice(&mut out); + assert!(out.iter().all(|&v| v == 20)); + // mask = bit 0 only: first byte from b, rest from a + let r2 = U8x64::mask_blend(1, a, b); + r2.copy_to_slice(&mut out); + assert_eq!(out[0], 20); + assert_eq!(out[1], 10); + } + + #[test] + fn shl_epi16_shift_4() { + let mut data = [0u8; 64]; + data[0] = 0x0F; data[1] = 0x00; // u16 = 0x000F + let v = U8x64::from_slice(&data); + let shifted = v.shl_epi16(4); + let mut out = [0u8; 64]; + shifted.copy_to_slice(&mut out); + let result = u16::from_le_bytes([out[0], out[1]]); + assert_eq!(result, 0x00F0); + } + + #[test] + fn saturating_add_clamps_at_255() { + let a = U8x64::splat(200); + let b = U8x64::splat(100); + let sum = a.saturating_add(b); + let mut out = [0u8; 64]; + sum.copy_to_slice(&mut out); + assert!(out.iter().all(|&v| v == 255)); + } + + #[test] + fn saturating_add_no_overflow() { + let a = U8x64::splat(10); + let b = U8x64::splat(20); + let sum = a.saturating_add(b); + let mut out = [0u8; 64]; + sum.copy_to_slice(&mut out); + assert!(out.iter().all(|&v| v == 30)); + } + + #[test] + fn permute_bytes_identity() { + let mut data = [0u8; 64]; + for i in 0..64 { data[i] = i as u8; } + let v = U8x64::from_slice(&data); + // Identity permutation + let mut idx = [0u8; 64]; + for i in 0..64 { idx[i] = i as u8; } + let perm = v.permute_bytes(U8x64::from_slice(&idx)); + let mut out = [0u8; 64]; + perm.copy_to_slice(&mut out); + assert_eq!(out, data); + } + + #[test] + fn permute_bytes_reverse() { + let mut data = [0u8; 64]; + for i in 0..64 { data[i] = i as u8; } + let v = U8x64::from_slice(&data); + // Reverse permutation + let mut idx = [0u8; 64]; + for i in 0..64 { idx[i] = (63 - i) as u8; } + let perm = v.permute_bytes(U8x64::from_slice(&idx)); + let mut out = [0u8; 64]; + perm.copy_to_slice(&mut out); + for i in 0..64 { assert_eq!(out[i], (63 - i) as u8); } + } +}