diff --git a/crates/burn/src/ops/matmul.rs b/crates/burn/src/ops/matmul.rs deleted file mode 120000 index 44ce5ad9..00000000 --- a/crates/burn/src/ops/matmul.rs +++ /dev/null @@ -1 +0,0 @@ -../../upstream/crates/burn-ndarray/src/ops/matmul.rs \ No newline at end of file diff --git a/crates/burn/src/ops/matmul.rs b/crates/burn/src/ops/matmul.rs new file mode 100644 index 00000000..426d7133 --- /dev/null +++ b/crates/burn/src/ops/matmul.rs @@ -0,0 +1,467 @@ +use crate::UnsafeSharedRef; +use crate::{NdArrayElement, ShapeOps, SharedArray, iter_range_par, ops::NdArrayOps, run_par}; + +use alloc::{vec, vec::Vec}; +use burn_backend::ElementConversion; +use burn_backend::Shape; +use ndarray::{IxDyn, s}; + +#[cfg(feature = "std")] +use std::collections::HashMap; +#[cfg(feature = "std")] +use std::sync::{LazyLock, RwLock}; + +// ============================================================================ +// Compiled Attention Cache — O(1) table lookup replacing O(d) matmul +// ============================================================================ +// +// When a model is loaded, attention weight matrices can be compiled into +// precomputed distance tables. During matmul, we check this cache first: +// - Hit: return table[q_palette_idx][k_palette_idx] (O(1) per element) +// - Miss: fall through to BLAS (O(d) per element) +// +// The cache is keyed by (m, k, n) dimensions of the matmul. +// In attention: m=seq_len, k=d_head, n=seq_len. The k dimension identifies +// which attention head's table to use. + +/// A compiled attention table: 256×256 u16 distances, precomputed. +#[cfg(feature = "std")] +#[derive(Clone)] +pub struct CompiledAttention { + /// 256×256 distance table. table[q][k] = precomputed attention distance. + pub table: Vec, + /// Palette size (number of archetypes, typically 256). + pub k_palette: usize, + /// Input dimension this table was compiled from. + pub d_head: usize, + /// Palette assignment: for each row index, which palette entry it maps to. + pub q_assignments: Vec, + /// Palette assignment for columns. + pub k_assignments: Vec, +} + +/// Global cache of compiled attention tables. +/// Keyed by (d_head) — the inner dimension of the attention matmul. +#[cfg(feature = "std")] +static ATTENTION_CACHE: LazyLock>> = + LazyLock::new(|| RwLock::new(HashMap::new())); + +/// Register a compiled attention table for a given head dimension. +#[cfg(feature = "std")] +pub fn register_attention_table(d_head: usize, table: CompiledAttention) { + let mut cache = ATTENTION_CACHE.write().unwrap(); + cache.insert(d_head, table); +} + +/// Check if a compiled attention table exists for the given dimensions. +#[cfg(feature = "std")] +pub fn has_attention_table(d_head: usize) -> bool { + let cache = ATTENTION_CACHE.read().unwrap(); + cache.contains_key(&d_head) +} + +/// Clear all compiled attention tables. +#[cfg(feature = "std")] +pub fn clear_attention_cache() { + let mut cache = ATTENTION_CACHE.write().unwrap(); + cache.clear(); +} + +/// Try to compute matmul using compiled attention table lookup. +/// Returns None if no table exists for these dimensions. +#[cfg(feature = "std")] +fn try_attention_matmul( + _lhs: &ndarray::ArrayView2<'_, E>, + _rhs: &ndarray::ArrayView2<'_, E>, + out: &mut ndarray::ArrayViewMut2<'_, E>, + m: usize, + k: usize, + n: usize, +) -> bool { + let cache = ATTENTION_CACHE.read().unwrap(); + let table = match cache.get(&k) { + Some(t) => t, + None => return false, + }; + + // Use palette assignments to look up precomputed distances + if table.q_assignments.len() < m || table.k_assignments.len() < n { + return false; + } + + for i in 0..m { + let q_idx = table.q_assignments[i] as usize; + for j in 0..n { + let k_idx = table.k_assignments[j] as usize; + // Table lookup: O(1) per element instead of O(k) dot product + let dist = table.table[q_idx * table.k_palette + k_idx]; + // Convert distance to similarity score (higher = more attention) + // Negate and scale: attention ∝ -distance + let score: f64 = -(dist as f64) / 1000.0; + out[[i, j]] = score.elem(); + } + } + true +} + +pub(crate) fn matmul( + lhs: SharedArray, + rhs: SharedArray, +) -> SharedArray { + let shape_lhs = lhs.shape(); + let shape_rhs = rhs.shape(); + let ndims = shape_lhs.num_dims(); + let m = shape_lhs[ndims - 2]; // # of left rows + let k = shape_rhs[ndims - 2]; // # of left cols and right rows + let n = shape_rhs[ndims - 1]; // # of right cols + + let (out_shape, strides_lhs, strides_rhs, strides_out) = output_shape(shape_lhs, shape_rhs); + let l_mat_size = m * k; // size of matrix component of left array + let r_mat_size = k * n; // size of matrix component of right array + let out_mat_size = m * n; // size of matrix component of output array + + let num_l_batches = shape_lhs.num_elements() / l_mat_size; + let num_r_batches = shape_rhs.num_elements() / r_mat_size; + let num_out_batches = out_shape.num_elements() / out_mat_size; + + let lhs_array = NdArrayOps::reshape(lhs, Shape::new([num_l_batches, m, k])); + let rhs_array = NdArrayOps::reshape(rhs, Shape::new([num_r_batches, k, n])); + + let alpha: E = 1.0.elem(); + let beta: E = 0.0.elem(); + + let out = run_par!(|| { + let mut out_array = ndarray::Array3::::zeros((num_out_batches, m, n)); + let unsafe_shared_out_array = UnsafeSharedRef::new(&mut out_array); + + iter_range_par!(0, num_out_batches).for_each(|out_batch| { + // Here, we: + // 1. Un-flatten the output batch into a component-based batch index. + // 2. Use the strides for left and right batch indices to convert it to a flattened + // batch for left and right. + let out_index = strides_out.unflatten(out_batch); + let l_batch = strides_lhs.flatten(&out_index); + let r_batch = strides_rhs.flatten(&out_index); + + let lhs_slice = lhs_array.slice(s!(l_batch, .., ..)); + let rhs_slice = rhs_array.slice(s!(r_batch, .., ..)); + + unsafe { + let mut out_slice = unsafe_shared_out_array + .get() + .slice_mut(s!(out_batch, .., ..)); + + // Try compiled attention table (O(1) per element). + // Falls through to BLAS if no table is registered for d_head=k. + #[cfg(feature = "std")] + if try_attention_matmul(&lhs_slice, &rhs_slice, &mut out_slice, m, k, n) { + return; + } + + ndarray::linalg::general_mat_mul( + alpha, + &lhs_slice, + &rhs_slice, + beta, + &mut out_slice, + ) + } + }); + + out_array.into_shared().into_dyn() + }); + + NdArrayOps::reshape(out, out_shape) +} + +#[derive(Debug, PartialEq)] +struct Strides { + strides: Vec, +} +impl Strides { + fn new(strides: Vec) -> Self { + Strides { strides } + } + + fn unflatten(&self, linear_index: usize) -> Vec { + let mut coord = Vec::with_capacity(self.strides.len()); + let mut rem = linear_index; + for stride in self.strides.iter() { + coord.push(rem / stride); + rem %= stride; + } + coord + } + + fn flatten(&self, index: &Vec) -> usize { + assert_eq!(self.strides.len(), index.len()); + self.strides + .iter() + .zip(index) + .map(|(stride, index)| stride * index) + .sum() + } +} + +/// Compute the (broadcasted) output shape of matrix multiplication, along with strides for +/// the non-matrix dimensions of all arrays. +/// +/// # Arguments +/// * `lsh`: Shape of the first (left-hand) matrix multiplication argument. +/// * `rsh`: Shape of the second (right-hand) matrix multiplication argument. +/// +/// # Panics +/// * If `D` is not at least 2. +/// * If the matrix multiplication dimensions (last 2) are incompatible. +/// * If any other dimension is not the same for both tensors, or equal to 1. (Any dimension where +/// one dim is equal to 1 is broadcast.) +fn output_shape(lsh: &[usize], rsh: &[usize]) -> (Shape, Strides, Strides, Strides) { + let ndims = lsh.num_dims(); + if ndims < 2 { + panic!("Matrix multiplication requires an array with at least 2 dimensions."); + } + + // Fetch matrix dimensions and check compatibility. + let l_rows = lsh[ndims - 2]; + let l_cols = lsh[ndims - 1]; + let r_rows = rsh[ndims - 2]; + let r_cols = rsh[ndims - 1]; + if l_cols != r_rows { + panic!("Dimensions are incompatible for matrix multiplication."); + } + // Set matrix dimensions of the output shape. + let mut osh = vec![0; ndims]; + osh[ndims - 2] = l_rows; + osh[ndims - 1] = r_cols; + + // Set other array dimensions, broadcasting as necessary. + // Compute the strides inline. + let mut cur_l_stride: usize = 1; + let mut cur_r_stride: usize = 1; + let mut cur_o_stride: usize = 1; + let mut l_strides = Vec::with_capacity(ndims - 2); + let mut r_strides = Vec::with_capacity(ndims - 2); + let mut o_strides = Vec::with_capacity(ndims - 2); + for i in (0..ndims - 2).rev() { + let l_dim = lsh[i]; + let r_dim = rsh[i]; + + // Compatible dimensions are: + // 1. Both dimensions are equal. + // 2. One of the dimensions is equal to 1. + let o_dim: usize; + if l_dim == r_dim { + o_dim = l_dim; // both dimensions are equal + l_strides.push(cur_l_stride); + r_strides.push(cur_r_stride); + } else if l_dim == 1 { + o_dim = r_dim; // broadcast the left + l_strides.push(0); + r_strides.push(cur_r_stride); + } else if r_dim == 1 { + o_dim = l_dim; // broadcast the right + l_strides.push(cur_l_stride); + r_strides.push(0); + } else { + panic!("Dimensions differ and cannot be broadcasted."); + } + osh[i] = o_dim; + o_strides.push(cur_o_stride); + cur_o_stride *= o_dim; + + cur_l_stride *= l_dim; + cur_r_stride *= r_dim; + } + l_strides.reverse(); + r_strides.reverse(); + o_strides.reverse(); + + ( + Shape::from(osh), + Strides::new(l_strides), + Strides::new(r_strides), + Strides::new(o_strides), + ) +} + +pub(crate) fn cross( + lhs: SharedArray, + rhs: SharedArray, + dim: usize, +) -> SharedArray { + let shape_lhs = lhs.shape(); + let shape_rhs = rhs.shape(); + let ndims = shape_lhs.num_dims(); + + // Broadcast the shapes except along dim + let mut broadcast_shape = vec![0; ndims]; + for i in 0..ndims { + if i == dim { + broadcast_shape[i] = shape_lhs[i]; // already checked to be 3 + } else { + let l = shape_lhs[i]; + let r = shape_rhs[i]; + if l == r { + broadcast_shape[i] = l; + } else if l == 1 { + broadcast_shape[i] = r; + } else if r == 1 { + broadcast_shape[i] = l; + } else { + panic!("Tensors are not broadcastable along dimension {}", i); + } + } + } + + // Broadcast lhs and rhs + let lhs_broadcast = if shape_lhs == broadcast_shape.as_slice() { + lhs + } else { + NdArrayOps::expand(lhs, Shape::from(broadcast_shape.clone())) + }; + let rhs_broadcast = if shape_rhs == broadcast_shape.as_slice() { + rhs + } else { + NdArrayOps::expand(rhs, Shape::from(broadcast_shape.clone())) + }; + + // Now, move dim to the last dimension + let mut perm = (0..ndims).collect::>(); + perm.remove(dim); + perm.push(dim); + + let lhs_permuted = NdArrayOps::permute(lhs_broadcast, &perm); + let rhs_permuted = NdArrayOps::permute(rhs_broadcast, &perm); + + // Reshape to (*, 3) + let total_elements = lhs_permuted.shape().num_elements(); + let batch_size = total_elements / 3; + let lhs_reshaped = NdArrayOps::reshape(lhs_permuted, Shape::new([batch_size, 3])); + let rhs_reshaped = NdArrayOps::reshape(rhs_permuted, Shape::new([batch_size, 3])); + + // Compute cross product + let mut result = ndarray::ArrayD::::zeros(IxDyn(&[batch_size, 3])); + for i in 0..batch_size { + let a1 = lhs_reshaped[IxDyn(&[i, 0])]; + let a2 = lhs_reshaped[IxDyn(&[i, 1])]; + let a3 = lhs_reshaped[IxDyn(&[i, 2])]; + let b1 = rhs_reshaped[IxDyn(&[i, 0])]; + let b2 = rhs_reshaped[IxDyn(&[i, 1])]; + let b3 = rhs_reshaped[IxDyn(&[i, 2])]; + result[IxDyn(&[i, 0])] = a2.mul(b3).sub(a3.mul(b2)); + result[IxDyn(&[i, 1])] = a3.mul(b1).sub(a1.mul(b3)); + result[IxDyn(&[i, 2])] = a1.mul(b2).sub(a2.mul(b1)); + } + + let result_shared = result.into_shared(); + + // Reshape back to the broadcast shape with dim at the end + let mut result_shape = broadcast_shape; + result_shape.remove(dim); + result_shape.push(3); + let result_reshaped = NdArrayOps::reshape(result_shared, Shape::from(result_shape)); + + // Permute back + let mut inv_perm = vec![0; ndims]; + for (i, &p) in perm.iter().enumerate() { + inv_perm[p] = i; + } + NdArrayOps::permute(result_reshaped, &inv_perm) +} + +#[cfg(test)] +mod tests { + use super::*; + + impl Strides { + fn empty() -> Self { + Strides { + strides: Vec::with_capacity(0), + } + } + } + + #[test] + fn test_output_shape() { + // plain matrix multiply + assert_eq!( + output_shape(&[5, 3], &[3, 7]), + ( + Shape::from([5, 7]), + Strides::empty(), + Strides::empty(), + Strides::empty() + ) + ); + // matrix multiply with one extra stack dimension + assert_eq!( + output_shape(&[4, 5, 3], &[4, 3, 7]), + ( + Shape::from([4, 5, 7]), + Strides::new(vec![1]), + Strides::new(vec![1]), + Strides::new(vec![1]) + ) + ); + // rank 3, broadcast left + assert_eq!( + output_shape(&[1, 5, 3], &[4, 3, 7]), + ( + Shape::from([4, 5, 7]), + Strides::new(vec![0]), + Strides::new(vec![1]), + Strides::new(vec![1]) + ) + ); + // rank 3, broadcast right + assert_eq!( + output_shape(&[4, 5, 3], &[1, 3, 7]), + ( + Shape::from([4, 5, 7]), + Strides::new(vec![1]), + Strides::new(vec![0]), + Strides::new(vec![1]) + ) + ); + // rank 4, multi broadcast + assert_eq!( + output_shape(&[1, 4, 5, 3], &[8, 1, 3, 7]), + ( + Shape::from([8, 4, 5, 7]), + Strides::new(vec![0, 1]), + Strides::new(vec![1, 0]), + Strides::new(vec![4, 1]) + ) + ); + // rank 5, multi-broadcast + assert_eq!( + output_shape(&[1, 3, 4, 5, 3], &[8, 3, 1, 3, 7]), + ( + Shape::from([8, 3, 4, 5, 7]), + Strides::new(vec![0, 4, 1]), + Strides::new(vec![3, 1, 0]), + Strides::new(vec![12, 4, 1]) + ) + ) + } + + #[test] + #[should_panic( + expected = "Matrix multiplication requires an array with at least 2 dimensions." + )] + fn test_output_shape_too_small() { + output_shape(&[4], &[4]); + } + + #[test] + #[should_panic(expected = "Dimensions are incompatible for matrix multiplication.")] + fn test_output_shape_bad_matrix_dims() { + output_shape(&[5, 3], &[4, 7]); + } + + #[test] + #[should_panic(expected = "Dimensions differ and cannot be broadcasted.")] + fn test_output_shape_non_broadcast() { + output_shape(&[4, 5, 3], &[2, 3, 7]); + } +} diff --git a/crates/burn/src/ops/tensor.rs b/crates/burn/src/ops/tensor.rs index c5f26f38..6fc38a71 100644 --- a/crates/burn/src/ops/tensor.rs +++ b/crates/burn/src/ops/tensor.rs @@ -611,6 +611,11 @@ where } fn float_tanh(tensor: FloatTensor) -> FloatTensor { + #[cfg(feature = "simd")] + let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vstanh) { + Ok(result) => return result, + Err(t) => t, + }; execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray| { array .mapv_into(|a: FloatElem| (a.to_f64()).tanh().elem()) @@ -673,6 +678,11 @@ where } fn float_round(tensor: FloatTensor) -> FloatTensor { + #[cfg(feature = "simd")] + let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsround) { + Ok(result) => return result, + Err(t) => t, + }; execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray| { array .mapv_into(|a: FloatElem| round_ties_even_wrapper(a.to_f64()).elem()) @@ -681,6 +691,11 @@ where } fn float_floor(tensor: FloatTensor) -> FloatTensor { + #[cfg(feature = "simd")] + let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsfloor) { + Ok(result) => return result, + Err(t) => t, + }; execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray| { array .mapv_into(|a: FloatElem| (a.to_f64()).floor().elem()) @@ -689,6 +704,11 @@ where } fn float_ceil(tensor: FloatTensor) -> FloatTensor { + #[cfg(feature = "simd")] + let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsceil) { + Ok(result) => return result, + Err(t) => t, + }; execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray| { array .mapv_into(|a: FloatElem| (a.to_f64()).ceil().elem()) @@ -697,6 +717,11 @@ where } fn float_trunc(tensor: FloatTensor) -> FloatTensor { + #[cfg(feature = "simd")] + let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vstrunc) { + Ok(result) => return result, + Err(t) => t, + }; execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray| { array .mapv_into(|a: FloatElem| (a.to_f64()).trunc().elem()) diff --git a/src/hpc/gguf.rs b/src/hpc/gguf.rs new file mode 100644 index 00000000..d969e305 --- /dev/null +++ b/src/hpc/gguf.rs @@ -0,0 +1,465 @@ +//! Minimal GGUF reader — extracts f32 weight tensors from GGUF model files. +//! +//! Supports: F32, F16, BF16, Q8_0 dequantization. +//! Purpose: load one attention head's Q/K/V weights for bgz-tensor benchmarking. +//! +//! # Format +//! +//! ```text +//! [magic:4][version:4][tensor_count:8][metadata_count:8] +//! [metadata_kv × metadata_count] +//! [tensor_info × tensor_count] +//! [padding to alignment] +//! [tensor_data] +//! ``` + +use std::collections::HashMap; +use std::io::{Read, Seek, SeekFrom}; + +/// GGUF magic number: "GGUF" in little-endian. +pub const GGUF_MAGIC: u32 = 0x46554747; // "GGUF" as LE u32 + +/// Tensor data type in GGUF. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u32)] +pub enum GgmlType { + F32 = 0, + F16 = 1, + Q4_0 = 2, + Q4_1 = 3, + Q5_0 = 6, + Q5_1 = 7, + Q8_0 = 8, + Q8_1 = 9, + Q4_K = 12, + Q5_K = 13, + Q6_K = 14, + Q8_K = 15, + F64 = 28, + BF16 = 30, + Unknown = 255, +} + +impl From for GgmlType { + fn from(v: u32) -> Self { + match v { + 0 => Self::F32, + 1 => Self::F16, + 2 => Self::Q4_0, + 3 => Self::Q4_1, + 6 => Self::Q5_0, + 7 => Self::Q5_1, + 8 => Self::Q8_0, + 9 => Self::Q8_1, + 12 => Self::Q4_K, + 13 => Self::Q5_K, + 14 => Self::Q6_K, + 15 => Self::Q8_K, + 28 => Self::F64, + 30 => Self::BF16, + _ => Self::Unknown, + } + } +} + +impl GgmlType { + /// Bytes per element for unquantized types. Returns None for quantized types. + pub fn element_size(&self) -> Option { + match self { + Self::F32 => Some(4), + Self::F16 | Self::BF16 => Some(2), + Self::F64 => Some(8), + _ => None, // Quantized types have block structure + } + } + + /// Block size for quantized types. + pub fn block_size(&self) -> usize { + match self { + Self::Q4_0 | Self::Q4_1 | Self::Q5_0 | Self::Q5_1 => 32, + Self::Q8_0 | Self::Q8_1 => 32, + Self::Q4_K | Self::Q5_K | Self::Q6_K | Self::Q8_K => 256, + _ => 1, // Unquantized: 1 element per "block" + } + } + + /// Bytes per block for quantized types. + pub fn bytes_per_block(&self) -> usize { + match self { + Self::F32 => 4, + Self::F16 | Self::BF16 => 2, + Self::F64 => 8, + Self::Q4_0 => 18, // 2 (scale) + 32/2 (nibbles) = 18 + Self::Q4_1 => 20, // 2 (scale) + 2 (min) + 32/2 = 20 + Self::Q8_0 => 34, // 2 (scale) + 32 (int8s) = 34 + Self::Q4_K => 144, // Complex block structure + _ => 0, + } + } +} + +/// Info about one tensor in the GGUF file. +#[derive(Debug, Clone)] +pub struct TensorInfo { + pub name: String, + pub dimensions: Vec, + pub dtype: GgmlType, + pub offset: u64, // relative to tensor data start +} + +impl TensorInfo { + pub fn element_count(&self) -> u64 { + self.dimensions.iter().product() + } +} + +/// Parsed GGUF header + tensor directory. +#[derive(Debug)] +pub struct GgufFile { + pub version: u32, + pub metadata: HashMap, // simplified: all values as strings + pub tensors: Vec, + pub tensor_data_offset: u64, // absolute file offset where tensor data starts + pub alignment: u64, +} + +/// Read a GGUF file header and tensor directory. +pub fn read_gguf_header(reader: &mut R) -> Result { + // Magic + let magic = read_u32(reader)?; + if magic != GGUF_MAGIC { + return Err(format!("Not a GGUF file: magic={:#x}, expected={:#x}", magic, GGUF_MAGIC)); + } + + // Version + let version = read_u32(reader)?; + if version < 2 || version > 3 { + return Err(format!("Unsupported GGUF version: {}", version)); + } + + // Counts + let tensor_count = read_u64(reader)?; + let metadata_count = read_u64(reader)?; + + // Metadata KV pairs (simplified: read keys, skip complex values) + let mut metadata = HashMap::new(); + for _ in 0..metadata_count { + let key = read_string(reader)?; + let value_type = read_u32(reader)?; + let value = read_metadata_value(reader, value_type)?; + metadata.insert(key, value); + } + + // Alignment + let alignment = metadata + .get("general.alignment") + .and_then(|v| v.parse::().ok()) + .unwrap_or(32); + + // Tensor info + let mut tensors = Vec::with_capacity(tensor_count as usize); + for _ in 0..tensor_count { + let name = read_string(reader)?; + let n_dims = read_u32(reader)? as usize; + let mut dimensions = Vec::with_capacity(n_dims); + for _ in 0..n_dims { + dimensions.push(read_u64(reader)?); + } + let dtype = GgmlType::from(read_u32(reader)?); + let offset = read_u64(reader)?; + tensors.push(TensorInfo { name, dimensions, dtype, offset }); + } + + // Compute tensor data start: current position, aligned up + let current_pos = reader.stream_position().map_err(|e| e.to_string())?; + let tensor_data_offset = (current_pos + alignment - 1) / alignment * alignment; + + Ok(GgufFile { + version, + metadata, + tensors, + tensor_data_offset, + alignment, + }) +} + +/// Read one tensor's data as f32 (dequantizing if needed). +pub fn read_tensor_f32( + reader: &mut R, + gguf: &GgufFile, + tensor: &TensorInfo, +) -> Result, String> { + let abs_offset = gguf.tensor_data_offset + tensor.offset; + reader.seek(SeekFrom::Start(abs_offset)).map_err(|e| e.to_string())?; + + let n_elements = tensor.element_count() as usize; + + match tensor.dtype { + GgmlType::F32 => { + let mut buf = vec![0u8; n_elements * 4]; + reader.read_exact(&mut buf).map_err(|e| e.to_string())?; + Ok(buf.chunks_exact(4) + .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect()) + } + GgmlType::F16 => { + let mut buf = vec![0u8; n_elements * 2]; + reader.read_exact(&mut buf).map_err(|e| e.to_string())?; + Ok(buf.chunks_exact(2) + .map(|c| { + let bits = u16::from_le_bytes([c[0], c[1]]); + f16_to_f32(bits) + }) + .collect()) + } + GgmlType::BF16 => { + let mut buf = vec![0u8; n_elements * 2]; + reader.read_exact(&mut buf).map_err(|e| e.to_string())?; + Ok(buf.chunks_exact(2) + .map(|c| { + let bits = u16::from_le_bytes([c[0], c[1]]); + bf16_to_f32(bits) + }) + .collect()) + } + GgmlType::Q8_0 => { + dequantize_q8_0(reader, n_elements) + } + other => Err(format!("Unsupported dtype for dequantization: {:?}", other)), + } +} + +/// Find a tensor by name pattern (e.g., "blk.0.attn_q.weight"). +pub fn find_tensor<'a>(gguf: &'a GgufFile, pattern: &str) -> Option<&'a TensorInfo> { + gguf.tensors.iter().find(|t| t.name.contains(pattern)) +} + +/// List all tensor names and shapes. +pub fn list_tensors(gguf: &GgufFile) -> Vec<(String, Vec, GgmlType)> { + gguf.tensors.iter() + .map(|t| (t.name.clone(), t.dimensions.clone(), t.dtype)) + .collect() +} + +// ── Internal helpers ──────────────────────────────────────────────────────── + +fn read_u32(r: &mut R) -> Result { + let mut buf = [0u8; 4]; + r.read_exact(&mut buf).map_err(|e| e.to_string())?; + Ok(u32::from_le_bytes(buf)) +} + +fn read_u64(r: &mut R) -> Result { + let mut buf = [0u8; 8]; + r.read_exact(&mut buf).map_err(|e| e.to_string())?; + Ok(u64::from_le_bytes(buf)) +} + +fn read_string(r: &mut R) -> Result { + let len = read_u64(r)? as usize; + if len > 65536 { + return Err(format!("String too long: {} bytes", len)); + } + let mut buf = vec![0u8; len]; + r.read_exact(&mut buf).map_err(|e| e.to_string())?; + String::from_utf8(buf).map_err(|e| e.to_string()) +} + +fn read_metadata_value(r: &mut R, value_type: u32) -> Result { + match value_type { + 0 => { let mut b = [0u8; 1]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok(b[0].to_string()) } // u8 + 1 => { let mut b = [0u8; 1]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok((b[0] as i8).to_string()) } // i8 + 2 => { let mut b = [0u8; 2]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok(u16::from_le_bytes(b).to_string()) } // u16 + 3 => { let mut b = [0u8; 2]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok(i16::from_le_bytes(b).to_string()) } // i16 + 4 => Ok(read_u32(r)?.to_string()), // u32 + 5 => { let v = read_u32(r)?; Ok((v as i32).to_string()) } // i32 + 6 => { let mut b = [0u8; 4]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok(f32::from_le_bytes(b).to_string()) } // f32 + 7 => { let mut b = [0u8; 1]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok((b[0] != 0).to_string()) } // bool + 8 => read_string(r), // string + 9 => { // array + let elem_type = read_u32(r)?; + let count = read_u64(r)?; + // Skip array elements (we don't need them for tensor loading) + for _ in 0..count { + let _ = read_metadata_value(r, elem_type)?; + } + Ok(format!("[array of {} × type {}]", count, elem_type)) + } + 10 => Ok(read_u64(r)?.to_string()), // u64 + 11 => { let v = read_u64(r)?; Ok((v as i64).to_string()) } // i64 + 12 => { let mut b = [0u8; 8]; r.read_exact(&mut b).map_err(|e| e.to_string())?; Ok(f64::from_le_bytes(b).to_string()) } // f64 + _ => Err(format!("Unknown metadata value type: {}", value_type)), + } +} + +/// Dequantize Q8_0: each block = 2 bytes scale (f16) + 32 bytes int8. +fn dequantize_q8_0(r: &mut R, n_elements: usize) -> Result, String> { + let block_size = 32; + let n_blocks = (n_elements + block_size - 1) / block_size; + let mut result = Vec::with_capacity(n_elements); + + for _ in 0..n_blocks { + // Read scale as f16 + let mut scale_buf = [0u8; 2]; + r.read_exact(&mut scale_buf).map_err(|e| e.to_string())?; + let scale = f16_to_f32(u16::from_le_bytes(scale_buf)); + + // Read 32 int8 values + let mut quants = [0u8; 32]; + r.read_exact(&mut quants).map_err(|e| e.to_string())?; + + for &q in &quants { + result.push((q as i8) as f32 * scale); + } + } + + result.truncate(n_elements); + Ok(result) +} + +/// Convert f16 bit pattern to f32. +fn f16_to_f32(bits: u16) -> f32 { + let sign = ((bits >> 15) & 1) as u32; + let exp = ((bits >> 10) & 0x1F) as u32; + let mantissa = (bits & 0x3FF) as u32; + + if exp == 0 { + if mantissa == 0 { + return f32::from_bits(sign << 31); // ±0 + } + // Subnormal f16 → normal f32 + let mut m = mantissa; + let mut e = 0i32; + while (m & 0x400) == 0 { + m <<= 1; + e -= 1; + } + m &= 0x3FF; + let f32_bits = (sign << 31) | (((127 - 15 + 1 + e as u32) & 0xFF) << 23) | (m << 13); + return f32::from_bits(f32_bits); + } + if exp == 31 { + // Inf or NaN + let f32_bits = (sign << 31) | (0xFF << 23) | (mantissa << 13); + return f32::from_bits(f32_bits); + } + // Normal + let f32_bits = (sign << 31) | ((exp + 127 - 15) << 23) | (mantissa << 13); + f32::from_bits(f32_bits) +} + +/// Convert BF16 bit pattern to f32 (just shift left 16 bits). +fn bf16_to_f32(bits: u16) -> f32 { + f32::from_bits((bits as u32) << 16) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + fn make_gguf_header(tensor_count: u64, metadata_count: u64) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&GGUF_MAGIC.to_le_bytes()); + buf.extend_from_slice(&3u32.to_le_bytes()); // version + buf.extend_from_slice(&tensor_count.to_le_bytes()); + buf.extend_from_slice(&metadata_count.to_le_bytes()); + buf + } + + fn append_string(buf: &mut Vec, s: &str) { + buf.extend_from_slice(&(s.len() as u64).to_le_bytes()); + buf.extend_from_slice(s.as_bytes()); + } + + fn append_tensor_info(buf: &mut Vec, name: &str, dims: &[u64], dtype: u32, offset: u64) { + append_string(buf, name); + buf.extend_from_slice(&(dims.len() as u32).to_le_bytes()); + for &d in dims { + buf.extend_from_slice(&d.to_le_bytes()); + } + buf.extend_from_slice(&dtype.to_le_bytes()); + buf.extend_from_slice(&offset.to_le_bytes()); + } + + #[test] + fn test_parse_minimal_gguf() { + let mut buf = make_gguf_header(1, 0); + append_tensor_info(&mut buf, "test.weight", &[4, 4], 0, 0); // F32, offset 0 + + // Pad to alignment (32 bytes) + while buf.len() % 32 != 0 { + buf.push(0); + } + + // Tensor data: 16 f32 values + for i in 0..16u32 { + buf.extend_from_slice(&(i as f32).to_le_bytes()); + } + + let mut cursor = Cursor::new(&buf); + let gguf = read_gguf_header(&mut cursor).unwrap(); + assert_eq!(gguf.version, 3); + assert_eq!(gguf.tensors.len(), 1); + assert_eq!(gguf.tensors[0].name, "test.weight"); + assert_eq!(gguf.tensors[0].dimensions, vec![4, 4]); + assert_eq!(gguf.tensors[0].dtype, GgmlType::F32); + + let data = read_tensor_f32(&mut cursor, &gguf, &gguf.tensors[0]).unwrap(); + assert_eq!(data.len(), 16); + assert!((data[0] - 0.0).abs() < 1e-6); + assert!((data[15] - 15.0).abs() < 1e-6); + } + + #[test] + fn test_f16_conversion() { + // f16 for 1.0: sign=0, exp=15, mantissa=0 → 0x3C00 + assert!((f16_to_f32(0x3C00) - 1.0).abs() < 1e-4); + // f16 for 0.0 + assert_eq!(f16_to_f32(0x0000), 0.0); + // f16 for -1.0: 0xBC00 + assert!((f16_to_f32(0xBC00) + 1.0).abs() < 1e-4); + } + + #[test] + fn test_bf16_conversion() { + // bf16 for 1.0: 0x3F80 (upper 16 bits of f32 1.0) + assert_eq!(bf16_to_f32(0x3F80), 1.0); + assert_eq!(bf16_to_f32(0x0000), 0.0); + } + + #[test] + fn test_q8_0_dequantize() { + // Build a Q8_0 block: f16 scale + 32 int8 values + let scale: f32 = 0.5; + let scale_f16_bits: u16 = 0x3800; // f16 for 0.5 + let mut block = Vec::new(); + block.extend_from_slice(&scale_f16_bits.to_le_bytes()); + for i in 0..32i8 { + block.push(i as u8); + } + + let mut cursor = Cursor::new(&block); + let result = dequantize_q8_0(&mut cursor, 32).unwrap(); + assert_eq!(result.len(), 32); + assert!((result[0] - 0.0).abs() < 1e-4); // 0 * 0.5 = 0 + assert!((result[1] - 0.5).abs() < 1e-4); // 1 * 0.5 = 0.5 + assert!((result[10] - 5.0).abs() < 1e-4); // 10 * 0.5 = 5.0 + } + + #[test] + fn test_list_tensors() { + let mut buf = make_gguf_header(2, 0); + append_tensor_info(&mut buf, "blk.0.attn_q.weight", &[4096, 4096], 8, 0); + append_tensor_info(&mut buf, "blk.0.attn_k.weight", &[4096, 1024], 8, 4096 * 4096 * 34 / 32); + + while buf.len() % 32 != 0 { buf.push(0); } + + let mut cursor = Cursor::new(&buf); + let gguf = read_gguf_header(&mut cursor).unwrap(); + let tensors = list_tensors(&gguf); + assert_eq!(tensors.len(), 2); + assert!(tensors[0].0.contains("attn_q")); + assert!(tensors[1].0.contains("attn_k")); + } +} diff --git a/src/hpc/mod.rs b/src/hpc/mod.rs index 68ddbc47..8c24546e 100644 --- a/src/hpc/mod.rs +++ b/src/hpc/mod.rs @@ -162,6 +162,10 @@ pub mod holo; #[allow(missing_docs)] pub mod cam_pq; +/// GGUF model file reader — extract f32 weights from quantized models. +#[allow(missing_docs)] +pub mod gguf; + // jitson: JSON config → scan pipeline (parser, validator, template, precompile, packed) // Always available — no Cranelift dependency. #[allow(missing_docs)] diff --git a/src/hpc/vml.rs b/src/hpc/vml.rs index 54b3ab11..fae85e59 100644 --- a/src/hpc/vml.rs +++ b/src/hpc/vml.rs @@ -225,6 +225,122 @@ pub fn vspow(a: &[f32], b: &[f32], out: &mut [f32]) { } } +/// Element-wise tanh: out[i] = tanh(x[i]) +/// +/// Uses the identity: tanh(x) = 2·sigmoid(2x) - 1 +/// which reuses our SIMD sigmoid (F32x16 polynomial). +pub fn vstanh(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + let chunks = n / 16; + let two = F32x16::splat(2.0); + let one = F32x16::splat(1.0); + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + // tanh(x) = 2·sigmoid(2x) - 1 + let two_x = v * two; + // sigmoid(z) = 1/(1+exp(-z)) + let neg_two_x = F32x16::splat(0.0) - two_x; + let exp_neg = simd_exp_f32(neg_two_x); + let sigmoid = one / (exp_neg + one); + let tanh_v = sigmoid.mul_add(two, F32x16::splat(-1.0)); + tanh_v.copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = x[i].tanh(); + } +} + +/// Element-wise floor: out[i] = floor(x[i]) +/// +/// Uses F32x16 hardware VRNDSCALEPS (AVX-512) or equivalent. +pub fn vsfloor(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + let chunks = n / 16; + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + v.floor().copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = x[i].floor(); + } +} + +/// Element-wise ceil: out[i] = ceil(x[i]) +pub fn vsceil(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + let chunks = n / 16; + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + // ceil = -floor(-x) + let neg = F32x16::splat(0.0) - v; + let floored = neg.floor(); + let ceiled = F32x16::splat(0.0) - floored; + ceiled.copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = x[i].ceil(); + } +} + +/// Element-wise round (ties to even): out[i] = round(x[i]) +pub fn vsround(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + let chunks = n / 16; + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + v.round().copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = x[i].round_ties_even(); + } +} + +/// Element-wise negate: out[i] = -x[i] +pub fn vsneg(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + let chunks = n / 16; + let zero = F32x16::splat(0.0); + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + (zero - v).copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = -x[i]; + } +} + +/// Element-wise trunc: out[i] = trunc(x[i]) (round toward zero) +pub fn vstrunc(x: &[f32], out: &mut [f32]) { + let n = x.len().min(out.len()); + // trunc = sign(x) * floor(abs(x)) + let chunks = n / 16; + for i in 0..chunks { + let off = i * 16; + let v = F32x16::from_slice(&x[off..off + 16]); + let abs_v = v.abs(); + let floored = abs_v.floor(); + // Restore sign: if original was negative, negate the result + // Use: trunc(x) = floor(x) if x >= 0, ceil(x) if x < 0 + // Simpler: just floor(abs(x)) * sign(x) + // We can do sign via: x / abs(x), but that's NaN for 0. + // Instead: if x >= 0, result = floor(abs(x)), else -floor(abs(x)) + let zero = F32x16::splat(0.0); + let mask = v.simd_lt(zero); // true where x < 0 + let pos_result = floored; + let neg_result = zero - floored; + let result = mask.select(neg_result, pos_result); + result.copy_to_slice(&mut out[off..off + 16]); + } + for i in (chunks * 16)..n { + out[i] = x[i].trunc(); + } +} + #[cfg(test)] mod tests { use super::*; @@ -346,4 +462,387 @@ mod tests { assert!((out[i] - x[i].sin()).abs() < 1e-5, "mismatch at {i}"); } } + #[test] + fn test_f64_golden_step_hydration_cost() { + use std::f64::consts; + // Rust 1.94: std::f64::consts::PHI and GAMMA are stable. + // On 1.93: define manually. + #[allow(dead_code)] + const PHI: f64 = 1.618033988749894848204586834365638118; + + // Simulate: 4096D f64 vector → Base17-style projection → hydration back + let d = 4096usize; + let base_dim = 17usize; + let golden_step = 11usize; + + // Generate "weight" data (deterministic, mimics Gaussian distribution) + let weights: Vec = (0..d) + .map(|i| ((i as f64 * 0.7 + 13.0).sin() * 2.5)) + .collect(); + + // ── ENCODING: f64[4096] → f64[17] (golden-step projection) ── + let encode_start = std::time::Instant::now(); + let n_octaves = (d + base_dim - 1) / base_dim; + let mut sum = [0.0f64; 17]; + let mut count = [0u32; 17]; + for octave in 0..n_octaves { + for bi in 0..base_dim { + let dim = octave * base_dim + ((bi * golden_step) % base_dim); + if dim < d { + sum[bi] += weights[dim]; + count[bi] += 1; + } + } + } + let mut coefficients_f64 = [0.0f64; 17]; + for i in 0..base_dim { + if count[i] > 0 { + coefficients_f64[i] = sum[i] / count[i] as f64; + } + } + let encode_time = encode_start.elapsed(); + + // ── QUANTIZE: f64[17] → i16[17] (what Base17 stores) ── + let fp_scale = 1000.0; + let coefficients_i16: Vec = coefficients_f64.iter() + .map(|&v| (v * fp_scale).round().clamp(-32768.0, 32767.0) as i16) + .collect(); + + // ── HYDRATION: i16[17] → f64[4096] (reconstruct from golden-step basis) ── + let hydrate_start = std::time::Instant::now(); + let mut reconstructed = vec![0.0f64; d]; + for octave in 0..n_octaves { + for bi in 0..base_dim { + let dim = octave * base_dim + ((bi * golden_step) % base_dim); + if dim < d { + reconstructed[dim] = coefficients_i16[bi] as f64 / fp_scale; + } + } + } + let hydrate_time = hydrate_start.elapsed(); + + // ── MEASURE: reconstruction quality ── + let mut sum_sq_err = 0.0f64; + let mut sum_sq_orig = 0.0f64; + for i in 0..d { + let err = weights[i] - reconstructed[i]; + sum_sq_err += err * err; + sum_sq_orig += weights[i] * weights[i]; + } + let relative_error = (sum_sq_err / sum_sq_orig).sqrt(); + + // ── REPORT ── + eprintln!("=== F64 Golden-Step Hydration Cost ==="); + eprintln!(" Input: f64[{}] = {} bytes", d, d * 8); + eprintln!(" Encoded: i16[17] = 34 bytes"); + eprintln!(" Compression: {}×", (d * 8) / 34); + eprintln!(" Encode time: {:?}", encode_time); + eprintln!(" Hydrate time: {:?}", hydrate_time); + eprintln!(" Relative error: {:.6}", relative_error); + eprintln!(" Reconstruction quality: {:.4}%", (1.0 - relative_error) * 100.0); + + // The surface area cost IS just the encode + hydrate. + // The middle (i16 distance, SimilarityTable lookup) is O(1) regardless. + // For f64 tensors: the ONLY extra cost vs f32 tensors is the + // f64→f64 accumulation in the projection (instead of f32→f64). + // That's ~0 extra cost because the projection already uses f64 sums. + + assert!(encode_time.as_micros() < 100, "encoding should be < 100μs"); + assert!(hydrate_time.as_micros() < 100, "hydration should be < 100μs"); + } + #[test] + fn test_golden_step_vs_random_projection_rho() { + // Compare: golden-step 17D projection vs random 17D projection + // on synthetic weight-like data (approximate Gaussian). + // Measures Spearman ρ of pairwise distances. + + let d = 256; // weight vector dimension (small for test speed) + let n = 50; // number of vectors to compare + let base_dim = 17; + let golden_step = 11; + + // Generate weight-like vectors (deterministic, Gaussian-ish) + let vectors: Vec> = (0..n) + .map(|i| { + (0..d) + .map(|j| ((i * 97 + j * 31) as f64 * 0.001).sin() * 2.0) + .collect() + }) + .collect(); + + // Ground truth: pairwise L2 distances in full d-D space + let mut gt_distances = Vec::new(); + for i in 0..n { + for j in (i + 1)..n { + let dist: f64 = vectors[i].iter().zip(&vectors[j]) + .map(|(a, b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + gt_distances.push(dist); + } + } + + // Golden-step projection: project each vector to 17D + let golden_projected: Vec> = vectors.iter() + .map(|v| { + let n_octaves = (d + base_dim - 1) / base_dim; + let mut sum = vec![0.0f64; base_dim]; + let mut count = vec![0u32; base_dim]; + for octave in 0..n_octaves { + for bi in 0..base_dim { + let dim = octave * base_dim + ((bi * golden_step) % base_dim); + if dim < d { + sum[bi] += v[dim]; + count[bi] += 1; + } + } + } + sum.iter().zip(&count) + .map(|(&s, &c)| if c > 0 { s / c as f64 } else { 0.0 }) + .collect() + }) + .collect(); + + // Random projection: use a deterministic "random" 17×d matrix + let random_matrix: Vec> = (0..base_dim) + .map(|i| { + (0..d) + .map(|j| ((i * 7919 + j * 104729) as f64 * 0.00001).sin()) + .collect() + }) + .collect(); + + let random_projected: Vec> = vectors.iter() + .map(|v| { + random_matrix.iter() + .map(|row| { + row.iter().zip(v).map(|(r, x)| r * x).sum::() + }) + .collect() + }) + .collect(); + + // Compute pairwise distances in both projected spaces + let golden_distances: Vec = { + let mut dists = Vec::new(); + for i in 0..n { + for j in (i + 1)..n { + let dist: f64 = golden_projected[i].iter().zip(&golden_projected[j]) + .map(|(a, b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + dists.push(dist); + } + } + dists + }; + + let random_distances: Vec = { + let mut dists = Vec::new(); + for i in 0..n { + for j in (i + 1)..n { + let dist: f64 = random_projected[i].iter().zip(&random_projected[j]) + .map(|(a, b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + dists.push(dist); + } + } + dists + }; + + // Compute Spearman ρ: rank correlation between GT and projected distances + fn spearman_rho(a: &[f64], b: &[f64]) -> f64 { + let n = a.len(); + let rank_a = ranks(a); + let rank_b = ranks(b); + let mean_a: f64 = rank_a.iter().sum::() / n as f64; + let mean_b: f64 = rank_b.iter().sum::() / n as f64; + let mut cov = 0.0f64; + let mut var_a = 0.0f64; + let mut var_b = 0.0f64; + for i in 0..n { + let da = rank_a[i] - mean_a; + let db = rank_b[i] - mean_b; + cov += da * db; + var_a += da * da; + var_b += db * db; + } + if var_a < 1e-10 || var_b < 1e-10 { return 0.0; } + cov / (var_a * var_b).sqrt() + } + + fn ranks(values: &[f64]) -> Vec { + let mut indexed: Vec<(usize, f64)> = values.iter().copied().enumerate().collect(); + indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let mut result = vec![0.0; values.len()]; + for (rank, (idx, _)) in indexed.into_iter().enumerate() { + result[idx] = rank as f64; + } + result + } + + let rho_golden = spearman_rho(>_distances, &golden_distances); + let rho_random = spearman_rho(>_distances, &random_distances); + + eprintln!("=== Projection Quality (Spearman ρ) ==="); + eprintln!(" Golden-step 17D: ρ = {:.4}", rho_golden); + eprintln!(" Random 17D: ρ = {:.4}", rho_random); + eprintln!(" Δ (golden - random): {:.4}", rho_golden - rho_random); + + // Both should preserve SOME ranking (ρ > 0) + assert!(rho_golden > 0.0, "golden-step ρ should be positive"); + assert!(rho_random > 0.0, "random ρ should be positive"); + // The interesting question: is golden better than random? + // We don't assert this — we just measure it. + // If Δ ≈ 0 → golden step is the 52°N problem. + // If Δ > 0.05 → golden step captures real structure. + } + #[test] + #[ignore] // Requires /tmp/tiny_imagenet_200.json (run with --include-ignored) + fn test_bgz17_on_tiny_imagenet() { + // Load real image feature vectors from tiny-imagenet (binary format). + // Generate with: python3 script that saves [d:u32][n:u32][f32 × d × n] + let bytes = match std::fs::read("/tmp/tiny_imagenet_200.bin") { + Ok(b) => b, + Err(_) => { + eprintln!("SKIP: /tmp/tiny_imagenet_200.bin not found"); + return; + } + }; + + let d = u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) as usize; + let n = u32::from_le_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]) as usize; + + let mut vectors: Vec> = Vec::with_capacity(n); + let float_data = &bytes[8..]; + for i in 0..n { + let v: Vec = (0..d) + .map(|j| { + let off = (i * d + j) * 4; + f32::from_le_bytes([float_data[off], float_data[off+1], float_data[off+2], float_data[off+3]]) as f64 + }) + .collect(); + vectors.push(v); + } + + let n = vectors.len(); + eprintln!("Loaded {} vectors of dim {} from tiny-imagenet", n, d); + assert!(n >= 50, "Need at least 50 vectors"); + + // Use first 100 for speed + let n = n.min(100); + let vectors = &vectors[..n]; + + let base_dim = 17; + let golden_step = 11; + + // Ground truth: pairwise L2 distances + let mut gt_distances = Vec::new(); + for i in 0..n { + for j in (i+1)..n { + let dist: f64 = vectors[i].iter().zip(&vectors[j]) + .map(|(a, b)| (a - b) * (a - b)) + .sum::() + .sqrt(); + gt_distances.push(dist); + } + } + + // Golden-step projection + let golden_projected: Vec> = vectors.iter() + .map(|v| { + let n_octaves = (d + base_dim - 1) / base_dim; + let mut sum = vec![0.0f64; base_dim]; + let mut count = vec![0u32; base_dim]; + for octave in 0..n_octaves { + for bi in 0..base_dim { + let dim = octave * base_dim + ((bi * golden_step) % base_dim); + if dim < d { sum[bi] += v[dim]; count[bi] += 1; } + } + } + sum.iter().zip(&count).map(|(&s, &c)| if c > 0 { s / c as f64 } else { 0.0 }).collect() + }) + .collect(); + + // Random projection + let random_matrix: Vec> = (0..base_dim) + .map(|i| (0..d).map(|j| ((i * 7919 + j * 104729) as f64 * 0.00001).sin()).collect()) + .collect(); + let random_projected: Vec> = vectors.iter() + .map(|v| random_matrix.iter().map(|row| row.iter().zip(v).map(|(r, x)| r * x).sum::()).collect()) + .collect(); + + // Simple mean projection (average every 17 consecutive dims) + let mean_projected: Vec> = vectors.iter() + .map(|v| { + (0..base_dim).map(|bi| { + let chunk: Vec = (bi..d).step_by(base_dim).map(|i| v[i]).collect(); + if chunk.is_empty() { 0.0 } else { chunk.iter().sum::() / chunk.len() as f64 } + }).collect() + }) + .collect(); + + // Compute projected distances + fn pairwise_l2(proj: &[Vec]) -> Vec { + let n = proj.len(); + let mut dists = Vec::new(); + for i in 0..n { for j in (i+1)..n { + let d: f64 = proj[i].iter().zip(&proj[j]).map(|(a,b)| (a-b)*(a-b)).sum::().sqrt(); + dists.push(d); + }} + dists + } + + let golden_dists = pairwise_l2(&golden_projected); + let random_dists = pairwise_l2(&random_projected); + let mean_dists = pairwise_l2(&mean_projected); + + // Spearman ρ + fn spearman(a: &[f64], b: &[f64]) -> f64 { + fn ranks(v: &[f64]) -> Vec { + let mut idx: Vec<(usize, f64)> = v.iter().copied().enumerate().collect(); + idx.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let mut r = vec![0.0; v.len()]; + for (rank, (i, _)) in idx.into_iter().enumerate() { r[i] = rank as f64; } + r + } + let ra = ranks(a); let rb = ranks(b); + let n = a.len() as f64; + let ma: f64 = ra.iter().sum::() / n; + let mb: f64 = rb.iter().sum::() / n; + let (mut cov, mut va, mut vb) = (0.0, 0.0, 0.0); + for i in 0..a.len() { + let (da, db) = (ra[i] - ma, rb[i] - mb); + cov += da * db; va += da * da; vb += db * db; + } + if va < 1e-10 || vb < 1e-10 { 0.0 } else { cov / (va * vb).sqrt() } + } + + let rho_golden = spearman(>_distances, &golden_dists); + let rho_random = spearman(>_distances, &random_dists); + let rho_mean = spearman(>_distances, &mean_dists); + + eprintln!("=== bgz17 on Tiny ImageNet (real pixel data) ==="); + eprintln!(" Golden-step 17D: ρ = {:.4}", rho_golden); + eprintln!(" Random 17D: ρ = {:.4}", rho_random); + eprintln!(" Mean-stride 17D: ρ = {:.4}", rho_mean); + eprintln!(" Δ golden-random: {:.4}", rho_golden - rho_random); + eprintln!(" Δ golden-mean: {:.4}", rho_golden - rho_mean); + eprintln!(); + if (rho_golden - rho_random).abs() < 0.02 { + eprintln!(" VERDICT: Golden-step ≈ random on pixel data (52°N problem)"); + eprintln!(" bgz17's value is NOT in the projection axes"); + eprintln!(" bgz17's value IS in the distance table + cascade infrastructure"); + } else if rho_golden > rho_random + 0.02 { + eprintln!(" VERDICT: Golden-step > random! The Fibonacci structure captures something."); + } else { + eprintln!(" VERDICT: Random > golden-step. Golden-step is WORSE for this data."); + } + + // Basic sanity: golden-step should preserve reasonable ranking + assert!(rho_golden > 0.3, "golden ρ too low: {}", rho_golden); + // Random projection CAN be very low on structured data — that's expected + assert!(rho_random > -0.5, "random ρ impossibly low: {}", rho_random); + } }