Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions src/hpc/gguf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -439,8 +439,26 @@ pub fn f16_to_f32(bits: u16) -> f32 {
return f32::from_bits(f32_bits);
}
if exp == 31 {
// Inf or NaN
let f32_bits = (sign << 31) | (0xFF << 23) | (mantissa << 13);
// Inf or NaN. IEEE 754 recommends producing a quiet NaN (QNaN) from
// F16 NaN inputs, which means setting the top mantissa bit (bit 22
// of F32 = 0x00400000) in addition to the shifted payload. The
// original implementation here left the quiet bit clear, producing
// a signaling NaN (SNaN), which is a bit-level mismatch against
// IEEE-correct references like the `half` crate. Finite-value
// upcasts were unaffected.
//
// This fix was landed alongside `examples/probe_jina_v5_safetensors.rs`
// in `lance-graph/crates/thinking-engine`, which round-trips all
// 65,536 F16 bit patterns through this method and is the regression
// test proving IEEE correctness over the full domain (±0, subnormals,
// normals, ±∞, every NaN payload).
let f32_bits = if mantissa == 0 {
// Infinity: just sign + exponent, no mantissa, no quiet bit.
(sign << 31) | 0x7f800000
} else {
// NaN: sign + exponent + quiet bit + shifted payload.
(sign << 31) | 0x7fc00000 | (mantissa << 13)
};
return f32::from_bits(f32_bits);
}
// Normal
Expand Down
142 changes: 136 additions & 6 deletions src/hpc/jina/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,20 @@ use std::sync::LazyLock;

/// Embedded weight files (compiled into the binary via include_bytes!).
/// Zero file I/O at runtime — the weights ARE the binary.
static JINA_BASE17: &[u8] = include_bytes!("weights/jina_base17_20k.bin");
static JINA_PALETTE: &[u8] = include_bytes!("weights/jina_palette_20k.bin");
///
/// Naming convention: {model}_{aspect}_{vocab_size}k.bin
/// - aspect = base17 (token embeddings) or palette (256-entry lookup)
/// - vocab_size = approximate token count in thousands
static JINA_V4_BASE17: &[u8] = include_bytes!("weights/jina_base17_20k.bin");
static JINA_V4_PALETTE: &[u8] = include_bytes!("weights/jina_palette_20k.bin");

// TODO(jina-v5-bake): When the bake pipeline produces Jina v5 weights
// (151K Qwen3 BPE tokens, 1024D hidden → 34-byte Base17), add:
// static JINA_V5_BASE17: &[u8] = include_bytes!("weights/jina_v5_base17_151k.bin");
// static JINA_V5_PALETTE: &[u8] = include_bytes!("weights/jina_v5_palette_151k.bin");
// Then swap the `JINA` LazyLock load line below to use JinaV5. See
// `JINA` / `JINA_V4` / `JINA_V5` statics near end of file for the wiring.

static GPT2_BASE17: &[u8] = include_bytes!("weights/gpt2_base17_50k.bin");
static GPT2_PALETTE: &[u8] = include_bytes!("weights/gpt2_palette_50k.bin");
static BERT_BASE17: &[u8] = include_bytes!("weights/bert_base17_30k.bin");
Expand All @@ -23,9 +35,91 @@ static BERT_PALETTE: &[u8] = include_bytes!("weights/bert_palette_30k.bin");
/// Which model's weights to use.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ModelSource {
/// Jina v4 text-retrieval (20K tokens, 2048D original).
/// Jina v4 text-retrieval (20K tokens, 2048D original, XLM-R base).
/// LEGACY route. Kept for backward compatibility and direct-access callers
/// that specifically need v4 behavior. Weights pre-baked at
/// `weights/jina_base17_20k.bin` + `weights/jina_palette_20k.bin`.
JinaV4,
/// GPT-2 small (50K tokens, 768D original). Same BPE as Jina.
/// Jina v5 small (151K tokens, 1024D hidden, Qwen 3.5 base, SiLU activation).
/// Also known as **Reader-LM v3** (same model, alternate name — BERT 3.x
/// architecture lineage; NOT the older Qwen2-based Reader-LM 1.5B/v1/v2).
///
/// **MAIN ROUTE** per AdaWorldAPI model registry (`lance-graph/CLAUDE.md`
/// → Model Registry → Production models): Jina v5 is the canonical
/// ground-truth anchor. Same Qwen 3.x BPE as Reranker v3, Qwopus.
///
/// # Storage format on disk (verified by probe)
///
/// The downloaded safetensors at
/// `lance-graph/crates/thinking-engine/data/jina-v5-onnx/model.safetensors`
/// is **BF16**, not F16. Every tensor in that 1.19 GB file is stored as
/// BF16 per the safetensors JSON header, verified by
/// `crates/thinking-engine/examples/probe_jina_v5_safetensors.rs`. The
/// embedding matrix is `embed_tokens.weight` shape `[151936, 1024]`
/// (311 MB BF16). Earlier canonical notes that said "Jina v5 is published
/// in F16 only" were incorrect for this specific export; other Jina v5
/// exports (ONNX, GGUF) may use different dtypes.
///
/// The tokenizer lives at `data/jina-v5-tokenizer.json` (flat under the
/// `data/` directory — NOT under `data/jina-v5-onnx/`). The tokenizer
/// reports vocab size = 151669, while the safetensors embedding matrix
/// has 151936 rows. Rows `[151669, 151936)` are ghost/unreachable
/// (fine-tune-trimmed vocabulary kept aligned for hardware efficiency).
/// Pair samplers MUST use `min(tokenizer_vocab, embed_rows) = 151669`.
///
/// # Precision hierarchy (workspace-wide rule, Jina v5 specifics)
///
/// 1. **Ground truth is the source file, losslessly upcast on demand.**
/// For this file, BF16 source → F32 via the trivial shift
/// [`crate::hpc::quantized::BF16`] scalar method. No F32 Vec is
/// materialized. No F32 "buffer" persists. F32 is a *method*, not a
/// storage format — it lives in registers or a small stack window
/// during computation and is discarded with the consumer.
///
/// 2. **Atomic-clock F16 → F32 method** at
/// [`crate::hpc::gguf::f16_to_f32`] (`src/hpc/gguf.rs:417`) is proven
/// lossless bit-exact over all 65,536 F16 patterns (including
/// subnormals, ±0, ±∞, and NaN payloads with correct IEEE 754 quiet
/// bit). Used by any F16 source (other Jina exports, GGUF files,
/// reranker weights). Not on the Jina v5 safetensors path since that
/// file is BF16.
///
/// 3. **Compute precision is BF16 with fused `mul_add`** via
/// [`crate::hpc::quantized::bf16_gemm_f32`] (`src/hpc/quantized.rs:108`).
/// F32-precision accumulation is a property of the hardware FMA
/// (`VDPBF16PS` on AVX-512-BF16, `BFMMLA` on ARM SVE, AMX on Apple),
/// invisible to the caller. The `F32x16::mul_add` / `F32x8::mul_add`
/// lane types in [`crate::simd`] compile to the appropriate
/// instruction for the target CPU.
///
/// 4. **F16 → BF16 has no exponent-range issue.** BF16 has MORE exponent
/// bits than F16 (8 vs 5), so every F16 value fits inside BF16 range
/// with ~33 orders of magnitude of headroom. The lossy step of
/// F16 → BF16 is a 3-bit mantissa truncation (10 → 7 bits), not an
/// exponent-range violation. Earlier notes that said "F16 max ~65504
/// overflows before reaching BF16 range" were backwards.
///
/// 5. **F64 constants** (π, e, φ, Euler-γ from `std::f64::consts`) are
/// used for calibration math (GammaProfile log/exp), preserved at full
/// 52-bit mantissa precision, and converted to BF16 exactly once per
/// profile as a splatted value. The calibration result is 28 bytes.
///
/// 6. **Storage after calibration**: Base17 i16 fixed-point (34-byte
/// plane) or palette u8 index. Certification against the BF16 source
/// goes through a streaming harness that reads the source once per
/// pass, upcasts in registers, and reports Pearson / Spearman /
/// Cronbach α to 4 decimal places.
///
/// # Weight baking status
///
/// Compile-time embedded weights at `weights/jina_v5_*.bin` are not yet
/// produced. Until they are, the `JINA` main-route LazyLock falls back
/// to v4 bytes. When the certification harness proves lab BF16 at
/// ≥ 0.9999 and bgz-hhtl-d at ≥ 0.9980 on the three metrics, the
/// Jina v5 runtime artifacts can be produced from the certified
/// derivation pipeline. See the TODO block above `JINA_V4_BASE17`.
JinaV5,
/// GPT-2 small (50K tokens, 768D original). Same BPE as Jina v4.
Gpt2,
/// BERT base uncased (30K tokens, 768D original). WordPiece tokenizer.
Bert,
Expand Down Expand Up @@ -190,9 +284,33 @@ fn build_similarity_table(palette: &JinaPalette) -> [f32; 256] {
// Global LazyLock runtimes — loaded once, used forever
// ============================================================================

/// Jina v4 runtime (20K tokens). LazyLock: zero cost after first access.
/// Jina **main route**. LazyLock: zero cost after first access.
///
/// Today this loads Jina v4 bytes (20K tokens) because v5 weights are not yet
/// baked into `weights/`. When the v5 bake pipeline produces
/// `weights/jina_v5_base17_151k.bin` + `weights/jina_v5_palette_151k.bin`,
/// swap the load line below to:
///
/// ```ignore
/// ModelRuntime::load(ModelSource::JinaV5, JINA_V5_BASE17, JINA_V5_PALETTE)
/// ```
///
/// Callers should use `JINA` for default behavior. Only use `JINA_V4`
/// explicitly when v4-specific behavior is required (e.g., backward-compat
/// tests).
pub static JINA: LazyLock<ModelRuntime> = LazyLock::new(|| {
ModelRuntime::load(ModelSource::JinaV4, JINA_BASE17, JINA_PALETTE)
// TODO(jina-v5-bake): swap to JinaV5 when v5 weights exist.
ModelRuntime::load(ModelSource::JinaV4, JINA_V4_BASE17, JINA_V4_PALETTE)
});

/// Jina **v4 explicit route** (20K tokens, XLM-R base). LEGACY.
///
/// Use this when a caller specifically needs v4 behavior and should NOT be
/// silently upgraded to v5 when the main route is swapped. Today this is
/// functionally identical to `JINA` (both load v4 bytes), but after the v5
/// bake `JINA` will load v5 while `JINA_V4` keeps loading v4.
pub static JINA_V4: LazyLock<ModelRuntime> = LazyLock::new(|| {
ModelRuntime::load(ModelSource::JinaV4, JINA_V4_BASE17, JINA_V4_PALETTE)
});

/// GPT-2 runtime (50K tokens). Same BPE as Jina → interoperable palettes.
Expand All @@ -211,12 +329,24 @@ mod tests {

#[test]
fn test_jina_runtime_loads() {
// Main route. Today this is v4; when v5 is baked, update this test to
// assert source == JinaV5 and vocab_size == ~151000.
let rt = &*JINA;
assert_eq!(rt.source, ModelSource::JinaV4);
assert_eq!(rt.vocab_size(), 20000);
assert!((rt.similarity[0] - 1.0).abs() < 0.01, "self-similarity should be ~1.0");
}

#[test]
fn test_jina_v4_explicit_route() {
// Legacy v4-specific accessor. After v5 bake, this test MUST still
// pass (v4 is the backward-compat guarantee — never deleted).
let rt = &*JINA_V4;
assert_eq!(rt.source, ModelSource::JinaV4);
assert_eq!(rt.vocab_size(), 20000);
assert!((rt.similarity[0] - 1.0).abs() < 0.01, "self-similarity should be ~1.0");
}

#[test]
fn test_gpt2_runtime_loads() {
let rt = &*GPT2;
Expand Down
14 changes: 14 additions & 0 deletions src/simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,20 @@ pub use crate::simd_avx512::{
bf16_to_f32_scalar, f32_to_bf16_scalar,
bf16_to_f32_batch, f32_to_bf16_batch,
};

// BF16 RNE (round-to-nearest-even) path — pure AVX-512-F, byte-exact vs
// hardware `_mm512_cvtneps_pbh` on Sapphire Rapids+ (verified on 1M inputs
// in ndarray::simd_avx512::tests). Consumer code should call
// `f32_to_bf16_batch_rne` in hot loops (500-20000× faster than the scalar
// path via AMX / AVX-512 tiles); `f32_to_bf16_scalar_rne` is exposed only
// as a unit-test reference implementation and MUST NOT be called in hot
// loops per the workspace-wide "never scalar ever" rule for F32→BF16.
// See lance-graph/CLAUDE.md § Certification Process.
#[cfg(target_arch = "x86_64")]
pub use crate::simd_avx512::{
f32_to_bf16_scalar_rne,
f32_to_bf16_batch_rne,
};
// BF16 SIMD types only available when avx512bf16 is enabled at compile time
#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
pub use crate::simd_avx512::{BF16x16, BF16x8};
Expand Down
Loading
Loading