Skip to content
17 changes: 11 additions & 6 deletions src/hpc/gguf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,17 @@ pub fn read_tensor_f32<R: Read + Seek>(
GgmlType::BF16 => {
let mut buf = vec![0u8; n_elements * 2];
reader.read_exact(&mut buf).map_err(|e| e.to_string())?;
Ok(buf.chunks_exact(2)
.map(|c| {
let bits = u16::from_le_bytes([c[0], c[1]]);
bf16_to_f32(bits)
})
.collect())
// Reinterpret u8 pairs as BF16 (same repr) and batch-convert via quantized.rs
// SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
let bf16_slice: &[super::quantized::BF16] = unsafe {
std::slice::from_raw_parts(
buf.as_ptr() as *const super::quantized::BF16,
n_elements,
)
};
let mut result = vec![0.0f32; n_elements];
super::quantized::bf16_to_f32_slice(bf16_slice, &mut result);
Ok(result)
}
GgmlType::Q8_0 => {
dequantize_q8_0(reader, n_elements)
Expand Down
122 changes: 122 additions & 0 deletions src/hpc/gguf_indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -574,4 +574,126 @@ mod tests {
assert!(stats.tensors_indexed > 0, "should index at least some tensors");
assert!(stats.overall_ratio() > 10.0, "ratio should be significant: {:.1}", stats.overall_ratio());
}

#[test]
#[ignore] // Streams from HuggingFace — requires network + time
fn test_stream_index_llama4_scout_from_hf() {
use super::super::http_reader::{HttpRangeReader, resolve_hf_url};
use std::io::BufWriter;

let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
let filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf";

eprintln!("Resolving {} / {} ...", repo, filename);
let (url, size) = match resolve_hf_url(repo, filename) {
Ok(r) => r,
Err(e) => { eprintln!("SKIP: {}", e); return; }
};
eprintln!(" URL resolved, size: {:.2} GB", size as f64 / 1e9);

let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024); // 16 MB chunks

let out_path = "/tmp/llama4_scout.bgz7";
let out = std::fs::File::create(out_path).expect("create output");
let mut writer = BufWriter::new(out);

eprintln!("Streaming index...");
let stats = stream_index_gguf(
&mut reader,
&mut writer,
Some(&|name, layer_type, orig, comp| {
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
name, layer_type, orig, comp, ratio);
}),
).expect("stream_index_gguf");

drop(writer);
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);

eprintln!();
eprintln!("=== Llama 4 Scout → bgz17 (streamed from HF) ===");
eprintln!(" Source: {:.2} GB ({})", size as f64 / 1e9, filename);
eprintln!(" Output: {:.2} MB ({})", out_size as f64 / 1e6, out_path);
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
eprintln!(" Tensors: {} indexed, {} skipped",
stats.tensors_indexed, stats.tensors_skipped);
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);

let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
for (i, name) in type_names.iter().enumerate() {
let (count, orig, comp) = stats.by_type[i];
if count > 0 {
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
eprintln!(" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
}
}

assert!(stats.tensors_indexed > 0);
}

#[test]
#[ignore] // Streams BF16 shard 5 (18.2 GB) from HuggingFace
fn test_stream_index_llama4_bf16_shard5() {
use super::super::http_reader::HttpRangeReader;
use std::io::BufWriter;

let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
let filename = "BF16/Llama-4-Scout-17B-16E-Instruct-BF16-00005-of-00005.gguf";
let size: u64 = 18_220_000_000; // ~18.2 GB from metadata

let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename);
eprintln!("Streaming shard 5: {:.2} GB", size as f64 / 1e9);
eprintln!(" URL: {}", url);

// 16 MB chunks for fewer HTTP round-trips
let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024);

let out_path = "/tmp/llama4_scout_shard5.bgz7";
let out = std::fs::File::create(out_path).expect("create output");
let mut writer = BufWriter::new(out);

let stats = stream_index_gguf(
&mut reader,
&mut writer,
Some(&|name, layer_type, orig, comp| {
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
name, layer_type, orig, comp, ratio);
}),
).expect("stream_index_gguf");

drop(writer);
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);

eprintln!();
eprintln!("=== Llama 4 Scout BF16 Shard 5 → bgz17 ===");
eprintln!(" Source: {:.2} GB (BF16, streamed from HF)", size as f64 / 1e9);
eprintln!(" Output: {:.2} MB", out_size as f64 / 1e6);
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
eprintln!(" Tensors: {} indexed, {} skipped",
stats.tensors_indexed, stats.tensors_skipped);
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);

let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
for (i, name) in type_names.iter().enumerate() {
let (count, orig, comp) = stats.by_type[i];
if count > 0 {
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
eprintln!(" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
}
}

assert!(stats.tensors_indexed > 0);
// BF16 dequant to f32 doubles the size, so original_bytes > source size
assert!(stats.original_bytes > 0);
}
}
Loading
Loading