AdaWorldAPI · AdaWorldAPI · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/src/hpc/gguf.rs b/src/hpc/gguf.rs
@@ -215,12 +215,17 @@ pub fn read_tensor_f32<R: Read + Seek>(
         GgmlType::BF16 => {
             let mut buf = vec![0u8; n_elements * 2];
             reader.read_exact(&mut buf).map_err(|e| e.to_string())?;
-            Ok(buf.chunks_exact(2)
-                .map(|c| {
-                    let bits = u16::from_le_bytes([c[0], c[1]]);
-                    bf16_to_f32(bits)
-                })
-                .collect())
+            // Reinterpret u8 pairs as BF16 (same repr) and batch-convert via quantized.rs
+            // SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
+            let bf16_slice: &[super::quantized::BF16] = unsafe {
+                std::slice::from_raw_parts(
+                    buf.as_ptr() as *const super::quantized::BF16,
+                    n_elements,
+                )
+            };
+            let mut result = vec![0.0f32; n_elements];
+            super::quantized::bf16_to_f32_slice(bf16_slice, &mut result);
+            Ok(result)
         }
         GgmlType::Q8_0 => {
             dequantize_q8_0(reader, n_elements)

diff --git a/src/hpc/gguf_indexer.rs b/src/hpc/gguf_indexer.rs
@@ -574,4 +574,126 @@ mod tests {
         assert!(stats.tensors_indexed > 0, "should index at least some tensors");
         assert!(stats.overall_ratio() > 10.0, "ratio should be significant: {:.1}", stats.overall_ratio());
     }
+
+    #[test]
+    #[ignore] // Streams from HuggingFace — requires network + time
+    fn test_stream_index_llama4_scout_from_hf() {
+        use super::super::http_reader::{HttpRangeReader, resolve_hf_url};
+        use std::io::BufWriter;
+
+        let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
+        let filename = "Llama-4-Scout-17B-16E-Instruct-UD-IQ1_S.gguf";
+
+        eprintln!("Resolving {} / {} ...", repo, filename);
+        let (url, size) = match resolve_hf_url(repo, filename) {
+            Ok(r) => r,
+            Err(e) => { eprintln!("SKIP: {}", e); return; }
+        };
+        eprintln!("  URL resolved, size: {:.2} GB", size as f64 / 1e9);
+
+        let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024); // 16 MB chunks
+
+        let out_path = "/tmp/llama4_scout.bgz7";
+        let out = std::fs::File::create(out_path).expect("create output");
+        let mut writer = BufWriter::new(out);
+
+        eprintln!("Streaming index...");
+        let stats = stream_index_gguf(
+            &mut reader,
+            &mut writer,
+            Some(&|name, layer_type, orig, comp| {
+                let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
+                eprintln!("  {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
+                    name, layer_type, orig, comp, ratio);
+            }),
+        ).expect("stream_index_gguf");
+
+        drop(writer);
+        let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
+
+        eprintln!();
+        eprintln!("=== Llama 4 Scout → bgz17 (streamed from HF) ===");
+        eprintln!("  Source:     {:.2} GB ({})", size as f64 / 1e9, filename);
+        eprintln!("  Output:     {:.2} MB ({})", out_size as f64 / 1e6, out_path);
+        eprintln!("  Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
+        eprintln!("  Tensors:    {} indexed, {} skipped",
+            stats.tensors_indexed, stats.tensors_skipped);
+        eprintln!("  Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
+        eprintln!("  Compressed:     {:.2} MB", stats.compressed_bytes as f64 / 1e6);
+        eprintln!("  Ratio:          {:.1}×", stats.overall_ratio());
+        eprintln!("  Peak tensor:    {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
+
+        let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
+        for (i, name) in type_names.iter().enumerate() {
+            let (count, orig, comp) = stats.by_type[i];
+            if count > 0 {
+                let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
+                eprintln!("  {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
+                    name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
+            }
+        }
+
+        assert!(stats.tensors_indexed > 0);
+    }
+
+    #[test]
+    #[ignore] // Streams BF16 shard 5 (18.2 GB) from HuggingFace
+    fn test_stream_index_llama4_bf16_shard5() {
+        use super::super::http_reader::HttpRangeReader;
+        use std::io::BufWriter;
+
+        let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
+        let filename = "BF16/Llama-4-Scout-17B-16E-Instruct-BF16-00005-of-00005.gguf";
+        let size: u64 = 18_220_000_000; // ~18.2 GB from metadata
+
+        let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename);
+        eprintln!("Streaming shard 5: {:.2} GB", size as f64 / 1e9);
+        eprintln!("  URL: {}", url);
+
+        // 16 MB chunks for fewer HTTP round-trips
+        let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024);
+
+        let out_path = "/tmp/llama4_scout_shard5.bgz7";
+        let out = std::fs::File::create(out_path).expect("create output");
+        let mut writer = BufWriter::new(out);
+
+        let stats = stream_index_gguf(
+            &mut reader,
+            &mut writer,
+            Some(&|name, layer_type, orig, comp| {
+                let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
+                eprintln!("  {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
+                    name, layer_type, orig, comp, ratio);
+            }),
+        ).expect("stream_index_gguf");
+
+        drop(writer);
+        let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
+
+        eprintln!();
+        eprintln!("=== Llama 4 Scout BF16 Shard 5 → bgz17 ===");
+        eprintln!("  Source:     {:.2} GB (BF16, streamed from HF)", size as f64 / 1e9);
+        eprintln!("  Output:     {:.2} MB", out_size as f64 / 1e6);
+        eprintln!("  Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
+        eprintln!("  Tensors:    {} indexed, {} skipped",
+            stats.tensors_indexed, stats.tensors_skipped);
+        eprintln!("  Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
+        eprintln!("  Compressed:     {:.2} MB", stats.compressed_bytes as f64 / 1e6);
+        eprintln!("  Ratio:          {:.1}×", stats.overall_ratio());
+        eprintln!("  Peak tensor:    {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
+
+        let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
+        for (i, name) in type_names.iter().enumerate() {
+            let (count, orig, comp) = stats.by_type[i];
+            if count > 0 {
+                let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
+                eprintln!("  {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
+                    name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
+            }
+        }
+
+        assert!(stats.tensors_indexed > 0);
+        // BF16 dequant to f32 doubles the size, so original_bytes > source size
+        assert!(stats.original_bytes > 0);
+    }
 }