diff --git a/.claude/prompts/SESSION_HIDREAM_DIFFUSION.md b/.claude/prompts/SESSION_HIDREAM_DIFFUSION.md new file mode 100644 index 00000000..2cb10806 --- /dev/null +++ b/.claude/prompts/SESSION_HIDREAM_DIFFUSION.md @@ -0,0 +1,149 @@ +# SESSION: HiDream-I1 DiT+MoE — First Diffusion Model Indexing + +## MISSION + +Index HiDream-I1-Full (17B DiT+MoE, MIT license) through the bgz17 pipeline. +First cross-domain validation: do image generation MoE experts show the same +structural redundancy as LLM MoE experts (Maverick's 123,000×)? + +Also diff HiDream's Llama-3.1-8B text encoder against base Llama-3.1-8B +to see what "learning to see" does to a language model's attention patterns. + +## READ FIRST + +```bash +cat src/hpc/safetensors.rs # read_safetensors_header, stream_index_safetensors_bf16 +cat src/hpc/gguf_indexer.rs # stream_index_gguf_bf16_with_header (shared core) +cat src/hpc/causal_diff.rs # causal_diff, find_reasoning_scaffold +``` + +## MODEL MAP + +``` +HiDream-ai/HiDream-I1-Full (MIT, ungated) + +Transformer (DiT + MoE): + transformer/diffusion_pytorch_model-{00001..00007}-of-00007.safetensors + Shard 1: 4.99 GB + Shard 2: 4.98 GB + Shard 3: 4.99 GB + Shard 4: 4.98 GB + Shard 5: 4.99 GB + Shard 6: 4.99 GB + Shard 7: 4.29 GB + Total: 35.21 GB + +Text Encoders: + text_encoder/model.safetensors 0.49 GB (CLIP-L) + text_encoder_2/model.safetensors 2.77 GB (CLIP-G/OpenCLIP ViT-bigG) + text_encoder_3/model-00001-of-00002 4.99 GB (Llama-3.1-8B shard 1) + text_encoder_3/model-00002-of-00002 4.53 GB (Llama-3.1-8B shard 2) + Total: 12.78 GB + +VAE: + vae/diffusion_pytorch_model.safetensors 0.16 GB + +Grand total: ~48.15 GB +``` + +## PHASE 1: Index Transformer (35 GB, ~1 hour) + +The DiT+MoE transformer is the main target. Architecture: +- DiT blocks with self-attention (Q/K/V/O projections) +- MoE expert layers (gate + expert FFN) +- Cross-attention (text conditioning) +- Time-step embeddings + +```bash +cargo test test_stream_index_hidream_transformer \ + --release -- --ignored --nocapture 2>&1 | tee /tmp/hidream_transformer.log +``` + +Expected compression: +- MoE expert weights: 50,000-100,000× (if similar to Maverick) +- Attention Q/K/V/O: 500-2,000× +- Cross-attention: unknown — this is NEW (text→image conditioning) +- Time embedding MLP: unknown — sinusoidal structure may compress differently + +## PHASE 2: Index Text Encoders (13 GB, ~30 min) + +Index all three text encoders. The Llama-3.1-8B encoder is especially +interesting — it's a known architecture fine-tuned for image conditioning. + +```bash +cargo test test_stream_index_hidream_text_encoders \ + --release -- --ignored --nocapture +``` + +## PHASE 3: Diff Llama-3.1-8B (what "seeing" adds to "reading") + +Compare HiDream's Llama-3.1-8B (text_encoder_3) against base Llama-3.1-8B +(unsloth/Llama-3.1-8B, ungated safetensors). + +```bash +# Index base Llama-3.1-8B +cargo test test_stream_index_llama31_8b_base \ + --release -- --ignored --nocapture + +# Diff +cargo test test_hidream_llama_diff \ + --release -- --ignored --nocapture +``` + +This diff tells us: which attention heads re-routed when a language model +learned to condition image generation? The Q/K/V/O shift pattern reveals +what "visual grounding" looks like in weight space. + +Cross-reference with the Qwen3.5 reasoning scaffold: +- Qwen3.5 diff: what "structured reasoning" looks like (Claude distillation) +- HiDream diff: what "visual grounding" looks like (image conditioning) +- Same NARS pipeline, different capability injection +- Do they share attention heads? If yes → multimodal reasoning is routing + +## PHASE 4: Cross-Domain MoE Comparison + +Compare HiDream's MoE expert compression against Maverick's: + +``` +Maverick (LLM): 128 experts, 123,000× on gate/up_exps +HiDream (diffusion): N experts, ???× on expert layers + +If similar ratios → MoE structural redundancy is architecture-level, + not domain-level. Experts are commodity everywhere. +If different → image generation experts specialize more than + language experts (domain shapes expert identity). +``` + +## EXPECTED RESULTS + +``` +HiDream DiT+MoE transformer (35 GB): + Conservative: 5-10 MB (3,500-7,000×) + If MoE-heavy: 1-3 MB (12,000-35,000×) + +CLIP-L (0.49 GB): ~100 KB (5,000×) +CLIP-G (2.77 GB): ~500 KB (5,500×) +Llama-3.1-8B (9.52 GB): ~2 MB (5,000×) + +Total ~48 GB: → ~3-13 MB +``` + +## CRITICAL NOTES + +1. Use safetensors path: stream_index_safetensors_bf16 (BF16 precision) +2. Tensor names will differ from GGUF conventions — classify_tensor and + classify_projection may need HiDream-specific patterns +3. Check tensor names in shard 1 header first: the naming convention + determines whether classify_tensor catches attention/FFN/MoE correctly +4. If MoE expert tensors are named differently than llama.cpp convention, + add patterns to classify_tensor BEFORE running (or they'll be classified + as generic Attention and compress at lower ratios) + +## DELIVERABLES + +1. bgz7 indexes: /tmp/hidream_transformer_shard{01-07}.bgz7 +2. bgz7 indexes: /tmp/hidream_clip_l.bgz7, hidream_clip_g.bgz7 +3. bgz7 indexes: /tmp/hidream_llama_enc.bgz7 (combined shards) +4. bgz7 indexes: /tmp/llama31_8b_base.bgz7 +5. Diff results: .claude/knowledge/hidream_results.md +6. Cross-domain MoE comparison: .claude/knowledge/moe_cross_domain.md diff --git a/src/hpc/safetensors.rs b/src/hpc/safetensors.rs index 675a0c58..a2c421b2 100644 --- a/src/hpc/safetensors.rs +++ b/src/hpc/safetensors.rs @@ -411,4 +411,177 @@ mod tests { stats.tensors_indexed); } } + + // ── HiDream-I1: DiT+MoE diffusion model ── + + /// Helper: index safetensors shards from a HuggingFace repo. + fn index_safetensors_shards( + repo: &str, + filenames: &[&str], + out_prefix: &str, + octave_stride: usize, + ) -> Vec { + use super::super::http_reader::HttpRangeReader; + use std::io::BufWriter; + + let mut all_stats = Vec::new(); + + for (i, filename) in filenames.iter().enumerate() { + let shard = i + 1; + let out_path = if filenames.len() == 1 { + format!("{}.bgz7", out_prefix) + } else { + format!("{}_shard{:02}.bgz7", out_prefix, shard) + }; + + if std::fs::metadata(&out_path).is_ok() { + eprintln!("SKIP {} (exists)", out_path); + continue; + } + + let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename); + eprintln!("Indexing {}/{}: {}", shard, filenames.len(), filename); + + // HEAD for size + let size_str = std::process::Command::new("curl") + .args(&["-sI", "-L", &url]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_default(); + let size: u64 = size_str.lines() + .find(|l| l.to_lowercase().starts_with("content-length:")) + .and_then(|l| l.split(':').nth(1)) + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(5_500_000_000); + + let mut reader = HttpRangeReader::with_chunk_size(url, size, 256 * 1024 * 1024); + let out = std::fs::File::create(&out_path).expect("create output"); + let mut writer = BufWriter::new(out); + + let stats = super::stream_index_safetensors_bf16( + &mut reader, &mut writer, octave_stride, + Some(&|name, lt, orig, comp| { + let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 }; + eprintln!(" {:50} {:>12} → {:>8} ({:.0}×)", name, orig, comp, ratio); + }), + ).expect("safetensors indexing failed"); + + drop(writer); + let out_size = std::fs::metadata(&out_path).map(|m| m.len()).unwrap_or(0); + eprintln!(" → {:.2} MB, {} tensors, {:.0}×", + out_size as f64 / 1e6, stats.tensors_indexed, stats.overall_ratio()); + + all_stats.push(stats); + } + + all_stats + } + + #[test] + #[ignore] // Streams ~35 GB from HuggingFace + fn test_stream_index_hidream_transformer() { + let repo = "HiDream-ai/HiDream-I1-Full"; + let shards: Vec<&str> = (1..=7).map(|i| { + // Leak the string so it lives long enough — test only + Box::leak(format!( + "transformer/diffusion_pytorch_model-{:05}-of-00007.safetensors", i + ).into_boxed_str()) as &str + }).collect(); + + let stats = index_safetensors_shards(repo, &shards, "/tmp/hidream_transformer", 16); + + let total_tensors: usize = stats.iter().map(|s| s.tensors_indexed).sum(); + let total_orig: u64 = stats.iter().map(|s| s.original_bytes).sum(); + let total_comp: u64 = stats.iter().map(|s| s.compressed_bytes).sum(); + + eprintln!(); + eprintln!("━━━ HiDream-I1 Transformer (DiT+MoE) ━━━"); + eprintln!(" Source: {:.2} GB", total_orig as f64 / 1e9); + eprintln!(" Compressed: {:.2} MB", total_comp as f64 / 1e6); + eprintln!(" Ratio: {:.0}×", total_orig as f64 / total_comp.max(1) as f64); + eprintln!(" Tensors: {}", total_tensors); + + assert!(total_tensors > 50); + } + + #[test] + #[ignore] // Streams ~13 GB + fn test_stream_index_hidream_text_encoders() { + let repo = "HiDream-ai/HiDream-I1-Full"; + + // CLIP-L + eprintln!("━━━ CLIP-L ━━━"); + index_safetensors_shards(repo, + &["text_encoder/model.safetensors"], + "/tmp/hidream_clip_l", 16); + + // CLIP-G + eprintln!("━━━ CLIP-G ━━━"); + index_safetensors_shards(repo, + &["text_encoder_2/model.safetensors"], + "/tmp/hidream_clip_g", 16); + + // Llama-3.1-8B text encoder (2 shards) + eprintln!("━━━ Llama-3.1-8B (HiDream text encoder) ━━━"); + index_safetensors_shards(repo, + &["text_encoder_3/model-00001-of-00002.safetensors", + "text_encoder_3/model-00002-of-00002.safetensors"], + "/tmp/hidream_llama_enc", 16); + } + + #[test] + #[ignore] // Streams ~16 GB (base Llama-3.1-8B) + fn test_stream_index_llama31_8b_base() { + let repo = "unsloth/Llama-3.1-8B"; + let shards: Vec<&str> = (1..=4).map(|i| { + Box::leak(format!( + "model-{:05}-of-00004.safetensors", i + ).into_boxed_str()) as &str + }).collect(); + + index_safetensors_shards(repo, &shards, "/tmp/llama31_8b_base", 16); + } + + #[test] + #[ignore] // Requires: HiDream Llama enc + base Llama indexed + fn test_hidream_llama_diff() { + use super::super::causal_diff::{causal_diff, print_diff_summary, find_reasoning_scaffold}; + + // Compare HiDream's Llama-3.1-8B (image-conditioned) vs base + // Shards need to be concatenated or diffed per-shard + let pairs = [ + ("/tmp/llama31_8b_base_shard01.bgz7", "/tmp/hidream_llama_enc_shard01.bgz7", "shard 1"), + ("/tmp/llama31_8b_base_shard02.bgz7", "/tmp/hidream_llama_enc_shard02.bgz7", "shard 2"), + ]; + + let mut total_shifted = 0usize; + let mut total_compared = 0usize; + + for (base, dist, label) in &pairs { + if !std::fs::metadata(base).is_ok() || !std::fs::metadata(dist).is_ok() { + eprintln!("SKIP {} (files not found)", label); + continue; + } + + let (edges, stats) = causal_diff(base, dist, 100).expect("diff failed"); + print_diff_summary( + &format!("Llama-3.1-8B: base vs HiDream image encoder ({})", label), + &stats, edges.len()); + + let scaffold = find_reasoning_scaffold(&edges, 0.3); + eprintln!(" Visual grounding scaffold blocks: {:?}", scaffold); + + total_shifted += stats.rows_shifted; + total_compared += stats.rows_compared; + } + + if total_compared > 0 { + eprintln!(); + eprintln!("━━━ Cross-Domain Insight ━━━"); + eprintln!(" Total rows shifted: {}/{} ({:.1}%)", + total_shifted, total_compared, + total_shifted as f64 / total_compared as f64 * 100.0); + eprintln!(" → These shifts = what 'visual grounding' looks like in LLM weight space"); + } + } }