From 54ed7eedf351036ccbb4a77e817f2f3332dddf20 Mon Sep 17 00:00:00 2001 From: AdaWorldAPI Date: Mon, 30 Mar 2026 10:24:27 +0200 Subject: [PATCH] feat: MoE gate topology extraction + expert clustering + scaffold cross-ref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extract_gate_topology() — pulls ffn_gate_inp Base17 rows from bgz7, one row per expert. Each row IS the expert's structural identity. cluster_experts() — pairwise L1 between experts within each block, connected-component grouping of structurally interchangeable experts. At threshold=500, Maverick's 123,000× compression predicts >90% redundancy. cross_reference_gate_scaffold() — links attention scaffold blocks (Q+O shifted from Qwen3.5 diff) with gate redundancy per block. Routing-dominated blocks = reasoning changes work through the router, not through the expert weights. Tests: - test_maverick_gate_topology: load all 18 Maverick bgz7 shards - test_cross_reference_gate_scaffold: full pipeline connecting Qwen3.5 attention diff with Maverick gate structure --- src/hpc/causal_diff.rs | 277 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) diff --git a/src/hpc/causal_diff.rs b/src/hpc/causal_diff.rs index a7e96c15..e82e2bf9 100644 --- a/src/hpc/causal_diff.rs +++ b/src/hpc/causal_diff.rs @@ -333,6 +333,180 @@ fn nars_revision(a: NarsTruth, b: NarsTruth) -> NarsTruth { NarsTruth::new(f.clamp(0.0, 1.0), c.clamp(0.0, 0.99)) } +// ============================================================================ +// MoE gate topology — expert clustering from router weights +// ============================================================================ + +/// One expert's structural identity from the gate projection. +#[derive(Clone, Debug)] +pub struct ExpertFingerprint { + pub block: u32, + pub expert_idx: usize, + pub base17: Base17, +} + +/// Pairwise expert similarity within a block. +#[derive(Clone, Debug)] +pub struct ExpertCluster { + pub block: u32, + pub n_experts: usize, + /// Mean pairwise L1 distance between experts (lower = more redundant). + pub mean_pairwise_l1: f64, + /// Number of expert pairs with L1 < threshold (structurally interchangeable). + pub redundant_pairs: usize, + /// Total pairs compared. + pub total_pairs: usize, + /// Groups of structurally similar experts (L1 < threshold). + pub groups: Vec>, +} + +/// Extract MoE gate topology from a bgz7 file. +/// +/// Finds all `ffn_gate_inp` tensors (the router gate projections). +/// Each row in the gate tensor = one expert's activation fingerprint. +/// Returns per-block expert fingerprints. +pub fn extract_gate_topology(bgz7_path: &str) -> Result, String> { + let tensors = read_bgz7_file(bgz7_path)?; + let mut fingerprints = Vec::new(); + + for t in &tensors { + // Match router gate tensors + if !t.name.contains("gate_inp") && !t.name.contains("gate.weight") { + continue; + } + // Skip expert FFN gates (gate_exps) — we want the ROUTER gate + if t.name.contains("_exps") { + continue; + } + + let block = extract_block(&t.name).unwrap_or(0); + + for (expert_idx, row) in t.rows.iter().enumerate() { + fingerprints.push(ExpertFingerprint { + block, + expert_idx, + base17: row.clone(), + }); + } + + eprintln!(" Gate: {} → {} experts in block {}", + t.name, t.rows.len(), block); + } + + Ok(fingerprints) +} + +/// Cluster experts within each block by Base17 L1 distance. +/// +/// `redundancy_threshold`: L1 below which two experts are "structurally interchangeable". +/// Suggested: 500 (conservative), 1000 (aggressive). +pub fn cluster_experts( + fingerprints: &[ExpertFingerprint], + redundancy_threshold: u32, +) -> Vec { + // Group by block + let mut by_block: HashMap> = HashMap::new(); + for fp in fingerprints { + by_block.entry(fp.block).or_default().push(fp); + } + + let mut clusters = Vec::new(); + + for (block, experts) in &by_block { + let n = experts.len(); + let mut total_l1 = 0u64; + let mut redundant = 0usize; + let total_pairs = n * (n - 1) / 2; + + // Pairwise L1 + let mut adjacency: Vec> = vec![vec![false; n]; n]; + for i in 0..n { + for j in (i + 1)..n { + let l1 = experts[i].base17.l1(&experts[j].base17); + total_l1 += l1 as u64; + if l1 < redundancy_threshold { + redundant += 1; + adjacency[i][j] = true; + adjacency[j][i] = true; + } + } + } + + let mean_l1 = if total_pairs > 0 { total_l1 as f64 / total_pairs as f64 } else { 0.0 }; + + // Simple connected-component grouping + let mut visited = vec![false; n]; + let mut groups = Vec::new(); + for start in 0..n { + if visited[start] { continue; } + let mut group = vec![start]; + visited[start] = true; + let mut stack = vec![start]; + while let Some(node) = stack.pop() { + for neighbor in 0..n { + if !visited[neighbor] && adjacency[node][neighbor] { + visited[neighbor] = true; + group.push(neighbor); + stack.push(neighbor); + } + } + } + if group.len() > 1 { + groups.push(group); + } + } + + eprintln!(" Block {:>2}: {} experts, mean_L1={:.0}, redundant_pairs={}/{} ({:.0}%), groups={}", + block, n, mean_l1, redundant, total_pairs, + if total_pairs > 0 { redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 }, + groups.len()); + + clusters.push(ExpertCluster { + block: *block, + n_experts: n, + mean_pairwise_l1: mean_l1, + redundant_pairs: redundant, + total_pairs, + groups, + }); + } + + clusters.sort_by_key(|c| c.block); + clusters +} + +/// Cross-reference gate topology with attention scaffold. +/// +/// For each scaffold block (where Q+O shifted), check if the gate +/// in that block has high expert redundancy. High redundancy + scaffold +/// = the reasoning change works THROUGH the router, not the experts. +pub fn cross_reference_gate_scaffold( + clusters: &[ExpertCluster], + scaffold_blocks: &[u32], +) -> Vec<(u32, bool, f64)> { + let mut results = Vec::new(); + + for block in scaffold_blocks { + if let Some(cluster) = clusters.iter().find(|c| c.block == *block) { + let redundancy_pct = if cluster.total_pairs > 0 { + cluster.redundant_pairs as f64 / cluster.total_pairs as f64 + } else { 0.0 }; + + let is_routing_dominated = redundancy_pct > 0.5; + results.push((*block, is_routing_dominated, redundancy_pct)); + + eprintln!(" Block {:>2}: scaffold={} routing_dominated={} redundancy={:.0}%", + block, true, is_routing_dominated, redundancy_pct * 100.0); + } else { + // No gate in this block (dense layer, not MoE) + results.push((*block, false, 0.0)); + eprintln!(" Block {:>2}: scaffold={} (dense, no MoE gate)", block, true); + } + } + + results +} + // ============================================================================ // Tests // ============================================================================ @@ -516,4 +690,107 @@ mod tests { if truth.frequency > 0.5 { "shifted" } else { "stable" }); } } + + #[test] + #[ignore] // Requires: Maverick bgz7 outputs from shard indexing + fn test_maverick_gate_topology() { + // Load all Maverick shard bgz7 files and extract gate tensors + let mut all_fingerprints = Vec::new(); + + for shard in 1..=18u32 { + let path = format!("/tmp/llama4_maverick_shard{:02}.bgz7", shard); + if !std::fs::metadata(&path).is_ok() { + // Try openchat/weights path + let alt = format!("src/hpc/openchat/weights/llama4_maverick_shard{:02}.bgz7", shard); + if !std::fs::metadata(&alt).is_ok() { + eprintln!("SKIP shard {} (not found)", shard); + continue; + } + match extract_gate_topology(&alt) { + Ok(fps) => all_fingerprints.extend(fps), + Err(e) => eprintln!("WARN shard {}: {}", shard, e), + } + continue; + } + match extract_gate_topology(&path) { + Ok(fps) => all_fingerprints.extend(fps), + Err(e) => eprintln!("WARN shard {}: {}", shard, e), + } + } + + eprintln!(); + eprintln!("Total expert fingerprints: {}", all_fingerprints.len()); + + if all_fingerprints.is_empty() { + eprintln!("No gate tensors found — Maverick may not have been indexed yet"); + return; + } + + // Cluster experts + let clusters = cluster_experts(&all_fingerprints, 500); + + eprintln!(); + eprintln!("━━━ Maverick Gate Topology ━━━"); + let total_redundant: usize = clusters.iter().map(|c| c.redundant_pairs).sum(); + let total_pairs: usize = clusters.iter().map(|c| c.total_pairs).sum(); + eprintln!(" Overall redundancy: {}/{} pairs ({:.0}%)", + total_redundant, total_pairs, + if total_pairs > 0 { total_redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 }); + + // NARS truth for expert redundancy + let f = if total_pairs > 0 { total_redundant as f32 / total_pairs as f32 } else { 0.0 }; + let c = (1.0 - 1.0 / (1.0 + total_pairs as f32)).min(0.99); + eprintln!(" NARS truth: f={:.3} c={:.3}", f, c); + eprintln!(" Interpretation: {:.0}% of expert pairs are structurally interchangeable", f * 100.0); + } + + #[test] + #[ignore] // Requires: both Maverick bgz7 + Qwen3.5 diff results + fn test_cross_reference_gate_scaffold() { + // This test connects the two analyses: + // 1. Attention scaffold from Qwen3.5 diff (which blocks have Q+O shift) + // 2. Gate topology from Maverick (which blocks have redundant experts) + + // Step 1: Run the Qwen3.5 diff to find scaffold blocks + let base = "/tmp/qwen35_27b_base.bgz7"; + let dist = "/tmp/qwen35_27b_distilled_v1.bgz7"; + + if !std::fs::metadata(base).is_ok() || !std::fs::metadata(dist).is_ok() { + eprintln!("SKIP: Qwen3.5 bgz7 files not found"); + return; + } + + let (edges, _stats) = causal_diff(base, dist, 100).expect("diff failed"); + let scaffold_blocks = find_reasoning_scaffold(&edges, 0.3); + + // Step 2: Extract Maverick gate topology + let mut all_fps = Vec::new(); + for shard in 1..=18u32 { + let path = format!("/tmp/llama4_maverick_shard{:02}.bgz7", shard); + if let Ok(fps) = extract_gate_topology(&path) { + all_fps.extend(fps); + } + } + + if all_fps.is_empty() { + eprintln!("SKIP: No Maverick gate fingerprints"); + return; + } + + let clusters = cluster_experts(&all_fps, 500); + + // Step 3: Cross-reference + eprintln!(); + eprintln!("━━━ Cross-Reference: Attention Scaffold × Gate Topology ━━━"); + let results = cross_reference_gate_scaffold(&clusters, &scaffold_blocks); + + let routing_dominated: usize = results.iter().filter(|(_, rd, _)| *rd).count(); + eprintln!(); + eprintln!(" Scaffold blocks: {}", scaffold_blocks.len()); + eprintln!(" Routing-dominated: {}/{} ({:.0}%)", + routing_dominated, results.len(), + if !results.is_empty() { routing_dominated as f64 / results.len() as f64 * 100.0 } else { 0.0 }); + eprintln!(" → {} = reasoning changes work THROUGH the router", + if routing_dominated > results.len() / 2 { "YES" } else { "PARTIAL" }); + } }