From e21fbe1fa75942a61fba00b772966e5192f3d797 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 28 Mar 2026 19:10:49 +0000
Subject: [PATCH 1/4] =?UTF-8?q?feat(deepnsm):=20full=20eval=20pipeline=20+?=
 =?UTF-8?q?=20CAM-PQ=20bridge=20+=20SPO=20triple=20(849=E2=86=921231=20lin?=
 =?UTF-8?q?es)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The expansion that was deferred since session start. Adds:

Evaluation types (transcoded from Python nsm_evaluation.py + prompts.py):
  - Prediction: grader output with logprob, rank, match status
  - SubstitutabilityScore: per-grader scoring with minimality + entailment deltas
  - Explication: NSM paraphrase with legality_score() (primes/molecules/circularity)
    + calculate_averages() + get_truncated()
  - AmbiguousExample: masked passage with get_truncated() (removes non-UNK sentences)
  - ModelResult: aggregated evaluation across all explications

Static sets via LazyLock (Rust 1.94):
  - NSM_PRIMES_SET: 78 primes including multi-word ("a long time", "don't want")
  - STOP_WORDS: English stopwords minus NSM primes (one-time filtered)
  - is_nsm_prime(), is_stop_word(), LEGAL_PUNCTUATION

CAM-PQ bridge:
  - load_nsm_codebook(): codebook_pq.bin → CamCodebook (96KB, [6][256][16] f32)
  - load_cam_codes(): cam_codes.bin → Vec<CamFingerprint> (5050 × 6 bytes)

36-bit SPO triple:
  - SpoTriple: 12-bit subject + predicate + object packed in u64
  - new(), subject(), predicate(), object()

Prompt templates + builders:
  - NSM_EXPLICATION_SYS_INST, RECOVERY_PROMPT_SYS_INST
  - build_explication_prompt() with few-shot support
  - build_recover_prompt() with optional explication hint

23 tests passing (12 original + 11 new).

https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
---
 src/hpc/deepnsm.rs | 382 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 382 insertions(+)
diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs
index 4cb658da..beecc7a1 100644
--- a/src/hpc/deepnsm.rs
+++ b/src/hpc/deepnsm.rs
@@ -847,3 +847,385 @@ mod tests {
         }
     }
 }
+
+// ============================================================================
+// DeepNSM Evaluation Pipeline — transcoded from Python DeepNSM
+// ============================================================================
+
+/// The full NSM primes set including multi-word primes (from Python utils.py).
+static NSM_PRIMES_SET: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
+    [
+        "i", "you", "someone", "people", "something", "thing", "body", "kind", "part",
+        "this", "the same", "other", "else", "another", "one", "two", "some", "all",
+        "much", "many", "little", "few", "good", "bad", "big", "small", "think", "know",
+        "want", "don't want", "feel", "see", "hear", "say", "words", "true", "do",
+        "happen", "move", "there", "is", "be", "mine", "live", "die", "when", "time",
+        "now", "before", "after", "a long time", "a short time", "for some time",
+        "moment", "where", "place", "here", "above", "below", "far", "near", "side",
+        "inside", "touch", "not", "maybe", "can", "because", "if", "very", "more",
+        "like", "as", "way", "said",
+    ].into_iter().collect()
+});
+
+/// Check if a word is an NSM semantic prime.
+pub fn is_nsm_prime(word: &str) -> bool {
+    NSM_PRIMES_SET.contains(word.to_lowercase().as_str())
+}
+
+/// English stopwords excluding NSM primes. `LazyLock` one-time init.
+static STOP_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
+    let sw: HashSet<&str> = [
+        "a", "an", "and", "are", "at", "been", "but", "by", "did", "does",
+        "doing", "down", "during", "each", "for", "from", "further", "had",
+        "has", "having", "he", "her", "herself", "him", "himself", "his",
+        "how", "in", "into", "it", "its", "itself", "just", "me", "my",
+        "myself", "no", "nor", "of", "off", "on", "once", "only", "or",
+        "our", "ours", "ourselves", "out", "over", "own", "re", "s", "she",
+        "should", "so", "such", "t", "than", "that", "the", "their",
+        "theirs", "them", "themselves", "then", "these", "they", "those",
+        "through", "to", "too", "under", "until", "up", "ve", "was", "we",
+        "were", "what", "which", "while", "who", "whom", "why", "will",
+        "with", "won", "would", "your", "yours", "yourself", "yourselves",
+    ].into_iter().collect();
+    sw.into_iter().filter(|w| !NSM_PRIMES_SET.contains(*w)).collect()
+});
+
+/// Check if a word is a stopword (but not an NSM prime).
+pub fn is_stop_word(word: &str) -> bool {
+    STOP_WORDS.contains(word.to_lowercase().as_str())
+}
+
+/// Legal punctuation in NSM explications.
+pub const LEGAL_PUNCTUATION: &[char] = &['\'', '.', ',', ':', '!', '?', '"', '\n', '\t', '(', ')', '/'];
+
+// ── Evaluation types ────────────────────────────────────────────────────────
+
+/// A single prediction from a grader model.
+#[derive(Clone, Debug)]
+pub struct Prediction {
+    pub prediction: String,
+    pub answer_logprob: f32,
+    pub answer_ranks: Vec<usize>,
+    pub is_match: bool,
+    pub lines_removed: usize,
+}
+
+impl Prediction {
+    pub fn new(prediction: &str) -> Self {
+        Self { prediction: prediction.to_string(), answer_logprob: 0.0, answer_ranks: Vec::new(), is_match: false, lines_removed: 0 }
+    }
+}
+
+/// Substitutability score from one grader model.
+#[derive(Clone, Debug)]
+pub struct SubstitutabilityScore {
+    pub model: String,
+    pub baselines: Vec<Prediction>,
+    pub exp_baselines: Vec<Prediction>,
+    pub minimality: Vec<Vec<Prediction>>,
+    pub entailments: Vec<Vec<Prediction>>,
+    pub adj_score: f32,
+    pub avg_delta_log: f32,
+    pub avg_min_delta_log: f32,
+    pub avg_ent_delta_log: f32,
+    pub total_match: usize,
+}
+
+impl SubstitutabilityScore {
+    pub fn new(model: &str) -> Self {
+        Self { model: model.to_string(), baselines: Vec::new(), exp_baselines: Vec::new(), minimality: Vec::new(), entailments: Vec::new(), adj_score: 0.0, avg_delta_log: 0.0, avg_min_delta_log: 0.0, avg_ent_delta_log: 0.0, total_match: 0 }
+    }
+}
+
+/// An NSM explication with legality scoring.
+#[derive(Clone, Debug)]
+pub struct Explication {
+    pub text: String,
+    pub target_word: String,
+    pub length: usize,
+    pub primes: usize,
+    pub stop_words_count: usize,
+    pub molecules: usize,
+    pub unique_molecules: usize,
+    pub uses_original_word: bool,
+    pub primes_ratio: f32,
+    pub molecules_ratio: f32,
+    pub sub_scores: Vec<SubstitutabilityScore>,
+    pub avg_delta: f32,
+    pub avg_delta_min: f32,
+    pub avg_delta_ent: f32,
+    pub score_exp: f32,
+    pub total_score: f32,
+}
+
+impl Explication {
+    pub fn new(text: &str) -> Self {
+        Self {
+            text: text.to_string(), target_word: String::new(),
+            length: 0, primes: 0, stop_words_count: 0, molecules: 0,
+            unique_molecules: 0, uses_original_word: false, primes_ratio: 0.0,
+            molecules_ratio: 0.0, sub_scores: Vec::new(), avg_delta: 0.0,
+            avg_delta_min: 0.0, avg_delta_ent: 0.0, score_exp: 0.0, total_score: 0.0,
+        }
+    }
+
+    /// Score legality against a target word (circularity via stem matching).
+    pub fn legality_score(&mut self, word: &str) {
+        let clean: String = self.text.to_lowercase().chars()
+            .filter(|c| c.is_alphanumeric() || c.is_whitespace()).collect();
+        let tokens: Vec<&str> = clean.split_whitespace().collect();
+        self.target_word = word.to_string();
+        self.length = tokens.len();
+        self.primes = tokens.iter().filter(|t| is_nsm_prime(t)).count();
+        self.stop_words_count = tokens.iter().filter(|t| is_stop_word(t)).count();
+        let mols: Vec<&&str> = tokens.iter().filter(|t| !is_nsm_prime(t) && !is_stop_word(t)).collect();
+        self.molecules = mols.len();
+        self.unique_molecules = mols.iter().collect::<HashSet<_>>().len();
+        let wl = word.to_lowercase();
+        let stem = if wl.len() >= 4 { &wl[..4] } else { &wl };
+        self.uses_original_word = tokens.iter().any(|t| *t == wl || (t.len() >= 4 && t.starts_with(stem)));
+        self.primes_ratio = if self.length > 0 { self.primes as f32 / self.length as f32 } else { 0.0 };
+        self.molecules_ratio = if self.length > 0 { self.molecules as f32 / self.length as f32 } else { 0.0 };
+    }
+
+    /// Compute averages from substitutability sub-scores.
+    pub fn calculate_averages(&mut self) {
+        if self.sub_scores.is_empty() { return; }
+        let n = self.sub_scores.len() as f32;
+        self.avg_delta = self.sub_scores.iter().map(|s| s.avg_delta_log).sum::<f32>() / n;
+        self.avg_delta_min = self.sub_scores.iter().map(|s| s.avg_min_delta_log).sum::<f32>() / n;
+        self.avg_delta_ent = self.sub_scores.iter().map(|s| s.avg_ent_delta_log).sum::<f32>() / n;
+        self.score_exp = self.sub_scores.iter().map(|s| s.adj_score).sum::<f32>() / n;
+        self.total_score = if !self.uses_original_word {
+            2.0 * (self.score_exp + 10.0 * self.primes_ratio - 10.0 * self.molecules_ratio)
+        } else { 0.0 };
+    }
+
+    /// Truncated versions with lines removed from the end.
+    pub fn get_truncated(&self, max_lines_remove: usize) -> Vec<Explication> {
+        let lines: Vec<&str> = self.text.trim().lines().collect();
+        (0..max_lines_remove.min(lines.len()))
+            .map(|i| Explication::new(&lines[..lines.len() - (i + 1)].join("\n")))
+            .collect()
+    }
+}
+
+/// Ambiguous example passage with masked word.
+#[derive(Clone, Debug)]
+pub struct AmbiguousExample {
+    pub text: String,
+    pub source: Option<String>,
+}
+
+impl AmbiguousExample {
+    pub fn new(text: &str) -> Self { Self { text: text.to_string(), source: None } }
+
+    /// Truncated versions removing non-UNK sentences.
+    pub fn get_truncated(&self, max_remove: usize) -> Vec<AmbiguousExample> {
+        let sents: Vec<&str> = self.text.split('.').map(|s| s.trim()).filter(|s| !s.is_empty()).collect();
+        let non_unk: Vec<usize> = sents.iter().enumerate().filter(|(_, s)| !s.contains("<UNK>")).map(|(i, _)| i).collect();
+        (0..max_remove.min(non_unk.len())).map(|i| {
+            let exclude: HashSet<usize> = non_unk[..=i].iter().copied().collect();
+            let kept: Vec<&str> = sents.iter().enumerate().filter(|(j, _)| !exclude.contains(j)).map(|(_, s)| *s).collect();
+            AmbiguousExample::new(&kept.join(". "))
+        }).collect()
+    }
+}
+
+/// Aggregated model evaluation result.
+#[derive(Clone, Debug)]
+pub struct ModelResult {
+    pub model_name: String,
+    pub num_examples: usize,
+    pub explications: Vec<Explication>,
+    pub avg_primes_ratio: f32,
+    pub avg_molecules_ratio: f32,
+    pub avg_total_score: f32,
+}
+
+impl ModelResult {
+    pub fn new(model_name: &str) -> Self {
+        Self { model_name: model_name.to_string(), num_examples: 0, explications: Vec::new(), avg_primes_ratio: 0.0, avg_molecules_ratio: 0.0, avg_total_score: 0.0 }
+    }
+    pub fn calculate_averages(&mut self) {
+        let n = self.explications.len() as f32;
+        if n == 0.0 { return; }
+        self.avg_primes_ratio = self.explications.iter().map(|e| e.primes_ratio).sum::<f32>() / n;
+        self.avg_molecules_ratio = self.explications.iter().map(|e| e.molecules_ratio).sum::<f32>() / n;
+        self.avg_total_score = self.explications.iter().map(|e| e.total_score).sum::<f32>() / n;
+    }
+}
+
+// ── CAM-PQ bridge ───────────────────────────────────────────────────────────
+
+/// Load DeepNSM codebook (`codebook_pq.bin`) into ndarray's CamCodebook.
+pub fn load_nsm_codebook(codebook_bytes: &[u8]) -> super::cam_pq::CamCodebook {
+    use super::cam_pq::{CamCodebook, SubspaceCodebook, NUM_CENTROIDS, NUM_SUBSPACES};
+    let expected = NUM_SUBSPACES * NUM_CENTROIDS * 16 * 4;
+    assert_eq!(codebook_bytes.len(), expected, "codebook_pq.bin: expected {expected} bytes, got {}", codebook_bytes.len());
+    let mut codebooks: Vec<SubspaceCodebook> = Vec::with_capacity(NUM_SUBSPACES);
+    for s in 0..NUM_SUBSPACES {
+        let mut centroids = Vec::with_capacity(NUM_CENTROIDS);
+        for c in 0..NUM_CENTROIDS {
+            let mut centroid = Vec::with_capacity(16);
+            for d in 0..16 {
+                let off = (s * NUM_CENTROIDS * 16 + c * 16 + d) * 4;
+                centroid.push(f32::from_le_bytes([codebook_bytes[off], codebook_bytes[off+1], codebook_bytes[off+2], codebook_bytes[off+3]]));
+            }
+            centroids.push(centroid);
+        }
+        codebooks.push(SubspaceCodebook { centroids, subspace_dim: 16 });
+    }
+    CamCodebook { codebooks: codebooks.try_into().unwrap(), total_dim: 96, subspace_dim: 16 }
+}
+
+/// Load CAM codes (`cam_codes.bin`): N words × 6 bytes.
+pub fn load_cam_codes(bytes: &[u8]) -> Vec<super::cam_pq::CamFingerprint> {
+    assert_eq!(bytes.len() % 6, 0);
+    bytes.chunks_exact(6).map(|c| { let mut fp = [0u8; 6]; fp.copy_from_slice(c); fp }).collect()
+}
+
+// ── 36-bit SPO triple ───────────────────────────────────────────────────────
+
+/// 36-bit SPO triple packed in u64. 12-bit subject + predicate + object.
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub struct SpoTriple { packed: u64 }
+
+impl SpoTriple {
+    pub fn new(subject: u16, predicate: u16, object: u16) -> Self {
+        debug_assert!(subject < 4096 && predicate < 4096 && object < 4096);
+        Self { packed: ((subject as u64) << 24) | ((predicate as u64) << 12) | object as u64 }
+    }
+    pub fn subject(&self) -> u16 { ((self.packed >> 24) & 0xFFF) as u16 }
+    pub fn predicate(&self) -> u16 { ((self.packed >> 12) & 0xFFF) as u16 }
+    pub fn object(&self) -> u16 { (self.packed & 0xFFF) as u16 }
+}
+
+// ── Prompt templates ────────────────────────────────────────────────────────
+
+/// NSM explication system instruction.
+pub const NSM_EXPLICATION_SYS_INST: &str = "You are a linguist specializing in semantic analysis using the Natural Semantic Metalanguage (NSM) approach. NSM reduces lexicons to universal semantic primes. Paraphrase the word's meaning using NSM primes.";
+
+/// Recovery prompt: predict masked word.
+pub const RECOVERY_PROMPT_SYS_INST: &str = "Read the passage with a missing word indicated by <UNK>. Predict the missing word. Output only your prediction.";
+
+/// Chat message for prompt construction.
+#[derive(Clone, Debug)]
+pub struct ChatMessage { pub role: String, pub content: String }
+
+/// Build explication prompt with optional few-shot.
+pub fn build_explication_prompt(word: &str, examples: &[&str], few_shot: &[(String, String)], max: Option<usize>) -> Vec<ChatMessage> {
+    let mut msgs = vec![ChatMessage { role: "system".into(), content: NSM_EXPLICATION_SYS_INST.into() }];
+    for (u, a) in &few_shot[..max.unwrap_or(few_shot.len()).min(few_shot.len())] {
+        msgs.push(ChatMessage { role: "user".into(), content: u.clone() });
+        msgs.push(ChatMessage { role: "assistant".into(), content: a.clone() });
+    }
+    msgs.push(ChatMessage { role: "user".into(), content: format!("Word: {word}\nExamples:\n{}\nParaphrase:", examples.join("\n\n")) });
+    msgs
+}
+
+/// Build recovery prompt for substitutability testing.
+pub fn build_recover_prompt(ambig: &AmbiguousExample, exp: Option<&Explication>) -> Vec<ChatMessage> {
+    let user = match exp {
+        Some(e) => format!("Passage: {}\nParaphrase:\n{}\nMissing word:", ambig.text, e.text),
+        None => format!("Passage: {}\nMissing Word:", ambig.text),
+    };
+    vec![
+        ChatMessage { role: "system".into(), content: RECOVERY_PROMPT_SYS_INST.into() },
+        ChatMessage { role: "user".into(), content: user },
+    ]
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod eval_tests {
+    use super::*;
+
+    #[test]
+    fn test_is_nsm_prime() {
+        assert!(is_nsm_prime("think"));
+        assert!(is_nsm_prime("THINK"));
+        assert!(!is_nsm_prime("journalism"));
+    }
+
+    #[test]
+    fn test_is_stop_word() {
+        assert!(is_stop_word("the"));
+        assert!(!is_stop_word("think")); // NSM prime, not stopword
+    }
+
+    #[test]
+    fn test_explication_legality() {
+        let mut exp = Explication::new("someone can feel something good because of this");
+        exp.legality_score("happy");
+        assert!(exp.primes_ratio > 0.5);
+        assert!(!exp.uses_original_word);
+    }
+
+    #[test]
+    fn test_explication_circularity() {
+        let mut exp = Explication::new("feeling happy about something");
+        exp.legality_score("happy");
+        assert!(exp.uses_original_word);
+    }
+
+    #[test]
+    fn test_explication_averages() {
+        let mut exp = Explication::new("test");
+        exp.primes_ratio = 0.6;
+        exp.molecules_ratio = 0.1;
+        let mut s = SubstitutabilityScore::new("g");
+        s.adj_score = 5.0;
+        exp.sub_scores.push(s);
+        exp.calculate_averages();
+        assert!(exp.total_score > 0.0);
+    }
+
+    #[test]
+    fn test_truncated() {
+        let exp = Explication::new("line one\nline two\nline three");
+        assert_eq!(exp.get_truncated(2).len(), 2);
+    }
+
+    #[test]
+    fn test_ambiguous_truncated() {
+        let a = AmbiguousExample::new("The cat sat. The <UNK> was red. It was sunny.");
+        let t = a.get_truncated(2);
+        assert!(!t.is_empty());
+    }
+
+    #[test]
+    fn test_model_result() {
+        let mut r = ModelResult::new("m");
+        let mut e = Explication::new("t");
+        e.primes_ratio = 0.5;
+        e.total_score = 8.0;
+        r.explications.push(e);
+        r.calculate_averages();
+        assert!((r.avg_total_score - 8.0).abs() < 0.01);
+    }
+
+    #[test]
+    fn test_spo_triple() {
+        let t = SpoTriple::new(671, 2943, 95);
+        assert_eq!(t.subject(), 671);
+        assert_eq!(t.predicate(), 2943);
+        assert_eq!(t.object(), 95);
+    }
+
+    #[test]
+    fn test_cam_codes_load() {
+        let bytes = vec![1,2,3,4,5,6, 7,8,9,10,11,12];
+        let codes = load_cam_codes(&bytes);
+        assert_eq!(codes.len(), 2);
+        assert_eq!(codes[0], [1,2,3,4,5,6]);
+    }
+
+    #[test]
+    fn test_prompt_building() {
+        let msgs = build_explication_prompt("happy", &["I am happy"], &[], None);
+        assert_eq!(msgs.len(), 2);
+        assert!(msgs[1].content.contains("happy"));
+    }
+}

From 2c5cc3a9f342411f4b467406f2af45bac3a535b2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 28 Mar 2026 20:08:28 +0000
Subject: [PATCH 2/4] feat(hpc): SIMD-accelerate cam_pq + deepnsm hot paths via
 crate::simd
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All consumer code uses crate::simd only. Zero raw intrinsics.
LazyLock dispatch table selects AVX-512 vs AVX2 at startup.

cam_pq.rs — squared_l2():
  - Called 1,536× per CAM-PQ query (6 subspaces × 256 centroids)
  - Was: scalar iter().zip().map().sum()
  - Now: F32x16 for 16D subvectors (one SIMD lane = one subspace dimension)
  - Fast path: n==16 → single load-subtract-multiply-reduce
  - Medium path: n>=16 → chunked F32x16 with mul_add + scalar remainder
  - Estimated 16× speedup on hot path

deepnsm.rs — nsm_decompose() normalization:
  - Was: scalar iter().sum() + scalar /= loop
  - Now: F32x16 accumulation (4×16=64 elements) + scalar remainder (10)
  - Normalize via F32x16 * splat(1/sum) + scalar tail

deepnsm.rs — nsm_to_fingerprint() XOR:
  - Was: scalar for j in 0..1250 { result[j] ^= pattern[j] }
  - Now: U8x64 XOR (19×64=1216 bytes) + scalar remainder (34 bytes)
  - 64 bytes per SIMD operation vs 1 byte scalar

deepnsm.rs — nsm_similarity() cosine:
  - Was: scalar 3-accumulator loop over 74 elements
  - Now: F32x16 with mul_add for dot/mag_a/mag_b (4×16=64) + scalar tail (10)
  - Three reductions in one pass

23 deepnsm tests + 7 dispatch tests passing. Zero regressions.

https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
---
 src/hpc/cam_pq.rs  | 39 +++++++++++++++++++++-
 src/hpc/deepnsm.rs | 81 ++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/src/hpc/cam_pq.rs b/src/hpc/cam_pq.rs
index b0b6a30b..87536ea2 100644
--- a/src/hpc/cam_pq.rs
+++ b/src/hpc/cam_pq.rs
@@ -456,9 +456,46 @@ pub fn train_hybrid(
 
 // === Internal utilities ===
 
-/// Squared L2 distance between two slices.
+/// Squared L2 distance between two slices via `crate::simd`.
+///
+/// For 16D subvectors (CAM-PQ subspace dimension), this is one F32x16
+/// load-subtract-multiply-reduce. Consumer never sees hardware details.
 #[inline(always)]
 fn squared_l2(a: &[f32], b: &[f32]) -> f32 {
+    debug_assert_eq!(a.len(), b.len());
+    let n = a.len();
+
+    // Fast path: exactly 16 elements = one F32x16 lane (most common in CAM-PQ).
+    if n == 16 {
+        use crate::simd::F32x16;
+        let va = F32x16::from_slice(a);
+        let vb = F32x16::from_slice(b);
+        let diff = va - vb;
+        return (diff * diff).reduce_sum();
+    }
+
+    // Medium path: process 16 elements at a time, accumulate remainder scalar.
+    if n >= 16 {
+        use crate::simd::F32x16;
+        let mut acc = F32x16::splat(0.0);
+        let chunks = n / 16;
+        for i in 0..chunks {
+            let off = i * 16;
+            let va = F32x16::from_slice(&a[off..off + 16]);
+            let vb = F32x16::from_slice(&b[off..off + 16]);
+            let diff = va - vb;
+            acc = diff.mul_add(diff, acc);
+        }
+        let mut sum = acc.reduce_sum();
+        // Scalar remainder
+        for i in (chunks * 16)..n {
+            let d = a[i] - b[i];
+            sum += d * d;
+        }
+        return sum;
+    }
+
+    // Scalar fallback for tiny slices.
     a.iter().zip(b.iter()).map(|(x, y)| (x - y) * (x - y)).sum()
 }
 
diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs
index beecc7a1..b98f7af9 100644
--- a/src/hpc/deepnsm.rs
+++ b/src/hpc/deepnsm.rs
@@ -609,16 +609,34 @@ pub fn nsm_decompose(text: &str) -> NsmDecomposition {
         }
     }
 
-    // Normalize weights to sum to 1.0
-    let sum: f32 = weights.iter().sum();
-    if sum > 0.0 {
-        for w in weights.iter_mut() {
-            *w /= sum;
+    // Normalize weights to sum to 1.0 via crate::simd.
+    // 74 elements = 4×F32x16 (64) + 10 scalar remainder.
+    let weight_sum = {
+        use crate::simd::F32x16;
+        let mut simd_sum = F32x16::splat(0.0);
+        for chunk in weights[..64].chunks_exact(16) {
+            simd_sum = simd_sum + F32x16::from_slice(chunk);
         }
-    }
+        let mut s: f32 = simd_sum.reduce_sum();
+        for &w in &weights[64..74] {
+            s += w;
+        }
+        if s > 0.0 {
+            let inv = F32x16::splat(1.0 / s);
+            for chunk in weights[..64].chunks_exact_mut(16) {
+                let v = F32x16::from_slice(chunk) * inv;
+                v.copy_to_slice(chunk);
+            }
+            let inv_s = 1.0 / s;
+            for w in weights[64..74].iter_mut() {
+                *w *= inv_s;
+            }
+        }
+        s
+    };
 
     // Determine dominant primes (weight > threshold)
-    let threshold = if sum > 0.0 { 1.0 / 74.0 } else { 0.0 };
+    let threshold = if weight_sum > 0.0 { 1.0 / 74.0 } else { 0.0 };
     let dominant: Vec<NsmPrime> = ALL_PRIMES
         .iter()
         .filter(|p| weights[**p as u8 as usize] > threshold)
@@ -656,24 +674,59 @@ pub fn nsm_to_fingerprint(decomp: &NsmDecomposition) -> [u8; 1250] {
         let mut reader = hasher.finalize_xof();
         reader.fill(&mut pattern);
 
-        for j in 0..1250 {
-            result[j] ^= pattern[j];
+        // XOR 1250 bytes via crate::simd::U8x64.
+        // 1250 = 19×64 (1216) + 34 scalar remainder.
+        {
+            use crate::simd::U8x64;
+            let chunks = 1250 / 64; // 19
+            for c in 0..chunks {
+                let off = c * 64;
+                let vr = U8x64::from_slice(&result[off..off + 64]);
+                let vp = U8x64::from_slice(&pattern[off..off + 64]);
+                let xored = vr ^ vp;
+                xored.copy_to_slice(&mut result[off..off + 64]);
+            }
+            // Scalar remainder (34 bytes).
+            for j in (chunks * 64)..1250 {
+                result[j] ^= pattern[j];
+            }
         }
     }
 
     result
 }
 
-/// Cosine similarity between two NSM decompositions.
+/// Cosine similarity between two NSM decompositions via `crate::simd`.
+///
+/// 74 elements = 4×F32x16 (64) + 10 scalar remainder.
+/// Three accumulations in one pass: dot product, magnitude_a², magnitude_b².
 pub fn nsm_similarity(a: &NsmDecomposition, b: &NsmDecomposition) -> f32 {
-    let mut dot = 0.0f32;
-    let mut mag_a = 0.0f32;
-    let mut mag_b = 0.0f32;
-    for i in 0..74 {
+    use crate::simd::F32x16;
+
+    let mut sdot = F32x16::splat(0.0);
+    let mut smag_a = F32x16::splat(0.0);
+    let mut smag_b = F32x16::splat(0.0);
+
+    // SIMD: first 64 elements (4 × 16 lanes).
+    for i in (0..64).step_by(16) {
+        let va = F32x16::from_slice(&a.weights[i..i + 16]);
+        let vb = F32x16::from_slice(&b.weights[i..i + 16]);
+        sdot = va.mul_add(vb, sdot);
+        smag_a = va.mul_add(va, smag_a);
+        smag_b = vb.mul_add(vb, smag_b);
+    }
+
+    let mut dot = sdot.reduce_sum();
+    let mut mag_a = smag_a.reduce_sum();
+    let mut mag_b = smag_b.reduce_sum();
+
+    // Scalar: remaining 10 elements (indices 64..74).
+    for i in 64..74 {
         dot += a.weights[i] * b.weights[i];
         mag_a += a.weights[i] * a.weights[i];
         mag_b += b.weights[i] * b.weights[i];
     }
+
     let denom = (mag_a * mag_b).sqrt();
     if denom < 1e-10 {
         0.0

From d76c631b2636b0ce9098a5225267f7efe7969573 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 28 Mar 2026 21:30:43 +0000
Subject: [PATCH 3/4] docs(deepnsm): save SoA adjacency layout as future
 concept
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document the category-padded SoA layout (16 categories × 16 slots = 256
F32x16 lanes) as a future optimization concept in deepnsm.rs.

Verified no overlap with existing patterns:
  - blasgraph CSR/CSC: graph adjacency matrix, not semantic vectors
  - SPO semiring: cost algebra, not vector layout
  - neighborhood CLAM: search scope, not decomposition format
  - aabb/spatial_hash SoA: spatial coords (x,y,z), not semantic categories
  - dn_tree SoA: HV summary layout, not NSM category decomposition

The concept is clean for future implementation.

https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
---
 src/hpc/deepnsm.rs | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs
index b98f7af9..05e85615 100644
--- a/src/hpc/deepnsm.rs
+++ b/src/hpc/deepnsm.rs
@@ -1282,3 +1282,37 @@ mod eval_tests {
         assert!(msgs[1].content.contains("happy"));
     }
 }
+
+// ============================================================================
+// FUTURE CONCEPT: SoA adjacency layout for SIMD-native category decomposition
+// ============================================================================
+//
+// Current: weights[74] in prime order → 4×F32x16 + 10 scalar remainder
+// Future:  weights[256] in category-padded SoA → 16×F32x16, zero remainder
+//
+// Layout: 16 NsmCategory groups, each padded to 16 slots:
+//   Substantive[16]:  [I, You, Someone, Something, Thing, Body, 0, 0, ..., 0]
+//   Relational[16]:   [Kind, Part, 0, 0, ..., 0]
+//   Mental[16]:       [Think, Know, Want, DontWant, Feel, See, Hear, 0, ..., 0]
+//   ...
+//
+// Benefits:
+//   - One F32x16 per category → entire category comparison in one instruction
+//   - Cross-category similarity = 16 parallel dot products
+//   - Category masking = one F32Mask16 per category (is this word Mental? Spatial?)
+//   - No scalar remainder: 16×16 = 256 elements, all lanes used
+//   - SIMD-friendly alignment: each category starts at 64-byte boundary
+//
+// Does NOT duplicate:
+//   - blasgraph CSR/CSC: graph adjacency matrix, not semantic vectors
+//   - SPO semiring: cost algebra, not vector layout
+//   - neighborhood CLAM: search scope, not decomposition format
+//   - aabb/spatial_hash SoA: spatial coords (x,y,z), not semantic categories
+//
+// Integration points:
+//   - DeepNSM encoder.rs: bind() + bundle() operate on category-aligned vectors
+//   - CausalEdge64: S/P/O palette indices map to category-level features
+//   - Thinking styles: MODULATE verb maps content categories → style weights
+//
+// TODO: implement NsmDecompositionSoA with category-padded [f32; 256] storage
+// ============================================================================

From e1c37a519889a98a3b5a12a4c79ec296c5fdf1ba Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 28 Mar 2026 22:16:54 +0000
Subject: [PATCH 4/4] docs(deepnsm): psychometric validation framework +
 vertical HHTL bundling spec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two architectural concepts saved for dedicated implementation sessions:

1. Psychometric validation for DeepNSM measurement instrument:
   - Cronbach's α across 128 projections (2³ SPO × 2⁴ HHTL)
   - Split-half reliability: Strategy A vs Strategy B distance
   - IRT item parameters: per-word difficulty + discrimination
   - Factor analysis: do 74 primes factor into 16 NsmCategory?
   - Construct/convergent/discriminant validity across codec chain
   - Polysemy detection via α drop across projections
   - P-values with 128 independent measurements per pair

2. Vertical HHTL bundling (studio mixing analogy):
   - Leaves → bundle → Twigs → bundle → Branches → bundle → Hip
   - Each level = majority vote denoising (background noise removal)
   - Unbind bottom-up to verify reconstruction (information loss audit)
   - Combined SPO × HHTL = 128-way factorial decomposition
   - Cascade as psychometric filter: discrimination, factor analysis,
     composite reliability, SEM, residual analysis

Key insight: NARS confidence IS measurement reliability (formalized).
Every similarity judgment gets a confidence interval backed by
128 independent projection measurements.

https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
---
 src/hpc/deepnsm.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/hpc/deepnsm.rs b/src/hpc/deepnsm.rs
index 05e85615..bdacb4b0 100644
--- a/src/hpc/deepnsm.rs
+++ b/src/hpc/deepnsm.rs
@@ -1316,3 +1316,62 @@ mod eval_tests {
 //
 // TODO: implement NsmDecompositionSoA with category-padded [f32; 256] storage
 // ============================================================================
+
+// ============================================================================
+// FUTURE CONCEPT: Psychometric validation framework for DeepNSM
+// ============================================================================
+//
+// The vocabulary IS a measurement instrument. Each word is a test item.
+// Each prime weight is a factor loading. Psychometric theory validates
+// whether the decomposition measures what it claims to measure.
+//
+// RELIABILITY:
+//   - Test-retest: bundle → unbundle → re-bundle → compare (bit-reproducible)
+//   - Cronbach's α: correlation across 2³ SPO × 2⁴ HHTL = 128 projections
+//     High α (>0.7) = projections agree = construct is coherent
+//     Low α (<0.5) = bundling destroys information at that level
+//   - Split-half: Strategy A distance vs Strategy B distance for same pair
+//     Pearson r between them = reliability of the dual encoding
+//
+// VALIDITY:
+//   - Construct: do primes factor into 16 NsmCategory groups? (PCA/FA)
+//   - Convergent: SpoBase17 ≈ CausalEdge64 ≈ VsaVec ≈ DeepNSM cosine
+//     for same pair. All should rank similarly.
+//   - Discriminant: "dog bites man" ≠ "man bites dog" across all encodings
+//   - Criterion: OSINT extraction quality against known-true datasets
+//
+// ITEM RESPONSE THEORY (IRT):
+//   - Per-word difficulty: how many primes cleanly decompose this word?
+//     "think" = easy (2 primes), "justice" = hard (6+ primes)
+//   - Per-word discrimination: does this word reliably separate concepts?
+//     "good" = high (separates Evaluator), "the" = zero
+//   - Per-prime reliability: does this prime contribute consistently?
+//
+// HHTL CASCADE AS PSYCHOMETRIC FILTER:
+//   HEEL: drop items with discrimination < 0.3 (bad test items)
+//   HIP:  factor analyze → extract latent structure → compare with theory
+//   BRANCH: composite reliability per factor (α per NsmCategory)
+//   TWIG: structural equation model → path coefficients = causal relations
+//   LEAF: residual variance → noise OR undiscovered factor → NARS abduction
+//
+// VERTICAL BUNDLING (studio mixing analogy):
+//   Leaves → bundle → Twigs → bundle → Branches → bundle → Hip
+//   Each level = majority vote denoising
+//   Unbind bottom-up to verify reconstruction
+//   Hamming(unbind(Hip, branch_role), actual_branch) = information loss
+//   Combined with SPO: 2³ × 2⁴ = 128 projections, each an "item"
+//   Cronbach's α across 128 items = total measurement reliability
+//
+// P-VALUES:
+//   128 independent measurements per pair → statistical power for p < 0.001
+//   Every similarity judgment comes with a confidence interval
+//   NARS confidence IS measurement reliability (formalized)
+//
+// POLYSEMY DETECTION:
+//   Word with high α in context = disambiguated (reliable measurement)
+//   Word with low α across projections = polysemous (unreliable item)
+//   α drop localizes WHERE the ambiguity lives in the HHTL tree
+//
+// TODO: implement CronbachAlpha, SplitHalfReliability, FactorAnalysis,
+//       ItemDifficulty, ItemDiscrimination, MeasurementInvariance
+// ============================================================================