From 69017531a7652558d1263f0718de368c720aa6af Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 4 Apr 2026 12:05:02 +0000
Subject: [PATCH] =?UTF-8?q?docs:=20clarify=20VNNI=20dispatch=20tiers=20?=
 =?UTF-8?q?=E2=80=94=20F32x16=20is=20the=20floor,=20no=20scalar=20on=20x86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

avx512vnni (64 MACs) and avxvnniint8 (32 MACs) are mutually exclusive
by hardware generation. The scalar i32 path in matvec_dispatch only
exists for non-x86 correctness. On x86, the thinking engine dispatches
to F32x16 FMA (16 MACs) when no VNNI is detected — never reaches
the scalar path.

https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp
---
 src/simd_amx.rs | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/simd_amx.rs b/src/simd_amx.rs
index ee420e78..a092f2f1 100644
--- a/src/simd_amx.rs
+++ b/src/simd_amx.rs
@@ -201,11 +201,19 @@ pub fn vnni_matvec_scalar(
     }
 }
 
-/// Runtime-dispatched MatVec: avx512vnni → avxvnniint8 (VNNI2) → scalar.
+/// Runtime-dispatched VNNI MatVec: avx512vnni → avxvnniint8 → scalar i32.
 ///
-/// Tier 2: avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+)
-/// Tier 1: avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H)
-/// Tier 0: scalar
+/// Three tiers, mutually exclusive by hardware generation:
+///   avx512vnni  — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+)
+///   avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H)
+///   scalar i32  — only for non-x86 or testing (caller should prefer F32x16 FMA)
+///
+/// NOTE: The scalar path here does i32 multiply-accumulate, NOT f32.
+/// For the thinking engine, F32x16 FMA (16 MACs/instr) is the true floor.
+/// This scalar path exists only for correctness on non-x86 targets.
+/// The thinking engine's cycle_auto() dispatches:
+///   VNNI detected → cycle_vnni() → this function
+///   No VNNI       → cycle() → F32x16 (never reaches here)
 pub fn matvec_dispatch(
     table: &[u8],
     energy_i8: &[i8],
@@ -223,6 +231,8 @@ pub fn matvec_dispatch(
             return;
         }
     }
+    // Non-x86 or no VNNI: i32 scalar accumulate.
+    // On x86, the thinking engine uses F32x16 FMA instead of reaching here.
     vnni_matvec_scalar(table, energy_i8, result, n);
 }