From 69017531a7652558d1263f0718de368c720aa6af Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 4 Apr 2026 12:05:02 +0000 Subject: [PATCH] =?UTF-8?q?docs:=20clarify=20VNNI=20dispatch=20tiers=20?= =?UTF-8?q?=E2=80=94=20F32x16=20is=20the=20floor,=20no=20scalar=20on=20x86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit avx512vnni (64 MACs) and avxvnniint8 (32 MACs) are mutually exclusive by hardware generation. The scalar i32 path in matvec_dispatch only exists for non-x86 correctness. On x86, the thinking engine dispatches to F32x16 FMA (16 MACs) when no VNNI is detected — never reaches the scalar path. https://claude.ai/code/session_01ChLvBfpJS8dQhHxRD4pYNp --- src/simd_amx.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/simd_amx.rs b/src/simd_amx.rs index ee420e78..a092f2f1 100644 --- a/src/simd_amx.rs +++ b/src/simd_amx.rs @@ -201,11 +201,19 @@ pub fn vnni_matvec_scalar( } } -/// Runtime-dispatched MatVec: avx512vnni → avxvnniint8 (VNNI2) → scalar. +/// Runtime-dispatched VNNI MatVec: avx512vnni → avxvnniint8 → scalar i32. /// -/// Tier 2: avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+) -/// Tier 1: avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H) -/// Tier 0: scalar +/// Three tiers, mutually exclusive by hardware generation: +/// avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+) +/// avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H) +/// scalar i32 — only for non-x86 or testing (caller should prefer F32x16 FMA) +/// +/// NOTE: The scalar path here does i32 multiply-accumulate, NOT f32. +/// For the thinking engine, F32x16 FMA (16 MACs/instr) is the true floor. +/// This scalar path exists only for correctness on non-x86 targets. +/// The thinking engine's cycle_auto() dispatches: +/// VNNI detected → cycle_vnni() → this function +/// No VNNI → cycle() → F32x16 (never reaches here) pub fn matvec_dispatch( table: &[u8], energy_i8: &[i8], @@ -223,6 +231,8 @@ pub fn matvec_dispatch( return; } } + // Non-x86 or no VNNI: i32 scalar accumulate. + // On x86, the thinking engine uses F32x16 FMA instead of reaching here. vnni_matvec_scalar(table, energy_i8, result, n); }