From f68fefa514eb11fe53777901de38daf663f70228 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 13 Apr 2026 17:00:24 +0000 Subject: [PATCH] =?UTF-8?q?Cherry-pick:=20fix(amx)=20prevent=20SIGILL=20?= =?UTF-8?q?=E2=80=94=20use=20=5Fxgetbv(0)=20+=20prctl=20for=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause: amx_available() used CPUID leaf 0xD (what CPU supports) instead of _xgetbv(0) (what OS actually enabled). Hypervisors that advertise AMX in CPUID but don't enable tile state caused SIGILL. Fix — 4-step detection: 1. CPUID.07H bits 24+25 → CPU has AMX-TILE + AMX-INT8? 2. CPUID.01H bit 27 → OS supports XSAVE? 3. _xgetbv(0) bits 17+18 → OS ACTUALLY enabled tile state? 4. prctl(ARCH_REQ_XCOMP_PERM, 18) → process has tile permission? VNNI hierarchy documented: avx512vnni (EVEX zmm, 64 MACs) → checked first avxvnniint8 (VEX ymm, 32 MACs) → only if avx512vnni absent Before: 1612 pass + SIGILL crash After: 1612 pass, 0 fail, 36 ignored Cherry-picked from 282daf77 (claude/continue-lance-graph-ndarray-Ld786) https://claude.ai/code/session_017ZN5PNEf8boFBgorUZVrFU --- .claude/AMX_GOTCHAS.md | 39 +++++++++++++++---- src/simd_amx.rs | 87 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 107 insertions(+), 19 deletions(-) diff --git a/.claude/AMX_GOTCHAS.md b/.claude/AMX_GOTCHAS.md index 3e8dfe48..22330c91 100644 --- a/.claude/AMX_GOTCHAS.md +++ b/.claude/AMX_GOTCHAS.md @@ -66,18 +66,41 @@ For CPUID leaf 7 (AMX detection): use `__cpuid_count()`, not inline asm. --- -## Gotcha 4: OS must enable AMX via XSETBV +## Gotcha 4: OS must enable AMX via XSETBV + process must request permission -AMX tiles are large (8 KB of state). The OS must opt in via XCR0 bits 17+18. -Linux 5.19+ enables AMX by default. Older kernels: SIGILL on tile instructions. +AMX tiles are large (8 KB of state). Two levels of OS enablement required: + +1. **Kernel enables tile state in XCR0** (bits 17+18). Linux 5.19+ does this. +2. **Process requests XCOMP_PERM** via `prctl(ARCH_REQ_XCOMP_PERM, 18)`. + Without this, LDTILECFG will SIGILL even if XCR0 bits are set. **Detection (stable)**: ```rust -let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0); -let tilecfg = (xcr0.eax >> 17) & 1; // bit 17 = XTILECFG -let tiledata = (xcr0.eax >> 18) & 1; // bit 18 = XTILEDATA -// Both must be 1 -``` +// Step 1: CPUID — does CPU support AMX? +let cpuid = core::arch::x86_64::__cpuid_count(7, 0); +let amx_tile = (cpuid.edx >> 24) & 1; +let amx_int8 = (cpuid.edx >> 25) & 1; + +// Step 2: OSXSAVE — does OS support XSAVE? +let cpuid_01 = core::arch::x86_64::__cpuid(1); +let osxsave = (cpuid_01.ecx >> 27) & 1; + +// Step 3: _xgetbv(0) — did OS ACTUALLY enable tile state? +// ⚠ Do NOT use __cpuid_count(0xD, 0) — that reports what CPU SUPPORTS, +// not what the OS ENABLED. _xgetbv(0) reads the actual XCR0 register. +let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) }; +let tilecfg = (xcr0 >> 17) & 1; // bit 17 = XTILECFG +let tiledata = (xcr0 >> 18) & 1; // bit 18 = XTILEDATA + +// Step 4: prctl — request tile permission for this process +// SYS_prctl = 157, ARCH_REQ_XCOMP_PERM = 0x1023, XFEATURE_XTILEDATA = 18 +// Returns 0 on success, -errno on failure. Idempotent. +``` + +**Previous bug**: `__cpuid_count(0xD, 0)` reports XSAVE state component bitmap +(what the CPU *supports*), NOT the actual XCR0 value (what the OS *enabled*). +On hypervisors that advertise AMX in CPUID but don't enable tile state, +the old check returned `true` → SIGILL on LDTILECFG. --- diff --git a/src/simd_amx.rs b/src/simd_amx.rs index a092f2f1..9bc688af 100644 --- a/src/simd_amx.rs +++ b/src/simd_amx.rs @@ -31,17 +31,74 @@ // ═══════════════════════════════════════════════════════════════════════════ /// Check if AMX hardware is present AND OS-enabled. +/// +/// Two checks required: +/// 1. CPUID.07H.0H:EDX bits 24 (AMX-TILE) + 25 (AMX-INT8) = CPU supports it +/// 2. XCR0 bits 17 (TILECFG) + 18 (TILEDATA) = OS has enabled tile state +/// +/// The XCR0 check is critical: even if CPUID reports AMX, the hypervisor +/// may not have enabled the XSTATE for tiles. Without OS enablement, +/// LDTILECFG will SIGILL. +/// +/// Previous bug: used CPUID leaf 0xD (reports what CPU supports for XSAVE) +/// instead of _xgetbv(0) (reports what OS actually enabled). The old check +/// could return true on a hypervisor that advertises AMX in CPUID but +/// hasn't set XCR0 bits 17+18. #[cfg(target_arch = "x86_64")] pub fn amx_available() -> bool { + // Step 1: CPU supports AMX-TILE + AMX-INT8? let cpuid = core::arch::x86_64::__cpuid_count(7, 0); let amx_tile = (cpuid.edx >> 24) & 1; let amx_int8 = (cpuid.edx >> 25) & 1; if amx_tile == 0 || amx_int8 == 0 { return false; } - // Check OS enabled via XCR0 bits 17+18 - let xcr0 = core::arch::x86_64::__cpuid_count(0xD, 0); - let tilecfg = (xcr0.eax >> 17) & 1; - let tiledata = (xcr0.eax >> 18) & 1; - tilecfg == 1 && tiledata == 1 + + // Step 2: OS enabled XSAVE? (CPUID.01H:ECX bit 27 = OSXSAVE) + let cpuid_01 = core::arch::x86_64::__cpuid(1); + let osxsave = (cpuid_01.ecx >> 27) & 1; + if osxsave == 0 { return false; } + + // Step 3: OS actually enabled tile state in XCR0? + // _xgetbv(0) reads the ACTUAL XCR0 register (what the OS set), + // not the CPUID-reported capability. + // Bit 17 = TILECFG, Bit 18 = TILEDATA. Both must be set. + let xcr0: u64 = unsafe { core::arch::x86_64::_xgetbv(0) }; + let tilecfg = (xcr0 >> 17) & 1; + let tiledata = (xcr0 >> 18) & 1; + if tilecfg == 0 || tiledata == 0 { return false; } + + // Step 4: Request XCOMP_PERM for TILEDATA. + // Linux kernel 5.19+: processes must call prctl(ARCH_REQ_XCOMP_PERM, 18) + // to request permission for TILEDATA (XFEATURE 18) before using AMX. + // Without this, LDTILECFG will SIGILL even if XCR0 bits are set. + // The prctl either succeeds (0) or fails (-1) — idempotent, safe to call + // multiple times. + #[cfg(target_os = "linux")] + { + const SYS_PRCTL: i64 = 157; // x86_64 syscall number for prctl + const ARCH_REQ_XCOMP_PERM: i64 = 0x1023; + const XFEATURE_XTILEDATA: i64 = 18; + // SAFETY: syscall(prctl, ARCH_REQ_XCOMP_PERM, 18) is a simple permission + // request. It either grants tile permission (returns 0) or fails (returns + // -errno). No side effects on failure. Idempotent. + let ret: i64; + unsafe { + core::arch::asm!( + "syscall", + inlateout("rax") SYS_PRCTL => ret, + in("rdi") ARCH_REQ_XCOMP_PERM, + in("rsi") XFEATURE_XTILEDATA, + in("rdx") 0i64, + in("r10") 0i64, + in("r8") 0i64, + lateout("rcx") _, + lateout("r11") _, + options(nostack), + ); + } + if ret != 0 { return false; } + } + + true } #[cfg(not(target_arch = "x86_64"))] @@ -203,17 +260,25 @@ pub fn vnni_matvec_scalar( /// Runtime-dispatched VNNI MatVec: avx512vnni → avxvnniint8 → scalar i32. /// -/// Three tiers, mutually exclusive by hardware generation: +/// Three tiers, checked in order (first match wins): /// avx512vnni — 64 MACs/instr (zmm, Cascade Lake+, Zen 4+) /// avxvnniint8 — 32 MACs/instr (ymm, Arrow Lake, NUC 14 i9-185H) -/// scalar i32 — only for non-x86 or testing (caller should prefer F32x16 FMA) +/// scalar i32 — only for non-x86 or testing +/// +/// IMPORTANT: avxvnniint8 (VNNI2, 256-bit) is NEVER reached when +/// avx512vnni (VNNI512) is present. This is correct: +/// - CPUs with avx512vnni always have 512-bit VPDPBUSD (faster) +/// - avxvnniint8 exists ONLY for CPUs that dropped AVX-512 +/// but added 256-bit VNNI (Arrow Lake, Meteor Lake U-series) +/// - The two instructions have DIFFERENT encodings: +/// avx512vnni: EVEX-encoded VPDPBUSD zmm (512-bit) +/// avxvnniint8: VEX-encoded VPDPBUSD ymm (256-bit) +/// - Running EVEX VPDPBUSD on a VEX-only CPU = SIGILL +/// - Running VEX VPDPBUSD on an EVEX CPU = works but wastes half the width /// -/// NOTE: The scalar path here does i32 multiply-accumulate, NOT f32. -/// For the thinking engine, F32x16 FMA (16 MACs/instr) is the true floor. -/// This scalar path exists only for correctness on non-x86 targets. /// The thinking engine's cycle_auto() dispatches: /// VNNI detected → cycle_vnni() → this function -/// No VNNI → cycle() → F32x16 (never reaches here) +/// No VNNI → cycle() → F32x16 FMA (never reaches here) pub fn matvec_dispatch( table: &[u8], energy_i8: &[i8],