From 0f5dab178f30162b0a3fbe9f7ee06e0b8bb611ed Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 12 May 2026 12:07:57 -0700 Subject: [PATCH 1/2] x86: use generic x86-64-v{N} mcpu and explicit mattrs The x86 backend previously assumed AVX-512 microarchitectures form a strict feature hierarchy (Zen4 implies Cannonlake implies Skylake-AVX512 implies AVX512). It then drove `-mcpu` off the highest level in that chain, picking vendor-specific CPU names like `znver4` or `cannonlake`. That assumption isn't quite right: vendor-specific mcpu choices enable vendor-specific features for us (e.g. `-mcpu=znver4` turns on `sse4a`, which Cannonlake doesn't have). A Halide flag higher in the hierarchy was therefore not actually a strict superset of one below it -- code built for `AVX512_Zen4` could use SSE4a and fail to run on a Cannonlake CPU even though Halide treats Zen4 as "Cannonlake and above". Switch `mcpu_target()` to pick only generic `x86-64-v{N}` levels, and have `mattrs()` explicitly enable every feature Halide tracks on top of that baseline. The Halide feature flags keep their existing meaning, but a level like `AVX512_Zen4` now produces code that runs on the intersection of Zen4 and Cannonlake rather than the union, preserving the "each level is a superset of the level below" invariant the rest of the backend depends on. Per-CPU tuning via `mcpu_tune()` is untouched, so users who want znver4/znver5-specific scheduling still get it. Verified with `make correctness_simd_op_check_x86` (passes) and by running `correctness_vector_reductions` under Intel SDE's `-chip_check_die` for pnr/snb/hsw/skx/cnl/icx/spr with matching Halide targets (all pass). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/CodeGen_X86.cpp | 137 +++++++++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 51 deletions(-) diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp index 4e42ecdc0703..4ff685a7a4e2 100644 --- a/src/CodeGen_X86.cpp +++ b/src/CodeGen_X86.cpp @@ -1718,31 +1718,34 @@ void CodeGen_X86::visit(const Store *op) { } string CodeGen_X86::mcpu_target() const { - // Perform an ad-hoc guess for the -mcpu given features. - // WARNING: this is used to drive -mcpu, *NOT* -mtune! - // The CPU choice here *WILL* affect -mattrs! - if (target.has_feature(Target::AVX512_SapphireRapids)) { - return "sapphirerapids"; - } else if (target.has_feature(Target::AVX512_Zen5)) { - return "znver5"; - } else if (target.has_feature(Target::AVX512_Zen4)) { - return "znver4"; - } else if (target.has_feature(Target::AVX512_Cannonlake)) { - return "cannonlake"; - } else if (target.has_feature(Target::AVX512_Skylake)) { - return "skylake-avx512"; - } else if (target.has_feature(Target::AVX512_KNL)) { - return "knl"; + // Use generic x86-64 microarchitecture levels rather than specific + // CPU names. mattrs() turns on the actual features Halide knows + // about on top of this baseline. This avoids LLVM enabling + // vendor-specific extras for us (e.g. sse4a on AMD CPUs like + // "znver4"), which would otherwise mean that a Halide feature + // higher in the hierarchy (Zen4) failed to be a strict superset + // of one below it (Cannonlake). With this change, a Halide flag + // like AVX512_Zen4 produces code that runs on the intersection + // of Zen4 and Cannonlake hardware -- not the union. + if (target.has_feature(Target::AVX512_Skylake)) { + // x86-64-v4: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA, + // LZCNT, MOVBE, plus AVX512F/BW/CD/DQ/VL. + return "x86-64-v4"; } else if (target.has_feature(Target::AVX2)) { - return "haswell"; + // x86-64-v3: SSE4.2, POPCNT, AVX, AVX2, BMI1/2, F16C, FMA, + // LZCNT, MOVBE. Also covers AVX512 / AVX512_KNL, since both + // imply AVX2 (via complete_x86_target), but neither requires + // BW/DQ/VL which would come for free with v4. + return "x86-64-v3"; } else if (target.has_feature(Target::AVX)) { - return "corei7-avx"; - } else if (target.has_feature(Target::SSE41)) { - // We want SSE4.1 but not SSE4.2, hence "penryn" rather than "corei7" - return "penryn"; + // x86-64-v2: SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT. AVX is + // added on top via mattrs. + return "x86-64-v2"; } else { - // Default should not include SSSE3, hence "k8" rather than "core2" - return "k8"; + // Plain x86-64 baseline (SSE, SSE2). The Halide SSE41 level + // (which deliberately stops short of SSE4.2) adds its + // features individually in mattrs. + return "x86-64"; } } @@ -1814,10 +1817,35 @@ string CodeGen_X86::mcpu_tune() const { return mcpu_target(); // Detect "best" CPU from the enabled ISA's. } -// FIXME: we should lower everything here, instead of relying -// that -mcpu= (`mcpu_target()`) implies/sets features for us. +// All features Halide knows about are turned on individually here, +// rather than relying on `-mcpu=` (set by `mcpu_target()`) to pull +// them in. `mcpu_target()` picks only generic x86-64-v{N} levels, so +// we don't accidentally inherit vendor-specific features (e.g. sse4a) +// that aren't present across all CPUs at a given Halide feature +// level. string CodeGen_X86::mattrs() const { std::vector attrs; + + // Features added on top of the plain x86-64 baseline. These are + // already covered by x86-64-v2 (and above), but Halide's SSE41 + // feature level intentionally stops short of SSE4.2, so SSE41 + // alone uses the plain x86-64 mcpu and adds these by hand. + if (target.has_feature(Target::SSE41) && !target.has_feature(Target::AVX)) { + attrs.emplace_back("+sse3"); + attrs.emplace_back("+ssse3"); + attrs.emplace_back("+sse4.1"); + } + + // Features added on top of the x86-64-v2 baseline. AVX without + // AVX2 only needs +avx (the rest of v3 -- BMI/F16C/FMA -- aren't + // guaranteed at the Halide AVX level). + if (target.has_feature(Target::AVX) && !target.has_feature(Target::AVX2)) { + attrs.emplace_back("+avx"); + } + + // FMA / F16C are guaranteed when AVX2 is set (and thus already + // included in x86-64-v3), but are also exposed as independent + // Halide flags, so emit them explicitly when requested. if (target.has_feature(Target::FMA)) { attrs.emplace_back("+fma"); } @@ -1827,39 +1855,46 @@ string CodeGen_X86::mattrs() const { if (target.has_feature(Target::F16C)) { attrs.emplace_back("+f16c"); } + + // AVX512 features. Any AVX512 variant implies AVX2 (via + // complete_x86_target), so the mcpu baseline is at least + // x86-64-v3. Skylake-and-above selects x86-64-v4, which already + // includes F/CD/BW/DQ/VL, but we still add those features + // explicitly so the bare AVX512 / AVX512_KNL paths (which use + // x86-64-v3) also get them. if (target.has_feature(Target::AVX512) || target.has_feature(Target::AVX512_KNL) || target.has_feature(Target::AVX512_Skylake) || target.has_feature(Target::AVX512_Cannonlake)) { attrs.emplace_back("+avx512f"); attrs.emplace_back("+avx512cd"); - if (target.has_feature(Target::AVX512_KNL)) { - attrs.emplace_back("+avx512pf"); - attrs.emplace_back("+avx512er"); - } - if (target.has_feature(Target::AVX512_Skylake) || - target.has_feature(Target::AVX512_Cannonlake)) { - attrs.emplace_back("+avx512vl"); - attrs.emplace_back("+avx512bw"); - attrs.emplace_back("+avx512dq"); - } - if (target.has_feature(Target::AVX512_Cannonlake)) { - attrs.emplace_back("+avx512ifma"); - attrs.emplace_back("+avx512vbmi"); - } - if (target.has_feature(Target::AVX512_Zen4)) { - attrs.emplace_back("+avx512bf16"); - attrs.emplace_back("+avx512vnni"); - attrs.emplace_back("+avx512bitalg"); - attrs.emplace_back("+avx512vbmi2"); - } - if (target.has_feature(Target::AVXVNNI)) { - attrs.emplace_back("+avxvnni"); - } - if (target.has_feature(Target::AVX512_SapphireRapids)) { - attrs.emplace_back("+amx-int8"); - attrs.emplace_back("+amx-bf16"); - } + } + if (target.has_feature(Target::AVX512_KNL)) { + attrs.emplace_back("+avx512pf"); + attrs.emplace_back("+avx512er"); + } + if (target.has_feature(Target::AVX512_Skylake) || + target.has_feature(Target::AVX512_Cannonlake)) { + attrs.emplace_back("+avx512vl"); + attrs.emplace_back("+avx512bw"); + attrs.emplace_back("+avx512dq"); + } + if (target.has_feature(Target::AVX512_Cannonlake)) { + attrs.emplace_back("+avx512ifma"); + attrs.emplace_back("+avx512vbmi"); + } + if (target.has_feature(Target::AVX512_Zen4)) { + attrs.emplace_back("+avx512bf16"); + attrs.emplace_back("+avx512vnni"); + attrs.emplace_back("+avx512bitalg"); + attrs.emplace_back("+avx512vbmi2"); + } + if (target.has_feature(Target::AVXVNNI)) { + attrs.emplace_back("+avxvnni"); + } + if (target.has_feature(Target::AVX512_SapphireRapids)) { + attrs.emplace_back("+amx-int8"); + attrs.emplace_back("+amx-bf16"); } if (gather_might_be_slow(target)) { attrs.emplace_back("+prefer-no-gather"); From a88f977e6752672b8409ffddd2aab833a0580555 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Tue, 12 May 2026 13:39:41 -0700 Subject: [PATCH 2/2] x86: read CPUID.7.1.EDX, not CPUID.7.0.EDX, when probing for AVX10 Host CPU detection (in both `src/Target.cpp` and the runtime helper `src/runtime/x86_cpu_features.cpp`) was checking `CPUID.(EAX=7,ECX=0).EDX[19]` to decide whether AVX10 is present. The AVX10 enumeration bit actually lives at `CPUID.(EAX=7,ECX=1).EDX[19]` -- both files had read both sub-leaves into `info2` and `info3` respectively, so the fix is a one-token change from `info2.edx` to `info3.edx` in each place. The APX detection a few lines below was already reading `info3.edx`, which is what tipped me off that the AVX10 line was a slip rather than an intentional choice. Why this hadn't surfaced on real silicon, despite being shipped since early 2024: the bad path requires both `os_avx512` to be true AND `CPUID.7.0.EDX[19]` to be set on the same CPU. On Intel hardware through Sapphire Rapids, `CPUID.7.0.EDX[19]` is reserved and reads as zero, so the check is harmlessly always false. On hybrid Alder/Raptor Lake parts various hybrid-related bits in `CPUID.7.0.EDX` are set, but those CPUs don't have AVX-512 enabled by the OS, so `os_avx512` is false and the outer guard blocks the path. On real Granite Rapids (the first chip that actually has AVX10) both leaves agree, so the buggy check happens to give the right answer. Intel's SDE is what makes the bug visible: under `sde -spr`, the emulated CPUID sets `CPUID.7.0.EDX = 0xffdc4432` (bit 19 = 1) while correctly leaving `CPUID.7.1.EDX[19]` clear, and reports `os_avx512`. That combination -- impossible on shipped silicon -- triggers host-detection to push `Target::AVX10_1` even though the chip doesn't have AVX10. SDE additionally returns `CPUID.24.0.EBX = 0x00000002` (version=2, but all of the 128/256/512 width bits clear), so `vector_bits` is never set away from its default of 0, and `CodeGen_X86::mattrs()` hits `user_error << "AVX10 only supports 256 or 512 bit variants"`. The runtime version had the same typo with a milder symptom: it doesn't track `vector_bits`, so it would silently advertise `halide_target_feature_avx10_1` as available to anything that calls `halide_can_use_target_features` on a machine matching the SDE-style combination. No shipped Intel part hits this today, but it's the same latent landmine and worth fixing alongside the JIT path. Found while running the correctness suite under SDE for a variety of Intel microarchitectures, where every chip except `-spr` passed cleanly; the 9/10 SPR failures all decoded to this single detection bug. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Target.cpp | 5 +++-- src/runtime/x86_cpu_features.cpp | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Target.cpp b/src/Target.cpp index c73215f40282..1fb7373d6e99 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -502,9 +502,10 @@ Target calculate_host_target() { } } - // AVX10 converged vector instructions. + // AVX10 converged vector instructions. The enumeration bit is + // CPUID.(EAX=7,ECX=1).EDX[19]. const uint32_t avx10 = 1U << 19; - if (os_avx512 && (info2.edx & avx10)) { + if (os_avx512 && (info3.edx & avx10)) { const auto info_avx10 = cpuid(0x24, 0x0); // This checks that the AVX10 version is greater than zero. diff --git a/src/runtime/x86_cpu_features.cpp b/src/runtime/x86_cpu_features.cpp index afb4ea0d3e46..0716d7f27078 100644 --- a/src/runtime/x86_cpu_features.cpp +++ b/src/runtime/x86_cpu_features.cpp @@ -197,8 +197,9 @@ extern "C" WEAK int halide_get_cpu_features(CpuFeatures *features) { // AVX10 converged vector instructions. // AVX10 uses EVEX encoding with opmask registers at all vector widths, // so it requires the same OS XSAVE support as AVX-512. + // The enumeration bit is CPUID.(EAX=7,ECX=1).EDX[19]. constexpr uint32_t avx10 = 1U << 19; - if (os_avx512 && (info2.edx & avx10)) { + if (os_avx512 && (info3.edx & avx10)) { const auto info_avx10 = cpuid(0x24, 0x0); if ((info_avx10.ebx & 0xff) >= 1) { halide_set_available_cpu_feature(features, halide_target_feature_avx10_1);