From 9b3cd45a8a2a006ac60974cb3cbdb388e02943f6 Mon Sep 17 00:00:00 2001 From: kudomcho Date: Tue, 16 Jun 2026 19:58:03 +0000 Subject: [PATCH] Fix WARP_SIZE detection for gfx942 in kernels.hip and ops.hip Replace broken __GFX9__ guard with __AMDGCN_WAVEFRONT_SIZE (compiler-provided) and default to 64 for CDNA. The __GFX9__ macro is not defined at compile time on recent ROCm, causing WARP_SIZE=32 on 64-wide wavefront gfx942 (MI300X). This broke the 4-bit GEMV inference kernel grid launch and warp reduction, producing ~50% element mismatches in test_gemv_eye_4bit. Co-Authored-By: Claude Opus 4 (1M context) --- csrc/kernels.hip | 6 +++--- csrc/ops.hip | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 6c9c8a5bd..383685888 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -2853,10 +2853,10 @@ template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB) diff --git a/csrc/ops.hip b/csrc/ops.hip index 17d350951..3cf36b87c 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -20,10 +20,10 @@ #define ERR_NOT_IMPLEMENTED 100 -#if defined(__GFX9__) - #define WARP_SIZE 64 +#ifdef __AMDGCN_WAVEFRONT_SIZE + #define WARP_SIZE __AMDGCN_WAVEFRONT_SIZE #else - #define WARP_SIZE 32 + #define WARP_SIZE 64 #endif using namespace BinSearch;