From 748e0308d41cf30ba991ad0709911e37c7fc3601 Mon Sep 17 00:00:00 2001 From: Philip Degarmo Date: Fri, 19 Jun 2026 12:41:33 -0700 Subject: [PATCH 1/3] Fix a miscompile that occurs when reading a matrix contained in a struct with no sibling fields. The expected behavior is that each load instruction is at a 4 byte offset, but observed behavior is that Loads are always at 0 offset. This was introduced in commit 909c55245 (merged 2025-03-17), and it's observable regressed between the releases v1.8.2502 and v1.8.2505. The regression was tested as still live in v1.10.2605.24 and current HEAD. This change adds a test that reproduces the problem. ClangHLSLTests shows 2 failures with this test pre-patch and 0 failures after the patch. --- lib/HLSL/HLOperationLower.cpp | 37 +++++++++++++------ ...ab_load_struct_matrix_element_offsets.hlsl | 31 ++++++++++++++++ 2 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/bab_load_struct_matrix_element_offsets.hlsl diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp index ff30dcbf20..4264270a09 100644 --- a/lib/HLSL/HLOperationLower.cpp +++ b/lib/HLSL/HLOperationLower.cpp @@ -9222,6 +9222,13 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, assert(resultSize <= 16); std::vector idxList(resultSize); + // For raw buffers the byte offset rides in the buffer index and the element + // offset operand stays undef, so the per-element offset must accumulate into + // bufIdx. For structured buffers the per-element offset is the element offset + // and bufIdx is the structure index. + bool isRawBuf = DXIL::IsRawBuffer(ResKind); + Value *matBaseIdx = isRawBuf ? bufIdx : baseOffset; + switch (subOp) { case HLSubscriptOpcode::ColMatSubscript: case HLSubscriptOpcode::RowMatSubscript: { @@ -9229,7 +9236,7 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, Value *offset = CI->getArgOperand(HLOperandIndex::kMatSubscriptSubOpIdx + i); offset = subBuilder.CreateMul(offset, EltByteSize); - idxList[i] = subBuilder.CreateAdd(baseOffset, offset); + idxList[i] = subBuilder.CreateAdd(matBaseIdx, offset); } } break; case HLSubscriptOpcode::RowMatElement: @@ -9238,7 +9245,7 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (unsigned i = 0; i < resultSize; i++) { Value *offset = subBuilder.CreateMul(EltIdxs->getAggregateElement(i), EltByteSize); - idxList[i] = subBuilder.CreateAdd(baseOffset, offset); + idxList[i] = subBuilder.CreateAdd(matBaseIdx, offset); } } break; default: @@ -9251,9 +9258,10 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (auto U = CI->user_begin(); U != CI->user_end();) { Value *subsUser = *(U++); if (resultSize == 1) { - TranslateStructBufSubscriptUser(cast(subsUser), handle, - ResKind, bufIdx, idxList[0], status, - hlslOP, DL); + TranslateStructBufSubscriptUser( + cast(subsUser), handle, ResKind, + isRawBuf ? idxList[0] : bufIdx, isRawBuf ? baseOffset : idxList[0], + status, hlslOP, DL); continue; } if (GetElementPtrInst *GEP = dyn_cast(subsUser)) { @@ -9261,8 +9269,9 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (auto gepU = GEP->user_begin(); gepU != GEP->user_end();) { Instruction *gepUserInst = cast(*(gepU++)); - TranslateStructBufSubscriptUser(gepUserInst, handle, ResKind, bufIdx, - GEPOffset, status, hlslOP, DL); + TranslateStructBufSubscriptUser( + gepUserInst, handle, ResKind, isRawBuf ? GEPOffset : bufIdx, + isRawBuf ? baseOffset : GEPOffset, status, hlslOP, DL); } GEP->eraseFromParent(); @@ -9276,13 +9285,15 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (unsigned i = 0; i < resultSize; i++) { Value *EltVal = stBuilder.CreateExtractElement(Val, i); uint8_t mask = DXIL::kCompMask_X; - GenerateStructBufSt(handle, bufIdx, idxList[i], EltTy, hlslOP, + GenerateStructBufSt(handle, isRawBuf ? idxList[i] : bufIdx, + isRawBuf ? baseOffset : idxList[i], EltTy, hlslOP, stBuilder, {EltVal, undefElt, undefElt, undefElt}, mask, alignment); } } else { uint8_t mask = DXIL::kCompMask_X; - GenerateStructBufSt(handle, bufIdx, idxList[0], EltTy, hlslOP, + GenerateStructBufSt(handle, isRawBuf ? idxList[0] : bufIdx, + isRawBuf ? baseOffset : idxList[0], EltTy, hlslOP, stBuilder, {Val, undefElt, undefElt, undefElt}, mask, alignment); } @@ -9300,14 +9311,16 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (unsigned i = 0; i < resultSize; i++) { Value *ResultElt; // TODO: This can be inefficient for row major matrix load - GenerateRawBufLd(handle, bufIdx, idxList[i], + GenerateRawBufLd(handle, isRawBuf ? idxList[i] : bufIdx, + isRawBuf ? baseOffset : idxList[i], /*status*/ nullptr, EltTy, ResultElt, hlslOP, ldBuilder, 1, alignment); ldData = ldBuilder.CreateInsertElement(ldData, ResultElt, i); } } else { - GenerateRawBufLd(handle, bufIdx, idxList[0], /*status*/ nullptr, EltTy, - ldData, hlslOP, ldBuilder, 4, alignment); + GenerateRawBufLd(handle, isRawBuf ? idxList[0] : bufIdx, + isRawBuf ? baseOffset : idxList[0], /*status*/ nullptr, + EltTy, ldData, hlslOP, ldBuilder, 4, alignment); } ldUser->replaceAllUsesWith(ldData); ldUser->eraseFromParent(); diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/bab_load_struct_matrix_element_offsets.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/bab_load_struct_matrix_element_offsets.hlsl new file mode 100644 index 0000000000..758a910182 --- /dev/null +++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/bab_load_struct_matrix_element_offsets.hlsl @@ -0,0 +1,31 @@ +// Reading individual matrix elements through the struct member of a +// ByteAddressBuffer.Load must address each element at its own +// byte offset. For raw buffers the byte offset rides in the load index operand +// and the element-offset operand stays undef, so the four first-column reads +// below must land at indices 0, 4, 8 and 12 -- they must not collapse to the +// matrix base (index 0), which is what happened before the matrix-element +// subscript path was taught the raw-buffer addressing convention. + +// RUN: %dxc -T cs_6_6 -E cs %s | FileCheck %s + +struct Box { float4x4 m; }; + +ByteAddressBuffer src : register(t0); +RWByteAddressBuffer dst : register(u0); + +[numthreads(1, 1, 1)] +void cs() +{ + Box b = src.Load(0); + + // Column-major: _m00/_m10/_m20/_m30 live at bytes 0/4/8/12. + dst.Store(0, asuint(b.m._m00)); + dst.Store(4, asuint(b.m._m10)); + dst.Store(8, asuint(b.m._m20)); + dst.Store(12, asuint(b.m._m30)); +} + +// CHECK: rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 0, i32 undef, i8 1, i32 4) +// CHECK: rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 4, i32 undef, i8 1, i32 4) +// CHECK: rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 8, i32 undef, i8 1, i32 4) +// CHECK: rawBufferLoad.f32(i32 139, %dx.types.Handle %{{.*}}, i32 12, i32 undef, i8 1, i32 4) From d6a4e6b9d19952ef135d3639299a131cf5aa0e46 Mon Sep 17 00:00:00 2001 From: Philip Degarmo Date: Fri, 19 Jun 2026 13:16:05 -0700 Subject: [PATCH 2/3] clang format fix --- lib/HLSL/HLOperationLower.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp index 4264270a09..73be0acf32 100644 --- a/lib/HLSL/HLOperationLower.cpp +++ b/lib/HLSL/HLOperationLower.cpp @@ -9258,10 +9258,10 @@ void TranslateStructBufMatSubscript(CallInst *CI, Value *handle, for (auto U = CI->user_begin(); U != CI->user_end();) { Value *subsUser = *(U++); if (resultSize == 1) { - TranslateStructBufSubscriptUser( - cast(subsUser), handle, ResKind, - isRawBuf ? idxList[0] : bufIdx, isRawBuf ? baseOffset : idxList[0], - status, hlslOP, DL); + TranslateStructBufSubscriptUser(cast(subsUser), handle, + ResKind, isRawBuf ? idxList[0] : bufIdx, + isRawBuf ? baseOffset : idxList[0], + status, hlslOP, DL); continue; } if (GetElementPtrInst *GEP = dyn_cast(subsUser)) { From 1eefc31d1447cb1ef4ca9ed9c7e4e59a90c586c7 Mon Sep 17 00:00:00 2001 From: Philip Degarmo Date: Tue, 23 Jun 2026 20:37:16 -0700 Subject: [PATCH 3/3] Additional test per feedback https://github.com/microsoft/DirectXShaderCompiler/pull/8568#pullrequestreview-4548635560 to verify the IR->IR transform of the modified pass --- .../intrinsics/buffer-struct-matrix-load.ll | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-struct-matrix-load.ll diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-struct-matrix-load.ll b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-struct-matrix-load.ll new file mode 100644 index 0000000000..78ecdff379 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/buffer-struct-matrix-load.ll @@ -0,0 +1,100 @@ +; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +%struct.ByteAddressBuffer = type { i32 } +%struct.RWByteAddressBuffer = type { i32 } +%struct.Box = type { %class.matrix.float.4.4 } +%class.matrix.float.4.4 = type { [4 x <4 x float>] } +%dx.types.Handle = type { i8* } +%dx.types.ResourceProperties = type { i32, i32 } + +@"\01?src@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4 +@"\01?dst@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4 + +; Function Attrs: nounwind +define void @main() #0 { + %src = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?src@@3UByteAddressBuffer@@A" + + ; CHECK: [[HDL:%.*]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer + ; CHECK: [[ANHDL:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDL]], %dx.types.ResourceProperties { i32 11, i32 0 }) + ; Column 0 of a column-major float4x4: elements at byte offsets 0, 4, 8, 12. + ; The element-offset operand (4th arg) must be undef for raw-buffer loads. + ; CHECK: @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDL]], i32 0, i32 undef, i8 1, i32 4) + ; CHECK: @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDL]], i32 4, i32 undef, i8 1, i32 4) + ; CHECK: @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDL]], i32 8, i32 undef, i8 1, i32 4) + ; CHECK: @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle [[ANHDL]], i32 12, i32 undef, i8 1, i32 4) + + %src.hdl = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %src) + %src.ann = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %src.hdl, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer) + + ; Load(0): returns a virtual pointer at raw-buffer byte offset 0. + %box.ptr = call %struct.Box* @"dx.hl.op.ro.%struct.Box* (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %src.ann, i32 0) + + ; GEP to field 0 (the matrix m) inside Box -- byte offset within the struct = 0. + %mat.ptr = getelementptr inbounds %struct.Box, %struct.Box* %box.ptr, i32 0, i32 0 + + ; ColMatSubscript (opcode 1): extract rows 0-3 of column 0 from the 4x4 matrix. + ; Row indices 0-3 map to flat element indices 0-3 (column-major storage), + ; i.e. byte offsets 0, 4, 8, 12 relative to the matrix base. + %col0.ptr = call <4 x float>* @"dx.hl.subscript.colMajor[].rn.<4 x float>* (i32, %class.matrix.float.4.4*, i32, i32, i32, i32)"(i32 1, %class.matrix.float.4.4* %mat.ptr, i32 0, i32 1, i32 2, i32 3) + %col0 = load <4 x float>, <4 x float>* %col0.ptr + + ; Write the result to a UAV so the loads have an observable use. + %dst = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?dst@@3URWByteAddressBuffer@@A" + %dst.hdl = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %dst) + %dst.ann = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %dst.hdl, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer) + %col0.x = extractelement <4 x float> %col0, i32 0 + %col0.y = extractelement <4 x float> %col0, i32 1 + %col0.z = extractelement <4 x float> %col0, i32 2 + %col0.w = extractelement <4 x float> %col0, i32 3 + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %dst.ann, i32 0, float %col0.x) + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %dst.ann, i32 4, float %col0.y) + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %dst.ann, i32 8, float %col0.z) + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32 277, %dx.types.Handle %dst.ann, i32 12, float %col0.w) + + ret void +} + +declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32, %struct.ByteAddressBuffer) #1 +declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer) #1 +declare %struct.Box* @"dx.hl.op.ro.%struct.Box* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #2 +declare <4 x float>* @"dx.hl.subscript.colMajor[].rn.<4 x float>* (i32, %class.matrix.float.4.4*, i32, i32, i32, i32)"(i32, %class.matrix.float.4.4*, i32, i32, i32, i32) #1 +declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #1 +declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #1 +declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, float)"(i32, %dx.types.Handle, i32, float) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } + +!pauseresume = !{!1} +!dx.version = !{!0} +!dx.valver = !{!2} +!dx.shaderModel = !{!3} +!dx.typeAnnotations = !{!4} +!dx.entryPoints = !{!8} +!dx.fnprops = !{!15} +!dx.options = !{!16, !17} + +!0 = !{i32 1, i32 6} +!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"} +!2 = !{i32 1, i32 9} +!3 = !{!"cs", i32 6, i32 6} +; dx.typeAnnotations: function annotations only (no user struct needed for this pass). +!4 = !{i32 1, void ()* @main, !5} +!5 = !{!6} +!6 = !{i32 1, !7, !7} +!7 = !{} +; dx.entryPoints: entry = main, no signatures, resources = {SRV=!10, UAV=!12}. +!8 = !{void ()* @main, !"main", null, !9, null} +!9 = !{!10, !12, null, null} +!10 = !{!11} +!11 = !{i32 0, %struct.ByteAddressBuffer* @"\01?src@@3UByteAddressBuffer@@A", !"src", i32 0, i32 0, i32 1, i32 11, i32 0, null} +!12 = !{!13} +!13 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?dst@@3URWByteAddressBuffer@@A", !"dst", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null} +; dx.fnprops: CS kind=5, numthreads(1,1,1). +!15 = !{void ()* @main, i32 5, i32 1, i32 1, i32 1} +!16 = !{i32 64} +!17 = !{i32 -1}