From 439bf91992918df3e179bc7a7d11375b88c055c0 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Mon, 15 Jun 2026 13:06:49 +0200 Subject: [PATCH] fix(cpu-ops): lazy transpose for Q8_0 packed tensors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ops.transpose rewraps the packed bytes with a flipped shape for the K-series (Q4_K/Q5_K/Q6_K) and Q5_0/Q5_1, but Q8_0 fell through to the generic FP32 DenseTensorDataFactory path, which casts the Byte-backed buffer to Float and throws ClassCastException. Add the analogous Q8_0BlockTensorData case. This unblocks keeping a Q8_0 matmul weight packed through linearProject (matmul(x, transpose(W))) — notably FunctionGemma's tied Q8_0 lm_head, which otherwise has to dequant to FP32 (~0.67 GB) and OOMs the 1.9 GB SL2610 board. Verified: SKaiNET-transformers GemmaQ5KPackedParityTest (eager load(NATIVE_OPTIMIZED)) now packs the lm_head as Q8_0 and decodes byte-identically to the FP32 baseline. See SKaiNET-transformers#178. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt index d61eb889..0e2c889a 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt @@ -24,6 +24,7 @@ import sk.ainet.lang.tensor.data.Q5_1TensorData import sk.ainet.lang.tensor.data.Q5_1BlockTensorData import sk.ainet.lang.tensor.data.Q5_0TensorData import sk.ainet.lang.tensor.data.Q5_0BlockTensorData +import sk.ainet.lang.tensor.data.Q8_0BlockTensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.UpsampleMode @@ -606,6 +607,12 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + // Q8_0 lazy transpose: rewrap the same input-block-major bytes with + // flipped shape (bytes are layout-agnostic to the [out,in] kernel + // convention) so a packed Q8_0 weight (e.g. gemma's tied lm_head) + // survives linearProject's transpose instead of hitting the generic + // FP32 path (Byte→Float ClassCastException). See transformers #178. + is Q8_0TensorData -> return newTensor(Q8_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) else -> {} } }