From 439bf91992918df3e179bc7a7d11375b88c055c0 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Mon, 15 Jun 2026 13:06:49 +0200
Subject: [PATCH] fix(cpu-ops): lazy transpose for Q8_0 packed tensors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ops.transpose rewraps the packed bytes with a flipped shape for the K-series
(Q4_K/Q5_K/Q6_K) and Q5_0/Q5_1, but Q8_0 fell through to the generic FP32
DenseTensorDataFactory path, which casts the Byte-backed buffer to Float and
throws ClassCastException. Add the analogous Q8_0BlockTensorData case.

This unblocks keeping a Q8_0 matmul weight packed through linearProject
(matmul(x, transpose(W))) — notably FunctionGemma's tied Q8_0 lm_head, which
otherwise has to dequant to FP32 (~0.67 GB) and OOMs the 1.9 GB SL2610 board.

Verified: SKaiNET-transformers GemmaQ5KPackedParityTest (eager load(NATIVE_OPTIMIZED))
now packs the lm_head as Q8_0 and decodes byte-identically to the FP32 baseline.
See SKaiNET-transformers#178.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt       | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
index d61eb889..0e2c889a 100644
--- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
@@ -24,6 +24,7 @@ import sk.ainet.lang.tensor.data.Q5_1TensorData
 import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
 import sk.ainet.lang.tensor.data.Q5_0TensorData
 import sk.ainet.lang.tensor.data.Q5_0BlockTensorData
+import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
 import sk.ainet.lang.tensor.data.TensorData
 import sk.ainet.lang.tensor.data.TensorDataFactory
 import sk.ainet.lang.tensor.ops.UpsampleMode
@@ -606,6 +607,12 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
                 is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
+                // Q8_0 lazy transpose: rewrap the same input-block-major bytes with
+                // flipped shape (bytes are layout-agnostic to the [out,in] kernel
+                // convention) so a packed Q8_0 weight (e.g. gemma's tied lm_head)
+                // survives linearProject's transpose instead of hitting the generic
+                // FP32 path (Byte→Float ClassCastException). See transformers #178.
+                is Q8_0TensorData -> return newTensor(Q8_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 else -> {}
             }
         }