SKaiNET-developers · michalharakal · Jun 11, 2026 · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/...s/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/...s/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
@@ -79,6 +79,12 @@ public interface KernelProvider {
      */
     public fun matmulQ6K(): Q6KMatmulKernel? = null
 
+    /**
+     * F32 × Q5_K matmul kernel exposed by this provider, or `null` if
+     * this provider does not specialize Q5_K. Same fall-through pattern.
+     */
+    public fun matmulQ5K(): Q5KMatmulKernel? = null
+
     /**
      * F32 × Q5_1 matmul kernel exposed by this provider, or `null` if
      * this provider does not specialize Q5_1. Same fall-through pattern.
@@ -126,6 +132,7 @@ public interface KernelProvider {
             "Q8_0" -> matmulQ8_0() != null
             "Q4_0" -> matmulQ4_0() != null
             "Q6_K" -> matmulQ6K() != null
+            "Q5_K" -> matmulQ5K() != null
             "Q5_1" -> matmulQ5_1() != null
             "Q5_0" -> matmulQ5_0() != null
             else -> false

diff --git a/.../skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt b/.../skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt
@@ -0,0 +1,59 @@
+package sk.ainet.backend.api.kernel
+
+/**
+ * F32 input × Q5_K-packed weights matrix-vector multiply, in canonical
+ * ggml super-block layout.
+ *
+ *   output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
+ *     for j ∈ [0, inputDim), o ∈ [0, outputDim)
+ *
+ * Block layout (256-element super-block, 176 bytes/block; see
+ * [sk.ainet.lang.tensor.data.Q5_KTensorData] kdoc for the byte map):
+ * - bytes 0..1   : `d` (super-block scale, FP16 LE)
+ * - bytes 2..3   : `dMin` (super-block min-scale, FP16 LE)
+ * - bytes 4..15  : 12 bytes of packed (6-bit scaleIdx, 6-bit minIdx) for
+ *                  8 sub-blocks via ggml's `get_scale_min_k4` mixing
+ *                  (identical to Q4_K)
+ * - bytes 16..47 : 32 bytes `qh` high-bit plane (the 5th bit of each code)
+ * - bytes 48..175: 128 bytes of 4-bit low nibbles, *strided* in 4 groups of
+ *                  32 bytes (identical layout to Q4_K's `qs`)
+ *
+ * Per sub-block s ∈ 0..7:
+ *   `scale[s]  = d    * scaleIdx[s]`
+ *   `offset[s] = dMin * minIdx[s]`
+ *   per element: `code = lowNibble | (fifthBit << 4)` (0..31);
+ *                `dequant = code * scale[s] - offset[s]`
+ *
+ * The lazy-`dmin` accumulation trick (used by every well-tuned K-quant
+ * kernel including ggml's reference) avoids subtracting `offset` per
+ * element by tracking `Σ(input · code)` and `Σ(input)` per sub-block
+ * and combining as `scale * codeSum − offset * inputSum` once.
+ *
+ * Implementations MUST NOT mutate `input` or `weight`. They MAY assume
+ * the arrays do not alias each other or `output`. They MUST fully
+ * write the `outputDim` floats starting at `output[outputOffset]`.
+ *
+ * Packed-weight row-major contract: `weight` holds blocks laid out
+ * `(blockIdx * outputDim + o) * 176` for output row `o` and input
+ * block index `blockIdx`. This matches `Q5_KBlockTensorData.packedData`.
+ *
+ * `inputDim` MUST be a multiple of 256 (the Q5_K block size).
+ */
+public interface Q5KMatmulKernel {
+    /**
+     * @param input FP32 input vector (single row).
+     * @param inputOffset element offset into [input] where the row starts.
+     * @param weight packed Q5_K bytes for the full `outputDim × inputDim` weight tensor.
+     * @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
+     * @param inputDim contraction dimension (must be a multiple of 256).
+     * @param outputDim number of output cells.
+     * @param output FP32 output vector.
+     * @param outputOffset element offset into [output] where the row starts.
+     */
+    public fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    )
+}
diff --git a/...ds/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt b/...ds/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
@@ -6,6 +6,7 @@ import sk.ainet.backend.api.kernel.KernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
 import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
 import sk.ainet.backend.api.kernel.Q6KMatmulKernel
 import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
@@ -33,6 +34,7 @@ public object ScalarKernelProvider : KernelProvider {
     override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
     override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel
     override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel
+    override fun matmulQ5K(): Q5KMatmulKernel = ScalarQ5_KMatmulKernel
     override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel
     override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel
 }
diff --git a/.../skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt b/.../skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt
@@ -0,0 +1,91 @@
+package sk.ainet.exec.kernel
+
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+
+/**
+ * Scalar reference [Q5KMatmulKernel] — commonMain, so Q5_K packed matmul works
+ * on Kotlin/Native / JS / WASM, not only the JVM SIMD path.
+ *
+ * Q5_K super-block: 256 elements / 176 bytes, block-major `(blockIdx*outputDim+o)*176`:
+ * `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 32 `qh`
+ * high-bit bytes 128 `qs` low-nibble bytes. Each of the 8 sub-blocks (32 elts)
+ * contributes `codeSum*scale - inputSum*offset`, with `scale = d*scaleIdx`,
+ * `offset = dMin*minIdx`, and the 5-bit `code = lowNibble | (fifthBit << 4)`.
+ * Math mirrors `DequantOps.dequantQ5KFromBytes` and the Q4_K kernel (only the
+ * 5th-bit fold differs).
+ */
+public object ScalarQ5_KMatmulKernel : Q5KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+    private const val SUB_BLOCK = 32
+    private const val BYTES_PER_BLOCK = 176
+    private const val QH_OFFSET = 16
+    private const val QS_OFFSET = 48
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "ScalarQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0) return
+        if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+        val scaleIdx = IntArray(8)
+        val minIdx = IntArray(8)
+
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (blockIdx in 0 until blocksPerInputDim) {
+                val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+                val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF))
+                val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF))
+
+                // ggml get_scale_min_k4 over the 12 scale bytes (identical to Q4_K).
+                val sc = blockBase + 4
+                for (sb in 0 until 4) {
+                    scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F
+                    minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F
+                }
+                for (sb in 4 until 8) {
+                    val low4S = weight[sc + sb + 4].toInt() and 0x0F
+                    val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6
+                    scaleIdx[sb] = low4S or (high2S shl 4)
+                    val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4
+                    val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6
+                    minIdx[sb] = low4M or (high2M shl 4)
+                }
+
+                val qhBase = blockBase + QH_OFFSET
+                val qsBase = blockBase + QS_OFFSET
+                val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE
+                for (groupJ in 0 until 4) {
+                    val qsRegion = qsBase + groupJ * 32
+                    // sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes;
+                    // the 5th bit comes from qh[i], bit (2*groupJ + half).
+                    for (half in 0 until 2) {
+                        val sb = 2 * groupJ + half
+                        val bit = 2 * groupJ + half
+                        val inStart = inBlockBase + sb * SUB_BLOCK
+                        var codeSum = 0f
+                        var inputSum = 0f
+                        for (i in 0 until 32) {
+                            val b = weight[qsRegion + i].toInt() and 0xFF
+                            val low = if (half == 0) (b and 0x0F) else (b ushr 4)
+                            val fifth = ((weight[qhBase + i].toInt() and 0xFF) ushr bit) and 0x01
+                            val code = low or (fifth shl 4)
+                            val v = input[inStart + i]
+                            codeSum += v * code
+                            inputSum += v
+                        }
+                        acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb])
+                    }
+                }
+            }
+            output[outputOffset + o] = acc
+        }
+    }
+}
diff --git a/...kends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/...kends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
@@ -18,6 +18,8 @@ import sk.ainet.lang.tensor.data.Q4_KTensorData
 import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q6_KTensorData
 import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q5_1TensorData
 import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
 import sk.ainet.lang.tensor.data.Q5_0TensorData
@@ -333,6 +335,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
     private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() }
     private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() }
     private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() }
+    private val q5kKernel by lazy { resolveProvider { it.matmulQ5K() != null }?.matmulQ5K() }
     private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() }
     private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() }
 
@@ -367,6 +370,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
             is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) }
+            is Q5_KTensorData -> q5kKernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
@@ -598,6 +602,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
             @Suppress("UNCHECKED_CAST")
             when (val d = tensor.data) {
                 is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
+                is Q5_KTensorData -> return newTensor(Q5_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)

diff --git a/...skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/...skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt
@@ -5,6 +5,7 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
 import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
 import sk.ainet.backend.api.kernel.Q6KMatmulKernel
@@ -65,6 +66,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
     override fun matmulQ6K(): Q6KMatmulKernel? =
         if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null
 
+    override fun matmulQ5K(): Q5KMatmulKernel? =
+        if (isAvailable()) PanamaVectorQ5_KMatmulKernel else null
+
     private fun isVectorApiClassLoaded(): Boolean = runCatching {
         Class.forName("jdk.incubator.vector.FloatVector")
         Class.forName("jdk.incubator.vector.VectorSpecies")