From 01010416b5d19f890b92e063d54067a42654d965 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Wed, 10 Jun 2026 23:13:55 +0200
Subject: [PATCH 1/4] feat(backend): first-class Q5_K packed matmul + ARM NEON
 kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds Q5_K as a packed in-kernel dequant-matmul format (previously Q5_K was
only eagerly decoded to FP32 on load), mirroring the existing Q4_K plumbing,
and hand-written ARM NEON paths for the native CPU kernels.

Q5_K (256-elt / 176-byte super-block: d, dMin, 12 packed scales, 32-byte qh
high-bit plane, 128-byte qs low nibbles; 5-bit code = lowNibble | (5th<<4)):
- TensorEncoding.Q5_K; Q5_KTensorData / Q5_KBlockTensorData (5th-bit fold).
- Q5KMatmulKernel SPI + matmulQ5K()/"Q5_K" in KernelProvider.supports().
- ScalarQ5_KMatmulKernel (commonMain/KN), PanamaVectorQ5_KMatmulKernel (JVM),
  native C skainet_q5k_matmul + NativeQ5KMatmulKernel (FFM); all registered.
- DefaultCpuOps matmul dispatch + lazy-transpose branches.
- StreamingGgufParametersLoader: Q5_K + Q6_K packed branches (a Q5_K_M GGUF
  now loads end-to-end instead of SKIP'ing most tensors).

Tests: Q5_KBlockTensorData bit-exact vs DequantOps golden across blocks;
native<->Panama<->scalar matmul parity; KernelSupportMatrixTest gate updated.

ARM NEON (behind #if __ARM_NEON in skainet_simd.h; x86 keeps the scalar
fallback, re-verified green):
- fp32 (broadcast+vfmaq), q8_0 (widen int8->f32+vfmaq), q4k/q5k (nibble
  unpack + dual code/input accumulators; q5k folds the qh 5th bit via a
  runtime-count vshlq_u8).
- CMake aarch64 branch: -march=armv8.2-a+fp16+dotprod (no +i8mm — A55 lacks
  it). Cross toolchain-aarch64.cmake + opt-in -PcrossArm64 gradle tasks;
  default x86 build unaffected.

BOARD-VERIFY-PENDING: the NEON paths are aarch64-syntax-validated (clang
--target=aarch64) but not executed (x86 host, no QEMU). Run the parity tests
under qemu-aarch64 or on the SL2610 before relying on them.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../backend/api/kernel/KernelProvider.kt      |   7 +
 .../backend/api/kernel/Q5KMatmulKernel.kt     |  59 ++++
 .../ainet/exec/kernel/ScalarKernelProvider.kt |   2 +
 .../exec/kernel/ScalarQ5_KMatmulKernel.kt     |  91 +++++
 .../sk/ainet/exec/tensor/ops/DefaultCpuOps.kt |   5 +
 .../exec/kernel/PanamaVectorKernelProvider.kt |   4 +
 .../kernel/PanamaVectorQ5_KMatmulKernel.kt    | 193 +++++++++++
 .../PanamaVectorQ5_KMatmulKernelParityTest.kt |  71 ++++
 .../build.gradle.kts                          |  60 ++++
 .../native/CMakeLists.txt                     |  12 +
 .../native/include/skainet_kernels.h          |  25 ++
 .../native/include/skainet_simd.h             |  51 +++
 .../native/src/fp32_matmul.c                  |  14 +
 .../native/src/q4k_matmul.c                   |  26 ++
 .../native/src/q5k_matmul.c                   | 202 +++++++++++
 .../native/src/q8_0_matmul.c                  |  21 ++
 .../native/toolchain-aarch64.cmake            |  28 ++
 .../ainet/exec/kernel/NativeKernelProvider.kt |   4 +
 .../exec/kernel/NativeQ5KMatmulKernel.kt      |  91 +++++
 .../exec/kernel/KernelSupportMatrixTest.kt    |   8 +-
 .../kernel/NativeQ5KMatmulKernelParityTest.kt |  94 +++++
 .../io/gguf/StreamingGgufParametersLoader.kt  |  14 +
 .../io/gguf/Q5KBlockTensorDataParityTest.kt   | 115 +++++++
 .../ainet/lang/tensor/data/Q5_KTensorData.kt  | 320 ++++++++++++++++++
 .../lang/tensor/storage/TensorEncoding.kt     |  12 +
 25 files changed, 1525 insertions(+), 4 deletions(-)
 create mode 100644 skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt
 create mode 100644 skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt
 create mode 100644 skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt
 create mode 100644 skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt
 create mode 100644 skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h
 create mode 100644 skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c
 create mode 100644 skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt
 create mode 100644 skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt
 create mode 100644 skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt

diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
index 09d99a9c..ad475190 100644
--- a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
+++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
@@ -79,6 +79,12 @@ public interface KernelProvider {
      */
     public fun matmulQ6K(): Q6KMatmulKernel? = null
 
+    /**
+     * F32 × Q5_K matmul kernel exposed by this provider, or `null` if
+     * this provider does not specialize Q5_K. Same fall-through pattern.
+     */
+    public fun matmulQ5K(): Q5KMatmulKernel? = null
+
     /**
      * F32 × Q5_1 matmul kernel exposed by this provider, or `null` if
      * this provider does not specialize Q5_1. Same fall-through pattern.
@@ -126,6 +132,7 @@ public interface KernelProvider {
             "Q8_0" -> matmulQ8_0() != null
             "Q4_0" -> matmulQ4_0() != null
             "Q6_K" -> matmulQ6K() != null
+            "Q5_K" -> matmulQ5K() != null
             "Q5_1" -> matmulQ5_1() != null
             "Q5_0" -> matmulQ5_0() != null
             else -> false
diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt
new file mode 100644
index 00000000..54398e22
--- /dev/null
+++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt
@@ -0,0 +1,59 @@
+package sk.ainet.backend.api.kernel
+
+/**
+ * F32 input × Q5_K-packed weights matrix-vector multiply, in canonical
+ * ggml super-block layout.
+ *
+ *   output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
+ *     for j ∈ [0, inputDim), o ∈ [0, outputDim)
+ *
+ * Block layout (256-element super-block, 176 bytes/block; see
+ * [sk.ainet.lang.tensor.data.Q5_KTensorData] kdoc for the byte map):
+ * - bytes 0..1   : `d` (super-block scale, FP16 LE)
+ * - bytes 2..3   : `dMin` (super-block min-scale, FP16 LE)
+ * - bytes 4..15  : 12 bytes of packed (6-bit scaleIdx, 6-bit minIdx) for
+ *                  8 sub-blocks via ggml's `get_scale_min_k4` mixing
+ *                  (identical to Q4_K)
+ * - bytes 16..47 : 32 bytes `qh` high-bit plane (the 5th bit of each code)
+ * - bytes 48..175: 128 bytes of 4-bit low nibbles, *strided* in 4 groups of
+ *                  32 bytes (identical layout to Q4_K's `qs`)
+ *
+ * Per sub-block s ∈ 0..7:
+ *   `scale[s]  = d    * scaleIdx[s]`
+ *   `offset[s] = dMin * minIdx[s]`
+ *   per element: `code = lowNibble | (fifthBit << 4)` (0..31);
+ *                `dequant = code * scale[s] - offset[s]`
+ *
+ * The lazy-`dmin` accumulation trick (used by every well-tuned K-quant
+ * kernel including ggml's reference) avoids subtracting `offset` per
+ * element by tracking `Σ(input · code)` and `Σ(input)` per sub-block
+ * and combining as `scale * codeSum − offset * inputSum` once.
+ *
+ * Implementations MUST NOT mutate `input` or `weight`. They MAY assume
+ * the arrays do not alias each other or `output`. They MUST fully
+ * write the `outputDim` floats starting at `output[outputOffset]`.
+ *
+ * Packed-weight row-major contract: `weight` holds blocks laid out
+ * `(blockIdx * outputDim + o) * 176` for output row `o` and input
+ * block index `blockIdx`. This matches `Q5_KBlockTensorData.packedData`.
+ *
+ * `inputDim` MUST be a multiple of 256 (the Q5_K block size).
+ */
+public interface Q5KMatmulKernel {
+    /**
+     * @param input FP32 input vector (single row).
+     * @param inputOffset element offset into [input] where the row starts.
+     * @param weight packed Q5_K bytes for the full `outputDim × inputDim` weight tensor.
+     * @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
+     * @param inputDim contraction dimension (must be a multiple of 256).
+     * @param outputDim number of output cells.
+     * @param output FP32 output vector.
+     * @param outputOffset element offset into [output] where the row starts.
+     */
+    public fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    )
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
index 0611ce76..31a8386b 100644
--- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
@@ -6,6 +6,7 @@ import sk.ainet.backend.api.kernel.KernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
 import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
 import sk.ainet.backend.api.kernel.Q6KMatmulKernel
 import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
@@ -33,6 +34,7 @@ public object ScalarKernelProvider : KernelProvider {
     override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
     override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel
     override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel
+    override fun matmulQ5K(): Q5KMatmulKernel = ScalarQ5_KMatmulKernel
     override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel
     override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel
 }
diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt
new file mode 100644
index 00000000..63daf5e0
--- /dev/null
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt
@@ -0,0 +1,91 @@
+package sk.ainet.exec.kernel
+
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+
+/**
+ * Scalar reference [Q5KMatmulKernel] — commonMain, so Q5_K packed matmul works
+ * on Kotlin/Native / JS / WASM, not only the JVM SIMD path.
+ *
+ * Q5_K super-block: 256 elements / 176 bytes, block-major `(blockIdx*outputDim+o)*176`:
+ * `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 32 `qh`
+ * high-bit bytes 128 `qs` low-nibble bytes. Each of the 8 sub-blocks (32 elts)
+ * contributes `codeSum*scale - inputSum*offset`, with `scale = d*scaleIdx`,
+ * `offset = dMin*minIdx`, and the 5-bit `code = lowNibble | (fifthBit << 4)`.
+ * Math mirrors `DequantOps.dequantQ5KFromBytes` and the Q4_K kernel (only the
+ * 5th-bit fold differs).
+ */
+public object ScalarQ5_KMatmulKernel : Q5KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+    private const val SUB_BLOCK = 32
+    private const val BYTES_PER_BLOCK = 176
+    private const val QH_OFFSET = 16
+    private const val QS_OFFSET = 48
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "ScalarQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0) return
+        if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+        val scaleIdx = IntArray(8)
+        val minIdx = IntArray(8)
+
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (blockIdx in 0 until blocksPerInputDim) {
+                val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+                val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF))
+                val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF))
+
+                // ggml get_scale_min_k4 over the 12 scale bytes (identical to Q4_K).
+                val sc = blockBase + 4
+                for (sb in 0 until 4) {
+                    scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F
+                    minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F
+                }
+                for (sb in 4 until 8) {
+                    val low4S = weight[sc + sb + 4].toInt() and 0x0F
+                    val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6
+                    scaleIdx[sb] = low4S or (high2S shl 4)
+                    val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4
+                    val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6
+                    minIdx[sb] = low4M or (high2M shl 4)
+                }
+
+                val qhBase = blockBase + QH_OFFSET
+                val qsBase = blockBase + QS_OFFSET
+                val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE
+                for (groupJ in 0 until 4) {
+                    val qsRegion = qsBase + groupJ * 32
+                    // sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes;
+                    // the 5th bit comes from qh[i], bit (2*groupJ + half).
+                    for (half in 0 until 2) {
+                        val sb = 2 * groupJ + half
+                        val bit = 2 * groupJ + half
+                        val inStart = inBlockBase + sb * SUB_BLOCK
+                        var codeSum = 0f
+                        var inputSum = 0f
+                        for (i in 0 until 32) {
+                            val b = weight[qsRegion + i].toInt() and 0xFF
+                            val low = if (half == 0) (b and 0x0F) else (b ushr 4)
+                            val fifth = ((weight[qhBase + i].toInt() and 0xFF) ushr bit) and 0x01
+                            val code = low or (fifth shl 4)
+                            val v = input[inStart + i]
+                            codeSum += v * code
+                            inputSum += v
+                        }
+                        acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb])
+                    }
+                }
+            }
+            output[outputOffset + o] = acc
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
index 1a45ae7d..d61eb889 100644
--- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt
@@ -18,6 +18,8 @@ import sk.ainet.lang.tensor.data.Q4_KTensorData
 import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q6_KTensorData
 import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q5_1TensorData
 import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
 import sk.ainet.lang.tensor.data.Q5_0TensorData
@@ -333,6 +335,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
     private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() }
     private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() }
     private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() }
+    private val q5kKernel by lazy { resolveProvider { it.matmulQ5K() != null }?.matmulQ5K() }
     private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() }
     private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() }
 
@@ -367,6 +370,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
             is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) }
+            is Q5_KTensorData -> q5kKernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
             is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
@@ -598,6 +602,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
             @Suppress("UNCHECKED_CAST")
             when (val d = tensor.data) {
                 is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
+                is Q5_KTensorData -> return newTensor(Q5_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
                 is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt
index 8aee59e2..e9bb5b59 100644
--- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt
+++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt
@@ -5,6 +5,7 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
 import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
 import sk.ainet.backend.api.kernel.Q6KMatmulKernel
@@ -65,6 +66,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
     override fun matmulQ6K(): Q6KMatmulKernel? =
         if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null
 
+    override fun matmulQ5K(): Q5KMatmulKernel? =
+        if (isAvailable()) PanamaVectorQ5_KMatmulKernel else null
+
     private fun isVectorApiClassLoaded(): Boolean = runCatching {
         Class.forName("jdk.incubator.vector.FloatVector")
         Class.forName("jdk.incubator.vector.VectorSpecies")
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt
new file mode 100644
index 00000000..5302a471
--- /dev/null
+++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt
@@ -0,0 +1,193 @@
+package sk.ainet.exec.kernel
+
+import jdk.incubator.vector.ByteVector
+import jdk.incubator.vector.FloatVector
+import jdk.incubator.vector.VectorOperators
+import jdk.incubator.vector.VectorSpecies
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+import sk.ainet.exec.tensor.ops.parallelChunks
+
+/**
+ * SIMD-vectorized Q5_K matmul on the JDK Vector API.
+ *
+ * Identical pipeline to [PanamaVectorQ4KMatmulKernel], with one extra step:
+ * each code is 5-bit, so the low nibble (from `qs`) is OR'd with the 5th bit
+ * pulled from the `qh` high-bit plane. For super-block group `j` (0..3), the
+ * low sub-block uses `qh` bit `2j` and the high sub-block uses bit `2j + 1`,
+ * with `qh` indexed by the intra-group position (same `idx` as `qs`). See
+ * [sk.ainet.lang.tensor.data.Q5_KTensorData] / `DequantOps.dequantQ5KFromBytes`.
+ *
+ * Numerical equivalence with the scalar reference is within FMA +
+ * reordered-reduction tolerance; verified via parity tests.
+ */
+public object PanamaVectorQ5_KMatmulKernel : Q5KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+    private const val SUB_BLOCK_SIZE = 32
+    private const val SUB_BLOCKS_PER_BLOCK = 8
+    private const val BYTES_PER_BLOCK = 176
+    private const val QH_OFFSET = 16
+    private const val QS_OFFSET = 48
+
+    private val floatSpecies: VectorSpecies<Float> = FloatVector.SPECIES_PREFERRED
+
+    private val byteSpeciesForFloat: VectorSpecies<Byte> = when (floatSpecies.length()) {
+        16 -> ByteVector.SPECIES_128
+        else -> ByteVector.SPECIES_64 // covers 4-wide (NEON) and 8-wide (AVX2)
+    }
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "PanamaVectorQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+
+        parallelChunks(outputDim) { startO, endO ->
+            val scaleIdx = IntArray(SUB_BLOCKS_PER_BLOCK)
+            val minIdx = IntArray(SUB_BLOCKS_PER_BLOCK)
+            for (o in startO until endO) {
+                var acc = 0f
+                for (blockIdx in 0 until blocksPerInputDim) {
+                    val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+
+                    // d, dMin (FP16 LE).
+                    val dBits = (weight[blockBase + 1].toInt() and 0xFF shl 8) or
+                        (weight[blockBase].toInt() and 0xFF)
+                    val dMinBits = (weight[blockBase + 3].toInt() and 0xFF shl 8) or
+                        (weight[blockBase + 2].toInt() and 0xFF)
+                    val d = halfToFloat(dBits)
+                    val dMin = halfToFloat(dMinBits)
+
+                    // Sub-scale decode via ggml `get_scale_min_k4` (identical to Q4_K).
+                    val scalesOffset = blockBase + 4
+                    for (sb in 0 until 4) {
+                        scaleIdx[sb] = weight[scalesOffset + sb].toInt() and 0x3F
+                        minIdx[sb] = weight[scalesOffset + sb + 4].toInt() and 0x3F
+                    }
+                    for (sb in 4 until 8) {
+                        val low4S = weight[scalesOffset + sb + 4].toInt() and 0x0F
+                        val high2S = (weight[scalesOffset + sb - 4].toInt() and 0xFF) ushr 6
+                        scaleIdx[sb] = low4S or (high2S shl 4)
+                        val low4M = (weight[scalesOffset + sb + 4].toInt() and 0xFF) ushr 4
+                        val high2M = (weight[scalesOffset + sb].toInt() and 0xFF) ushr 6
+                        minIdx[sb] = low4M or (high2M shl 4)
+                    }
+
+                    // 4 strided qs groups; each carries sbLo (lo nibbles) and sbHi (hi nibbles).
+                    // The 5th bit comes from qh[idx], bit (2*groupJ) for lo, (2*groupJ+1) for hi.
+                    val qhOffset = blockBase + QH_OFFSET
+                    val qsOffset = blockBase + QS_OFFSET
+                    val inputBlockBase = inputOffset + blockIdx * BLOCK_SIZE
+                    for (groupJ in 0 until 4) {
+                        val qsRegion = qsOffset + groupJ * 32
+                        val sbLo = 2 * groupJ
+                        val sbHi = sbLo + 1
+                        val bitLo = (2 * groupJ).toByte()
+                        val bitHi = (2 * groupJ + 1).toByte()
+                        val inputStartLo = inputBlockBase + sbLo * SUB_BLOCK_SIZE
+                        val inputStartHi = inputStartLo + SUB_BLOCK_SIZE
+
+                        var codeAccLo = FloatVector.zero(floatSpecies)
+                        var inputAccLo = FloatVector.zero(floatSpecies)
+                        var codeAccHi = FloatVector.zero(floatSpecies)
+                        var inputAccHi = FloatVector.zero(floatSpecies)
+
+                        val floatStep = floatSpecies.length()
+                        val byteLoadLen = byteSpeciesForFloat.length()
+                        var idx = 0
+
+                        // SIMD body — single qs + qh load feeds both nibble vectors.
+                        while (idx + floatStep <= SUB_BLOCK_SIZE &&
+                            qsRegion + idx + byteLoadLen <= weight.size
+                        ) {
+                            val inVecLo = FloatVector.fromArray(floatSpecies, input, inputStartLo + idx)
+                            val inVecHi = FloatVector.fromArray(floatSpecies, input, inputStartHi + idx)
+                            val byteVec = ByteVector.fromArray(byteSpeciesForFloat, weight, qsRegion + idx)
+                            val qhVec = ByteVector.fromArray(byteSpeciesForFloat, weight, qhOffset + idx)
+                            val loNib = byteVec.and(0x0F.toByte())
+                            val hiNib = byteVec.lanewise(VectorOperators.LSHR, 4.toByte())
+                            val fifthLo = qhVec.lanewise(VectorOperators.LSHR, bitLo)
+                                .and(0x01.toByte()).lanewise(VectorOperators.LSHL, 4.toByte())
+                            val fifthHi = qhVec.lanewise(VectorOperators.LSHR, bitHi)
+                                .and(0x01.toByte()).lanewise(VectorOperators.LSHL, 4.toByte())
+                            val codeLoBytes = loNib.lanewise(VectorOperators.OR, fifthLo)
+                            val codeHiBytes = hiNib.lanewise(VectorOperators.OR, fifthHi)
+                            val codeVecLo = codeLoBytes.castShape(floatSpecies, 0) as FloatVector
+                            val codeVecHi = codeHiBytes.castShape(floatSpecies, 0) as FloatVector
+                            codeAccLo = inVecLo.fma(codeVecLo, codeAccLo)
+                            inputAccLo = inVecLo.add(inputAccLo)
+                            codeAccHi = inVecHi.fma(codeVecHi, codeAccHi)
+                            inputAccHi = inVecHi.add(inputAccHi)
+                            idx += floatStep
+                        }
+
+                        var codeSumLo = codeAccLo.reduceLanes(VectorOperators.ADD)
+                        var inputSumLo = inputAccLo.reduceLanes(VectorOperators.ADD)
+                        var codeSumHi = codeAccHi.reduceLanes(VectorOperators.ADD)
+                        var inputSumHi = inputAccHi.reduceLanes(VectorOperators.ADD)
+
+                        // Scalar tail — only fires if floatSpecies.length() doesn't divide 32 (rare).
+                        while (idx < SUB_BLOCK_SIZE) {
+                            val byte = weight[qsRegion + idx].toInt() and 0xFF
+                            val qh = weight[qhOffset + idx].toInt() and 0xFF
+                            val codeLo = ((byte and 0x0F) or (((qh ushr bitLo.toInt()) and 0x01) shl 4)).toFloat()
+                            val codeHi = ((byte ushr 4) or (((qh ushr bitHi.toInt()) and 0x01) shl 4)).toFloat()
+                            val vLo = input[inputStartLo + idx]
+                            val vHi = input[inputStartHi + idx]
+                            codeSumLo += vLo * codeLo
+                            inputSumLo += vLo
+                            codeSumHi += vHi * codeHi
+                            inputSumHi += vHi
+                            idx++
+                        }
+
+                        val scaleLo = d * scaleIdx[sbLo]
+                        val offsetLo = dMin * minIdx[sbLo]
+                        val scaleHi = d * scaleIdx[sbHi]
+                        val offsetHi = dMin * minIdx[sbHi]
+                        acc += codeSumLo * scaleLo - inputSumLo * offsetLo
+                        acc += codeSumHi * scaleHi - inputSumHi * offsetHi
+                    }
+                }
+                output[outputOffset + o] = acc
+            }
+        }
+    }
+
+    /**
+     * IEEE 754 binary16 → binary32 conversion. Mirrors the helper in
+     * [PanamaVectorQ4KMatmulKernel].
+     */
+    private fun halfToFloat(hbits: Int): Float {
+        val sign = (hbits ushr 15) and 0x1
+        val exp = (hbits ushr 10) and 0x1F
+        val frac = hbits and 0x3FF
+        return when {
+            exp == 0 -> {
+                if (frac == 0) {
+                    if (sign == 0) 0.0f else -0.0f
+                } else {
+                    val f = frac / 1024.0f * (1.0f / 16384.0f)
+                    if (sign == 0) f else -f
+                }
+            }
+            exp == 0x1F -> {
+                if (frac == 0) {
+                    if (sign == 0) Float.POSITIVE_INFINITY else Float.NEGATIVE_INFINITY
+                } else {
+                    Float.NaN
+                }
+            }
+            else -> {
+                val bits = (sign shl 31) or ((exp - 15 + 127) shl 23) or (frac shl 13)
+                Float.fromBits(bits)
+            }
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt
new file mode 100644
index 00000000..cd269d7d
--- /dev/null
+++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt
@@ -0,0 +1,71 @@
+package sk.ainet.exec.kernel
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertTrue
+
+/**
+ * Numerical parity for [PanamaVectorQ5_KMatmulKernel] against
+ * [ScalarQ5_KMatmulKernel] — the commonMain reference. Both share the
+ * canonical Q5_K layout (176-byte block, `qh` 5th-bit plane) and the
+ * lazy-`dmin` accumulation, so outputs must agree within FMA +
+ * reordered-reduction tolerance.
+ *
+ * Fixture: random Q5_K bytes with `d`/`dMin` clamped to `1.0f16` (no
+ * NaN/Inf), packed input-block-major `(blockIdx * outputDim + o) * 176`.
+ * The random `qh` bytes exercise the full 5-bit code range.
+ */
+class PanamaVectorQ5_KMatmulKernelParityTest {
+
+    private val blockSize = 256
+    private val bytesPerBlock = 176
+
+    private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray {
+        val rng = Random(seed)
+        val bytes = ByteArray(numBlocks * bytesPerBlock)
+        rng.nextBytes(bytes)
+        for (block in 0 until numBlocks) {
+            val base = block * bytesPerBlock
+            // 0x3C00 == 1.0f16. Force d = dMin = 1.0f16 so dequant stays finite.
+            bytes[base + 0] = 0x00.toByte()
+            bytes[base + 1] = 0x3C.toByte()
+            bytes[base + 2] = 0x00.toByte()
+            bytes[base + 3] = 0x3C.toByte()
+        }
+        return bytes
+    }
+
+    private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) {
+        val numBlocks = (inputDim / blockSize) * outputDim
+        val packed = randomQ5KBytes(numBlocks, seed)
+        val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f }
+
+        val refOut = FloatArray(outputDim)
+        ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0)
+
+        val vecOut = FloatArray(outputDim)
+        PanamaVectorQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, vecOut, 0)
+
+        for (o in 0 until outputDim) {
+            val diff = abs(refOut[o] - vecOut[o])
+            val rel = diff / (abs(refOut[o]) + 1e-9f)
+            assertTrue(
+                diff <= tol || rel < 1e-4f,
+                "row $o diverged: scalar=${refOut[o]} panama=${vecOut[o]} diff=$diff rel=$rel tol=$tol",
+            )
+        }
+    }
+
+    @Test
+    fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f)
+
+    @Test
+    fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f)
+
+    @Test
+    fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f)
+
+    @Test
+    fun llm_typical_shape() = assertParity(4096, 64, 999, 5e-1f)
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
index 9efc6865..6ab0ddd9 100644
--- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
+++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
@@ -105,12 +105,72 @@ val packageNativeKernels by tasks.registering(Copy::class) {
     into(nativeResourceTargetDir)
 }
 
+// --- Cross-compile to aarch64 (opt-in) -------------------------------------
+//
+// Produces native/linux-arm64/libskainet_kernels.so with the NEON paths
+// (CMAKE_SYSTEM_PROCESSOR=aarch64 -> -march=armv8.2-a+fp16+dotprod). Gated on
+// `-PcrossArm64=true` so the default host build is unaffected on machines
+// without the `gcc-aarch64-linux-gnu` cross toolchain. The board build / CI
+// host opts in. NativeLibraryLoader resolves native/linux-arm64/ from os.arch
+// at runtime, so the consuming side needs no change once this .so is bundled.
+//
+// BOARD-VERIFY-PENDING: the NEON code is syntax-validated for aarch64 but has
+// not been executed; run the parity tests under QEMU or on the SL2610.
+val crossArm64Enabled: Boolean = (findProperty("crossArm64") as String?)?.toBoolean() == true
+val aarch64Cc: String = (findProperty("skainetAarch64Cc") as String?) ?: "aarch64-linux-gnu-gcc"
+val cmakeBuildArm64Path: String = layout.buildDirectory.dir("native/cmake-build-arm64").get().asFile.absolutePath
+val nativeResourceArm64Dir = nativeResourcesRoot.map { it.dir("native/linux-arm64") }
+val toolchainFilePath = "$nativeSourcePath/toolchain-aarch64.cmake"
+
+val configureNativeKernelsArm64 by tasks.registering(Exec::class) {
+    group = "build"
+    description = "CMake configure for the aarch64 (NEON) native kernels (cross-compile)."
+    onlyIf { crossArm64Enabled }
+    inputs.file("$nativeSourcePath/CMakeLists.txt")
+    inputs.dir("$nativeSourcePath/src")
+    inputs.dir("$nativeSourcePath/include")
+    outputs.dir(cmakeBuildArm64Path)
+    commandLine = listOf(
+        "cmake",
+        "-S", nativeSourcePath,
+        "-B", cmakeBuildArm64Path,
+        "-DCMAKE_BUILD_TYPE=Release",
+        "-DCMAKE_TOOLCHAIN_FILE=$toolchainFilePath",
+        "-DSKAINET_AARCH64_CC=$aarch64Cc",
+    )
+}
+
+val buildNativeKernelsArm64 by tasks.registering(Exec::class) {
+    group = "build"
+    description = "Cross-build the aarch64 (NEON) native kernels shared library."
+    onlyIf { crossArm64Enabled }
+    dependsOn(configureNativeKernelsArm64)
+    inputs.file("$nativeSourcePath/CMakeLists.txt")
+    inputs.dir("$nativeSourcePath/src")
+    inputs.dir("$nativeSourcePath/include")
+    outputs.dir(cmakeBuildArm64Path)
+    commandLine = listOf("cmake", "--build", cmakeBuildArm64Path, "--config", "Release")
+}
+
+val packageNativeKernelsArm64 by tasks.registering(Copy::class) {
+    group = "build"
+    description = "Stage the cross-built aarch64 native kernels into JVM resources."
+    onlyIf { crossArm64Enabled }
+    dependsOn(buildNativeKernelsArm64)
+    from(cmakeBuildArm64Path) {
+        include("libskainet_kernels.so")
+        eachFile { path = name }
+    }
+    into(nativeResourceArm64Dir)
+}
+
 kotlin.sourceSets.named("jvmMain") {
     resources.srcDir(nativeResourcesRoot)
 }
 
 tasks.named("jvmProcessResources") {
     dependsOn(packageNativeKernels)
+    if (crossArm64Enabled) dependsOn(packageNativeKernelsArm64)
 }
 
 // Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test
diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
index ade06d41..f26c125c 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
+++ b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
@@ -12,6 +12,7 @@ endif()
 add_library(skainet_kernels SHARED
     src/skainet_smoke.c
     src/q4k_matmul.c
+    src/q5k_matmul.c
     src/fp32_matmul.c
     src/bf16_matmul.c
     src/q8_0_matmul.c
@@ -41,6 +42,17 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
         -ffast-math
         -funroll-loops
     )
+    # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h.
+    # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 + dotprod
+    # (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it would
+    # SIGILL on the board. CONFIRM on-device first: `grep Features /proc/cpuinfo`
+    # must list `asimddp` and `fphp`/`asimdhp`. Explicit -march (not -mcpu=native)
+    # because this is a cross-compile from an x86 host.
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+        target_compile_options(skainet_kernels PRIVATE
+            -march=armv8.2-a+fp16+dotprod
+        )
+    endif()
     set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
 elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC")
     target_compile_options(skainet_kernels PRIVATE
diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h
index a0fa3ff7..167fc80b 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h
+++ b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h
@@ -60,6 +60,31 @@ SKAINET_API void skainet_q4k_matmul(
     int32_t output_offset
 );
 
+/*
+ * Q5_K matrix-vector multiply.
+ *
+ *   output[output_offset + o] = sum_j input[input_offset + j] *
+ *                                dequant(weight[block, o, j])
+ *
+ * Block layout: canonical ggml Q5_K, 256 elements per super-block, 176
+ * bytes per block (2 B d + 2 B dMin + 12 B packed scales + 32 B `qh`
+ * high-bit plane + 128 B `qs` low nibbles). Each 5-bit code is
+ * `lowNibble | (fifthBit << 4)`. Packed weights laid out as
+ *   weight + weight_byte_offset + (block_idx * output_dim + o) * 176
+ *
+ * input_dim must be a multiple of 256.
+ */
+SKAINET_API void skainet_q5k_matmul(
+    const float* input,
+    int32_t input_offset,
+    const uint8_t* weight,
+    int32_t weight_byte_offset,
+    int32_t input_dim,
+    int32_t output_dim,
+    float* output,
+    int32_t output_offset
+);
+
 /*
  * Row-major FP32 SGEMM:  C(m, n) = A(m, k) * B(k, n).
  *
diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h
new file mode 100644
index 00000000..dd2ec25b
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h
@@ -0,0 +1,51 @@
+#ifndef SKAINET_SIMD_H
+#define SKAINET_SIMD_H
+
+/*
+ * Compile-time SIMD capability detection for the native CPU kernels.
+ *
+ * The kernels keep their portable scalar bodies as the `#else` fallback,
+ * so x86_64 (which auto-vectorizes well under -O3 -ffast-math) and any
+ * pre-ARMv8.2 target keep compiling unchanged. The NEON paths are only
+ * taken when the compiler advertises `__ARM_NEON` (AArch64 always does
+ * with the right -march). `__ARM_FEATURE_DOTPROD` / `__ARM_FEATURE_MATMUL_INT8`
+ * are gated on the build flags (`-march=armv8.2-a+dotprod`, etc.).
+ *
+ * BOARD-VERIFY-PENDING: the NEON paths in this tree compile to the scalar
+ * fallback on the x86 build host and have NOT been executed on aarch64.
+ * They must be built with the cross toolchain and bit-exact-checked under
+ * QEMU or on the SL2610 before being relied on.
+ */
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#  include <arm_neon.h>
+#  define SKAINET_HAVE_NEON 1
+#endif
+
+#if defined(__ARM_FEATURE_DOTPROD)
+#  define SKAINET_HAVE_DOTPROD 1
+#endif
+
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+#  define SKAINET_HAVE_I8MM 1
+#endif
+
+#ifdef SKAINET_HAVE_NEON
+/* Horizontal sum of a float32x4 lane vector. AArch64 has vaddvq_f32
+ * natively; this wrapper keeps call sites readable. */
+static inline float skainet_neon_hadd_f32(float32x4_t v) {
+    return vaddvq_f32(v);
+}
+
+/* Widen 16 unsigned bytes to four float32x4 lanes (out[0]=lanes 0..3, …). */
+static inline void skainet_neon_u8x16_to_f32x4x4(uint8x16_t v, float32x4_t out[4]) {
+    const uint16x8_t lo16 = vmovl_u8(vget_low_u8(v));
+    const uint16x8_t hi16 = vmovl_u8(vget_high_u8(v));
+    out[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo16)));
+    out[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo16)));
+    out[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi16)));
+    out[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi16)));
+}
+#endif
+
+#endif /* SKAINET_SIMD_H */
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c
index ce88afd8..dbd6293b 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c
+++ b/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c
@@ -1,4 +1,5 @@
 #include "skainet_kernels.h"
+#include "skainet_simd.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -53,9 +54,22 @@ SKAINET_API void skainet_fp32_matmul(
         for (int32_t p = 0; p < k; ++p) {
             const float a_ip = a_row[p];
             const float* SKAINET_RESTRICT b_row = b + b_offset + (size_t) p * b_stride;
+#ifdef SKAINET_HAVE_NEON
+            const float32x4_t va = vdupq_n_f32(a_ip);
+            int32_t j = 0;
+            for (; j + 4 <= n; j += 4) {
+                float32x4_t cv = vld1q_f32(c_row + j);
+                cv = vfmaq_f32(cv, va, vld1q_f32(b_row + j));
+                vst1q_f32(c_row + j, cv);
+            }
+            for (; j < n; ++j) {
+                c_row[j] += a_ip * b_row[j];
+            }
+#else
             for (int32_t j = 0; j < n; ++j) {
                 c_row[j] += a_ip * b_row[j];
             }
+#endif
         }
     }
 }
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
index 7c742793..8091e58a 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
+++ b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
@@ -1,4 +1,5 @@
 #include "skainet_kernels.h"
+#include "skainet_simd.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -124,6 +125,30 @@ SKAINET_API void skainet_q4k_matmul(
                 float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
                 float code_sum_hi = 0.0f, input_sum_hi = 0.0f;
 
+#ifdef SKAINET_HAVE_NEON
+                float32x4_t cacc_lo = vdupq_n_f32(0.0f), iacc_lo = vdupq_n_f32(0.0f);
+                float32x4_t cacc_hi = vdupq_n_f32(0.0f), iacc_hi = vdupq_n_f32(0.0f);
+                for (int off = 0; off < Q4K_SUB_BLOCK_SIZE; off += 16) {
+                    const uint8x16_t packed = vld1q_u8(qs_group + off);
+                    const uint8x16_t lo_nib = vandq_u8(packed, vdupq_n_u8(0x0F));
+                    const uint8x16_t hi_nib = vshrq_n_u8(packed, 4);
+                    float32x4_t cl[4], ch[4];
+                    skainet_neon_u8x16_to_f32x4x4(lo_nib, cl);
+                    skainet_neon_u8x16_to_f32x4x4(hi_nib, ch);
+                    for (int q = 0; q < 4; ++q) {
+                        const float32x4_t v_lo = vld1q_f32(in_lo + off + q * 4);
+                        const float32x4_t v_hi = vld1q_f32(in_hi + off + q * 4);
+                        cacc_lo = vfmaq_f32(cacc_lo, v_lo, cl[q]);
+                        iacc_lo = vaddq_f32(iacc_lo, v_lo);
+                        cacc_hi = vfmaq_f32(cacc_hi, v_hi, ch[q]);
+                        iacc_hi = vaddq_f32(iacc_hi, v_hi);
+                    }
+                }
+                code_sum_lo = skainet_neon_hadd_f32(cacc_lo);
+                input_sum_lo = skainet_neon_hadd_f32(iacc_lo);
+                code_sum_hi = skainet_neon_hadd_f32(cacc_hi);
+                input_sum_hi = skainet_neon_hadd_f32(iacc_hi);
+#else
                 /* 32 iterations — auto-vectorizes cleanly under -O3. */
                 for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) {
                     const uint8_t b = qs_group[i];
@@ -136,6 +161,7 @@ SKAINET_API void skainet_q4k_matmul(
                     code_sum_hi  += v_hi * code_hi;
                     input_sum_hi += v_hi;
                 }
+#endif
 
                 const float scale_lo  = d     * (float) scale_idx[sb_lo];
                 const float offset_lo = d_min * (float) min_idx[sb_lo];
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c
new file mode 100644
index 00000000..de395114
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c
@@ -0,0 +1,202 @@
+#include "skainet_kernels.h"
+#include "skainet_simd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define Q5K_BLOCK_SIZE       256
+#define Q5K_SUB_BLOCK_SIZE    32
+#define Q5K_SUB_BLOCKS         8
+#define Q5K_BYTES_PER_BLOCK  176
+#define Q5K_QH_OFFSET         16
+#define Q5K_QS_OFFSET         48
+
+/*
+ * IEEE 754 binary16 (LE byte order) -> binary32 conversion.
+ * Mirrors PanamaVectorQ5_KMatmulKernel.halfToFloat / the Q4_K kernel
+ * byte-for-byte (kept scalar to preserve bit-exact FP16 parity).
+ */
+static inline float skainet_q5k_half_to_float(uint16_t hbits) {
+    const uint32_t sign = (hbits >> 15) & 0x1u;
+    const uint32_t exp  = (hbits >> 10) & 0x1Fu;
+    const uint32_t frac =  hbits        & 0x3FFu;
+
+    if (exp == 0u) {
+        if (frac == 0u) {
+            union { uint32_t u; float f; } v = { sign << 31 };
+            return v.f;
+        }
+        float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f);
+        return sign ? -f : f;
+    }
+    if (exp == 0x1Fu) {
+        union { uint32_t u; float f; } v;
+        v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u);
+        return v.f;
+    }
+    union { uint32_t u; float f; } v;
+    v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13);
+    return v.f;
+}
+
+/*
+ * ggml's get_scale_min_k4 unmix for the 12-byte packed sub-scale region
+ * (bytes 4..15). Identical to Q4_K.
+ */
+static inline void skainet_q5k_decode_scales(
+    const uint8_t* scales,
+    int* scale_idx,
+    int* min_idx
+) {
+    for (int sb = 0; sb < 4; ++sb) {
+        scale_idx[sb] = scales[sb]     & 0x3F;
+        min_idx[sb]   = scales[sb + 4] & 0x3F;
+    }
+    for (int sb = 4; sb < 8; ++sb) {
+        const int low4_s  = scales[sb + 4] & 0x0F;
+        const int high2_s = (scales[sb - 4] >> 6) & 0x03;
+        scale_idx[sb] = low4_s | (high2_s << 4);
+
+        const int low4_m  = (scales[sb + 4] >> 4) & 0x0F;
+        const int high2_m = (scales[sb] >> 6) & 0x03;
+        min_idx[sb] = low4_m | (high2_m << 4);
+    }
+}
+
+/*
+ * Native Q5_K matrix-vector multiply matching the
+ * sk.ainet.backend.api.kernel.Q5KMatmulKernel SPI contract. Single
+ * input row times an `outputDim x inputDim` Q5_K-packed weight tensor
+ * laid out (blockIdx * outputDim + o) * 176 bytes.
+ *
+ * Q5_K extends Q4_K with a 32-byte `qh` high-bit plane: the 5-bit code
+ * is `lowNibble | (fifthBit << 4)`, where the low nibble lives in `qs`
+ * (same strided layout as Q4_K) and the 5th bit is bit (2*group) of
+ * qh[l] for the low sub-block, (2*group + 1) for the high sub-block.
+ *
+ * Lazy-dmin pattern: per sub-block accumulate
+ *   codeSum[s] = sum_i input[i] * code[i]
+ *   inputSum[s] = sum_i input[i]
+ * and combine once via
+ *   acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s]
+ *
+ * Scalar single-threaded; the tight inner loop is straight-line FP
+ * arithmetic so -O3 auto-vectorizes on AVX2/NEON. A hand-written NEON
+ * path is layered on behind __ARM_NEON in a later PR.
+ */
+SKAINET_API void skainet_q5k_matmul(
+    const float* SKAINET_RESTRICT input,
+    int32_t input_offset,
+    const uint8_t* SKAINET_RESTRICT weight,
+    int32_t weight_byte_offset,
+    int32_t input_dim,
+    int32_t output_dim,
+    float* SKAINET_RESTRICT output,
+    int32_t output_offset
+) {
+    if (output_dim <= 0 || input_dim <= 0) return;
+
+    const int32_t blocks_per_input_dim = input_dim / Q5K_BLOCK_SIZE;
+    const float* in_base = input + input_offset;
+    float* out_base = output + output_offset;
+
+    int scale_idx[Q5K_SUB_BLOCKS];
+    int min_idx[Q5K_SUB_BLOCKS];
+
+    for (int32_t o = 0; o < output_dim; ++o) {
+        float acc = 0.0f;
+
+        for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
+            const uint8_t* block = weight + weight_byte_offset
+                + (size_t)(block_idx * output_dim + o) * Q5K_BYTES_PER_BLOCK;
+
+            /* d, dMin (FP16 LE -> FP32). */
+            const uint16_t d_bits     = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
+            const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8);
+            const float d     = skainet_q5k_half_to_float(d_bits);
+            const float d_min = skainet_q5k_half_to_float(d_min_bits);
+
+            /* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */
+            skainet_q5k_decode_scales(block + 4, scale_idx, min_idx);
+
+            const uint8_t* qh = block + Q5K_QH_OFFSET;
+            const uint8_t* qs = block + Q5K_QS_OFFSET;
+            const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE;
+
+            /* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
+            for (int group_j = 0; group_j < 4; ++group_j) {
+                const uint8_t* qs_group = qs + group_j * Q5K_SUB_BLOCK_SIZE;
+                const int sb_lo = 2 * group_j;
+                const int sb_hi = sb_lo + 1;
+                const int bit_lo = 2 * group_j;
+                const int bit_hi = 2 * group_j + 1;
+                const float* in_lo = in_block + sb_lo * Q5K_SUB_BLOCK_SIZE;
+                const float* in_hi = in_block + sb_hi * Q5K_SUB_BLOCK_SIZE;
+
+                float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
+                float code_sum_hi = 0.0f, input_sum_hi = 0.0f;
+
+#ifdef SKAINET_HAVE_NEON
+                /* Variable right-shift via vshlq_u8 with a negative count
+                 * (bit_lo/bit_hi are runtime values, so vshrq_n_u8's
+                 * immediate form can't be used). */
+                const int8x16_t shr_lo = vdupq_n_s8(-(int8_t) bit_lo);
+                const int8x16_t shr_hi = vdupq_n_s8(-(int8_t) bit_hi);
+                float32x4_t cacc_lo = vdupq_n_f32(0.0f), iacc_lo = vdupq_n_f32(0.0f);
+                float32x4_t cacc_hi = vdupq_n_f32(0.0f), iacc_hi = vdupq_n_f32(0.0f);
+                for (int off = 0; off < Q5K_SUB_BLOCK_SIZE; off += 16) {
+                    const uint8x16_t packed = vld1q_u8(qs_group + off);
+                    const uint8x16_t qhv = vld1q_u8(qh + off);
+                    const uint8x16_t lo_nib = vandq_u8(packed, vdupq_n_u8(0x0F));
+                    const uint8x16_t hi_nib = vshrq_n_u8(packed, 4);
+                    /* 5th bit -> bit 4 of the code byte. */
+                    const uint8x16_t fifth_lo =
+                        vshlq_n_u8(vandq_u8(vshlq_u8(qhv, shr_lo), vdupq_n_u8(0x01)), 4);
+                    const uint8x16_t fifth_hi =
+                        vshlq_n_u8(vandq_u8(vshlq_u8(qhv, shr_hi), vdupq_n_u8(0x01)), 4);
+                    const uint8x16_t code_lo = vorrq_u8(lo_nib, fifth_lo);
+                    const uint8x16_t code_hi = vorrq_u8(hi_nib, fifth_hi);
+                    float32x4_t cl[4], ch[4];
+                    skainet_neon_u8x16_to_f32x4x4(code_lo, cl);
+                    skainet_neon_u8x16_to_f32x4x4(code_hi, ch);
+                    for (int q = 0; q < 4; ++q) {
+                        const float32x4_t v_lo = vld1q_f32(in_lo + off + q * 4);
+                        const float32x4_t v_hi = vld1q_f32(in_hi + off + q * 4);
+                        cacc_lo = vfmaq_f32(cacc_lo, v_lo, cl[q]);
+                        iacc_lo = vaddq_f32(iacc_lo, v_lo);
+                        cacc_hi = vfmaq_f32(cacc_hi, v_hi, ch[q]);
+                        iacc_hi = vaddq_f32(iacc_hi, v_hi);
+                    }
+                }
+                code_sum_lo = skainet_neon_hadd_f32(cacc_lo);
+                input_sum_lo = skainet_neon_hadd_f32(iacc_lo);
+                code_sum_hi = skainet_neon_hadd_f32(cacc_hi);
+                input_sum_hi = skainet_neon_hadd_f32(iacc_hi);
+#else
+                /* 32 iterations — auto-vectorizes cleanly under -O3. */
+                for (int i = 0; i < Q5K_SUB_BLOCK_SIZE; ++i) {
+                    const uint8_t b = qs_group[i];
+                    const uint8_t h = qh[i];
+                    const float code_lo = (float)((b & 0x0F) | (((h >> bit_lo) & 0x01) << 4));
+                    const float code_hi = (float)((b >> 4)   | (((h >> bit_hi) & 0x01) << 4));
+                    const float v_lo = in_lo[i];
+                    const float v_hi = in_hi[i];
+                    code_sum_lo  += v_lo * code_lo;
+                    input_sum_lo += v_lo;
+                    code_sum_hi  += v_hi * code_hi;
+                    input_sum_hi += v_hi;
+                }
+#endif
+
+                const float scale_lo  = d     * (float) scale_idx[sb_lo];
+                const float offset_lo = d_min * (float) min_idx[sb_lo];
+                const float scale_hi  = d     * (float) scale_idx[sb_hi];
+                const float offset_hi = d_min * (float) min_idx[sb_hi];
+                acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
+                acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
+            }
+        }
+
+        out_base[o] = acc;
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c
index 844a2159..cee95a21 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c
+++ b/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c
@@ -1,4 +1,5 @@
 #include "skainet_kernels.h"
+#include "skainet_simd.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -83,9 +84,29 @@ SKAINET_API void skainet_q8_0_matmul(
             const float* SKAINET_RESTRICT input_block =
                 input + input_offset + (size_t) block_idx * BLOCK_SIZE;
             float block_sum = 0.0f;
+#ifdef SKAINET_HAVE_NEON
+            /* Activations are FP32, so widen int8 codes to float and FMA
+             * (int8 dotprod would need int8 activations — see plan note). */
+            float32x4_t accv = vdupq_n_f32(0.0f);
+            for (int32_t k = 0; k < BLOCK_SIZE; k += 16) {
+                const int8x16_t c8 = vld1q_s8(codes + k);
+                const int16x8_t lo16 = vmovl_s8(vget_low_s8(c8));
+                const int16x8_t hi16 = vmovl_s8(vget_high_s8(c8));
+                const float32x4_t cf0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo16)));
+                const float32x4_t cf1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo16)));
+                const float32x4_t cf2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi16)));
+                const float32x4_t cf3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi16)));
+                accv = vfmaq_f32(accv, vld1q_f32(input_block + k),      cf0);
+                accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 4),  cf1);
+                accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 8),  cf2);
+                accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 12), cf3);
+            }
+            block_sum = skainet_neon_hadd_f32(accv);
+#else
             for (int32_t k = 0; k < BLOCK_SIZE; ++k) {
                 block_sum += input_block[k] * (float) codes[k];
             }
+#endif
             acc += block_sum * d;
         }
         output[output_offset + o] = acc;
diff --git a/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake b/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake
new file mode 100644
index 00000000..65e7e246
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake
@@ -0,0 +1,28 @@
+# CMake toolchain for cross-compiling the native kernels to aarch64 Linux
+# from an x86_64 host (e.g. the SL2610 board build / CI). Requires the
+# `gcc-aarch64-linux-gnu` package (or an equivalent clang cross setup).
+#
+# Usage:
+#   cmake -S native -B build/native/cmake-build-arm64 \
+#         -DCMAKE_TOOLCHAIN_FILE=native/toolchain-aarch64.cmake \
+#         -DCMAKE_BUILD_TYPE=Release
+#
+# CMAKE_SYSTEM_PROCESSOR=aarch64 makes CMakeLists.txt take the
+# `-march=armv8.2-a+fp16+dotprod` branch that enables the __ARM_NEON paths.
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+# Allow overriding the cross compiler (e.g. a clang cross or a different
+# triple) via -DSKAINET_AARCH64_CC=... ; default to the Debian/Ubuntu GNU
+# cross toolchain.
+if(NOT DEFINED SKAINET_AARCH64_CC)
+    set(SKAINET_AARCH64_CC aarch64-linux-gnu-gcc)
+endif()
+set(CMAKE_C_COMPILER ${SKAINET_AARCH64_CC})
+
+# Search for libraries/headers only in the target sysroot, but find
+# programs (the compiler) on the host.
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt
index 60dd45e2..ba0011b2 100644
--- a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt
+++ b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt
@@ -7,6 +7,7 @@ import sk.ainet.backend.api.kernel.MemSegKernelProvider
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
 import sk.ainet.backend.api.kernel.Q4KMemSegMatmulKernel
 import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
 import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
 
 /**
@@ -97,4 +98,7 @@ public object NativeKernelProvider : KernelProvider, MemSegKernelProvider {
 
     override fun matmulQ4_0(): Q4_0MatmulKernel? =
         if (NativeQ4_0MatmulKernel.isAvailable()) NativeQ4_0MatmulKernel else null
+
+    override fun matmulQ5K(): Q5KMatmulKernel? =
+        if (NativeQ5KMatmulKernel.isAvailable()) NativeQ5KMatmulKernel else null
 }
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt
new file mode 100644
index 00000000..6df68d73
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt
@@ -0,0 +1,91 @@
+package sk.ainet.exec.kernel
+
+import java.lang.foreign.Arena
+import java.lang.foreign.FunctionDescriptor
+import java.lang.foreign.Linker
+import java.lang.foreign.MemorySegment
+import java.lang.foreign.ValueLayout
+import java.lang.invoke.MethodHandle
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+
+/**
+ * Native (FFM) implementation of [Q5KMatmulKernel].
+ *
+ * Wraps the bundled C symbol
+ *
+ *   void skainet_q5k_matmul(
+ *       const float* input, int32_t input_offset,
+ *       const uint8_t* weight, int32_t weight_byte_offset,
+ *       int32_t input_dim, int32_t output_dim,
+ *       float* output, int32_t output_offset);
+ *
+ * Same lazy-`dmin` accumulation as [PanamaVectorQ5_KMatmulKernel] over
+ * the canonical 256-element / 176-byte Q5_K super-block (the 5th bit of
+ * each code comes from the `qh` plane). Numerical parity vs the Panama
+ * kernel is asserted by [NativeQ5KMatmulKernelParityTest].
+ *
+ * Single-threaded scalar C (`-O3 -ffast-math`, auto-vectorized inner
+ * loop); a hand-written NEON path is layered on behind `__ARM_NEON`.
+ */
+internal object NativeQ5KMatmulKernel : Q5KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+    private const val BYTES_PER_BLOCK = 176
+
+    fun isAvailable(): Boolean = handle != null
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeQ5KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        val mh = handle
+            ?: error("NativeQ5KMatmulKernel.matmul invoked while native library unavailable")
+
+        Arena.ofConfined().use { arena ->
+            val inSeg = arena.allocate(
+                inputDim.toLong() * java.lang.Float.BYTES,
+                ValueLayout.JAVA_FLOAT.byteAlignment(),
+            )
+            val outSeg = arena.allocate(
+                outputDim.toLong() * java.lang.Float.BYTES,
+                ValueLayout.JAVA_FLOAT.byteAlignment(),
+            )
+            val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * BYTES_PER_BLOCK.toLong()
+            val weightSeg = arena.allocate(weightBytesUsed, 1L)
+
+            MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim)
+            MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt())
+
+            mh.invoke(
+                inSeg, 0,
+                weightSeg, 0,
+                inputDim, outputDim,
+                outSeg, 0,
+            )
+
+            MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim)
+        }
+    }
+
+    private val handle: MethodHandle? by lazy {
+        val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
+        val symbol = lookup.find("skainet_q5k_matmul").orElse(null) ?: return@lazy null
+        val descriptor = FunctionDescriptor.ofVoid(
+            ValueLayout.ADDRESS,    // input
+            ValueLayout.JAVA_INT,   // input_offset
+            ValueLayout.ADDRESS,    // weight
+            ValueLayout.JAVA_INT,   // weight_byte_offset
+            ValueLayout.JAVA_INT,   // input_dim
+            ValueLayout.JAVA_INT,   // output_dim
+            ValueLayout.ADDRESS,    // output
+            ValueLayout.JAVA_INT,   // output_offset
+        )
+        runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt
index 1565c4c4..d93a0149 100644
--- a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt
+++ b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt
@@ -21,7 +21,7 @@ import sk.ainet.backend.api.kernel.KernelProvider
  */
 class KernelSupportMatrixTest {
 
-    private val formats = listOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0")
+    private val formats = listOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0")
 
     // platform key (display) -> the set of providers (by source-set) reaching it.
     private val platforms = listOf("JVM", "Android", "Native·linux", "Native·apple", "JS/WASM")
@@ -36,9 +36,9 @@ class KernelSupportMatrixTest {
     private fun tiers(): List<Tier> = listOf(
         Tier("scalar", 0, platforms.toSet(), scalarFormats()),
         Tier("panama-vector", 50, setOf("JVM", "Android"),
-            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0")),
+            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0")),
         Tier("native-ffm", 100, setOf("JVM"),
-            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K")),
+            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q5_K")),
     )
 
     private fun best(fmt: String, platform: String, tiers: List<Tier>): String? =
@@ -71,7 +71,7 @@ class KernelSupportMatrixTest {
         // Drift gate on the scalar floor (the all-platform baseline): the documented set
         // below must equal what the scalar provider actually carries. Update both together.
         assertEquals(
-            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0"),
+            setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0"),
             scalarFormats(),
             "ScalarKernelProvider coverage changed — update the declared sets + run ./gradlew generateKernelMatrix",
         )
diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt
new file mode 100644
index 00000000..095a1723
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt
@@ -0,0 +1,94 @@
+package sk.ainet.exec.kernel
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.BeforeTest
+import kotlin.test.Test
+import kotlin.test.assertTrue
+
+/**
+ * Numerical parity tests for [NativeQ5KMatmulKernel] against
+ * [PanamaVectorQ5_KMatmulKernel]. Both kernels share the canonical Q5_K
+ * layout (176-byte block, 32-byte `qh` 5th-bit plane) and the lazy-`dmin`
+ * accumulation, so outputs must agree element-wise within FMA +
+ * reordered-reduction tolerance.
+ *
+ * Fixture mirrors [NativeQ4KMatmulKernelParityTest]: random Q5_K bytes with
+ * `d`/`dMin` clamped to `1.0f16`, packed input-block-major
+ * `(blockIdx * outputDim + o) * 176`. Random `qh` bytes exercise the 5th bit.
+ */
+class NativeQ5KMatmulKernelParityTest {
+
+    private val blockSize = 256
+    private val bytesPerBlock = 176
+
+    @BeforeTest
+    fun checkNativeAvailable() {
+        assertTrue(
+            NativeQ5KMatmulKernel.isAvailable(),
+            "NativeQ5KMatmulKernel reports unavailable on this host — bundled libskainet_kernels " +
+                "missing or skainet_q5k_matmul symbol unresolved",
+        )
+    }
+
+    private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray {
+        val rng = Random(seed)
+        val bytes = ByteArray(numBlocks * bytesPerBlock)
+        rng.nextBytes(bytes)
+        for (block in 0 until numBlocks) {
+            val base = block * bytesPerBlock
+            // 0x3C00 == 1.0f16. Force d = dMin = 1.0f16 so dequant magnitudes stay finite.
+            bytes[base + 0] = 0x00.toByte()
+            bytes[base + 1] = 0x3C.toByte()
+            bytes[base + 2] = 0x00.toByte()
+            bytes[base + 3] = 0x3C.toByte()
+        }
+        return bytes
+    }
+
+    private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) {
+        val numBlocks = (inputDim / blockSize) * outputDim
+        val packed = randomQ5KBytes(numBlocks, seed)
+        val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f }
+
+        val refOut = FloatArray(outputDim)
+        PanamaVectorQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0)
+
+        val nativeOut = FloatArray(outputDim)
+        NativeQ5KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, nativeOut, 0)
+
+        for (o in 0 until outputDim) {
+            val diff = abs(refOut[o] - nativeOut[o])
+            val rel = diff / (abs(refOut[o]) + 1e-9f)
+            assertTrue(
+                diff <= tol || rel < 1e-4f,
+                "row $o diverged: panama=${refOut[o]} native=${nativeOut[o]} diff=$diff rel=$rel tol=$tol",
+            )
+        }
+    }
+
+    @Test
+    fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f)
+
+    @Test
+    fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f)
+
+    @Test
+    fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f)
+
+    @Test
+    fun llm_typical_shape_4096_outputDim_64() = assertParity(4096, 64, 999, 5e-1f)
+
+    @Test
+    fun rejects_inputDim_not_multiple_of_block() {
+        val packed = randomQ5KBytes(numBlocks = 2, seed = 1)
+        val input = FloatArray(255)
+        val out = FloatArray(1)
+        try {
+            NativeQ5KMatmulKernel.matmul(input, 0, packed, 0, 255, 1, out, 0)
+            kotlin.test.fail("expected IllegalArgumentException for non-multiple inputDim")
+        } catch (e: IllegalArgumentException) {
+            // expected
+        }
+    }
+}
diff --git a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt
index 324c5da7..04d32f42 100644
--- a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt
+++ b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt
@@ -6,6 +6,8 @@ import sk.ainet.io.RandomAccessSource
 import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.Tensor
 import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
 import sk.ainet.lang.types.BF16
 import sk.ainet.lang.types.DType
@@ -88,6 +90,18 @@ public class StreamingGgufParametersLoader(
                         ctx.fromData<T, V>(packed as sk.ainet.lang.tensor.data.TensorData<T, V>, dtype)
                     }
 
+                    GGMLQuantizationType.Q5_K -> {
+                        @Suppress("UNCHECKED_CAST")
+                        val packed = Q5_KBlockTensorData.fromRawBytes(shape, rawBytes)
+                        ctx.fromData<T, V>(packed as sk.ainet.lang.tensor.data.TensorData<T, V>, dtype)
+                    }
+
+                    GGMLQuantizationType.Q6_K -> {
+                        @Suppress("UNCHECKED_CAST")
+                        val packed = Q6_KBlockTensorData.fromRawBytes(shape, rawBytes)
+                        ctx.fromData<T, V>(packed as sk.ainet.lang.tensor.data.TensorData<T, V>, dtype)
+                    }
+
                     GGMLQuantizationType.Q8_0 -> {
                         @Suppress("UNCHECKED_CAST")
                         val packed = Q8_0BlockTensorData.fromRawBytes(shape, rawBytes)
diff --git a/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt b/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt
new file mode 100644
index 00000000..460454f7
--- /dev/null
+++ b/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt
@@ -0,0 +1,115 @@
+package sk.ainet.io.gguf
+
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertTrue
+import sk.ainet.io.gguf.dequant.DequantOps
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
+import sk.ainet.lang.tensor.data.toFloatArray
+
+/**
+ * Ties the packed [Q5_KBlockTensorData] (used by the eager CPU matmul path)
+ * to the proven golden [DequantOps.dequantQ5KFromBytes] across a *multi-block*
+ * buffer. Multiple blocks are the case that exposed the historical `qh[idx/8]`
+ * indexing bug — a single block can pass by accident, so this builds 3 blocks
+ * with distinct codes and asserts bit-exact agreement.
+ */
+class Q5KBlockTensorDataParityTest {
+
+    private fun floatToHalf(value: Float): Int {
+        val bits = value.toRawBits()
+        val sign = (bits shr 16) and 0x8000
+        val exponent = ((bits shr 23) and 0xFF) - 127
+        val mantissa = bits and 0x7FFFFF
+        return when {
+            exponent >= 16 -> sign or 0x7C00
+            exponent >= -14 -> sign or ((exponent + 15) shl 10) or (mantissa shr 13)
+            else -> sign
+        }
+    }
+
+    /** Build a single 176-byte canonical Q5_K block (see [Q5KCanonicalLayoutTest]). */
+    private fun buildBlock(
+        d: Float,
+        dMin: Float,
+        scaleIdx: IntArray,
+        minIdx: IntArray,
+        subBlockCodes: Array<IntArray>,
+    ): ByteArray {
+        val block = ByteArray(176)
+        val dBits = floatToHalf(d)
+        block[0] = (dBits and 0xFF).toByte()
+        block[1] = ((dBits shr 8) and 0xFF).toByte()
+        val dMinBits = floatToHalf(dMin)
+        block[2] = (dMinBits and 0xFF).toByte()
+        block[3] = ((dMinBits shr 8) and 0xFF).toByte()
+
+        val scaleBytes = IntArray(12)
+        for (j in 0 until 4) {
+            scaleBytes[j] = scaleBytes[j] or (scaleIdx[j] and 0x3F)
+            scaleBytes[j + 4] = scaleBytes[j + 4] or (minIdx[j] and 0x3F)
+        }
+        for (j in 4 until 8) {
+            val sLow4 = scaleIdx[j] and 0x0F
+            val sHi2 = (scaleIdx[j] shr 4) and 0x03
+            val mLow4 = minIdx[j] and 0x0F
+            val mHi2 = (minIdx[j] shr 4) and 0x03
+            scaleBytes[j + 4] = scaleBytes[j + 4] or sLow4 or (mLow4 shl 4)
+            scaleBytes[j - 4] = scaleBytes[j - 4] or (sHi2 shl 6)
+            scaleBytes[j] = scaleBytes[j] or (mHi2 shl 6)
+        }
+        for (i in 0 until 12) block[4 + i] = (scaleBytes[i] and 0xFF).toByte()
+
+        val qhBytes = IntArray(32)
+        for (j in 0 until 4) {
+            for (l in 0 until 32) {
+                val highLo = (subBlockCodes[2 * j][l] ushr 4) and 0x01
+                val highHi = (subBlockCodes[2 * j + 1][l] ushr 4) and 0x01
+                qhBytes[l] = qhBytes[l] or (highLo shl (2 * j))
+                qhBytes[l] = qhBytes[l] or (highHi shl (2 * j + 1))
+            }
+        }
+        for (i in 0 until 32) block[16 + i] = (qhBytes[i] and 0xFF).toByte()
+
+        for (j in 0 until 4) {
+            for (l in 0 until 32) {
+                val lo = subBlockCodes[2 * j][l] and 0x0F
+                val hi = subBlockCodes[2 * j + 1][l] and 0x0F
+                block[48 + j * 32 + l] = ((hi shl 4) or lo).toByte()
+            }
+        }
+        return block
+    }
+
+    @Test
+    fun `Q5_KBlockTensorData toFloatArray matches DequantOps golden across blocks`() {
+        val nBlocks = 3
+        val buf = ByteArray(nBlocks * 176)
+        for (b in 0 until nBlocks) {
+            val d = 0.125f + 0.01f * b
+            val dMin = 0.0625f + 0.005f * b
+            val scaleIdx = IntArray(8) { (63 - (it * 7 + b * 3)) and 0x3F }
+            val minIdx = IntArray(8) { (it * 8 + b * 5) and 0x3F }
+            val codes = Array(8) { s -> IntArray(32) { j -> ((s * 11 + j * 5 + b * 13) and 0x1F) } }
+            val block = buildBlock(d, dMin, scaleIdx, minIdx, codes)
+            block.copyInto(buf, b * 176)
+        }
+
+        val golden = DequantOps.dequantFromBytes(buf, GGMLQuantizationType.Q5_K, nBlocks * 256)
+        val packed = Q5_KBlockTensorData(Shape(nBlocks * 256), buf).toFloatArray()
+
+        assertEquals(golden.size, packed.size)
+        var maxAbs = 0f
+        var firstDiff = -1
+        for (i in golden.indices) {
+            val ad = kotlin.math.abs(golden[i] - packed[i])
+            if (ad > maxAbs) maxAbs = ad
+            if (firstDiff == -1 && ad > 0f) firstDiff = i
+        }
+        assertTrue(
+            maxAbs == 0f,
+            "Q5_KBlockTensorData disagrees with DequantOps golden: maxAbs=$maxAbs firstDiff=$firstDiff",
+        )
+    }
+}
diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt
new file mode 100644
index 00000000..9aba08bd
--- /dev/null
+++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt
@@ -0,0 +1,320 @@
+package sk.ainet.lang.tensor.data
+
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.storage.PackedBlockStorage
+import sk.ainet.lang.tensor.storage.TensorEncoding
+import sk.ainet.lang.types.DType
+
+/**
+ * Tensor data interface for Q5_K quantized format (canonical ggml layout).
+ *
+ * Q5_K block format (256 elements per block, 176 bytes per block):
+ * - 2 bytes: f16 d (super-block scale)
+ * - 2 bytes: f16 dMin (super-block min-scale)
+ * - 12 bytes: packed 6-bit scaleIdx + 6-bit minIdx for each of 8 sub-blocks,
+ *             encoded with ggml's `get_scale_min_k4` bit-mixing layout —
+ *             *identical* to Q4_K (see [Q4_KTensorData]).
+ * - 32 bytes: `qh` high-bit plane. One byte per intra-group element position
+ *             `l` (0..31); the 5th bit of a code comes from a single bit of
+ *             `qh[l]` selected by `(outer-iter, low/high nibble)`.
+ * - 128 bytes: `qs` 4-bit low nibbles, laid out *strided* in 4 groups of 32
+ *              bytes, exactly as Q4_K: byte (j*32 + i) carries element
+ *              (2j*32 + i) in its lo nibble and element ((2j+1)*32 + i) in its
+ *              hi nibble.
+ *
+ * The 5th bit (per ggml-quants.c `dequantize_row_q5_K`): for outer iteration
+ * `outer` (0..3), the low-nibble sub-block uses `qh[l]` bit `2*outer` and the
+ * high-nibble sub-block uses bit `2*outer + 1`. `qh` is indexed by the
+ * intra-group position `l` (0..31), NOT by output position.
+ *
+ * Each sub-block s (s=0..7):
+ * - 6-bit scaleIdx, 6-bit minIdx (from `get_scale_min_k4`)
+ * - scale  = d    * scaleIdx
+ * - offset = dMin * minIdx
+ *
+ * Dequantization: `output[i] = code[i] * scale - offset`, where `code` is the
+ * full 5-bit value `lowNibble | (fifthBit << 4)` (0..31).
+ *
+ * Validated bit-exact against `DequantOps.dequantQ5KFromBytes`, which carries
+ * the proof and the regression note about the earlier `qh[idx/8]` bug.
+ */
+public interface Q5_KTensorData : TensorData<DType, Byte> {
+    /** Number of Q5_K blocks in the tensor. */
+    public val blockCount: Int
+
+    /** Raw packed data containing all blocks. */
+    public val packedData: ByteArray
+
+    /** Get the main scale factor (d) for a block. */
+    public fun getBlockD(blockIdx: Int): Float
+
+    /** Get the minimum scale factor (dMin) for a block. */
+    public fun getBlockDMin(blockIdx: Int): Float
+
+    /**
+     * Get the scale for a specific sub-block within a block:
+     * `scale = d * scaleIdx` (no /63 normalisation — ggml's `d1 = d * sc`).
+     */
+    public fun getSubBlockScale(blockIdx: Int, subBlockIdx: Int): Float
+
+    /**
+     * Get the offset for a specific sub-block within a block:
+     * `offset = dMin * minIdx`. Subtract this from `code * scale` for the
+     * dequantised value.
+     */
+    public fun getSubBlockMin(blockIdx: Int, subBlockIdx: Int): Float
+
+    /** Get a 5-bit quantized code value (0..31) for `elementIdx` (0..255). */
+    public fun getCode(blockIdx: Int, elementIdx: Int): Int
+
+    public companion object {
+        /** Elements per Q5_K block. */
+        public const val BLOCK_SIZE: Int = 256
+
+        /** Elements per sub-block. */
+        public const val SUB_BLOCK_SIZE: Int = 32
+
+        /** Number of sub-blocks per block. */
+        public const val SUB_BLOCKS_PER_BLOCK: Int = 8
+
+        /** Bytes per Q5_K block (2 + 2 + 12 + 32 + 128 = 176). */
+        public const val BYTES_PER_BLOCK: Int = 176
+
+        /** Byte offset of the 32-byte `qh` high-bit plane within a block. */
+        public const val QH_OFFSET: Int = 16
+
+        /** Byte offset of the 128-byte `qs` low-nibble region within a block. */
+        public const val QS_OFFSET: Int = 48
+    }
+}
+
+/**
+ * Implementation of Q5_KTensorData backed by a packed byte array (canonical
+ * ggml layout — see [Q5_KTensorData] kdoc for the full byte map).
+ *
+ * @param initialShape the logical shape of the tensor (in elements, not blocks)
+ * @param data the raw packed block data
+ */
+public class Q5_KBlockTensorData(
+    initialShape: Shape,
+    private val data: ByteArray
+) : Q5_KTensorData, PackedBlockStorage {
+
+    override val shape: Shape = Shape(initialShape.dimensions.copyOf())
+    private val strides: IntArray = shape.computeStrides()
+    override val packedData: ByteArray get() = data
+
+    override val blockCount: Int = (shape.volume + Q5_KTensorData.BLOCK_SIZE - 1) / Q5_KTensorData.BLOCK_SIZE
+
+    // PackedBlockStorage implementation
+    override val encoding: TensorEncoding get() = TensorEncoding.Q5_K
+    override val blockSize: Int get() = Q5_KTensorData.BLOCK_SIZE
+
+    override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" }
+        for (subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) {
+            val scale = getSubBlockScale(blockIdx, subBlockIdx)
+            val offset = getSubBlockMin(blockIdx, subBlockIdx)
+            val elemsStart = subBlockIdx * Q5_KTensorData.SUB_BLOCK_SIZE
+            for (j in 0 until Q5_KTensorData.SUB_BLOCK_SIZE) {
+                val elementIdx = elemsStart + j
+                val outIdx = outputOffset + elementIdx
+                if (outIdx >= output.size) return
+                val globalIdx = blockIdx * Q5_KTensorData.BLOCK_SIZE + elementIdx
+                if (globalIdx >= shape.volume) return
+                val code = getCode(blockIdx, elementIdx)
+                output[outIdx] = code * scale - offset
+            }
+        }
+    }
+
+    init {
+        val requiredBytes = blockCount * Q5_KTensorData.BYTES_PER_BLOCK
+        require(data.size >= requiredBytes) {
+            "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks"
+        }
+    }
+
+    override fun getBlockD(blockIdx: Int): Float {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" }
+        val offset = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK
+        val b0 = data[offset].toInt() and 0xFF
+        val b1 = data[offset + 1].toInt() and 0xFF
+        val halfBits = (b1 shl 8) or b0
+        return halfToFloat(halfBits)
+    }
+
+    override fun getBlockDMin(blockIdx: Int): Float {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" }
+        val offset = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 2
+        val b0 = data[offset].toInt() and 0xFF
+        val b1 = data[offset + 1].toInt() and 0xFF
+        val halfBits = (b1 shl 8) or b0
+        return halfToFloat(halfBits)
+    }
+
+    override fun getSubBlockScale(blockIdx: Int, subBlockIdx: Int): Float {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" }
+        require(subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) {
+            "Sub-block index $subBlockIdx out of bounds (0..7)"
+        }
+        return getBlockD(blockIdx) * getScaleIndex(blockIdx, subBlockIdx)
+    }
+
+    override fun getSubBlockMin(blockIdx: Int, subBlockIdx: Int): Float {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" }
+        require(subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) {
+            "Sub-block index $subBlockIdx out of bounds (0..7)"
+        }
+        return getBlockDMin(blockIdx) * getMinIndex(blockIdx, subBlockIdx)
+    }
+
+    /**
+     * Port of `get_scale_min_k4` from ggml-quants.c — identical to Q4_K. The
+     * 12 scale bytes don't pack 12 bits sequentially per sub-block; sub-blocks
+     * 4..7 reuse the top 2 bits of bytes for sub-blocks 0..3.
+     */
+    private fun getScaleIndex(blockIdx: Int, subBlockIdx: Int): Int {
+        val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 4
+        val j = subBlockIdx
+        return if (j < 4) {
+            data[base + j].toInt() and 0x3F
+        } else {
+            val low4 = data[base + j + 4].toInt() and 0x0F
+            val high2 = (data[base + j - 4].toInt() and 0xFF) ushr 6
+            low4 or (high2 shl 4)
+        }
+    }
+
+    private fun getMinIndex(blockIdx: Int, subBlockIdx: Int): Int {
+        val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 4
+        val j = subBlockIdx
+        return if (j < 4) {
+            data[base + j + 4].toInt() and 0x3F
+        } else {
+            val low4 = (data[base + j + 4].toInt() and 0xFF) ushr 4
+            val high2 = (data[base + j].toInt() and 0xFF) ushr 6
+            low4 or (high2 shl 4)
+        }
+    }
+
+    /**
+     * Look up the 5-bit code for `elementIdx` (0..255) within block `blockIdx`.
+     * The low 4 bits come from the strided `qs` nibble layout (identical to
+     * Q4_K); the 5th bit comes from `qh[l]` where `l` is the intra-group
+     * position and the bit index is `2*group` (lo nibble) or `2*group + 1`
+     * (hi nibble). Matches `DequantOps.dequantQ5KFromBytes`.
+     */
+    override fun getCode(blockIdx: Int, elementIdx: Int): Int {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" }
+        require(elementIdx in 0 until Q5_KTensorData.BLOCK_SIZE) {
+            "Element index $elementIdx out of bounds (0..255)"
+        }
+        val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK
+        val groupIdx = elementIdx / 64           // 0..3 — the `outer` iteration
+        val withinGroup = elementIdx % 64        // 0..63
+        val l = withinGroup % 32                 // 0..31 — intra-group position
+        val qsByte = data[base + Q5_KTensorData.QS_OFFSET + groupIdx * 32 + l].toInt() and 0xFF
+        val low = if (withinGroup < 32) qsByte and 0x0F else qsByte ushr 4
+        val qhByte = data[base + Q5_KTensorData.QH_OFFSET + l].toInt() and 0xFF
+        val bit = if (withinGroup < 32) 2 * groupIdx else 2 * groupIdx + 1
+        val fifth = (qhByte ushr bit) and 0x01
+        return low or (fifth shl 4)
+    }
+
+    override fun get(vararg indices: Int): Byte {
+        val flatIndex = calcFlatIndex(indices)
+        val blockIdx = flatIndex / Q5_KTensorData.BLOCK_SIZE
+        val elementIdx = flatIndex % Q5_KTensorData.BLOCK_SIZE
+        return getCode(blockIdx, elementIdx).toByte()
+    }
+
+    override fun set(vararg indices: Int, value: Byte) {
+        throw UnsupportedOperationException("Q5_K packed tensor data is read-only")
+    }
+
+    private fun calcFlatIndex(indices: IntArray): Int {
+        require(indices.size == shape.dimensions.size) {
+            "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})"
+        }
+        var flatIndex = 0
+        for (i in indices.indices) {
+            val idx = indices[i]
+            require(idx >= 0 && idx < shape.dimensions[i]) {
+                "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}"
+            }
+            flatIndex += idx * strides[i]
+        }
+        return flatIndex
+    }
+
+    public companion object {
+        /**
+         * Create Q5_KTensorData from raw GGUF bytes.
+         */
+        public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_KBlockTensorData {
+            return Q5_KBlockTensorData(shape, bytes)
+        }
+
+        /**
+         * Convert f16 bits to float32.
+         */
+        internal fun halfToFloat(hbits: Int): Float {
+            val sign = (hbits and 0x8000) shl 16
+            val exp = (hbits and 0x7C00) shr 10
+            val mant = hbits and 0x03FF
+
+            return when (exp) {
+                0 -> {
+                    if (mant == 0) {
+                        Float.fromBits(sign)
+                    } else {
+                        var m = mant
+                        var e = -14
+                        while ((m and 0x400) == 0) {
+                            m = m shl 1
+                            e--
+                        }
+                        m = m and 0x3FF
+                        val floatExp = (e + 127) shl 23
+                        val floatMant = m shl 13
+                        Float.fromBits(sign or floatExp or floatMant)
+                    }
+                }
+                31 -> {
+                    val floatExp = 0xFF shl 23
+                    val floatMant = mant shl 13
+                    Float.fromBits(sign or floatExp or floatMant)
+                }
+                else -> {
+                    val floatExp = (exp - 15 + 127) shl 23
+                    val floatMant = mant shl 13
+                    Float.fromBits(sign or floatExp or floatMant)
+                }
+            }
+        }
+    }
+}
+
+/**
+ * Dequantize Q5_K tensor data to a FloatArray (canonical ggml formula:
+ * `output[i] = code[i] * scale - offset`).
+ */
+public fun Q5_KTensorData.toFloatArray(): FloatArray {
+    val result = FloatArray(shape.volume)
+    var outIdx = 0
+    for (blockIdx in 0 until blockCount) {
+        for (subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) {
+            val scale = getSubBlockScale(blockIdx, subBlockIdx)
+            val offset = getSubBlockMin(blockIdx, subBlockIdx)
+            val elemsStart = subBlockIdx * Q5_KTensorData.SUB_BLOCK_SIZE
+            for (j in 0 until Q5_KTensorData.SUB_BLOCK_SIZE) {
+                val elementIdx = elemsStart + j
+                if (outIdx >= shape.volume) break
+                val code = getCode(blockIdx, elementIdx)
+                result[outIdx++] = code * scale - offset
+            }
+        }
+    }
+    return result
+}
diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
index 509b6704..1f2705ce 100644
--- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
+++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
@@ -40,6 +40,18 @@ public sealed interface TensorEncoding {
         }
     }
 
+    /** GGML Q5_K block quantization: 256 elements per 176-byte block. */
+    public data object Q5_K : TensorEncoding {
+        public const val BLOCK_SIZE: Int = 256
+        public const val BYTES_PER_BLOCK: Int = 176
+
+        override val name: String get() = "Q5_K"
+        override fun physicalBytes(elementCount: Long): Long {
+            val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE
+            return blocks * BYTES_PER_BLOCK
+        }
+    }
+
     /** GGML Q6_K block quantization: 256 elements per 210-byte block. */
     public data object Q6_K : TensorEncoding {
         public const val BLOCK_SIZE: Int = 256

From 58142009b9f05b381b8e78e226ee3b7c515748b8 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Thu, 11 Jun 2026 07:24:02 +0200
Subject: [PATCH 2/4] feat(backend-native-cpu): Kotlin/Native cinterop to the
 C/NEON kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hand-written matmul kernels were JVM-only (consumed via FFM), but the
SL2610 board binary is Kotlin/Native — it can't use the FFM wrapper. Add a
K/N consumption path via cinterop so the board gets the same C (and, on
aarch64, NEON) kernels.

- CMake builds a STATIC archive (skainet_kernels_static -> libskainet_kernels.a)
  alongside the SHARED lib; same sources + flags (incl. the aarch64 NEON march).
- cinterop .def (skainet_kernels.h -> sk.ainet.kernels.cinterop).
- linuxX64 target on the (previously jvm-only) module, linking the static
  archive into K/N binaries; link tasks depend on the CMake build.
- NativeKnQ5KMatmulKernel (linuxX64Main): calls skainet_q5k_matmul via cinterop
  with pinned arrays (zero-copy).

POC verified on the host (linuxX64): NativeKnQ5KMatmulKernelParityTest — the
cinterop kernel matches the commonMain ScalarQ5_KMatmulKernel across 4 shapes
(tests=4, failures=0). JVM/FFM path unchanged (jvmTest green). linuxArm64 board
target + NEON runtime check are the remaining step.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build.gradle.kts                          | 38 ++++++++
 .../native/CMakeLists.txt                     | 87 ++++++++++---------
 .../exec/kernel/NativeKnQ5KMatmulKernel.kt    | 56 ++++++++++++
 .../NativeKnQ5KMatmulKernelParityTest.kt      | 70 +++++++++++++++
 .../cinterop/skainet_kernels.def              |  8 ++
 5 files changed, 220 insertions(+), 39 deletions(-)
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def

diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
index 6ab0ddd9..e5aa22f7 100644
--- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
+++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
@@ -3,10 +3,30 @@ plugins {
     alias(libs.plugins.vanniktech.mavenPublish)
 }
 
+// Paths shared by the K/N cinterop (kotlin block) and the CMake tasks below.
+val nativeIncludeDir: String = layout.projectDirectory.dir("native/include").asFile.absolutePath
+val staticArchivePath: String =
+    layout.buildDirectory.file("native/cmake-build/libskainet_kernels.a").get().asFile.absolutePath
+
 kotlin {
     explicitApi()
     jvm()
 
+    // Kotlin/Native: POC on the host (linuxX64); linuxArm64 is the board target.
+    // Exposes the hand-written C/NEON kernels to K/N via cinterop to the static
+    // archive libskainet_kernels.a (CMake `skainet_kernels_static`). This is the
+    // board-consumption path — the JVM consumes the same kernels via FFM instead.
+    linuxX64 {
+        compilations.getByName("main").cinterops.create("skainetKernels") {
+            defFile(project.file("src/nativeInterop/cinterop/skainet_kernels.def"))
+            includeDirs(nativeIncludeDir)
+        }
+        binaries.all {
+            // Link the static C archive into every linuxX64 binary (incl. tests).
+            linkerOpts(staticArchivePath)
+        }
+    }
+
     sourceSets {
         val jvmMain by getting {
             dependencies {
@@ -24,6 +44,18 @@ kotlin {
                 implementation(libs.kotlinx.coroutines)
             }
         }
+        val linuxX64Main by getting {
+            dependencies {
+                implementation(project(":skainet-backends:skainet-backend-api"))
+            }
+        }
+        val linuxX64Test by getting {
+            dependencies {
+                implementation(libs.kotlin.test)
+                // ScalarQ5_KMatmulKernel reference for the cinterop parity test.
+                implementation(project(":skainet-backends:skainet-backend-cpu"))
+            }
+        }
     }
 }
 
@@ -89,6 +121,12 @@ val buildNativeKernels by tasks.registering(Exec::class) {
     )
 }
 
+// The linuxX64 (K/N) binaries link libskainet_kernels.a (built by CMake into
+// cmakeBuildPath), so the static archive must exist before the K/N link step.
+tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxX64") }.configureEach {
+    dependsOn(buildNativeKernels)
+}
+
 val packageNativeKernels by tasks.registering(Copy::class) {
     group = "build"
     description = "Stage the built native kernels library into JVM resources."
diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
index f26c125c..492ac4b7 100644
--- a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
+++ b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
@@ -9,7 +9,7 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
-add_library(skainet_kernels SHARED
+set(SKAINET_KERNEL_SOURCES
     src/skainet_smoke.c
     src/q4k_matmul.c
     src/q5k_matmul.c
@@ -19,45 +19,54 @@ add_library(skainet_kernels SHARED
     src/q4_0_matmul.c
 )
 
-target_include_directories(skainet_kernels PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}/include
-)
+# SHARED: consumed by the JVM via java.lang.foreign (FFM), bundled as a JAR
+#         resource (libskainet_kernels.{so,dylib,dll}).
+# STATIC: consumed by Kotlin/Native via cinterop, linked into the K/N binary
+#         (libskainet_kernels.a). Same sources + flags; both keep -fPIC.
+add_library(skainet_kernels SHARED ${SKAINET_KERNEL_SOURCES})
+add_library(skainet_kernels_static STATIC ${SKAINET_KERNEL_SOURCES})
+set_target_properties(skainet_kernels_static PROPERTIES OUTPUT_NAME skainet_kernels)
 
-# Strip the "lib" prefix on Windows so the artifact name is consistent
-# with the resource-bundle path skainet_kernels.{dll,so,dylib}.
-if(WIN32)
-    set_target_properties(skainet_kernels PROPERTIES PREFIX "")
-endif()
+set(SKAINET_KERNEL_TARGETS skainet_kernels skainet_kernels_static)
 
-# Per-compiler tuning. The Q4_K kernel hot loop is straight-line FP
-# arithmetic that auto-vectorizes cleanly under aggressive optimization
-# (AVX2 on x86_64, NEON on ARM64). Visibility is also handled here on
-# ELF / Mach-O; on Windows the SKAINET_API macro adds dllexport so we
-# don't need /VISIBILITY flags.
-if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
-    target_compile_options(skainet_kernels PRIVATE
-        -fvisibility=hidden
-        -Wall -Wextra
-        -O3
-        -ffast-math
-        -funroll-loops
-    )
-    # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h.
-    # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 + dotprod
-    # (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it would
-    # SIGILL on the board. CONFIRM on-device first: `grep Features /proc/cpuinfo`
-    # must list `asimddp` and `fphp`/`asimdhp`. Explicit -march (not -mcpu=native)
-    # because this is a cross-compile from an x86 host.
-    if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
-        target_compile_options(skainet_kernels PRIVATE
-            -march=armv8.2-a+fp16+dotprod
+foreach(tgt IN LISTS SKAINET_KERNEL_TARGETS)
+    target_include_directories(${tgt} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+    # Strip the "lib" prefix on Windows so the shared artifact name matches
+    # the resource-bundle path skainet_kernels.{dll,so,dylib}.
+    if(WIN32)
+        set_target_properties(${tgt} PROPERTIES PREFIX "")
+    endif()
+
+    # Per-compiler tuning. The matmul hot loops are straight-line FP arithmetic
+    # that auto-vectorizes under aggressive optimization (AVX2 on x86_64, NEON
+    # on ARM64). Visibility is handled here on ELF / Mach-O; on Windows the
+    # SKAINET_API macro adds dllexport.
+    if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
+        target_compile_options(${tgt} PRIVATE
+            -fvisibility=hidden
+            -Wall -Wextra
+            -O3
+            -ffast-math
+            -funroll-loops
+        )
+        # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h.
+        # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 +
+        # dotprod (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it
+        # would SIGILL on the board. CONFIRM on-device first: `grep Features
+        # /proc/cpuinfo` must list `asimddp` and `fphp`/`asimdhp`. Explicit
+        # -march (not -mcpu=native) because this is a cross-compile from x86.
+        if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+            target_compile_options(${tgt} PRIVATE
+                -march=armv8.2-a+fp16+dotprod
+            )
+        endif()
+        set_target_properties(${tgt} PROPERTIES C_VISIBILITY_PRESET hidden)
+    elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC")
+        target_compile_options(${tgt} PRIVATE
+            /O2
+            /fp:fast
+            /W3
         )
     endif()
-    set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
-elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC")
-    target_compile_options(skainet_kernels PRIVATE
-        /O2
-        /fp:fast
-        /W3
-    )
-endif()
+endforeach()
diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt
new file mode 100644
index 00000000..1ad62644
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt
@@ -0,0 +1,56 @@
+package sk.ainet.exec.kernel
+
+import kotlinx.cinterop.ExperimentalForeignApi
+import kotlinx.cinterop.addressOf
+import kotlinx.cinterop.reinterpret
+import kotlinx.cinterop.usePinned
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+import sk.ainet.kernels.cinterop.skainet_q5k_matmul
+
+/**
+ * Kotlin/Native implementation of [Q5KMatmulKernel] that calls the hand-written
+ * C kernel `skainet_q5k_matmul` (the same `q5k_matmul.c` the JVM consumes via
+ * FFM) through cinterop, linked from the static archive `libskainet_kernels.a`.
+ *
+ * This is the board-consumption path: the SL2610 binary is Kotlin/Native, so it
+ * cannot use the JVM-FFM wrapper. The arrays are pinned and their base pointers
+ * passed to C; the C side reads `input + input_offset` etc., so no copy is made.
+ *
+ * On `linuxArm64` the linked archive carries the NEON paths
+ * (`-march=armv8.2-a+fp16+dotprod`); on `linuxX64` (this POC host) it's the
+ * scalar/auto-vectorized build. Correctness is identical across both.
+ */
+@OptIn(ExperimentalForeignApi::class)
+public object NativeKnQ5KMatmulKernel : Q5KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeKnQ5KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+
+        input.usePinned { inPin ->
+            weight.usePinned { wPin ->
+                output.usePinned { outPin ->
+                    skainet_q5k_matmul(
+                        inPin.addressOf(0),
+                        inputOffset,
+                        wPin.addressOf(0).reinterpret(),
+                        weightByteOffset,
+                        inputDim,
+                        outputDim,
+                        outPin.addressOf(0),
+                        outputOffset,
+                    )
+                }
+            }
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt
new file mode 100644
index 00000000..fceb2b28
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt
@@ -0,0 +1,70 @@
+package sk.ainet.exec.kernel
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertTrue
+
+/**
+ * Proves the Kotlin/Native cinterop path: [NativeKnQ5KMatmulKernel] (calling the
+ * C `skainet_q5k_matmul` via cinterop, linked from libskainet_kernels.a) must
+ * agree with the commonMain [ScalarQ5_KMatmulKernel] reference within FMA +
+ * `-ffast-math` reassociation tolerance.
+ *
+ * This is the host (linuxX64) de-risking of the board (linuxArm64) consumption:
+ * the cinterop mechanism + kernel correctness are verified here; only the NEON
+ * codegen differs on aarch64 (board-verify-pending).
+ */
+class NativeKnQ5KMatmulKernelParityTest {
+
+    private val blockSize = 256
+    private val bytesPerBlock = 176
+
+    private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray {
+        val rng = Random(seed)
+        val bytes = ByteArray(numBlocks * bytesPerBlock)
+        rng.nextBytes(bytes)
+        for (block in 0 until numBlocks) {
+            val base = block * bytesPerBlock
+            // 0x3C00 == 1.0f16 for d and dMin so dequant stays finite.
+            bytes[base + 0] = 0x00.toByte()
+            bytes[base + 1] = 0x3C.toByte()
+            bytes[base + 2] = 0x00.toByte()
+            bytes[base + 3] = 0x3C.toByte()
+        }
+        return bytes
+    }
+
+    private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) {
+        val numBlocks = (inputDim / blockSize) * outputDim
+        val packed = randomQ5KBytes(numBlocks, seed)
+        val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f }
+
+        val refOut = FloatArray(outputDim)
+        ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0)
+
+        val knOut = FloatArray(outputDim)
+        NativeKnQ5KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, knOut, 0)
+
+        for (o in 0 until outputDim) {
+            val diff = abs(refOut[o] - knOut[o])
+            val rel = diff / (abs(refOut[o]) + 1e-9f)
+            assertTrue(
+                diff <= tol || rel < 1e-4f,
+                "row $o diverged: scalar=${refOut[o]} cinterop=${knOut[o]} diff=$diff rel=$rel tol=$tol",
+            )
+        }
+    }
+
+    @Test
+    fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f)
+
+    @Test
+    fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f)
+
+    @Test
+    fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f)
+
+    @Test
+    fun llm_typical_shape() = assertParity(4096, 64, 999, 5e-1f)
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def b/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def
new file mode 100644
index 00000000..23c80954
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def
@@ -0,0 +1,8 @@
+# Kotlin/Native cinterop binding for the hand-written C matmul kernels
+# (skainet_kernels.h). Generates Kotlin bindings for skainet_q5k_matmul etc.;
+# the static archive libskainet_kernels.a (built by CMake) is linked into the
+# consuming K/N binary via linkerOpts (see build.gradle.kts). includeDirs for
+# the header are supplied from the Gradle cinterop block.
+headers = skainet_kernels.h
+headerFilter = skainet_kernels.h
+package = sk.ainet.kernels.cinterop

From 1b6f7f6acc10e1b79c5b0dd77eb7bbd97dce5e99 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Thu, 11 Jun 2026 07:31:56 +0200
Subject: [PATCH 3/4] feat(backend-native-cpu): K/N KernelProvider over the
 cinterop kernels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The K/N analogue of the JVM NativeKernelProvider (FFM): a KernelProvider
(priority 100) exposing the cinterop-backed Q5_K/Q4_K/Q8_0/Q4_0 matmul kernels,
plus installNativeKernels() to register it in KernelRegistry — the path the
eager runtime's DefaultCpuOps.chooseQuantizedMatmulHeap uses to resolve a
kernel. K/N has no ServiceLoader, so registration is an explicit call by the
consumer (scalar fallback for Q6_K etc. is registered separately from
skainet-backend-cpu).

Verified on linuxX64: NativeKnKernelProviderTest — installNativeKernels makes
native-cinterop the best-available provider, its Q5_K kernel is the
registry-resolved kernel, and it matches the scalar reference (6 K/N tests
green total).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../exec/kernel/NativeKnKernelProvider.kt     | 131 ++++++++++++++++++
 .../exec/kernel/NativeKnKernelProviderTest.kt |  67 +++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
 create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt

diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
new file mode 100644
index 00000000..83f80c36
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
@@ -0,0 +1,131 @@
+package sk.ainet.exec.kernel
+
+import kotlinx.cinterop.ExperimentalForeignApi
+import kotlinx.cinterop.addressOf
+import kotlinx.cinterop.reinterpret
+import kotlinx.cinterop.usePinned
+import sk.ainet.backend.api.kernel.Fp32MatmulKernel
+import sk.ainet.backend.api.kernel.KernelProvider
+import sk.ainet.backend.api.kernel.KernelRegistry
+import sk.ainet.backend.api.kernel.Q4KMatmulKernel
+import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
+import sk.ainet.backend.api.kernel.Q5KMatmulKernel
+import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
+import sk.ainet.kernels.cinterop.skainet_q4_0_matmul
+import sk.ainet.kernels.cinterop.skainet_q4k_matmul
+import sk.ainet.kernels.cinterop.skainet_q8_0_matmul
+
+/**
+ * Kotlin/Native [KernelProvider] backed by the hand-written C kernels via
+ * cinterop (static archive `libskainet_kernels.a`) — the K/N analogue of the
+ * JVM `NativeKernelProvider` (FFM). Priority 100, above the commonMain scalar
+ * reference (0). On `linuxArm64` the linked archive carries the NEON paths.
+ *
+ * **Registration is manual on K/N** (no `ServiceLoader`): a consumer calls
+ * [installNativeKernels] once at startup. [Q5KMatmulKernel] (the FunctionGemma
+ * Q5_K_M hot path) plus Q4_K / Q8_0 / Q4_0 are wired; the rest cascade to the
+ * scalar provider.
+ */
+@OptIn(ExperimentalForeignApi::class)
+public object NativeKnKernelProvider : KernelProvider {
+    override val name: String = "native-cinterop"
+    override val priority: Int = 100
+
+    // Statically linked — the symbols are always present once the binary links
+    // libskainet_kernels.a, so the provider is unconditionally available.
+    override fun isAvailable(): Boolean = true
+
+    // Abstract on KernelProvider (no default) — no native FP32 SGEMM wrapper yet.
+    override fun matmulFp32(): Fp32MatmulKernel? = null
+
+    override fun matmulQ5K(): Q5KMatmulKernel = NativeKnQ5KMatmulKernel
+    override fun matmulQ4K(): Q4KMatmulKernel = NativeKnQ4KMatmulKernel
+    override fun matmulQ8_0(): Q8_0MatmulKernel = NativeKnQ8_0MatmulKernel
+    override fun matmulQ4_0(): Q4_0MatmulKernel = NativeKnQ4_0MatmulKernel
+}
+
+/**
+ * Register [NativeKnKernelProvider] in the process-wide [KernelRegistry]. Idempotent
+ * (re-registering the same instance is a no-op). Call once at startup before any
+ * `ops.matmul` on quantized weights.
+ *
+ * For quant types without a C kernel (e.g. Q6_K) also register the commonMain
+ * `ScalarKernelProvider` (from `skainet-backend-cpu`) as the fallback — it lives
+ * in a different module, so the consumer wires it:
+ * `KernelRegistry.register(ScalarKernelProvider)`.
+ */
+public fun installNativeKernels() {
+    KernelRegistry.register(NativeKnKernelProvider)
+}
+
+@OptIn(ExperimentalForeignApi::class)
+public object NativeKnQ4KMatmulKernel : Q4KMatmulKernel {
+    private const val BLOCK_SIZE = 256
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeKnQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        input.usePinned { i -> weight.usePinned { w -> output.usePinned { o ->
+            skainet_q4k_matmul(
+                i.addressOf(0), inputOffset,
+                w.addressOf(0).reinterpret(), weightByteOffset,
+                inputDim, outputDim,
+                o.addressOf(0), outputOffset,
+            )
+        } } }
+    }
+}
+
+@OptIn(ExperimentalForeignApi::class)
+public object NativeKnQ8_0MatmulKernel : Q8_0MatmulKernel {
+    private const val BLOCK_SIZE = 32
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeKnQ8_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        input.usePinned { i -> weight.usePinned { w -> output.usePinned { o ->
+            skainet_q8_0_matmul(
+                i.addressOf(0), inputOffset,
+                w.addressOf(0).reinterpret(), weightByteOffset,
+                inputDim, outputDim,
+                o.addressOf(0), outputOffset,
+            )
+        } } }
+    }
+}
+
+@OptIn(ExperimentalForeignApi::class)
+public object NativeKnQ4_0MatmulKernel : Q4_0MatmulKernel {
+    private const val BLOCK_SIZE = 32
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeKnQ4_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        input.usePinned { i -> weight.usePinned { w -> output.usePinned { o ->
+            skainet_q4_0_matmul(
+                i.addressOf(0), inputOffset,
+                w.addressOf(0).reinterpret(), weightByteOffset,
+                inputDim, outputDim,
+                o.addressOf(0), outputOffset,
+            )
+        } } }
+    }
+}
diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt
new file mode 100644
index 00000000..99e7c614
--- /dev/null
+++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt
@@ -0,0 +1,67 @@
+package sk.ainet.exec.kernel
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.AfterTest
+import kotlin.test.BeforeTest
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertNotNull
+import kotlin.test.assertSame
+import kotlin.test.assertTrue
+import sk.ainet.backend.api.kernel.KernelRegistry
+
+/**
+ * Verifies the K/N kernel provider integrates with [KernelRegistry] the way the
+ * eager runtime's `DefaultCpuOps.chooseQuantizedMatmulHeap` resolves kernels:
+ * after [installNativeKernels], the highest-priority available provider is the
+ * cinterop one, and its Q5_K kernel is the registry-resolved kernel that runs.
+ */
+class NativeKnKernelProviderTest {
+
+    @BeforeTest
+    fun clean() = KernelRegistry.clearForTesting()
+
+    @AfterTest
+    fun reset() = KernelRegistry.clearForTesting()
+
+    @Test
+    fun installs_and_resolves_native_quant_kernels() {
+        installNativeKernels()
+
+        // Priority 100 cinterop beats the scalar (0) fallback.
+        assertEquals("native-cinterop", KernelRegistry.bestAvailable()?.name)
+
+        val provider = KernelRegistry.providers().firstOrNull { it.isAvailable() && it.matmulQ5K() != null }
+        assertNotNull(provider, "no available provider carries a Q5_K kernel")
+        assertSame(NativeKnQ5KMatmulKernel, provider.matmulQ5K())
+        assertTrue(provider.supports("matmul", listOf("Float32", "Q5_K")))
+    }
+
+    @Test
+    fun registry_resolved_q5k_kernel_is_correct() {
+        installNativeKernels()
+        val kernel = KernelRegistry.bestAvailable()!!.matmulQ5K()!!
+
+        val inputDim = 1024
+        val outputDim = 64
+        val numBlocks = (inputDim / 256) * outputDim
+        val packed = ByteArray(numBlocks * 176).also { Random(5).nextBytes(it) }
+        for (b in 0 until numBlocks) {
+            val base = b * 176
+            packed[base] = 0x00; packed[base + 1] = 0x3C // d = 1.0f16
+            packed[base + 2] = 0x00; packed[base + 3] = 0x3C // dMin = 1.0f16
+        }
+        val input = FloatArray(inputDim) { Random(it + 1).nextFloat() - 0.5f }
+
+        val ref = FloatArray(outputDim)
+        ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, ref, 0)
+        val got = FloatArray(outputDim)
+        kernel.matmul(input, 0, packed, 0, inputDim, outputDim, got, 0)
+
+        for (o in 0 until outputDim) {
+            val diff = abs(ref[o] - got[o])
+            assertTrue(diff <= 5e-2f || diff / (abs(ref[o]) + 1e-9f) < 1e-4f, "row $o: ref=${ref[o]} got=${got[o]}")
+        }
+    }
+}

From 587d59b0798847480dc51ec19b7417f9b2db02c5 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Thu, 11 Jun 2026 10:13:40 +0200
Subject: [PATCH 4/4] feat(backend-native-cpu): add linuxArm64 (board) K/N
 target

Promote the K/N cinterop path from the linuxX64 POC to the real board target:
- linuxArm64 target with the same skainet_kernels cinterop; links the aarch64
  cross-built static archive (cmake-build-arm64/libskainet_kernels.a, NEON).
- Shared `nativeMain` source set holds NativeKn*MatmulKernel + the provider, so
  linuxX64 and linuxArm64 share one implementation (cinterop bindings are
  commonized across both targets).
- linuxArm64 link tasks depend on the aarch64 cross-build only under -PcrossArm64
  (toolchain present); a plain host build still compiles linuxArm64 to a klib.

Verified on host: compileKotlinLinuxArm64 + cinteropSkainetKernelsLinuxArm64
succeed (cross-compiled from x86); linuxX64Test still green (6 tests) on the
shared nativeMain. Final aarch64 binary link + NEON runtime are board-verify-pending.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build.gradle.kts                          | 42 ++++++++++++++-----
 .../exec/kernel/NativeKnKernelProvider.kt     |  0
 .../exec/kernel/NativeKnQ5KMatmulKernel.kt    |  0
 3 files changed, 31 insertions(+), 11 deletions(-)
 rename skainet-backends/skainet-backend-native-cpu/src/{linuxX64Main => nativeMain}/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt (100%)
 rename skainet-backends/skainet-backend-native-cpu/src/{linuxX64Main => nativeMain}/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt (100%)

diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
index e5aa22f7..7c5e95e0 100644
--- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
+++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
@@ -7,25 +7,29 @@ plugins {
 val nativeIncludeDir: String = layout.projectDirectory.dir("native/include").asFile.absolutePath
 val staticArchivePath: String =
     layout.buildDirectory.file("native/cmake-build/libskainet_kernels.a").get().asFile.absolutePath
+// aarch64 cross-built static archive (produced by buildNativeKernelsArm64 with
+// -PcrossArm64; carries the NEON paths). Linked into linuxArm64 binaries.
+val staticArchiveArm64Path: String =
+    layout.buildDirectory.file("native/cmake-build-arm64/libskainet_kernels.a").get().asFile.absolutePath
 
 kotlin {
     explicitApi()
     jvm()
 
-    // Kotlin/Native: POC on the host (linuxX64); linuxArm64 is the board target.
-    // Exposes the hand-written C/NEON kernels to K/N via cinterop to the static
-    // archive libskainet_kernels.a (CMake `skainet_kernels_static`). This is the
-    // board-consumption path — the JVM consumes the same kernels via FFM instead.
-    linuxX64 {
+    // Kotlin/Native consumption of the hand-written C/NEON kernels via cinterop
+    // to the static archive libskainet_kernels.a (CMake `skainet_kernels_static`).
+    // linuxX64 = host (POC / CI-runnable); linuxArm64 = the SL2610 board target
+    // (its archive is the aarch64 cross-build with NEON). The JVM consumes the
+    // same kernels via FFM instead. Shared K/N code lives in `nativeMain`.
+    fun org.jetbrains.kotlin.gradle.plugin.mpp.KotlinNativeTarget.wireSkainetKernels(archive: String) {
         compilations.getByName("main").cinterops.create("skainetKernels") {
             defFile(project.file("src/nativeInterop/cinterop/skainet_kernels.def"))
             includeDirs(nativeIncludeDir)
         }
-        binaries.all {
-            // Link the static C archive into every linuxX64 binary (incl. tests).
-            linkerOpts(staticArchivePath)
-        }
+        binaries.all { linkerOpts(archive) }
     }
+    linuxX64 { wireSkainetKernels(staticArchivePath) }
+    linuxArm64 { wireSkainetKernels(staticArchiveArm64Path) }
 
     sourceSets {
         val jvmMain by getting {
@@ -44,11 +48,17 @@ kotlin {
                 implementation(libs.kotlinx.coroutines)
             }
         }
-        val linuxX64Main by getting {
+        // Shared K/N kernels (NativeKn*MatmulKernel + provider), consumed by both
+        // linuxX64 and linuxArm64. The cinterop bindings are commonized across the
+        // two targets so this source set can reference sk.ainet.kernels.cinterop.
+        val nativeMain by creating {
+            dependsOn(commonMain.get())
             dependencies {
                 implementation(project(":skainet-backends:skainet-backend-api"))
             }
         }
+        val linuxX64Main by getting { dependsOn(nativeMain) }
+        val linuxArm64Main by getting { dependsOn(nativeMain) }
         val linuxX64Test by getting {
             dependencies {
                 implementation(libs.kotlin.test)
@@ -122,7 +132,7 @@ val buildNativeKernels by tasks.registering(Exec::class) {
 }
 
 // The linuxX64 (K/N) binaries link libskainet_kernels.a (built by CMake into
-// cmakeBuildPath), so the static archive must exist before the K/N link step.
+// cmakeBuildPath), so the host static archive must exist before the K/N link.
 tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxX64") }.configureEach {
     dependsOn(buildNativeKernels)
 }
@@ -211,6 +221,16 @@ tasks.named("jvmProcessResources") {
     if (crossArm64Enabled) dependsOn(packageNativeKernelsArm64)
 }
 
+// linuxArm64 binaries link the aarch64 cross-built archive. Only wired with
+// -PcrossArm64 (cross toolchain present): a plain host build still compiles
+// linuxArm64 to a klib (no archive needed) — only a final binary/test link
+// needs it, which is a board/CI concern.
+if (crossArm64Enabled) {
+    tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxArm64") }.configureEach {
+        dependsOn(buildNativeKernelsArm64)
+    }
+}
+
 // Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test
 // JVM so Q4KMatmulMicrobenchTest activates. Skipped silently otherwise.
 val runBenchProperty = providers.systemProperty("skainet.runBench")
diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
similarity index 100%
rename from skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
rename to skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt
diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt
similarity index 100%
rename from skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt
rename to skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt