SKaiNET-developers · michalharakal · Apr 29, 2026 · Apr 29, 2026
diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts
@@ -15,6 +15,12 @@ kotlin {
         val jvmTest by getting {
             dependencies {
                 implementation(libs.kotlin.test)
+                // Parity tests compare NativeQ4KMatmulKernel output
+                // against PanamaVectorQ4KMatmulKernel; the Panama
+                // kernel pulls in parallelChunks which transitively
+                // requires kotlinx-coroutines.
+                implementation(project(":skainet-backends:skainet-backend-cpu"))
+                implementation(libs.kotlinx.coroutines)
             }
         }
     }
@@ -106,10 +112,15 @@ tasks.named("jvmProcessResources") {
     dependsOn(packageNativeKernels)
 }
 
+// Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test
+// JVM so Q4KMatmulMicrobenchTest activates. Skipped silently otherwise.
+val runBenchProperty = providers.systemProperty("skainet.runBench")
+
 tasks.withType<Test>().configureEach {
-    jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
+    jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
+    runBenchProperty.orNull?.let { systemProperty("skainet.runBench", it) }
 }
 
 tasks.withType<JavaExec>().configureEach {
-    jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
+    jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
 }
diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 
 add_library(skainet_kernels SHARED
     src/skainet_smoke.c
+    src/q4k_matmul.c
 )
 
 target_include_directories(skainet_kernels PUBLIC
@@ -23,8 +24,15 @@ if(WIN32)
     set_target_properties(skainet_kernels PROPERTIES PREFIX "")
 endif()
 
-# Hide non-exported symbols on ELF / Mach-O for a smaller surface area.
+# Hide non-exported symbols on ELF / Mach-O for a smaller surface area
+# and let the compiler auto-vectorize the Q4_K hot loop.
 if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
-    target_compile_options(skainet_kernels PRIVATE -fvisibility=hidden -Wall -Wextra)
+    target_compile_options(skainet_kernels PRIVATE
+        -fvisibility=hidden
+        -Wall -Wextra
+        -O3
+        -ffast-math
+        -funroll-loops
+    )
     set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
 endif()
diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h
@@ -25,6 +25,30 @@ extern "C" {
  */
 SKAINET_API void skainet_smoke_double(const float* input, float* output, int32_t length);
 
+/*
+ * Q4_K matrix-vector multiply.
+ *
+ *   output[output_offset + o] = sum_j input[input_offset + j] *
+ *                                dequant(weight[block, o, j])
+ *
+ * Block layout: canonical ggml Q4_K, 256 elements per super-block, 144
+ * bytes per block, with packed weights laid out as
+ *   weight + weight_byte_offset + (block_idx * output_dim + o) * 144
+ *
+ * Caller owns input/weight/output memory; the kernel does not retain
+ * pointers past return. input_dim must be a multiple of 256.
+ */
+SKAINET_API void skainet_q4k_matmul(
+    const float* input,
+    int32_t input_offset,
+    const uint8_t* weight,
+    int32_t weight_byte_offset,
+    int32_t input_dim,
+    int32_t output_dim,
+    float* output,
+    int32_t output_offset
+);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
@@ -0,0 +1,151 @@
+#include "skainet_kernels.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define Q4K_BLOCK_SIZE       256
+#define Q4K_SUB_BLOCK_SIZE    32
+#define Q4K_SUB_BLOCKS         8
+#define Q4K_BYTES_PER_BLOCK  144
+
+/*
+ * IEEE 754 binary16 (LE byte order) -> binary32 conversion.
+ * Mirrors PanamaVectorQ4KMatmulKernel.halfToFloat byte-for-byte.
+ */
+static inline float skainet_half_to_float(uint16_t hbits) {
+    const uint32_t sign = (hbits >> 15) & 0x1u;
+    const uint32_t exp  = (hbits >> 10) & 0x1Fu;
+    const uint32_t frac =  hbits        & 0x3FFu;
+
+    if (exp == 0u) {
+        if (frac == 0u) {
+            union { uint32_t u; float f; } v = { sign << 31 };
+            return v.f;
+        }
+        float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f);
+        return sign ? -f : f;
+    }
+    if (exp == 0x1Fu) {
+        union { uint32_t u; float f; } v;
+        v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u);
+        return v.f;
+    }
+    union { uint32_t u; float f; } v;
+    v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13);
+    return v.f;
+}
+
+/*
+ * ggml's get_scale_min_k4 unmix for the 12-byte packed sub-scale region
+ * (bytes 4..15 of a Q4_K block). Same logic as the Kotlin reference.
+ */
+static inline void skainet_q4k_decode_scales(
+    const uint8_t* scales,
+    int* scale_idx,
+    int* min_idx
+) {
+    for (int sb = 0; sb < 4; ++sb) {
+        scale_idx[sb] = scales[sb]     & 0x3F;
+        min_idx[sb]   = scales[sb + 4] & 0x3F;
+    }
+    for (int sb = 4; sb < 8; ++sb) {
+        const int low4_s  = scales[sb + 4] & 0x0F;
+        const int high2_s = (scales[sb - 4] >> 6) & 0x03;
+        scale_idx[sb] = low4_s | (high2_s << 4);
+
+        const int low4_m  = (scales[sb + 4] >> 4) & 0x0F;
+        const int high2_m = (scales[sb] >> 6) & 0x03;
+        min_idx[sb] = low4_m | (high2_m << 4);
+    }
+}
+
+/*
+ * Native Q4_K matrix-vector multiply matching the
+ * sk.ainet.backend.api.kernel.Q4KMatmulKernel SPI contract. Single
+ * input row times an `outputDim x inputDim` Q4_K-packed weight tensor
+ * laid out (blockIdx * outputDim + o) * 144 bytes.
+ *
+ * Lazy-dmin pattern: per sub-block accumulate
+ *   codeSum[s] = sum_i input[i] * code[i]
+ *   inputSum[s] = sum_i input[i]
+ * and combine once via
+ *   acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s]
+ *
+ * Scalar single-threaded for PR 2; the tight inner loop is
+ * straight-line FP arithmetic so -O3 auto-vectorizes the
+ * codeSum/inputSum accumulators on AVX2/NEON.
+ */
+SKAINET_API void skainet_q4k_matmul(
+    const float* __restrict__ input,
+    int32_t input_offset,
+    const uint8_t* __restrict__ weight,
+    int32_t weight_byte_offset,
+    int32_t input_dim,
+    int32_t output_dim,
+    float* __restrict__ output,
+    int32_t output_offset
+) {
+    if (output_dim <= 0 || input_dim <= 0) return;
+
+    const int32_t blocks_per_input_dim = input_dim / Q4K_BLOCK_SIZE;
+    const float* in_base = input + input_offset;
+    float* out_base = output + output_offset;
+
+    int scale_idx[Q4K_SUB_BLOCKS];
+    int min_idx[Q4K_SUB_BLOCKS];
+
+    for (int32_t o = 0; o < output_dim; ++o) {
+        float acc = 0.0f;
+
+        for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
+            const uint8_t* block = weight + weight_byte_offset
+                + (size_t)(block_idx * output_dim + o) * Q4K_BYTES_PER_BLOCK;
+
+            /* d, dMin (FP16 LE -> FP32). */
+            const uint16_t d_bits     = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
+            const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8);
+            const float d     = skainet_half_to_float(d_bits);
+            const float d_min = skainet_half_to_float(d_min_bits);
+
+            /* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */
+            skainet_q4k_decode_scales(block + 4, scale_idx, min_idx);
+
+            const uint8_t* qs = block + 16;
+            const float* in_block = in_base + (size_t) block_idx * Q4K_BLOCK_SIZE;
+
+            /* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
+            for (int group_j = 0; group_j < 4; ++group_j) {
+                const uint8_t* qs_group   = qs + group_j * Q4K_SUB_BLOCK_SIZE;
+                const int sb_lo = 2 * group_j;
+                const int sb_hi = sb_lo + 1;
+                const float* in_lo = in_block + sb_lo * Q4K_SUB_BLOCK_SIZE;
+                const float* in_hi = in_block + sb_hi * Q4K_SUB_BLOCK_SIZE;
+
+                float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
+                float code_sum_hi = 0.0f, input_sum_hi = 0.0f;
+
+                /* 32 iterations — auto-vectorizes cleanly under -O3. */
+                for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) {
+                    const uint8_t b = qs_group[i];
+                    const float code_lo = (float)(b & 0x0F);
+                    const float code_hi = (float)(b >> 4);
+                    const float v_lo = in_lo[i];
+                    const float v_hi = in_hi[i];
+                    code_sum_lo  += v_lo * code_lo;
+                    input_sum_lo += v_lo;
+                    code_sum_hi  += v_hi * code_hi;
+                    input_sum_hi += v_hi;
+                }
+
+                const float scale_lo  = d     * (float) scale_idx[sb_lo];
+                const float offset_lo = d_min * (float) min_idx[sb_lo];
+                const float scale_hi  = d     * (float) scale_idx[sb_hi];
+                const float offset_hi = d_min * (float) min_idx[sb_hi];
+                acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
+                acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
+            }
+        }
+
+        out_base[o] = acc;
+    }
+}
diff --git a/...kainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt b/...kainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt
@@ -8,26 +8,24 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel
  * Native (FFM) [KernelProvider]. Sits at priority `100`, above
  * [PanamaVectorKernelProvider] (`50`) and the scalar reference (`0`).
  *
- * PR 1 of the staged native-FFM rollout (see the `native-ffm-plan`
- * asciidoc) only ships the module scaffolding: the Gradle ↔ CMake
- * pipeline that produces a host-arch shared library, its bundling into
- * JAR resources, and an end-to-end FFM smoke downcall test. No real
- * matmul kernel is wired into the public SPI yet.
+ * Availability is gated on [NativeQ4KMatmulKernel.isAvailable] — the
+ * bundled `libskainet_kernels` shared library has to load AND the
+ * `skainet_q4k_matmul` symbol has to resolve via FFM. When either
+ * fails (missing arch, sandbox, JDK without FFM, kill-switch),
+ * `KernelRegistry.bestAvailable()` cleanly cascades to
+ * [PanamaVectorKernelProvider] at priority 50.
  *
- * Until [NativeQ4KMatmulKernel] (or its `MemSegment`-input sibling)
- * lands in PR 2, this provider deliberately reports `isAvailable() =
- * false` and returns `null` from every kernel accessor. That keeps
- * `KernelRegistry.bestAvailable()` cleanly cascading down to the
- * Panama priority-50 provider on every shape we measure today, so
- * adding the new module to the classpath produces no behavior change.
+ * PR 2 of the staged rollout: real Q4_K matmul wired into the SPI.
+ * `matmulFp32` follows in a later PR alongside a native FP32 kernel.
  */
 public object NativeKernelProvider : KernelProvider {
     override val name: String = "native-ffm"
     override val priority: Int = 100
 
-    override fun isAvailable(): Boolean = false
+    override fun isAvailable(): Boolean = NativeQ4KMatmulKernel.isAvailable()
 
     override fun matmulFp32(): Fp32MatmulKernel? = null
 
-    override fun matmulQ4K(): Q4KMatmulKernel? = null
+    override fun matmulQ4K(): Q4KMatmulKernel? =
+        if (NativeQ4KMatmulKernel.isAvailable()) NativeQ4KMatmulKernel else null
 }
diff --git a/...ainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ4KMatmulKernel.kt b/...ainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ4KMatmulKernel.kt
@@ -0,0 +1,102 @@
+package sk.ainet.exec.kernel
+
+import java.lang.foreign.Arena
+import java.lang.foreign.FunctionDescriptor
+import java.lang.foreign.Linker
+import java.lang.foreign.MemorySegment
+import java.lang.foreign.ValueLayout
+import java.lang.invoke.MethodHandle
+import sk.ainet.backend.api.kernel.Q4KMatmulKernel
+
+/**
+ * Native (FFM) implementation of [Q4KMatmulKernel].
+ *
+ * Wraps the bundled C symbol
+ *
+ *   void skainet_q4k_matmul(
+ *       const float* input, int32_t input_offset,
+ *       const uint8_t* weight, int32_t weight_byte_offset,
+ *       int32_t input_dim, int32_t output_dim,
+ *       float* output, int32_t output_offset);
+ *
+ * The C kernel implements the same lazy-`dmin` accumulation as
+ * [PanamaVectorQ4KMatmulKernel] (sum input·code and sum input per
+ * sub-block, combine via `d * scaleIdx[s] * codeSum - dMin * minIdx[s] * inputSum`)
+ * and shares the canonical 256-element / 144-byte super-block layout.
+ *
+ * Numerical parity vs the Panama kernel is asserted by
+ * [NativeQ4KMatmulKernelParityTest] within `1e-4` relative tolerance,
+ * matching the parity bar `PanamaVectorQ4KMatmulKernelTest` uses.
+ *
+ * PR 2 of the staged native-FFM rollout: ships a single-threaded
+ * scalar C kernel (`-O3 -ffast-math`, auto-vectorized inner loop).
+ * NEON / AVX2 intrinsics, `MemorySegment`-input zero-copy variant,
+ * and cross-arch CI shipping are deferred to PRs 3–5.
+ */
+internal object NativeQ4KMatmulKernel : Q4KMatmulKernel {
+
+    private const val BLOCK_SIZE = 256
+
+    fun isAvailable(): Boolean = handle != null
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "NativeQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) return
+        val mh = handle
+            ?: error("NativeQ4KMatmulKernel.matmul invoked while native library unavailable")
+
+        // The native kernel writes outputDim floats and only reads
+        // inputDim floats + (inputDim/256)*outputDim*144 weight bytes,
+        // so the segments size exactly to those windows. Heap-array
+        // segments would also work but allocating off-heap copies keeps
+        // the native side oblivious to the JVM heap layout (and lets
+        // the same wrapper take MemorySegment-backed inputs in PR 3).
+        Arena.ofConfined().use { arena ->
+            val inSeg = arena.allocate(
+                inputDim.toLong() * java.lang.Float.BYTES,
+                ValueLayout.JAVA_FLOAT.byteAlignment(),
+            )
+            val outSeg = arena.allocate(
+                outputDim.toLong() * java.lang.Float.BYTES,
+                ValueLayout.JAVA_FLOAT.byteAlignment(),
+            )
+            val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * 144L
+            val weightSeg = arena.allocate(weightBytesUsed, 1L)
+
+            MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim)
+            MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt())
+
+            mh.invoke(
+                inSeg, 0,
+                weightSeg, 0,
+                inputDim, outputDim,
+                outSeg, 0,
+            )
+
+            MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim)
+        }
+    }
+
+    private val handle: MethodHandle? by lazy {
+        val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
+        val symbol = lookup.find("skainet_q4k_matmul").orElse(null) ?: return@lazy null
+        val descriptor = FunctionDescriptor.ofVoid(
+            ValueLayout.ADDRESS,    // input
+            ValueLayout.JAVA_INT,   // input_offset
+            ValueLayout.ADDRESS,    // weight
+            ValueLayout.JAVA_INT,   // weight_byte_offset
+            ValueLayout.JAVA_INT,   // input_dim
+            ValueLayout.JAVA_INT,   // output_dim
+            ValueLayout.ADDRESS,    // output
+            ValueLayout.JAVA_INT,   // output_offset
+        )
+        runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
+    }
+}