diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
index a5934221..fd22f37f 100644
--- a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
+++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt
@@ -67,6 +67,12 @@ public interface KernelProvider {
      */
     public fun matmulQ8_0(): Q8_0MatmulKernel? = null
 
+    /**
+     * F32 × Q4_0 matmul kernel exposed by this provider, or `null` if
+     * this provider does not specialize Q4_0. Same fall-through pattern.
+     */
+    public fun matmulQ4_0(): Q4_0MatmulKernel? = null
+
     /**
      * Capability query: does this provider carry a kernel for
      * [opName] with the given [dtypeKeys]?
@@ -100,6 +106,7 @@ public interface KernelProvider {
             "BFloat16" -> matmulBf16() != null
             "Q4_K" -> matmulQ4K() != null
             "Q8_0" -> matmulQ8_0() != null
+            "Q4_0" -> matmulQ4_0() != null
             else -> false
         }
     }
diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt
new file mode 100644
index 00000000..fae0825b
--- /dev/null
+++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt
@@ -0,0 +1,46 @@
+package sk.ainet.backend.api.kernel
+
+/**
+ * F32 input × Q4_0-packed weights matrix-vector multiply, in canonical
+ * ggml block layout.
+ *
+ *   output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
+ *     for j ∈ [0, inputDim), o ∈ [0, outputDim)
+ *
+ * Block layout (32-element block, 18 bytes/block; see
+ * [sk.ainet.lang.tensor.data.Q4_0BlockTensorData] kdoc):
+ * - bytes 0..1  : `d` (block scale, FP16 LE)
+ * - bytes 2..17 : 16 bytes packing 32 4-bit codes (split layout — low
+ *   nibbles decode elements 0..15, high nibbles decode elements 16..31)
+ *
+ * Per element: `dequant = (code - 8) * d` (the `- 8` bias centres the
+ * unsigned 4-bit code around zero). Q4_0 has no per-block min / offset.
+ *
+ * Implementations MUST NOT mutate `input` or `weight`. They MAY assume
+ * the arrays do not alias each other or `output`. They MUST fully
+ * write the `outputDim` floats starting at `output[outputOffset]`.
+ *
+ * Packed-weight row-major contract: `weight` holds blocks laid out
+ * `(blockIdx * outputDim + o) * 18` for output row `o` and input block
+ * index `blockIdx`. This matches `Q4_0BlockTensorData.packedData`.
+ *
+ * `inputDim` MUST be a multiple of 32 (the Q4_0 block size).
+ */
+public interface Q4_0MatmulKernel {
+    /**
+     * @param input FP32 input vector (single row).
+     * @param inputOffset element offset into [input] where the row starts.
+     * @param weight packed Q4_0 bytes for the full `outputDim × inputDim` weight tensor.
+     * @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
+     * @param inputDim contraction dimension (must be a multiple of 32).
+     * @param outputDim number of output cells.
+     * @param output FP32 output vector.
+     * @param outputOffset element offset into [output] where the row starts.
+     */
+    public fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    )
+}
diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api
index 0caec153..a953d311 100644
--- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api
+++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api
@@ -53,6 +53,7 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProvider : sk/ainet/ba
 	public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel;
 	public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
 	public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
+	public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
 	public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
 	public fun supports (Ljava/lang/String;Ljava/util/List;)Z
 }
@@ -65,6 +66,7 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProviderFactory : sk/a
 	public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel;
 	public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
 	public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
+	public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
 	public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
 	public fun supports (Ljava/lang/String;Ljava/util/List;)Z
 }
@@ -97,6 +99,7 @@ public final class sk/ainet/exec/kernel/ScalarKernelProvider : sk/ainet/backend/
 	public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel;
 	public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
 	public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
+	public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
 	public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
 	public fun supports (Ljava/lang/String;Ljava/util/List;)Z
 }
@@ -109,6 +112,7 @@ public final class sk/ainet/exec/kernel/ScalarKernelProviderFactory : sk/ainet/b
 	public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel;
 	public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
 	public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
+	public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
 	public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
 	public fun supports (Ljava/lang/String;Ljava/util/List;)Z
 }
@@ -118,6 +122,11 @@ public final class sk/ainet/exec/kernel/ScalarMatmulKernel : sk/ainet/backend/ap
 	public fun matmul ([FII[FII[FIIIII)V
 }
 
+public final class sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel : sk/ainet/backend/api/kernel/Q4_0MatmulKernel {
+	public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ4_0MatmulKernel;
+	public fun matmul ([FI[BIII[FI)V
+}
+
 public final class sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel {
 	public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ8_0MatmulKernel;
 	public fun matmul ([FI[BIII[FI)V
diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
index 080377a7..a7c13ccd 100644
--- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt
@@ -3,6 +3,7 @@ package sk.ainet.exec.kernel
 import sk.ainet.backend.api.kernel.Bf16MatmulKernel
 import sk.ainet.backend.api.kernel.Fp32MatmulKernel
 import sk.ainet.backend.api.kernel.KernelProvider
+import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
 
 /**
@@ -25,4 +26,5 @@ public object ScalarKernelProvider : KernelProvider {
     override fun matmulFp32(): Fp32MatmulKernel = ScalarMatmulKernel
     override fun matmulBf16(): Bf16MatmulKernel = ScalarBf16MatmulKernel
     override fun matmulQ8_0(): Q8_0MatmulKernel = ScalarQ8_0MatmulKernel
+    override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
 }
diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt
new file mode 100644
index 00000000..6a844e86
--- /dev/null
+++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt
@@ -0,0 +1,96 @@
+package sk.ainet.exec.kernel
+
+import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
+
+/**
+ * Scalar reference implementation of [Q4_0MatmulKernel] — straight
+ * per-block dequant + per-element FMA, no SIMD. Always available on
+ * every KMP target. Used as:
+ *
+ * - The correctness reference that accelerated kernels (Panama Vector,
+ *   native FFM) must match within FP order tolerance.
+ * - A guaranteed fallback when no accelerated provider is registered.
+ *
+ * Block layout (32-element block, 18 bytes):
+ *   - bytes 0..1 : FP16 little-endian scale (`d`)
+ *   - bytes 2..17: 16 bytes packing 32 4-bit codes (split layout)
+ *
+ * Dequant per element: `(code - 8) * d`. No min / offset.
+ *
+ * Performance is intentionally modest; production paths should pick the
+ * Panama Vector or native variant via the kernel registry.
+ */
+public object ScalarQ4_0MatmulKernel : Q4_0MatmulKernel {
+
+    private const val BLOCK_SIZE = 32
+    private const val BYTES_PER_BLOCK = 18
+
+    override fun matmul(
+        input: FloatArray, inputOffset: Int,
+        weight: ByteArray, weightByteOffset: Int,
+        inputDim: Int, outputDim: Int,
+        output: FloatArray, outputOffset: Int,
+    ) {
+        require(inputDim % BLOCK_SIZE == 0) {
+            "ScalarQ4_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
+        }
+        if (outputDim == 0 || inputDim == 0) {
+            if (outputDim > 0) {
+                for (o in 0 until outputDim) output[outputOffset + o] = 0f
+            }
+            return
+        }
+        val blocksPerInputDim = inputDim / BLOCK_SIZE
+
+        for (o in 0 until outputDim) {
+            var acc = 0f
+            for (blockIdx in 0 until blocksPerInputDim) {
+                val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
+                // FP16 scale: two LE bytes.
+                val dBits = (weight[blockBase].toInt() and 0xFF) or
+                    ((weight[blockBase + 1].toInt() and 0xFF) shl 8)
+                val d = halfToFloat(dBits)
+                // 32 codes, blockIdx-th window of the input vector. Split
+                // layout: low nibbles → elements 0..15, high → 16..31.
+                val inputBase = inputOffset + blockIdx * BLOCK_SIZE
+                val codesBase = blockBase + 2
+                for (j in 0 until 16) {
+                    val b = weight[codesBase + j].toInt() and 0xFF
+                    val lo = (b and 0x0F) - 8
+                    val hi = (b ushr 4) - 8
+                    acc += input[inputBase + j] * lo * d
+                    acc += input[inputBase + 16 + j] * hi * d
+                }
+            }
+            output[outputOffset + o] = acc
+        }
+    }
+
+    /**
+     * Convert a 16-bit IEEE-754 half-precision value (low 16 bits of
+     * [hbits]) to FP32. Mirrors [ScalarQ8_0MatmulKernel]'s inlined helper
+     * — the skainet-lang-core dequant helper is internal to that module.
+     */
+    private fun halfToFloat(hbits: Int): Float {
+        val sign = (hbits and 0x8000) shl 16
+        val exp = (hbits and 0x7C00) shr 10
+        val mant = hbits and 0x03FF
+        return when (exp) {
+            0 -> {
+                if (mant == 0) Float.fromBits(sign)
+                else {
+                    var m = mant
+                    var e = -14
+                    while ((m and 0x400) == 0) {
+                        m = m shl 1
+                        e--
+                    }
+                    m = m and 0x3FF
+                    Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
+                }
+            }
+            31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
+            else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
+        }
+    }
+}
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt
index 703beebf..b70abfd9 100644
--- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt
+++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt
@@ -9,9 +9,11 @@ import sk.ainet.backend.api.kernel.KernelRegistry
 import sk.ainet.backend.api.kernel.KernelServiceLoader
 import sk.ainet.backend.api.kernel.KernelStrictness
 import sk.ainet.backend.api.kernel.Q4KMatmulKernel
+import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
 import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
 import sk.ainet.exec.kernel.ScalarBf16MatmulKernel
 import sk.ainet.exec.kernel.ScalarMatmulKernel
+import sk.ainet.exec.kernel.ScalarQ4_0MatmulKernel
 import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.Tensor
 import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData
@@ -21,6 +23,7 @@ import sk.ainet.lang.tensor.data.MemorySegmentTensorData
 import sk.ainet.lang.tensor.data.Q4MemorySegmentMarker
 import sk.ainet.lang.tensor.data.Q4MemorySegmentTensorData
 import sk.ainet.lang.tensor.data.Bf16TensorData
+import sk.ainet.lang.tensor.data.Q4_0TensorData
 import sk.ainet.lang.tensor.data.Q8_0TensorData
 import sk.ainet.lang.tensor.data.Q8MemorySegmentMarker
 import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData
@@ -113,6 +116,24 @@ internal class DefaultCpuOpsJvm(
             ?: ScalarBf16MatmulKernel
     }
 
+    /**
+     * Q4_0 matmul kernel resolved via [KernelRegistry]. Mirrors
+     * [bf16MatmulKernel]: non-null, picks the highest-priority provider
+     * that carries a Q4_0 kernel (native FFM at 100, Panama Vector at
+     * 50), falling back to [ScalarQ4_0MatmulKernel] — the scalar SPI
+     * kernel is the floor (every `KernelProvider` carries one), so Q4_0
+     * has no pre-SPI legacy fallback to thread through.
+     */
+    private val q4_0MatmulKernel: Q4_0MatmulKernel by lazy {
+        if (KernelRegistry.providers().isEmpty()) {
+            KernelServiceLoader.installAll()
+        }
+        KernelRegistry.providers()
+            .firstOrNull { it.isAvailable() && it.matmulQ4_0() != null }
+            ?.matmulQ4_0()
+            ?: ScalarQ4_0MatmulKernel
+    }
+
     override fun <T : DType, V> add(a: Tensor<T, V>, b: Tensor<T, V>): Tensor<T, V> {
         vectorFloatBinary(a, b, { x, y -> x.add(y) }) { x, y -> x + y }?.let { return it }
         return super.add(a, b)
@@ -521,6 +542,22 @@ internal class DefaultCpuOpsJvm(
                 @Suppress("UNCHECKED_CAST")
                 CpuTensor(outData as TensorData<T, V>, this, a.dtype)
             }
+            is Q4_0TensorData -> {
+                val outBuffer = FloatArray(batchSize * outputDim)
+                for (batch in 0 until batchSize) {
+                    val batchInput = if (batchSize == 1) inputBuffer
+                    else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim)
+                    q4_0MatmulKernel.matmul(
+                        batchInput, 0,
+                        bData.packedData, 0,
+                        inputDim, outputDim,
+                        outBuffer, batch * outputDim,
+                    )
+                }
+                val outData = DenseFloatArrayTensorData<T>(Shape(batchSize, outputDim), outBuffer)
+                @Suppress("UNCHECKED_CAST")
+                CpuTensor(outData as TensorData<T, V>, this, a.dtype)
+            }
             is Q4_KTensorData -> {
                 val outBuffer = FloatArray(batchSize * outputDim)
                 val spiKernel = q4kMatmulKernel
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt
index cc68683b..07a83cc3 100644
--- a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt
+++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt
@@ -47,6 +47,11 @@ class KernelProviderSupportsTest {
             p.supports("matmul", listOf("Float32", "Q8_0")),
             "Q8_0 matmul support must mirror matmulQ8_0() != null",
         )
+        assertEquals(
+            p.matmulQ4_0() != null,
+            p.supports("matmul", listOf("Float32", "Q4_0")),
+            "Q4_0 matmul support must mirror matmulQ4_0() != null",
+        )
     }
 
     @Test
@@ -62,6 +67,9 @@ class KernelProviderSupportsTest {
             p.matmulQ4K() != null,
             p.supports("matmul", listOf("Float32", "Q4_K")),
         )
+        // Scalar carries the Q4_0 floor kernel, so the capability query
+        // must report it as supported.
+        assertTrue(p.supports("matmul", listOf("Float32", "Q4_0")))
     }
 
     @Test
diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt
new file mode 100644
index 00000000..f005dc25
--- /dev/null
+++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt
@@ -0,0 +1,110 @@
+package sk.ainet.exec.tensor.ops
+
+import kotlin.math.abs
+import kotlin.random.Random
+import kotlin.test.Test
+import kotlin.test.assertTrue
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.exec.kernel.ScalarQ4_0MatmulKernel
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.Q4_0BlockTensorData
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.FP32
+
+/**
+ * Integration tests for the FP32 × Q4_0 dispatch path in
+ * [DefaultCpuOpsJvm.matmul]. Confirms that calling matmul on a
+ * Q4_0-backed weight tensor produces the same output as the scalar
+ * Q4_0 kernel — proving the dispatch actually routes through the
+ * registered Q4_0 SPI kernel (or the scalar floor). Mirrors
+ * [Q8_0MatmulDispatchTest]; pins integration, not kernel correctness.
+ */
+class Q4_0MatmulDispatchTest {
+
+    private val ctx = DirectCpuExecutionContext()
+
+    private val blockSize = 32
+    private val bytesPerBlock = 18
+
+    private fun randomQ4_0Bytes(blocksPerInputDim: Int, outputDim: Int, seed: Int): ByteArray {
+        val rng = Random(seed)
+        val numBlocks = blocksPerInputDim * outputDim
+        val bytes = ByteArray(numBlocks * bytesPerBlock)
+        rng.nextBytes(bytes)
+        for (block in 0 until numBlocks) {
+            val base = block * bytesPerBlock
+            // FP16 scale ≈ 7.6e-3 (0x2200) — safely finite, non-zero.
+            bytes[base + 0] = 0x00.toByte()
+            bytes[base + 1] = 0x22.toByte()
+        }
+        return bytes
+    }
+
+    private fun scalarQ4_0Reference(
+        input: FloatArray, weight: ByteArray,
+        inputDim: Int, outputDim: Int,
+        batchSize: Int,
+    ): FloatArray {
+        val out = FloatArray(batchSize * outputDim)
+        for (b in 0 until batchSize) {
+            ScalarQ4_0MatmulKernel.matmul(
+                input, b * inputDim,
+                weight, 0,
+                inputDim, outputDim,
+                out, b * outputDim,
+            )
+        }
+        return out
+    }
+
+    private fun assertDispatchMatchesScalar(
+        batchSize: Int, inputDim: Int, outputDim: Int, seed: Int,
+        tolPerBlock: Float = 1e-2f,
+    ) {
+        val rng = Random(seed)
+        val inputFloats = FloatArray(batchSize * inputDim) { rng.nextFloat() - 0.5f }
+        val blocksPerInputDim = inputDim / blockSize
+
+        val weightBytes = randomQ4_0Bytes(blocksPerInputDim, outputDim, seed)
+        // Logical shape of a Q4_0 weight tensor is [inputDim, outputDim].
+        @Suppress("UNCHECKED_CAST")
+        val td = Q4_0BlockTensorData(Shape(inputDim, outputDim), weightBytes) as TensorData<FP32, Float>
+        val weight = ctx.fromData(td, FP32::class)
+        val input = ctx.fromFloatArray<FP32, Float>(
+            Shape(batchSize, inputDim), FP32::class, inputFloats,
+        )
+
+        val out = ctx.ops.matmul(input, weight)
+        val outArr = out.data.copyToFloatArray()
+
+        val expected = scalarQ4_0Reference(inputFloats, weightBytes, inputDim, outputDim, batchSize)
+
+        val tol = (tolPerBlock * blocksPerInputDim.coerceAtLeast(1)).coerceAtLeast(tolPerBlock)
+        for (i in expected.indices) {
+            val diff = abs(expected[i] - outArr[i])
+            assertTrue(
+                diff <= tol,
+                "dispatch mismatch at $i: expected=${expected[i]} got=${outArr[i]} diff=$diff tol=$tol",
+            )
+        }
+    }
+
+    @Test
+    fun single_batch_matmul_against_q4_0_weight_routes_correctly() {
+        // batchSize=1 hits the optimized "no copyOfRange" branch in chooseQuantizedMatmul.
+        assertDispatchMatchesScalar(batchSize = 1, inputDim = 128, outputDim = 64, seed = 1)
+    }
+
+    @Test
+    fun multi_batch_matmul_against_q4_0_weight_routes_correctly() {
+        // batchSize>1 exercises the per-row copyOfRange branch.
+        assertDispatchMatchesScalar(batchSize = 3, inputDim = 256, outputDim = 32, seed = 2)
+    }
+
+    @Test
+    fun llm_typical_attention_proj_matmul_routes_correctly() {
+        // Realistic attention-projection size (matvec at dim×dim).
+        assertDispatchMatchesScalar(batchSize = 1, inputDim = 512, outputDim = 512, seed = 3)
+    }
+}
diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api
index 40f996c8..1805010a 100644
--- a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api
+++ b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api
@@ -3030,6 +3030,55 @@ public final class sk/ainet/lang/tensor/data/Q4MemorySegmentTensorData$Companion
 	public static synthetic fun fromRawBytes$default (Lsk/ainet/lang/tensor/data/Q4MemorySegmentTensorData$Companion;Lsk/ainet/lang/tensor/Shape;[BLjava/lang/foreign/Arena;JILjava/lang/Object;)Lsk/ainet/lang/tensor/data/Q4MemorySegmentTensorData;
 }
 
+public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData : sk/ainet/lang/tensor/data/Q4_0TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage {
+	public static final field Companion Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion;
+	public fun <init> (Lsk/ainet/lang/tensor/Shape;[B)V
+	public fun copyToFloatArray ()[F
+	public fun dequantizeBlock (I[FI)V
+	public fun get ([I)Ljava/lang/Byte;
+	public synthetic fun get ([I)Ljava/lang/Object;
+	public fun getBlockCount ()I
+	public fun getBlockScale (I)F
+	public fun getBlockSize ()I
+	public fun getCode (II)B
+	public fun getElementCount ()J
+	public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding;
+	public fun getPackedData ()[B
+	public fun getPhysicalBytes ()J
+	public fun getShape ()Lsk/ainet/lang/tensor/Shape;
+	public fun set ([IB)V
+	public synthetic fun set ([ILjava/lang/Object;)V
+	public fun toFloatArray ()[F
+	public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage;
+}
+
+public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion {
+	public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
+}
+
+public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData {
+	public static final field BLOCK_SIZE I
+	public static final field BYTES_PER_BLOCK I
+	public static final field Companion Lsk/ainet/lang/tensor/data/Q4_0TensorData$Companion;
+	public abstract fun getBlockCount ()I
+	public abstract fun getBlockScale (I)F
+	public abstract fun getCode (II)B
+	public abstract fun getPackedData ()[B
+}
+
+public final class sk/ainet/lang/tensor/data/Q4_0TensorData$Companion {
+	public static final field BLOCK_SIZE I
+	public static final field BYTES_PER_BLOCK I
+}
+
+public final class sk/ainet/lang/tensor/data/Q4_0TensorData$DefaultImpls {
+	public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q4_0TensorData;)[F
+}
+
+public final class sk/ainet/lang/tensor/data/Q4_0TensorDataKt {
+	public static final fun toFloatArray (Lsk/ainet/lang/tensor/data/Q4_0TensorData;)[F
+}
+
 public final class sk/ainet/lang/tensor/data/Q4_KBlockTensorData : sk/ainet/lang/tensor/data/Q4_KTensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage {
 	public static final field Companion Lsk/ainet/lang/tensor/data/Q4_KBlockTensorData$Companion;
 	public fun <init> (Lsk/ainet/lang/tensor/Shape;[B)V
@@ -5143,6 +5192,17 @@ public final class sk/ainet/lang/tensor/storage/TensorEncoding$Opaque : sk/ainet
 	public fun toString ()Ljava/lang/String;
 }
 
+public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_0 : sk/ainet/lang/tensor/storage/TensorEncoding {
+	public static final field BLOCK_SIZE I
+	public static final field BYTES_PER_BLOCK I
+	public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q4_0;
+	public fun equals (Ljava/lang/Object;)Z
+	public fun getName ()Ljava/lang/String;
+	public fun hashCode ()I
+	public fun physicalBytes (J)Ljava/lang/Long;
+	public fun toString ()Ljava/lang/String;
+}
+
 public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_K : sk/ainet/lang/tensor/storage/TensorEncoding {
 	public static final field BLOCK_SIZE I
 	public static final field BYTES_PER_BLOCK I
diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt
new file mode 100644
index 00000000..c1923721
--- /dev/null
+++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt
@@ -0,0 +1,202 @@
+package sk.ainet.lang.tensor.data
+
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.storage.PackedBlockStorage
+import sk.ainet.lang.tensor.storage.TensorEncoding
+import sk.ainet.lang.types.DType
+
+/**
+ * Tensor data interface for the Q4_0 quantized format (older GGML 4-bit).
+ *
+ * Q4_0 block format (32 elements per block, 18 bytes per block):
+ * - 2 bytes: f16 scale (`d`)
+ * - 16 bytes: 32 packed 4-bit codes (2 nibbles per byte)
+ *
+ * Canonical ggml nibble layout (the *split* layout, matching
+ * `sk.ainet.io.gguf.dequant.DequantOps.dequantQ4_0FromBytes`): for the
+ * 16 code bytes `qs[0..15]`, the low nibbles decode elements `0..15` and
+ * the high nibbles decode elements `16..31`:
+ *
+ *   element[j]      = ((qs[j] & 0x0F) - 8) * d   for j ∈ [0, 16)
+ *   element[j + 16] = ((qs[j] >>> 4) - 8) * d
+ *
+ * The `- 8` bias makes the 4-bit code symmetric around zero. This is the
+ * layout real GGUF Q4_0 weights are stored in.
+ *
+ * This interface enables direct quantized matmul without full
+ * dequantization, mirroring [Q8_0TensorData].
+ */
+public interface Q4_0TensorData : TensorData<DType, Byte> {
+    /** Number of Q4_0 blocks in the tensor. */
+    public val blockCount: Int
+
+    /** Raw packed data containing all blocks. */
+    public val packedData: ByteArray
+
+    /** Get the scale factor (`d`) for a specific block. */
+    public fun getBlockScale(blockIdx: Int): Float
+
+    /**
+     * Get the raw unsigned 4-bit code (0..15) for [elementIdx] (0..31)
+     * within a block. The dequantized value is `(code - 8) * scale`.
+     */
+    public fun getCode(blockIdx: Int, elementIdx: Int): Byte
+
+    public companion object {
+        /** Elements per Q4_0 block. */
+        public const val BLOCK_SIZE: Int = 32
+
+        /** Bytes per Q4_0 block (2 bytes scale + 16 bytes packed nibbles). */
+        public const val BYTES_PER_BLOCK: Int = 18
+    }
+}
+
+/**
+ * Implementation of [Q4_0TensorData] backed by a packed byte array.
+ *
+ * Memory layout per block (18 bytes):
+ * - bytes [0..1]  : f16 scale (little-endian)
+ * - bytes [2..17] : 16 bytes packing 32 4-bit codes (split layout, see
+ *   [Q4_0TensorData] kdoc)
+ *
+ * @param initialShape the logical shape of the tensor (in elements, not blocks)
+ * @param data the raw packed block data
+ */
+public class Q4_0BlockTensorData(
+    initialShape: Shape,
+    private val data: ByteArray
+) : Q4_0TensorData, PackedBlockStorage {
+
+    override val shape: Shape = Shape(initialShape.dimensions.copyOf())
+    private val strides: IntArray = shape.computeStrides()
+    override val packedData: ByteArray get() = data
+
+    override val blockCount: Int = (shape.volume + Q4_0TensorData.BLOCK_SIZE - 1) / Q4_0TensorData.BLOCK_SIZE
+
+    // PackedBlockStorage implementation
+    override val encoding: TensorEncoding get() = TensorEncoding.Q4_0
+    override val blockSize: Int get() = Q4_0TensorData.BLOCK_SIZE
+
+    override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" }
+        val scale = getBlockScale(blockIdx)
+        val elemsInBlock = minOf(Q4_0TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q4_0TensorData.BLOCK_SIZE)
+        val codesBase = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2
+        for (j in 0 until 16) {
+            val b = data[codesBase + j].toInt() and 0xFF
+            val lo = (b and 0x0F) - 8
+            val hi = (b ushr 4) - 8
+            val o0 = outputOffset + j
+            if (j < elemsInBlock && o0 < output.size) output[o0] = lo.toFloat() * scale
+            val o1 = outputOffset + 16 + j
+            if (16 + j < elemsInBlock && o1 < output.size) output[o1] = hi.toFloat() * scale
+        }
+    }
+
+    init {
+        val requiredBytes = blockCount * Q4_0TensorData.BYTES_PER_BLOCK
+        require(data.size >= requiredBytes) {
+            "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks"
+        }
+    }
+
+    override fun getBlockScale(blockIdx: Int): Float {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" }
+        val offset = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK
+        val b0 = data[offset].toInt() and 0xFF
+        val b1 = data[offset + 1].toInt() and 0xFF
+        return halfToFloat((b1 shl 8) or b0)
+    }
+
+    override fun getCode(blockIdx: Int, elementIdx: Int): Byte {
+        require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" }
+        require(elementIdx in 0 until Q4_0TensorData.BLOCK_SIZE) { "Element index $elementIdx out of bounds (0..31)" }
+        val byteInBlock = if (elementIdx < 16) elementIdx else elementIdx - 16
+        val b = data[blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2 + byteInBlock].toInt() and 0xFF
+        val nibble = if (elementIdx < 16) (b and 0x0F) else (b ushr 4)
+        return nibble.toByte()
+    }
+
+    override fun get(vararg indices: Int): Byte {
+        val flatIndex = calcFlatIndex(indices)
+        val blockIdx = flatIndex / Q4_0TensorData.BLOCK_SIZE
+        val elementIdx = flatIndex % Q4_0TensorData.BLOCK_SIZE
+        return getCode(blockIdx, elementIdx)
+    }
+
+    override fun set(vararg indices: Int, value: Byte) {
+        val flatIndex = calcFlatIndex(indices)
+        val blockIdx = flatIndex / Q4_0TensorData.BLOCK_SIZE
+        val elementIdx = flatIndex % Q4_0TensorData.BLOCK_SIZE
+        val byteInBlock = if (elementIdx < 16) elementIdx else elementIdx - 16
+        val offset = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2 + byteInBlock
+        val nib = value.toInt() and 0x0F
+        val cur = data[offset].toInt() and 0xFF
+        data[offset] = if (elementIdx < 16) ((cur and 0xF0) or nib).toByte()
+        else ((cur and 0x0F) or (nib shl 4)).toByte()
+    }
+
+    private fun calcFlatIndex(indices: IntArray): Int {
+        require(indices.size == shape.dimensions.size) {
+            "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})"
+        }
+        var flatIndex = 0
+        for (i in indices.indices) {
+            val idx = indices[i]
+            require(idx >= 0 && idx < shape.dimensions[i]) {
+                "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}"
+            }
+            flatIndex += idx * strides[i]
+        }
+        return flatIndex
+    }
+
+    public companion object {
+        /** Create [Q4_0BlockTensorData] from raw packed Q4_0 bytes. */
+        public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q4_0BlockTensorData {
+            return Q4_0BlockTensorData(shape, bytes)
+        }
+
+        /** Convert f16 bits to float32. */
+        internal fun halfToFloat(hbits: Int): Float {
+            val sign = (hbits and 0x8000) shl 16
+            val exp = (hbits and 0x7C00) shr 10
+            val mant = hbits and 0x03FF
+            return when (exp) {
+                0 -> {
+                    if (mant == 0) {
+                        Float.fromBits(sign)
+                    } else {
+                        var m = mant
+                        var e = -14
+                        while ((m and 0x400) == 0) {
+                            m = m shl 1
+                            e--
+                        }
+                        m = m and 0x3FF
+                        Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
+                    }
+                }
+                31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
+                else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
+            }
+        }
+    }
+}
+
+/**
+ * Dequantize Q4_0 tensor data to a FloatArray.
+ * `element[j] = (code[j] - 8) * scale` in the canonical split layout.
+ */
+public fun Q4_0TensorData.toFloatArray(): FloatArray {
+    val result = FloatArray(shape.volume)
+    for (blockIdx in 0 until blockCount) {
+        val scale = getBlockScale(blockIdx)
+        val base = blockIdx * Q4_0TensorData.BLOCK_SIZE
+        val elemsInBlock = minOf(Q4_0TensorData.BLOCK_SIZE, shape.volume - base)
+        for (i in 0 until elemsInBlock) {
+            result[base + i] = (getCode(blockIdx, i).toInt() - 8).toFloat() * scale
+        }
+    }
+    return result
+}
diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
index 4a9f745f..bd781a4f 100644
--- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
+++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt
@@ -52,6 +52,18 @@ public sealed interface TensorEncoding {
         }
     }
 
+    /** GGML Q4_0 block quantization: 32 elements per 18-byte block. */
+    public data object Q4_0 : TensorEncoding {
+        public const val BLOCK_SIZE: Int = 32
+        public const val BYTES_PER_BLOCK: Int = 18
+
+        override val name: String get() = "Q4_0"
+        override fun physicalBytes(elementCount: Long): Long {
+            val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE
+            return blocks * BYTES_PER_BLOCK
+        }
+    }
+
     /** GGML Q8_0 block quantization: 32 elements per 34-byte block. */
     public data object Q8_0 : TensorEncoding {
         public const val BLOCK_SIZE: Int = 32
diff --git a/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt
new file mode 100644
index 00000000..a2cc0c33
--- /dev/null
+++ b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt
@@ -0,0 +1,95 @@
+package sk.ainet.lang.tensor.data
+
+import sk.ainet.lang.tensor.Shape
+import kotlin.test.Test
+import kotlin.test.assertContentEquals
+import kotlin.test.assertEquals
+
+class Q4_0TensorDataTest {
+
+    /** Pack 32 unsigned 4-bit codes (0..15) into the canonical split layout. */
+    private fun packCodes(codes: IntArray): ByteArray {
+        require(codes.size == 32)
+        val out = ByteArray(16)
+        for (j in 0 until 16) {
+            out[j] = ((codes[j] and 0x0F) or ((codes[j + 16] and 0x0F) shl 4)).toByte()
+        }
+        return out
+    }
+
+    private fun block(scaleLo: Int, scaleHi: Int, codes: IntArray): ByteArray =
+        byteArrayOf(scaleLo.toByte(), scaleHi.toByte()) + packCodes(codes)
+
+    @Test
+    fun `constants are correct`() {
+        assertEquals(32, Q4_0TensorData.BLOCK_SIZE)
+        assertEquals(18, Q4_0TensorData.BYTES_PER_BLOCK)
+    }
+
+    @Test
+    fun `reads scale from block`() {
+        // scale = 1.0 (f16 0x3C00 little-endian)
+        val data = block(0x00, 0x3C, IntArray(32) { 8 })
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data)
+        assertEquals(1.0f, tensor.getBlockScale(0), 0.001f)
+    }
+
+    @Test
+    fun `split layout decodes low nibbles to first half and high nibbles to second half`() {
+        // codes[j]=j%16 → low nibble j∈0..15 ; codes[j+16]=15-(j%16) → high nibble
+        val codes = IntArray(32) { i -> if (i < 16) i else 15 - (i - 16) }
+        val data = block(0x00, 0x3C, codes) // scale 1.0
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data)
+        for (i in 0 until 32) {
+            assertEquals(codes[i].toByte(), tensor.getCode(0, i), "code mismatch at $i")
+        }
+    }
+
+    @Test
+    fun `toFloatArray applies minus-eight bias and scale`() {
+        // scale = 0.5 (f16 0x3800). codes: elem0=10 → (10-8)*0.5=1.0 ; elem16=6 → (6-8)*0.5=-1.0
+        val codes = IntArray(32) { 8 }
+        codes[0] = 10   // low nibble of byte 0  → element 0
+        codes[16] = 6   // high nibble of byte 0 → element 16
+        val data = block(0x00, 0x38, codes)
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data)
+        val floats = tensor.toFloatArray()
+        assertEquals(1.0f, floats[0], 0.01f)
+        assertEquals(-1.0f, floats[16], 0.01f)
+        assertEquals(0.0f, floats[1], 0.01f) // code 8 → (8-8)*scale = 0
+    }
+
+    @Test
+    fun `matches canonical ggml dequant for a known block`() {
+        // Mirror DequantOps.dequantQ4_0FromBytes: out[j]=(lo-8)*d, out[j+16]=(hi-8)*d.
+        val codes = IntArray(32) { i -> (i * 7 + 3) and 0x0F } // arbitrary 0..15 pattern
+        val data = block(0x00, 0x3C, codes) // scale 1.0
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data)
+        val floats = tensor.toFloatArray()
+        for (i in 0 until 32) {
+            assertEquals((codes[i] - 8).toFloat(), floats[i], 0.001f, "dequant mismatch at $i")
+        }
+    }
+
+    @Test
+    fun `set round-trips through nibble packing`() {
+        val data = block(0x00, 0x3C, IntArray(32) { 8 })
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data)
+        tensor[3] = 5      // low nibble of byte 3
+        tensor[19] = 12    // high nibble of byte 3 (19-16=3)
+        assertEquals(5.toByte(), tensor[3])
+        assertEquals(12.toByte(), tensor[19])
+    }
+
+    @Test
+    fun `handles multiple blocks and 2D shape`() {
+        val b0 = block(0x00, 0x3C, IntArray(32) { 8 })  // scale 1.0
+        val b1 = block(0x00, 0x40, IntArray(32) { 9 })  // scale 2.0, code 9
+        val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(8, 8), b0 + b1)
+        assertEquals(2, tensor.blockCount)
+        assertContentEquals(intArrayOf(8, 8), tensor.shape.dimensions)
+        assertEquals(1.0f, tensor.getBlockScale(0), 0.001f)
+        assertEquals(2.0f, tensor.getBlockScale(1), 0.001f)
+        assertEquals(9.toByte(), tensor.getCode(1, 0))
+    }
+}