From 01010416b5d19f890b92e063d54067a42654d965 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Wed, 10 Jun 2026 23:13:55 +0200 Subject: [PATCH 1/4] feat(backend): first-class Q5_K packed matmul + ARM NEON kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Q5_K as a packed in-kernel dequant-matmul format (previously Q5_K was only eagerly decoded to FP32 on load), mirroring the existing Q4_K plumbing, and hand-written ARM NEON paths for the native CPU kernels. Q5_K (256-elt / 176-byte super-block: d, dMin, 12 packed scales, 32-byte qh high-bit plane, 128-byte qs low nibbles; 5-bit code = lowNibble | (5th<<4)): - TensorEncoding.Q5_K; Q5_KTensorData / Q5_KBlockTensorData (5th-bit fold). - Q5KMatmulKernel SPI + matmulQ5K()/"Q5_K" in KernelProvider.supports(). - ScalarQ5_KMatmulKernel (commonMain/KN), PanamaVectorQ5_KMatmulKernel (JVM), native C skainet_q5k_matmul + NativeQ5KMatmulKernel (FFM); all registered. - DefaultCpuOps matmul dispatch + lazy-transpose branches. - StreamingGgufParametersLoader: Q5_K + Q6_K packed branches (a Q5_K_M GGUF now loads end-to-end instead of SKIP'ing most tensors). Tests: Q5_KBlockTensorData bit-exact vs DequantOps golden across blocks; native<->Panama<->scalar matmul parity; KernelSupportMatrixTest gate updated. ARM NEON (behind #if __ARM_NEON in skainet_simd.h; x86 keeps the scalar fallback, re-verified green): - fp32 (broadcast+vfmaq), q8_0 (widen int8->f32+vfmaq), q4k/q5k (nibble unpack + dual code/input accumulators; q5k folds the qh 5th bit via a runtime-count vshlq_u8). - CMake aarch64 branch: -march=armv8.2-a+fp16+dotprod (no +i8mm — A55 lacks it). Cross toolchain-aarch64.cmake + opt-in -PcrossArm64 gradle tasks; default x86 build unaffected. BOARD-VERIFY-PENDING: the NEON paths are aarch64-syntax-validated (clang --target=aarch64) but not executed (x86 host, no QEMU). Run the parity tests under qemu-aarch64 or on the SL2610 before relying on them. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../backend/api/kernel/KernelProvider.kt | 7 + .../backend/api/kernel/Q5KMatmulKernel.kt | 59 ++++ .../ainet/exec/kernel/ScalarKernelProvider.kt | 2 + .../exec/kernel/ScalarQ5_KMatmulKernel.kt | 91 +++++ .../sk/ainet/exec/tensor/ops/DefaultCpuOps.kt | 5 + .../exec/kernel/PanamaVectorKernelProvider.kt | 4 + .../kernel/PanamaVectorQ5_KMatmulKernel.kt | 193 +++++++++++ .../PanamaVectorQ5_KMatmulKernelParityTest.kt | 71 ++++ .../build.gradle.kts | 60 ++++ .../native/CMakeLists.txt | 12 + .../native/include/skainet_kernels.h | 25 ++ .../native/include/skainet_simd.h | 51 +++ .../native/src/fp32_matmul.c | 14 + .../native/src/q4k_matmul.c | 26 ++ .../native/src/q5k_matmul.c | 202 +++++++++++ .../native/src/q8_0_matmul.c | 21 ++ .../native/toolchain-aarch64.cmake | 28 ++ .../ainet/exec/kernel/NativeKernelProvider.kt | 4 + .../exec/kernel/NativeQ5KMatmulKernel.kt | 91 +++++ .../exec/kernel/KernelSupportMatrixTest.kt | 8 +- .../kernel/NativeQ5KMatmulKernelParityTest.kt | 94 +++++ .../io/gguf/StreamingGgufParametersLoader.kt | 14 + .../io/gguf/Q5KBlockTensorDataParityTest.kt | 115 +++++++ .../ainet/lang/tensor/data/Q5_KTensorData.kt | 320 ++++++++++++++++++ .../lang/tensor/storage/TensorEncoding.kt | 12 + 25 files changed, 1525 insertions(+), 4 deletions(-) create mode 100644 skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt create mode 100644 skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt create mode 100644 skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt create mode 100644 skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt create mode 100644 skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h create mode 100644 skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c create mode 100644 skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake create mode 100644 skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt create mode 100644 skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt create mode 100644 skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt create mode 100644 skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt index 09d99a9c..ad475190 100644 --- a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt +++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt @@ -79,6 +79,12 @@ public interface KernelProvider { */ public fun matmulQ6K(): Q6KMatmulKernel? = null + /** + * F32 × Q5_K matmul kernel exposed by this provider, or `null` if + * this provider does not specialize Q5_K. Same fall-through pattern. + */ + public fun matmulQ5K(): Q5KMatmulKernel? = null + /** * F32 × Q5_1 matmul kernel exposed by this provider, or `null` if * this provider does not specialize Q5_1. Same fall-through pattern. @@ -126,6 +132,7 @@ public interface KernelProvider { "Q8_0" -> matmulQ8_0() != null "Q4_0" -> matmulQ4_0() != null "Q6_K" -> matmulQ6K() != null + "Q5_K" -> matmulQ5K() != null "Q5_1" -> matmulQ5_1() != null "Q5_0" -> matmulQ5_0() != null else -> false diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt new file mode 100644 index 00000000..54398e22 --- /dev/null +++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q5KMatmulKernel.kt @@ -0,0 +1,59 @@ +package sk.ainet.backend.api.kernel + +/** + * F32 input × Q5_K-packed weights matrix-vector multiply, in canonical + * ggml super-block layout. + * + * output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j]) + * for j ∈ [0, inputDim), o ∈ [0, outputDim) + * + * Block layout (256-element super-block, 176 bytes/block; see + * [sk.ainet.lang.tensor.data.Q5_KTensorData] kdoc for the byte map): + * - bytes 0..1 : `d` (super-block scale, FP16 LE) + * - bytes 2..3 : `dMin` (super-block min-scale, FP16 LE) + * - bytes 4..15 : 12 bytes of packed (6-bit scaleIdx, 6-bit minIdx) for + * 8 sub-blocks via ggml's `get_scale_min_k4` mixing + * (identical to Q4_K) + * - bytes 16..47 : 32 bytes `qh` high-bit plane (the 5th bit of each code) + * - bytes 48..175: 128 bytes of 4-bit low nibbles, *strided* in 4 groups of + * 32 bytes (identical layout to Q4_K's `qs`) + * + * Per sub-block s ∈ 0..7: + * `scale[s] = d * scaleIdx[s]` + * `offset[s] = dMin * minIdx[s]` + * per element: `code = lowNibble | (fifthBit << 4)` (0..31); + * `dequant = code * scale[s] - offset[s]` + * + * The lazy-`dmin` accumulation trick (used by every well-tuned K-quant + * kernel including ggml's reference) avoids subtracting `offset` per + * element by tracking `Σ(input · code)` and `Σ(input)` per sub-block + * and combining as `scale * codeSum − offset * inputSum` once. + * + * Implementations MUST NOT mutate `input` or `weight`. They MAY assume + * the arrays do not alias each other or `output`. They MUST fully + * write the `outputDim` floats starting at `output[outputOffset]`. + * + * Packed-weight row-major contract: `weight` holds blocks laid out + * `(blockIdx * outputDim + o) * 176` for output row `o` and input + * block index `blockIdx`. This matches `Q5_KBlockTensorData.packedData`. + * + * `inputDim` MUST be a multiple of 256 (the Q5_K block size). + */ +public interface Q5KMatmulKernel { + /** + * @param input FP32 input vector (single row). + * @param inputOffset element offset into [input] where the row starts. + * @param weight packed Q5_K bytes for the full `outputDim × inputDim` weight tensor. + * @param weightByteOffset byte offset into [weight] where block (0, 0) starts. + * @param inputDim contraction dimension (must be a multiple of 256). + * @param outputDim number of output cells. + * @param output FP32 output vector. + * @param outputOffset element offset into [output] where the row starts. + */ + public fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) +} diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt index 0611ce76..31a8386b 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt @@ -6,6 +6,7 @@ import sk.ainet.backend.api.kernel.KernelProvider import sk.ainet.backend.api.kernel.Q4KMatmulKernel import sk.ainet.backend.api.kernel.Q4_0MatmulKernel import sk.ainet.backend.api.kernel.Q5_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5KMatmulKernel import sk.ainet.backend.api.kernel.Q5_1MatmulKernel import sk.ainet.backend.api.kernel.Q6KMatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel @@ -33,6 +34,7 @@ public object ScalarKernelProvider : KernelProvider { override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel + override fun matmulQ5K(): Q5KMatmulKernel = ScalarQ5_KMatmulKernel override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel } diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt new file mode 100644 index 00000000..63daf5e0 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ5_KMatmulKernel.kt @@ -0,0 +1,91 @@ +package sk.ainet.exec.kernel + +import sk.ainet.backend.api.kernel.Q5KMatmulKernel + +/** + * Scalar reference [Q5KMatmulKernel] — commonMain, so Q5_K packed matmul works + * on Kotlin/Native / JS / WASM, not only the JVM SIMD path. + * + * Q5_K super-block: 256 elements / 176 bytes, block-major `(blockIdx*outputDim+o)*176`: + * `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 32 `qh` + * high-bit bytes 128 `qs` low-nibble bytes. Each of the 8 sub-blocks (32 elts) + * contributes `codeSum*scale - inputSum*offset`, with `scale = d*scaleIdx`, + * `offset = dMin*minIdx`, and the 5-bit `code = lowNibble | (fifthBit << 4)`. + * Math mirrors `DequantOps.dequantQ5KFromBytes` and the Q4_K kernel (only the + * 5th-bit fold differs). + */ +public object ScalarQ5_KMatmulKernel : Q5KMatmulKernel { + + private const val BLOCK_SIZE = 256 + private const val SUB_BLOCK = 32 + private const val BYTES_PER_BLOCK = 176 + private const val QH_OFFSET = 16 + private const val QS_OFFSET = 48 + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "ScalarQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0) return + if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return } + val blocksPerInputDim = inputDim / BLOCK_SIZE + val scaleIdx = IntArray(8) + val minIdx = IntArray(8) + + for (o in 0 until outputDim) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF)) + val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF)) + + // ggml get_scale_min_k4 over the 12 scale bytes (identical to Q4_K). + val sc = blockBase + 4 + for (sb in 0 until 4) { + scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F + minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F + } + for (sb in 4 until 8) { + val low4S = weight[sc + sb + 4].toInt() and 0x0F + val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6 + scaleIdx[sb] = low4S or (high2S shl 4) + val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4 + val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6 + minIdx[sb] = low4M or (high2M shl 4) + } + + val qhBase = blockBase + QH_OFFSET + val qsBase = blockBase + QS_OFFSET + val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE + for (groupJ in 0 until 4) { + val qsRegion = qsBase + groupJ * 32 + // sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes; + // the 5th bit comes from qh[i], bit (2*groupJ + half). + for (half in 0 until 2) { + val sb = 2 * groupJ + half + val bit = 2 * groupJ + half + val inStart = inBlockBase + sb * SUB_BLOCK + var codeSum = 0f + var inputSum = 0f + for (i in 0 until 32) { + val b = weight[qsRegion + i].toInt() and 0xFF + val low = if (half == 0) (b and 0x0F) else (b ushr 4) + val fifth = ((weight[qhBase + i].toInt() and 0xFF) ushr bit) and 0x01 + val code = low or (fifth shl 4) + val v = input[inStart + i] + codeSum += v * code + inputSum += v + } + acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb]) + } + } + } + output[outputOffset + o] = acc + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt index 1a45ae7d..d61eb889 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt @@ -18,6 +18,8 @@ import sk.ainet.lang.tensor.data.Q4_KTensorData import sk.ainet.lang.tensor.data.Q4_KBlockTensorData import sk.ainet.lang.tensor.data.Q6_KTensorData import sk.ainet.lang.tensor.data.Q6_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData import sk.ainet.lang.tensor.data.Q5_1TensorData import sk.ainet.lang.tensor.data.Q5_1BlockTensorData import sk.ainet.lang.tensor.data.Q5_0TensorData @@ -333,6 +335,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() } private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() } private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() } + private val q5kKernel by lazy { resolveProvider { it.matmulQ5K() != null }?.matmulQ5K() } private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() } private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() } @@ -367,6 +370,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) } is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) } is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) } + is Q5_KTensorData -> q5kKernel?.let { k -> run(bd.packedData, k::matmul) } is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) } is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) } is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) } @@ -598,6 +602,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory @Suppress("UNCHECKED_CAST") when (val d = tensor.data) { is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q5_KTensorData -> return newTensor(Q5_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt index 8aee59e2..e9bb5b59 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt @@ -5,6 +5,7 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel import sk.ainet.backend.api.kernel.KernelProvider import sk.ainet.backend.api.kernel.Q4KMatmulKernel import sk.ainet.backend.api.kernel.Q4_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5KMatmulKernel import sk.ainet.backend.api.kernel.Q5_0MatmulKernel import sk.ainet.backend.api.kernel.Q5_1MatmulKernel import sk.ainet.backend.api.kernel.Q6KMatmulKernel @@ -65,6 +66,9 @@ public object PanamaVectorKernelProvider : KernelProvider { override fun matmulQ6K(): Q6KMatmulKernel? = if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null + override fun matmulQ5K(): Q5KMatmulKernel? = + if (isAvailable()) PanamaVectorQ5_KMatmulKernel else null + private fun isVectorApiClassLoaded(): Boolean = runCatching { Class.forName("jdk.incubator.vector.FloatVector") Class.forName("jdk.incubator.vector.VectorSpecies") diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt new file mode 100644 index 00000000..5302a471 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernel.kt @@ -0,0 +1,193 @@ +package sk.ainet.exec.kernel + +import jdk.incubator.vector.ByteVector +import jdk.incubator.vector.FloatVector +import jdk.incubator.vector.VectorOperators +import jdk.incubator.vector.VectorSpecies +import sk.ainet.backend.api.kernel.Q5KMatmulKernel +import sk.ainet.exec.tensor.ops.parallelChunks + +/** + * SIMD-vectorized Q5_K matmul on the JDK Vector API. + * + * Identical pipeline to [PanamaVectorQ4KMatmulKernel], with one extra step: + * each code is 5-bit, so the low nibble (from `qs`) is OR'd with the 5th bit + * pulled from the `qh` high-bit plane. For super-block group `j` (0..3), the + * low sub-block uses `qh` bit `2j` and the high sub-block uses bit `2j + 1`, + * with `qh` indexed by the intra-group position (same `idx` as `qs`). See + * [sk.ainet.lang.tensor.data.Q5_KTensorData] / `DequantOps.dequantQ5KFromBytes`. + * + * Numerical equivalence with the scalar reference is within FMA + + * reordered-reduction tolerance; verified via parity tests. + */ +public object PanamaVectorQ5_KMatmulKernel : Q5KMatmulKernel { + + private const val BLOCK_SIZE = 256 + private const val SUB_BLOCK_SIZE = 32 + private const val SUB_BLOCKS_PER_BLOCK = 8 + private const val BYTES_PER_BLOCK = 176 + private const val QH_OFFSET = 16 + private const val QS_OFFSET = 48 + + private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + + private val byteSpeciesForFloat: VectorSpecies = when (floatSpecies.length()) { + 16 -> ByteVector.SPECIES_128 + else -> ByteVector.SPECIES_64 // covers 4-wide (NEON) and 8-wide (AVX2) + } + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "PanamaVectorQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + val blocksPerInputDim = inputDim / BLOCK_SIZE + + parallelChunks(outputDim) { startO, endO -> + val scaleIdx = IntArray(SUB_BLOCKS_PER_BLOCK) + val minIdx = IntArray(SUB_BLOCKS_PER_BLOCK) + for (o in startO until endO) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + + // d, dMin (FP16 LE). + val dBits = (weight[blockBase + 1].toInt() and 0xFF shl 8) or + (weight[blockBase].toInt() and 0xFF) + val dMinBits = (weight[blockBase + 3].toInt() and 0xFF shl 8) or + (weight[blockBase + 2].toInt() and 0xFF) + val d = halfToFloat(dBits) + val dMin = halfToFloat(dMinBits) + + // Sub-scale decode via ggml `get_scale_min_k4` (identical to Q4_K). + val scalesOffset = blockBase + 4 + for (sb in 0 until 4) { + scaleIdx[sb] = weight[scalesOffset + sb].toInt() and 0x3F + minIdx[sb] = weight[scalesOffset + sb + 4].toInt() and 0x3F + } + for (sb in 4 until 8) { + val low4S = weight[scalesOffset + sb + 4].toInt() and 0x0F + val high2S = (weight[scalesOffset + sb - 4].toInt() and 0xFF) ushr 6 + scaleIdx[sb] = low4S or (high2S shl 4) + val low4M = (weight[scalesOffset + sb + 4].toInt() and 0xFF) ushr 4 + val high2M = (weight[scalesOffset + sb].toInt() and 0xFF) ushr 6 + minIdx[sb] = low4M or (high2M shl 4) + } + + // 4 strided qs groups; each carries sbLo (lo nibbles) and sbHi (hi nibbles). + // The 5th bit comes from qh[idx], bit (2*groupJ) for lo, (2*groupJ+1) for hi. + val qhOffset = blockBase + QH_OFFSET + val qsOffset = blockBase + QS_OFFSET + val inputBlockBase = inputOffset + blockIdx * BLOCK_SIZE + for (groupJ in 0 until 4) { + val qsRegion = qsOffset + groupJ * 32 + val sbLo = 2 * groupJ + val sbHi = sbLo + 1 + val bitLo = (2 * groupJ).toByte() + val bitHi = (2 * groupJ + 1).toByte() + val inputStartLo = inputBlockBase + sbLo * SUB_BLOCK_SIZE + val inputStartHi = inputStartLo + SUB_BLOCK_SIZE + + var codeAccLo = FloatVector.zero(floatSpecies) + var inputAccLo = FloatVector.zero(floatSpecies) + var codeAccHi = FloatVector.zero(floatSpecies) + var inputAccHi = FloatVector.zero(floatSpecies) + + val floatStep = floatSpecies.length() + val byteLoadLen = byteSpeciesForFloat.length() + var idx = 0 + + // SIMD body — single qs + qh load feeds both nibble vectors. + while (idx + floatStep <= SUB_BLOCK_SIZE && + qsRegion + idx + byteLoadLen <= weight.size + ) { + val inVecLo = FloatVector.fromArray(floatSpecies, input, inputStartLo + idx) + val inVecHi = FloatVector.fromArray(floatSpecies, input, inputStartHi + idx) + val byteVec = ByteVector.fromArray(byteSpeciesForFloat, weight, qsRegion + idx) + val qhVec = ByteVector.fromArray(byteSpeciesForFloat, weight, qhOffset + idx) + val loNib = byteVec.and(0x0F.toByte()) + val hiNib = byteVec.lanewise(VectorOperators.LSHR, 4.toByte()) + val fifthLo = qhVec.lanewise(VectorOperators.LSHR, bitLo) + .and(0x01.toByte()).lanewise(VectorOperators.LSHL, 4.toByte()) + val fifthHi = qhVec.lanewise(VectorOperators.LSHR, bitHi) + .and(0x01.toByte()).lanewise(VectorOperators.LSHL, 4.toByte()) + val codeLoBytes = loNib.lanewise(VectorOperators.OR, fifthLo) + val codeHiBytes = hiNib.lanewise(VectorOperators.OR, fifthHi) + val codeVecLo = codeLoBytes.castShape(floatSpecies, 0) as FloatVector + val codeVecHi = codeHiBytes.castShape(floatSpecies, 0) as FloatVector + codeAccLo = inVecLo.fma(codeVecLo, codeAccLo) + inputAccLo = inVecLo.add(inputAccLo) + codeAccHi = inVecHi.fma(codeVecHi, codeAccHi) + inputAccHi = inVecHi.add(inputAccHi) + idx += floatStep + } + + var codeSumLo = codeAccLo.reduceLanes(VectorOperators.ADD) + var inputSumLo = inputAccLo.reduceLanes(VectorOperators.ADD) + var codeSumHi = codeAccHi.reduceLanes(VectorOperators.ADD) + var inputSumHi = inputAccHi.reduceLanes(VectorOperators.ADD) + + // Scalar tail — only fires if floatSpecies.length() doesn't divide 32 (rare). + while (idx < SUB_BLOCK_SIZE) { + val byte = weight[qsRegion + idx].toInt() and 0xFF + val qh = weight[qhOffset + idx].toInt() and 0xFF + val codeLo = ((byte and 0x0F) or (((qh ushr bitLo.toInt()) and 0x01) shl 4)).toFloat() + val codeHi = ((byte ushr 4) or (((qh ushr bitHi.toInt()) and 0x01) shl 4)).toFloat() + val vLo = input[inputStartLo + idx] + val vHi = input[inputStartHi + idx] + codeSumLo += vLo * codeLo + inputSumLo += vLo + codeSumHi += vHi * codeHi + inputSumHi += vHi + idx++ + } + + val scaleLo = d * scaleIdx[sbLo] + val offsetLo = dMin * minIdx[sbLo] + val scaleHi = d * scaleIdx[sbHi] + val offsetHi = dMin * minIdx[sbHi] + acc += codeSumLo * scaleLo - inputSumLo * offsetLo + acc += codeSumHi * scaleHi - inputSumHi * offsetHi + } + } + output[outputOffset + o] = acc + } + } + } + + /** + * IEEE 754 binary16 → binary32 conversion. Mirrors the helper in + * [PanamaVectorQ4KMatmulKernel]. + */ + private fun halfToFloat(hbits: Int): Float { + val sign = (hbits ushr 15) and 0x1 + val exp = (hbits ushr 10) and 0x1F + val frac = hbits and 0x3FF + return when { + exp == 0 -> { + if (frac == 0) { + if (sign == 0) 0.0f else -0.0f + } else { + val f = frac / 1024.0f * (1.0f / 16384.0f) + if (sign == 0) f else -f + } + } + exp == 0x1F -> { + if (frac == 0) { + if (sign == 0) Float.POSITIVE_INFINITY else Float.NEGATIVE_INFINITY + } else { + Float.NaN + } + } + else -> { + val bits = (sign shl 31) or ((exp - 15 + 127) shl 23) or (frac shl 13) + Float.fromBits(bits) + } + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt new file mode 100644 index 00000000..cd269d7d --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_KMatmulKernelParityTest.kt @@ -0,0 +1,71 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue + +/** + * Numerical parity for [PanamaVectorQ5_KMatmulKernel] against + * [ScalarQ5_KMatmulKernel] — the commonMain reference. Both share the + * canonical Q5_K layout (176-byte block, `qh` 5th-bit plane) and the + * lazy-`dmin` accumulation, so outputs must agree within FMA + + * reordered-reduction tolerance. + * + * Fixture: random Q5_K bytes with `d`/`dMin` clamped to `1.0f16` (no + * NaN/Inf), packed input-block-major `(blockIdx * outputDim + o) * 176`. + * The random `qh` bytes exercise the full 5-bit code range. + */ +class PanamaVectorQ5_KMatmulKernelParityTest { + + private val blockSize = 256 + private val bytesPerBlock = 176 + + private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray { + val rng = Random(seed) + val bytes = ByteArray(numBlocks * bytesPerBlock) + rng.nextBytes(bytes) + for (block in 0 until numBlocks) { + val base = block * bytesPerBlock + // 0x3C00 == 1.0f16. Force d = dMin = 1.0f16 so dequant stays finite. + bytes[base + 0] = 0x00.toByte() + bytes[base + 1] = 0x3C.toByte() + bytes[base + 2] = 0x00.toByte() + bytes[base + 3] = 0x3C.toByte() + } + return bytes + } + + private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) { + val numBlocks = (inputDim / blockSize) * outputDim + val packed = randomQ5KBytes(numBlocks, seed) + val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f } + + val refOut = FloatArray(outputDim) + ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0) + + val vecOut = FloatArray(outputDim) + PanamaVectorQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, vecOut, 0) + + for (o in 0 until outputDim) { + val diff = abs(refOut[o] - vecOut[o]) + val rel = diff / (abs(refOut[o]) + 1e-9f) + assertTrue( + diff <= tol || rel < 1e-4f, + "row $o diverged: scalar=${refOut[o]} panama=${vecOut[o]} diff=$diff rel=$rel tol=$tol", + ) + } + } + + @Test + fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f) + + @Test + fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f) + + @Test + fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f) + + @Test + fun llm_typical_shape() = assertParity(4096, 64, 999, 5e-1f) +} diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts index 9efc6865..6ab0ddd9 100644 --- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts +++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts @@ -105,12 +105,72 @@ val packageNativeKernels by tasks.registering(Copy::class) { into(nativeResourceTargetDir) } +// --- Cross-compile to aarch64 (opt-in) ------------------------------------- +// +// Produces native/linux-arm64/libskainet_kernels.so with the NEON paths +// (CMAKE_SYSTEM_PROCESSOR=aarch64 -> -march=armv8.2-a+fp16+dotprod). Gated on +// `-PcrossArm64=true` so the default host build is unaffected on machines +// without the `gcc-aarch64-linux-gnu` cross toolchain. The board build / CI +// host opts in. NativeLibraryLoader resolves native/linux-arm64/ from os.arch +// at runtime, so the consuming side needs no change once this .so is bundled. +// +// BOARD-VERIFY-PENDING: the NEON code is syntax-validated for aarch64 but has +// not been executed; run the parity tests under QEMU or on the SL2610. +val crossArm64Enabled: Boolean = (findProperty("crossArm64") as String?)?.toBoolean() == true +val aarch64Cc: String = (findProperty("skainetAarch64Cc") as String?) ?: "aarch64-linux-gnu-gcc" +val cmakeBuildArm64Path: String = layout.buildDirectory.dir("native/cmake-build-arm64").get().asFile.absolutePath +val nativeResourceArm64Dir = nativeResourcesRoot.map { it.dir("native/linux-arm64") } +val toolchainFilePath = "$nativeSourcePath/toolchain-aarch64.cmake" + +val configureNativeKernelsArm64 by tasks.registering(Exec::class) { + group = "build" + description = "CMake configure for the aarch64 (NEON) native kernels (cross-compile)." + onlyIf { crossArm64Enabled } + inputs.file("$nativeSourcePath/CMakeLists.txt") + inputs.dir("$nativeSourcePath/src") + inputs.dir("$nativeSourcePath/include") + outputs.dir(cmakeBuildArm64Path) + commandLine = listOf( + "cmake", + "-S", nativeSourcePath, + "-B", cmakeBuildArm64Path, + "-DCMAKE_BUILD_TYPE=Release", + "-DCMAKE_TOOLCHAIN_FILE=$toolchainFilePath", + "-DSKAINET_AARCH64_CC=$aarch64Cc", + ) +} + +val buildNativeKernelsArm64 by tasks.registering(Exec::class) { + group = "build" + description = "Cross-build the aarch64 (NEON) native kernels shared library." + onlyIf { crossArm64Enabled } + dependsOn(configureNativeKernelsArm64) + inputs.file("$nativeSourcePath/CMakeLists.txt") + inputs.dir("$nativeSourcePath/src") + inputs.dir("$nativeSourcePath/include") + outputs.dir(cmakeBuildArm64Path) + commandLine = listOf("cmake", "--build", cmakeBuildArm64Path, "--config", "Release") +} + +val packageNativeKernelsArm64 by tasks.registering(Copy::class) { + group = "build" + description = "Stage the cross-built aarch64 native kernels into JVM resources." + onlyIf { crossArm64Enabled } + dependsOn(buildNativeKernelsArm64) + from(cmakeBuildArm64Path) { + include("libskainet_kernels.so") + eachFile { path = name } + } + into(nativeResourceArm64Dir) +} + kotlin.sourceSets.named("jvmMain") { resources.srcDir(nativeResourcesRoot) } tasks.named("jvmProcessResources") { dependsOn(packageNativeKernels) + if (crossArm64Enabled) dependsOn(packageNativeKernelsArm64) } // Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt index ade06d41..f26c125c 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt +++ b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt @@ -12,6 +12,7 @@ endif() add_library(skainet_kernels SHARED src/skainet_smoke.c src/q4k_matmul.c + src/q5k_matmul.c src/fp32_matmul.c src/bf16_matmul.c src/q8_0_matmul.c @@ -41,6 +42,17 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") -ffast-math -funroll-loops ) + # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h. + # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 + dotprod + # (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it would + # SIGILL on the board. CONFIRM on-device first: `grep Features /proc/cpuinfo` + # must list `asimddp` and `fphp`/`asimdhp`. Explicit -march (not -mcpu=native) + # because this is a cross-compile from an x86 host. + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + target_compile_options(skainet_kernels PRIVATE + -march=armv8.2-a+fp16+dotprod + ) + endif() set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden) elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC") target_compile_options(skainet_kernels PRIVATE diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h index a0fa3ff7..167fc80b 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h +++ b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h @@ -60,6 +60,31 @@ SKAINET_API void skainet_q4k_matmul( int32_t output_offset ); +/* + * Q5_K matrix-vector multiply. + * + * output[output_offset + o] = sum_j input[input_offset + j] * + * dequant(weight[block, o, j]) + * + * Block layout: canonical ggml Q5_K, 256 elements per super-block, 176 + * bytes per block (2 B d + 2 B dMin + 12 B packed scales + 32 B `qh` + * high-bit plane + 128 B `qs` low nibbles). Each 5-bit code is + * `lowNibble | (fifthBit << 4)`. Packed weights laid out as + * weight + weight_byte_offset + (block_idx * output_dim + o) * 176 + * + * input_dim must be a multiple of 256. + */ +SKAINET_API void skainet_q5k_matmul( + const float* input, + int32_t input_offset, + const uint8_t* weight, + int32_t weight_byte_offset, + int32_t input_dim, + int32_t output_dim, + float* output, + int32_t output_offset +); + /* * Row-major FP32 SGEMM: C(m, n) = A(m, k) * B(k, n). * diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h new file mode 100644 index 00000000..dd2ec25b --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_simd.h @@ -0,0 +1,51 @@ +#ifndef SKAINET_SIMD_H +#define SKAINET_SIMD_H + +/* + * Compile-time SIMD capability detection for the native CPU kernels. + * + * The kernels keep their portable scalar bodies as the `#else` fallback, + * so x86_64 (which auto-vectorizes well under -O3 -ffast-math) and any + * pre-ARMv8.2 target keep compiling unchanged. The NEON paths are only + * taken when the compiler advertises `__ARM_NEON` (AArch64 always does + * with the right -march). `__ARM_FEATURE_DOTPROD` / `__ARM_FEATURE_MATMUL_INT8` + * are gated on the build flags (`-march=armv8.2-a+dotprod`, etc.). + * + * BOARD-VERIFY-PENDING: the NEON paths in this tree compile to the scalar + * fallback on the x86 build host and have NOT been executed on aarch64. + * They must be built with the cross toolchain and bit-exact-checked under + * QEMU or on the SL2610 before being relied on. + */ + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +# include +# define SKAINET_HAVE_NEON 1 +#endif + +#if defined(__ARM_FEATURE_DOTPROD) +# define SKAINET_HAVE_DOTPROD 1 +#endif + +#if defined(__ARM_FEATURE_MATMUL_INT8) +# define SKAINET_HAVE_I8MM 1 +#endif + +#ifdef SKAINET_HAVE_NEON +/* Horizontal sum of a float32x4 lane vector. AArch64 has vaddvq_f32 + * natively; this wrapper keeps call sites readable. */ +static inline float skainet_neon_hadd_f32(float32x4_t v) { + return vaddvq_f32(v); +} + +/* Widen 16 unsigned bytes to four float32x4 lanes (out[0]=lanes 0..3, …). */ +static inline void skainet_neon_u8x16_to_f32x4x4(uint8x16_t v, float32x4_t out[4]) { + const uint16x8_t lo16 = vmovl_u8(vget_low_u8(v)); + const uint16x8_t hi16 = vmovl_u8(vget_high_u8(v)); + out[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(lo16))); + out[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(lo16))); + out[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(hi16))); + out[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(hi16))); +} +#endif + +#endif /* SKAINET_SIMD_H */ diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c index ce88afd8..dbd6293b 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c +++ b/skainet-backends/skainet-backend-native-cpu/native/src/fp32_matmul.c @@ -1,4 +1,5 @@ #include "skainet_kernels.h" +#include "skainet_simd.h" #include #include @@ -53,9 +54,22 @@ SKAINET_API void skainet_fp32_matmul( for (int32_t p = 0; p < k; ++p) { const float a_ip = a_row[p]; const float* SKAINET_RESTRICT b_row = b + b_offset + (size_t) p * b_stride; +#ifdef SKAINET_HAVE_NEON + const float32x4_t va = vdupq_n_f32(a_ip); + int32_t j = 0; + for (; j + 4 <= n; j += 4) { + float32x4_t cv = vld1q_f32(c_row + j); + cv = vfmaq_f32(cv, va, vld1q_f32(b_row + j)); + vst1q_f32(c_row + j, cv); + } + for (; j < n; ++j) { + c_row[j] += a_ip * b_row[j]; + } +#else for (int32_t j = 0; j < n; ++j) { c_row[j] += a_ip * b_row[j]; } +#endif } } } diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c index 7c742793..8091e58a 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c +++ b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c @@ -1,4 +1,5 @@ #include "skainet_kernels.h" +#include "skainet_simd.h" #include #include @@ -124,6 +125,30 @@ SKAINET_API void skainet_q4k_matmul( float code_sum_lo = 0.0f, input_sum_lo = 0.0f; float code_sum_hi = 0.0f, input_sum_hi = 0.0f; +#ifdef SKAINET_HAVE_NEON + float32x4_t cacc_lo = vdupq_n_f32(0.0f), iacc_lo = vdupq_n_f32(0.0f); + float32x4_t cacc_hi = vdupq_n_f32(0.0f), iacc_hi = vdupq_n_f32(0.0f); + for (int off = 0; off < Q4K_SUB_BLOCK_SIZE; off += 16) { + const uint8x16_t packed = vld1q_u8(qs_group + off); + const uint8x16_t lo_nib = vandq_u8(packed, vdupq_n_u8(0x0F)); + const uint8x16_t hi_nib = vshrq_n_u8(packed, 4); + float32x4_t cl[4], ch[4]; + skainet_neon_u8x16_to_f32x4x4(lo_nib, cl); + skainet_neon_u8x16_to_f32x4x4(hi_nib, ch); + for (int q = 0; q < 4; ++q) { + const float32x4_t v_lo = vld1q_f32(in_lo + off + q * 4); + const float32x4_t v_hi = vld1q_f32(in_hi + off + q * 4); + cacc_lo = vfmaq_f32(cacc_lo, v_lo, cl[q]); + iacc_lo = vaddq_f32(iacc_lo, v_lo); + cacc_hi = vfmaq_f32(cacc_hi, v_hi, ch[q]); + iacc_hi = vaddq_f32(iacc_hi, v_hi); + } + } + code_sum_lo = skainet_neon_hadd_f32(cacc_lo); + input_sum_lo = skainet_neon_hadd_f32(iacc_lo); + code_sum_hi = skainet_neon_hadd_f32(cacc_hi); + input_sum_hi = skainet_neon_hadd_f32(iacc_hi); +#else /* 32 iterations — auto-vectorizes cleanly under -O3. */ for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) { const uint8_t b = qs_group[i]; @@ -136,6 +161,7 @@ SKAINET_API void skainet_q4k_matmul( code_sum_hi += v_hi * code_hi; input_sum_hi += v_hi; } +#endif const float scale_lo = d * (float) scale_idx[sb_lo]; const float offset_lo = d_min * (float) min_idx[sb_lo]; diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c new file mode 100644 index 00000000..de395114 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/native/src/q5k_matmul.c @@ -0,0 +1,202 @@ +#include "skainet_kernels.h" +#include "skainet_simd.h" + +#include +#include + +#define Q5K_BLOCK_SIZE 256 +#define Q5K_SUB_BLOCK_SIZE 32 +#define Q5K_SUB_BLOCKS 8 +#define Q5K_BYTES_PER_BLOCK 176 +#define Q5K_QH_OFFSET 16 +#define Q5K_QS_OFFSET 48 + +/* + * IEEE 754 binary16 (LE byte order) -> binary32 conversion. + * Mirrors PanamaVectorQ5_KMatmulKernel.halfToFloat / the Q4_K kernel + * byte-for-byte (kept scalar to preserve bit-exact FP16 parity). + */ +static inline float skainet_q5k_half_to_float(uint16_t hbits) { + const uint32_t sign = (hbits >> 15) & 0x1u; + const uint32_t exp = (hbits >> 10) & 0x1Fu; + const uint32_t frac = hbits & 0x3FFu; + + if (exp == 0u) { + if (frac == 0u) { + union { uint32_t u; float f; } v = { sign << 31 }; + return v.f; + } + float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f); + return sign ? -f : f; + } + if (exp == 0x1Fu) { + union { uint32_t u; float f; } v; + v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u); + return v.f; + } + union { uint32_t u; float f; } v; + v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13); + return v.f; +} + +/* + * ggml's get_scale_min_k4 unmix for the 12-byte packed sub-scale region + * (bytes 4..15). Identical to Q4_K. + */ +static inline void skainet_q5k_decode_scales( + const uint8_t* scales, + int* scale_idx, + int* min_idx +) { + for (int sb = 0; sb < 4; ++sb) { + scale_idx[sb] = scales[sb] & 0x3F; + min_idx[sb] = scales[sb + 4] & 0x3F; + } + for (int sb = 4; sb < 8; ++sb) { + const int low4_s = scales[sb + 4] & 0x0F; + const int high2_s = (scales[sb - 4] >> 6) & 0x03; + scale_idx[sb] = low4_s | (high2_s << 4); + + const int low4_m = (scales[sb + 4] >> 4) & 0x0F; + const int high2_m = (scales[sb] >> 6) & 0x03; + min_idx[sb] = low4_m | (high2_m << 4); + } +} + +/* + * Native Q5_K matrix-vector multiply matching the + * sk.ainet.backend.api.kernel.Q5KMatmulKernel SPI contract. Single + * input row times an `outputDim x inputDim` Q5_K-packed weight tensor + * laid out (blockIdx * outputDim + o) * 176 bytes. + * + * Q5_K extends Q4_K with a 32-byte `qh` high-bit plane: the 5-bit code + * is `lowNibble | (fifthBit << 4)`, where the low nibble lives in `qs` + * (same strided layout as Q4_K) and the 5th bit is bit (2*group) of + * qh[l] for the low sub-block, (2*group + 1) for the high sub-block. + * + * Lazy-dmin pattern: per sub-block accumulate + * codeSum[s] = sum_i input[i] * code[i] + * inputSum[s] = sum_i input[i] + * and combine once via + * acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s] + * + * Scalar single-threaded; the tight inner loop is straight-line FP + * arithmetic so -O3 auto-vectorizes on AVX2/NEON. A hand-written NEON + * path is layered on behind __ARM_NEON in a later PR. + */ +SKAINET_API void skainet_q5k_matmul( + const float* SKAINET_RESTRICT input, + int32_t input_offset, + const uint8_t* SKAINET_RESTRICT weight, + int32_t weight_byte_offset, + int32_t input_dim, + int32_t output_dim, + float* SKAINET_RESTRICT output, + int32_t output_offset +) { + if (output_dim <= 0 || input_dim <= 0) return; + + const int32_t blocks_per_input_dim = input_dim / Q5K_BLOCK_SIZE; + const float* in_base = input + input_offset; + float* out_base = output + output_offset; + + int scale_idx[Q5K_SUB_BLOCKS]; + int min_idx[Q5K_SUB_BLOCKS]; + + for (int32_t o = 0; o < output_dim; ++o) { + float acc = 0.0f; + + for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) { + const uint8_t* block = weight + weight_byte_offset + + (size_t)(block_idx * output_dim + o) * Q5K_BYTES_PER_BLOCK; + + /* d, dMin (FP16 LE -> FP32). */ + const uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8); + const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8); + const float d = skainet_q5k_half_to_float(d_bits); + const float d_min = skainet_q5k_half_to_float(d_min_bits); + + /* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */ + skainet_q5k_decode_scales(block + 4, scale_idx, min_idx); + + const uint8_t* qh = block + Q5K_QH_OFFSET; + const uint8_t* qs = block + Q5K_QS_OFFSET; + const float* in_block = in_base + (size_t) block_idx * Q5K_BLOCK_SIZE; + + /* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */ + for (int group_j = 0; group_j < 4; ++group_j) { + const uint8_t* qs_group = qs + group_j * Q5K_SUB_BLOCK_SIZE; + const int sb_lo = 2 * group_j; + const int sb_hi = sb_lo + 1; + const int bit_lo = 2 * group_j; + const int bit_hi = 2 * group_j + 1; + const float* in_lo = in_block + sb_lo * Q5K_SUB_BLOCK_SIZE; + const float* in_hi = in_block + sb_hi * Q5K_SUB_BLOCK_SIZE; + + float code_sum_lo = 0.0f, input_sum_lo = 0.0f; + float code_sum_hi = 0.0f, input_sum_hi = 0.0f; + +#ifdef SKAINET_HAVE_NEON + /* Variable right-shift via vshlq_u8 with a negative count + * (bit_lo/bit_hi are runtime values, so vshrq_n_u8's + * immediate form can't be used). */ + const int8x16_t shr_lo = vdupq_n_s8(-(int8_t) bit_lo); + const int8x16_t shr_hi = vdupq_n_s8(-(int8_t) bit_hi); + float32x4_t cacc_lo = vdupq_n_f32(0.0f), iacc_lo = vdupq_n_f32(0.0f); + float32x4_t cacc_hi = vdupq_n_f32(0.0f), iacc_hi = vdupq_n_f32(0.0f); + for (int off = 0; off < Q5K_SUB_BLOCK_SIZE; off += 16) { + const uint8x16_t packed = vld1q_u8(qs_group + off); + const uint8x16_t qhv = vld1q_u8(qh + off); + const uint8x16_t lo_nib = vandq_u8(packed, vdupq_n_u8(0x0F)); + const uint8x16_t hi_nib = vshrq_n_u8(packed, 4); + /* 5th bit -> bit 4 of the code byte. */ + const uint8x16_t fifth_lo = + vshlq_n_u8(vandq_u8(vshlq_u8(qhv, shr_lo), vdupq_n_u8(0x01)), 4); + const uint8x16_t fifth_hi = + vshlq_n_u8(vandq_u8(vshlq_u8(qhv, shr_hi), vdupq_n_u8(0x01)), 4); + const uint8x16_t code_lo = vorrq_u8(lo_nib, fifth_lo); + const uint8x16_t code_hi = vorrq_u8(hi_nib, fifth_hi); + float32x4_t cl[4], ch[4]; + skainet_neon_u8x16_to_f32x4x4(code_lo, cl); + skainet_neon_u8x16_to_f32x4x4(code_hi, ch); + for (int q = 0; q < 4; ++q) { + const float32x4_t v_lo = vld1q_f32(in_lo + off + q * 4); + const float32x4_t v_hi = vld1q_f32(in_hi + off + q * 4); + cacc_lo = vfmaq_f32(cacc_lo, v_lo, cl[q]); + iacc_lo = vaddq_f32(iacc_lo, v_lo); + cacc_hi = vfmaq_f32(cacc_hi, v_hi, ch[q]); + iacc_hi = vaddq_f32(iacc_hi, v_hi); + } + } + code_sum_lo = skainet_neon_hadd_f32(cacc_lo); + input_sum_lo = skainet_neon_hadd_f32(iacc_lo); + code_sum_hi = skainet_neon_hadd_f32(cacc_hi); + input_sum_hi = skainet_neon_hadd_f32(iacc_hi); +#else + /* 32 iterations — auto-vectorizes cleanly under -O3. */ + for (int i = 0; i < Q5K_SUB_BLOCK_SIZE; ++i) { + const uint8_t b = qs_group[i]; + const uint8_t h = qh[i]; + const float code_lo = (float)((b & 0x0F) | (((h >> bit_lo) & 0x01) << 4)); + const float code_hi = (float)((b >> 4) | (((h >> bit_hi) & 0x01) << 4)); + const float v_lo = in_lo[i]; + const float v_hi = in_hi[i]; + code_sum_lo += v_lo * code_lo; + input_sum_lo += v_lo; + code_sum_hi += v_hi * code_hi; + input_sum_hi += v_hi; + } +#endif + + const float scale_lo = d * (float) scale_idx[sb_lo]; + const float offset_lo = d_min * (float) min_idx[sb_lo]; + const float scale_hi = d * (float) scale_idx[sb_hi]; + const float offset_hi = d_min * (float) min_idx[sb_hi]; + acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo; + acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi; + } + } + + out_base[o] = acc; + } +} diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c index 844a2159..cee95a21 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c +++ b/skainet-backends/skainet-backend-native-cpu/native/src/q8_0_matmul.c @@ -1,4 +1,5 @@ #include "skainet_kernels.h" +#include "skainet_simd.h" #include #include @@ -83,9 +84,29 @@ SKAINET_API void skainet_q8_0_matmul( const float* SKAINET_RESTRICT input_block = input + input_offset + (size_t) block_idx * BLOCK_SIZE; float block_sum = 0.0f; +#ifdef SKAINET_HAVE_NEON + /* Activations are FP32, so widen int8 codes to float and FMA + * (int8 dotprod would need int8 activations — see plan note). */ + float32x4_t accv = vdupq_n_f32(0.0f); + for (int32_t k = 0; k < BLOCK_SIZE; k += 16) { + const int8x16_t c8 = vld1q_s8(codes + k); + const int16x8_t lo16 = vmovl_s8(vget_low_s8(c8)); + const int16x8_t hi16 = vmovl_s8(vget_high_s8(c8)); + const float32x4_t cf0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(lo16))); + const float32x4_t cf1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(lo16))); + const float32x4_t cf2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(hi16))); + const float32x4_t cf3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(hi16))); + accv = vfmaq_f32(accv, vld1q_f32(input_block + k), cf0); + accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 4), cf1); + accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 8), cf2); + accv = vfmaq_f32(accv, vld1q_f32(input_block + k + 12), cf3); + } + block_sum = skainet_neon_hadd_f32(accv); +#else for (int32_t k = 0; k < BLOCK_SIZE; ++k) { block_sum += input_block[k] * (float) codes[k]; } +#endif acc += block_sum * d; } output[output_offset + o] = acc; diff --git a/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake b/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake new file mode 100644 index 00000000..65e7e246 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/native/toolchain-aarch64.cmake @@ -0,0 +1,28 @@ +# CMake toolchain for cross-compiling the native kernels to aarch64 Linux +# from an x86_64 host (e.g. the SL2610 board build / CI). Requires the +# `gcc-aarch64-linux-gnu` package (or an equivalent clang cross setup). +# +# Usage: +# cmake -S native -B build/native/cmake-build-arm64 \ +# -DCMAKE_TOOLCHAIN_FILE=native/toolchain-aarch64.cmake \ +# -DCMAKE_BUILD_TYPE=Release +# +# CMAKE_SYSTEM_PROCESSOR=aarch64 makes CMakeLists.txt take the +# `-march=armv8.2-a+fp16+dotprod` branch that enables the __ARM_NEON paths. + +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +# Allow overriding the cross compiler (e.g. a clang cross or a different +# triple) via -DSKAINET_AARCH64_CC=... ; default to the Debian/Ubuntu GNU +# cross toolchain. +if(NOT DEFINED SKAINET_AARCH64_CC) + set(SKAINET_AARCH64_CC aarch64-linux-gnu-gcc) +endif() +set(CMAKE_C_COMPILER ${SKAINET_AARCH64_CC}) + +# Search for libraries/headers only in the target sysroot, but find +# programs (the compiler) on the host. +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt index 60dd45e2..ba0011b2 100644 --- a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt +++ b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeKernelProvider.kt @@ -7,6 +7,7 @@ import sk.ainet.backend.api.kernel.MemSegKernelProvider import sk.ainet.backend.api.kernel.Q4KMatmulKernel import sk.ainet.backend.api.kernel.Q4KMemSegMatmulKernel import sk.ainet.backend.api.kernel.Q4_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5KMatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel /** @@ -97,4 +98,7 @@ public object NativeKernelProvider : KernelProvider, MemSegKernelProvider { override fun matmulQ4_0(): Q4_0MatmulKernel? = if (NativeQ4_0MatmulKernel.isAvailable()) NativeQ4_0MatmulKernel else null + + override fun matmulQ5K(): Q5KMatmulKernel? = + if (NativeQ5KMatmulKernel.isAvailable()) NativeQ5KMatmulKernel else null } diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt new file mode 100644 index 00000000..6df68d73 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernel.kt @@ -0,0 +1,91 @@ +package sk.ainet.exec.kernel + +import java.lang.foreign.Arena +import java.lang.foreign.FunctionDescriptor +import java.lang.foreign.Linker +import java.lang.foreign.MemorySegment +import java.lang.foreign.ValueLayout +import java.lang.invoke.MethodHandle +import sk.ainet.backend.api.kernel.Q5KMatmulKernel + +/** + * Native (FFM) implementation of [Q5KMatmulKernel]. + * + * Wraps the bundled C symbol + * + * void skainet_q5k_matmul( + * const float* input, int32_t input_offset, + * const uint8_t* weight, int32_t weight_byte_offset, + * int32_t input_dim, int32_t output_dim, + * float* output, int32_t output_offset); + * + * Same lazy-`dmin` accumulation as [PanamaVectorQ5_KMatmulKernel] over + * the canonical 256-element / 176-byte Q5_K super-block (the 5th bit of + * each code comes from the `qh` plane). Numerical parity vs the Panama + * kernel is asserted by [NativeQ5KMatmulKernelParityTest]. + * + * Single-threaded scalar C (`-O3 -ffast-math`, auto-vectorized inner + * loop); a hand-written NEON path is layered on behind `__ARM_NEON`. + */ +internal object NativeQ5KMatmulKernel : Q5KMatmulKernel { + + private const val BLOCK_SIZE = 256 + private const val BYTES_PER_BLOCK = 176 + + fun isAvailable(): Boolean = handle != null + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "NativeQ5KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + val mh = handle + ?: error("NativeQ5KMatmulKernel.matmul invoked while native library unavailable") + + Arena.ofConfined().use { arena -> + val inSeg = arena.allocate( + inputDim.toLong() * java.lang.Float.BYTES, + ValueLayout.JAVA_FLOAT.byteAlignment(), + ) + val outSeg = arena.allocate( + outputDim.toLong() * java.lang.Float.BYTES, + ValueLayout.JAVA_FLOAT.byteAlignment(), + ) + val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * BYTES_PER_BLOCK.toLong() + val weightSeg = arena.allocate(weightBytesUsed, 1L) + + MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim) + MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt()) + + mh.invoke( + inSeg, 0, + weightSeg, 0, + inputDim, outputDim, + outSeg, 0, + ) + + MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim) + } + } + + private val handle: MethodHandle? by lazy { + val lookup = NativeLibraryLoader.lookup() ?: return@lazy null + val symbol = lookup.find("skainet_q5k_matmul").orElse(null) ?: return@lazy null + val descriptor = FunctionDescriptor.ofVoid( + ValueLayout.ADDRESS, // input + ValueLayout.JAVA_INT, // input_offset + ValueLayout.ADDRESS, // weight + ValueLayout.JAVA_INT, // weight_byte_offset + ValueLayout.JAVA_INT, // input_dim + ValueLayout.JAVA_INT, // output_dim + ValueLayout.ADDRESS, // output + ValueLayout.JAVA_INT, // output_offset + ) + runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull() + } +} diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt index 1565c4c4..d93a0149 100644 --- a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt +++ b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt @@ -21,7 +21,7 @@ import sk.ainet.backend.api.kernel.KernelProvider */ class KernelSupportMatrixTest { - private val formats = listOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0") + private val formats = listOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0") // platform key (display) -> the set of providers (by source-set) reaching it. private val platforms = listOf("JVM", "Android", "Native·linux", "Native·apple", "JS/WASM") @@ -36,9 +36,9 @@ class KernelSupportMatrixTest { private fun tiers(): List = listOf( Tier("scalar", 0, platforms.toSet(), scalarFormats()), Tier("panama-vector", 50, setOf("JVM", "Android"), - setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0")), + setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0")), Tier("native-ffm", 100, setOf("JVM"), - setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K")), + setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q5_K")), ) private fun best(fmt: String, platform: String, tiers: List): String? = @@ -71,7 +71,7 @@ class KernelSupportMatrixTest { // Drift gate on the scalar floor (the all-platform baseline): the documented set // below must equal what the scalar provider actually carries. Update both together. assertEquals( - setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0"), + setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_K", "Q5_1", "Q5_0"), scalarFormats(), "ScalarKernelProvider coverage changed — update the declared sets + run ./gradlew generateKernelMatrix", ) diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt new file mode 100644 index 00000000..095a1723 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/NativeQ5KMatmulKernelParityTest.kt @@ -0,0 +1,94 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertTrue + +/** + * Numerical parity tests for [NativeQ5KMatmulKernel] against + * [PanamaVectorQ5_KMatmulKernel]. Both kernels share the canonical Q5_K + * layout (176-byte block, 32-byte `qh` 5th-bit plane) and the lazy-`dmin` + * accumulation, so outputs must agree element-wise within FMA + + * reordered-reduction tolerance. + * + * Fixture mirrors [NativeQ4KMatmulKernelParityTest]: random Q5_K bytes with + * `d`/`dMin` clamped to `1.0f16`, packed input-block-major + * `(blockIdx * outputDim + o) * 176`. Random `qh` bytes exercise the 5th bit. + */ +class NativeQ5KMatmulKernelParityTest { + + private val blockSize = 256 + private val bytesPerBlock = 176 + + @BeforeTest + fun checkNativeAvailable() { + assertTrue( + NativeQ5KMatmulKernel.isAvailable(), + "NativeQ5KMatmulKernel reports unavailable on this host — bundled libskainet_kernels " + + "missing or skainet_q5k_matmul symbol unresolved", + ) + } + + private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray { + val rng = Random(seed) + val bytes = ByteArray(numBlocks * bytesPerBlock) + rng.nextBytes(bytes) + for (block in 0 until numBlocks) { + val base = block * bytesPerBlock + // 0x3C00 == 1.0f16. Force d = dMin = 1.0f16 so dequant magnitudes stay finite. + bytes[base + 0] = 0x00.toByte() + bytes[base + 1] = 0x3C.toByte() + bytes[base + 2] = 0x00.toByte() + bytes[base + 3] = 0x3C.toByte() + } + return bytes + } + + private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) { + val numBlocks = (inputDim / blockSize) * outputDim + val packed = randomQ5KBytes(numBlocks, seed) + val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f } + + val refOut = FloatArray(outputDim) + PanamaVectorQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0) + + val nativeOut = FloatArray(outputDim) + NativeQ5KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, nativeOut, 0) + + for (o in 0 until outputDim) { + val diff = abs(refOut[o] - nativeOut[o]) + val rel = diff / (abs(refOut[o]) + 1e-9f) + assertTrue( + diff <= tol || rel < 1e-4f, + "row $o diverged: panama=${refOut[o]} native=${nativeOut[o]} diff=$diff rel=$rel tol=$tol", + ) + } + } + + @Test + fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f) + + @Test + fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f) + + @Test + fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f) + + @Test + fun llm_typical_shape_4096_outputDim_64() = assertParity(4096, 64, 999, 5e-1f) + + @Test + fun rejects_inputDim_not_multiple_of_block() { + val packed = randomQ5KBytes(numBlocks = 2, seed = 1) + val input = FloatArray(255) + val out = FloatArray(1) + try { + NativeQ5KMatmulKernel.matmul(input, 0, packed, 0, 255, 1, out, 0) + kotlin.test.fail("expected IllegalArgumentException for non-multiple inputDim") + } catch (e: IllegalArgumentException) { + // expected + } + } +} diff --git a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt index 324c5da7..04d32f42 100644 --- a/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt +++ b/skainet-io/skainet-io-gguf/src/commonMain/kotlin/sk/ainet/io/gguf/StreamingGgufParametersLoader.kt @@ -6,6 +6,8 @@ import sk.ainet.io.RandomAccessSource import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.Tensor import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData import sk.ainet.lang.tensor.data.Q8_0BlockTensorData import sk.ainet.lang.types.BF16 import sk.ainet.lang.types.DType @@ -88,6 +90,18 @@ public class StreamingGgufParametersLoader( ctx.fromData(packed as sk.ainet.lang.tensor.data.TensorData, dtype) } + GGMLQuantizationType.Q5_K -> { + @Suppress("UNCHECKED_CAST") + val packed = Q5_KBlockTensorData.fromRawBytes(shape, rawBytes) + ctx.fromData(packed as sk.ainet.lang.tensor.data.TensorData, dtype) + } + + GGMLQuantizationType.Q6_K -> { + @Suppress("UNCHECKED_CAST") + val packed = Q6_KBlockTensorData.fromRawBytes(shape, rawBytes) + ctx.fromData(packed as sk.ainet.lang.tensor.data.TensorData, dtype) + } + GGMLQuantizationType.Q8_0 -> { @Suppress("UNCHECKED_CAST") val packed = Q8_0BlockTensorData.fromRawBytes(shape, rawBytes) diff --git a/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt b/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt new file mode 100644 index 00000000..460454f7 --- /dev/null +++ b/skainet-io/skainet-io-gguf/src/jvmTest/kotlin/sk/ainet/io/gguf/Q5KBlockTensorDataParityTest.kt @@ -0,0 +1,115 @@ +package sk.ainet.io.gguf + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue +import sk.ainet.io.gguf.dequant.DequantOps +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.tensor.data.toFloatArray + +/** + * Ties the packed [Q5_KBlockTensorData] (used by the eager CPU matmul path) + * to the proven golden [DequantOps.dequantQ5KFromBytes] across a *multi-block* + * buffer. Multiple blocks are the case that exposed the historical `qh[idx/8]` + * indexing bug — a single block can pass by accident, so this builds 3 blocks + * with distinct codes and asserts bit-exact agreement. + */ +class Q5KBlockTensorDataParityTest { + + private fun floatToHalf(value: Float): Int { + val bits = value.toRawBits() + val sign = (bits shr 16) and 0x8000 + val exponent = ((bits shr 23) and 0xFF) - 127 + val mantissa = bits and 0x7FFFFF + return when { + exponent >= 16 -> sign or 0x7C00 + exponent >= -14 -> sign or ((exponent + 15) shl 10) or (mantissa shr 13) + else -> sign + } + } + + /** Build a single 176-byte canonical Q5_K block (see [Q5KCanonicalLayoutTest]). */ + private fun buildBlock( + d: Float, + dMin: Float, + scaleIdx: IntArray, + minIdx: IntArray, + subBlockCodes: Array, + ): ByteArray { + val block = ByteArray(176) + val dBits = floatToHalf(d) + block[0] = (dBits and 0xFF).toByte() + block[1] = ((dBits shr 8) and 0xFF).toByte() + val dMinBits = floatToHalf(dMin) + block[2] = (dMinBits and 0xFF).toByte() + block[3] = ((dMinBits shr 8) and 0xFF).toByte() + + val scaleBytes = IntArray(12) + for (j in 0 until 4) { + scaleBytes[j] = scaleBytes[j] or (scaleIdx[j] and 0x3F) + scaleBytes[j + 4] = scaleBytes[j + 4] or (minIdx[j] and 0x3F) + } + for (j in 4 until 8) { + val sLow4 = scaleIdx[j] and 0x0F + val sHi2 = (scaleIdx[j] shr 4) and 0x03 + val mLow4 = minIdx[j] and 0x0F + val mHi2 = (minIdx[j] shr 4) and 0x03 + scaleBytes[j + 4] = scaleBytes[j + 4] or sLow4 or (mLow4 shl 4) + scaleBytes[j - 4] = scaleBytes[j - 4] or (sHi2 shl 6) + scaleBytes[j] = scaleBytes[j] or (mHi2 shl 6) + } + for (i in 0 until 12) block[4 + i] = (scaleBytes[i] and 0xFF).toByte() + + val qhBytes = IntArray(32) + for (j in 0 until 4) { + for (l in 0 until 32) { + val highLo = (subBlockCodes[2 * j][l] ushr 4) and 0x01 + val highHi = (subBlockCodes[2 * j + 1][l] ushr 4) and 0x01 + qhBytes[l] = qhBytes[l] or (highLo shl (2 * j)) + qhBytes[l] = qhBytes[l] or (highHi shl (2 * j + 1)) + } + } + for (i in 0 until 32) block[16 + i] = (qhBytes[i] and 0xFF).toByte() + + for (j in 0 until 4) { + for (l in 0 until 32) { + val lo = subBlockCodes[2 * j][l] and 0x0F + val hi = subBlockCodes[2 * j + 1][l] and 0x0F + block[48 + j * 32 + l] = ((hi shl 4) or lo).toByte() + } + } + return block + } + + @Test + fun `Q5_KBlockTensorData toFloatArray matches DequantOps golden across blocks`() { + val nBlocks = 3 + val buf = ByteArray(nBlocks * 176) + for (b in 0 until nBlocks) { + val d = 0.125f + 0.01f * b + val dMin = 0.0625f + 0.005f * b + val scaleIdx = IntArray(8) { (63 - (it * 7 + b * 3)) and 0x3F } + val minIdx = IntArray(8) { (it * 8 + b * 5) and 0x3F } + val codes = Array(8) { s -> IntArray(32) { j -> ((s * 11 + j * 5 + b * 13) and 0x1F) } } + val block = buildBlock(d, dMin, scaleIdx, minIdx, codes) + block.copyInto(buf, b * 176) + } + + val golden = DequantOps.dequantFromBytes(buf, GGMLQuantizationType.Q5_K, nBlocks * 256) + val packed = Q5_KBlockTensorData(Shape(nBlocks * 256), buf).toFloatArray() + + assertEquals(golden.size, packed.size) + var maxAbs = 0f + var firstDiff = -1 + for (i in golden.indices) { + val ad = kotlin.math.abs(golden[i] - packed[i]) + if (ad > maxAbs) maxAbs = ad + if (firstDiff == -1 && ad > 0f) firstDiff = i + } + assertTrue( + maxAbs == 0f, + "Q5_KBlockTensorData disagrees with DequantOps golden: maxAbs=$maxAbs firstDiff=$firstDiff", + ) + } +} diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt new file mode 100644 index 00000000..9aba08bd --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_KTensorData.kt @@ -0,0 +1,320 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.storage.PackedBlockStorage +import sk.ainet.lang.tensor.storage.TensorEncoding +import sk.ainet.lang.types.DType + +/** + * Tensor data interface for Q5_K quantized format (canonical ggml layout). + * + * Q5_K block format (256 elements per block, 176 bytes per block): + * - 2 bytes: f16 d (super-block scale) + * - 2 bytes: f16 dMin (super-block min-scale) + * - 12 bytes: packed 6-bit scaleIdx + 6-bit minIdx for each of 8 sub-blocks, + * encoded with ggml's `get_scale_min_k4` bit-mixing layout — + * *identical* to Q4_K (see [Q4_KTensorData]). + * - 32 bytes: `qh` high-bit plane. One byte per intra-group element position + * `l` (0..31); the 5th bit of a code comes from a single bit of + * `qh[l]` selected by `(outer-iter, low/high nibble)`. + * - 128 bytes: `qs` 4-bit low nibbles, laid out *strided* in 4 groups of 32 + * bytes, exactly as Q4_K: byte (j*32 + i) carries element + * (2j*32 + i) in its lo nibble and element ((2j+1)*32 + i) in its + * hi nibble. + * + * The 5th bit (per ggml-quants.c `dequantize_row_q5_K`): for outer iteration + * `outer` (0..3), the low-nibble sub-block uses `qh[l]` bit `2*outer` and the + * high-nibble sub-block uses bit `2*outer + 1`. `qh` is indexed by the + * intra-group position `l` (0..31), NOT by output position. + * + * Each sub-block s (s=0..7): + * - 6-bit scaleIdx, 6-bit minIdx (from `get_scale_min_k4`) + * - scale = d * scaleIdx + * - offset = dMin * minIdx + * + * Dequantization: `output[i] = code[i] * scale - offset`, where `code` is the + * full 5-bit value `lowNibble | (fifthBit << 4)` (0..31). + * + * Validated bit-exact against `DequantOps.dequantQ5KFromBytes`, which carries + * the proof and the regression note about the earlier `qh[idx/8]` bug. + */ +public interface Q5_KTensorData : TensorData { + /** Number of Q5_K blocks in the tensor. */ + public val blockCount: Int + + /** Raw packed data containing all blocks. */ + public val packedData: ByteArray + + /** Get the main scale factor (d) for a block. */ + public fun getBlockD(blockIdx: Int): Float + + /** Get the minimum scale factor (dMin) for a block. */ + public fun getBlockDMin(blockIdx: Int): Float + + /** + * Get the scale for a specific sub-block within a block: + * `scale = d * scaleIdx` (no /63 normalisation — ggml's `d1 = d * sc`). + */ + public fun getSubBlockScale(blockIdx: Int, subBlockIdx: Int): Float + + /** + * Get the offset for a specific sub-block within a block: + * `offset = dMin * minIdx`. Subtract this from `code * scale` for the + * dequantised value. + */ + public fun getSubBlockMin(blockIdx: Int, subBlockIdx: Int): Float + + /** Get a 5-bit quantized code value (0..31) for `elementIdx` (0..255). */ + public fun getCode(blockIdx: Int, elementIdx: Int): Int + + public companion object { + /** Elements per Q5_K block. */ + public const val BLOCK_SIZE: Int = 256 + + /** Elements per sub-block. */ + public const val SUB_BLOCK_SIZE: Int = 32 + + /** Number of sub-blocks per block. */ + public const val SUB_BLOCKS_PER_BLOCK: Int = 8 + + /** Bytes per Q5_K block (2 + 2 + 12 + 32 + 128 = 176). */ + public const val BYTES_PER_BLOCK: Int = 176 + + /** Byte offset of the 32-byte `qh` high-bit plane within a block. */ + public const val QH_OFFSET: Int = 16 + + /** Byte offset of the 128-byte `qs` low-nibble region within a block. */ + public const val QS_OFFSET: Int = 48 + } +} + +/** + * Implementation of Q5_KTensorData backed by a packed byte array (canonical + * ggml layout — see [Q5_KTensorData] kdoc for the full byte map). + * + * @param initialShape the logical shape of the tensor (in elements, not blocks) + * @param data the raw packed block data + */ +public class Q5_KBlockTensorData( + initialShape: Shape, + private val data: ByteArray +) : Q5_KTensorData, PackedBlockStorage { + + override val shape: Shape = Shape(initialShape.dimensions.copyOf()) + private val strides: IntArray = shape.computeStrides() + override val packedData: ByteArray get() = data + + override val blockCount: Int = (shape.volume + Q5_KTensorData.BLOCK_SIZE - 1) / Q5_KTensorData.BLOCK_SIZE + + // PackedBlockStorage implementation + override val encoding: TensorEncoding get() = TensorEncoding.Q5_K + override val blockSize: Int get() = Q5_KTensorData.BLOCK_SIZE + + override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + for (subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) { + val scale = getSubBlockScale(blockIdx, subBlockIdx) + val offset = getSubBlockMin(blockIdx, subBlockIdx) + val elemsStart = subBlockIdx * Q5_KTensorData.SUB_BLOCK_SIZE + for (j in 0 until Q5_KTensorData.SUB_BLOCK_SIZE) { + val elementIdx = elemsStart + j + val outIdx = outputOffset + elementIdx + if (outIdx >= output.size) return + val globalIdx = blockIdx * Q5_KTensorData.BLOCK_SIZE + elementIdx + if (globalIdx >= shape.volume) return + val code = getCode(blockIdx, elementIdx) + output[outIdx] = code * scale - offset + } + } + } + + init { + val requiredBytes = blockCount * Q5_KTensorData.BYTES_PER_BLOCK + require(data.size >= requiredBytes) { + "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks" + } + } + + override fun getBlockD(blockIdx: Int): Float { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + val offset = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + val b0 = data[offset].toInt() and 0xFF + val b1 = data[offset + 1].toInt() and 0xFF + val halfBits = (b1 shl 8) or b0 + return halfToFloat(halfBits) + } + + override fun getBlockDMin(blockIdx: Int): Float { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + val offset = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 2 + val b0 = data[offset].toInt() and 0xFF + val b1 = data[offset + 1].toInt() and 0xFF + val halfBits = (b1 shl 8) or b0 + return halfToFloat(halfBits) + } + + override fun getSubBlockScale(blockIdx: Int, subBlockIdx: Int): Float { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + require(subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) { + "Sub-block index $subBlockIdx out of bounds (0..7)" + } + return getBlockD(blockIdx) * getScaleIndex(blockIdx, subBlockIdx) + } + + override fun getSubBlockMin(blockIdx: Int, subBlockIdx: Int): Float { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + require(subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) { + "Sub-block index $subBlockIdx out of bounds (0..7)" + } + return getBlockDMin(blockIdx) * getMinIndex(blockIdx, subBlockIdx) + } + + /** + * Port of `get_scale_min_k4` from ggml-quants.c — identical to Q4_K. The + * 12 scale bytes don't pack 12 bits sequentially per sub-block; sub-blocks + * 4..7 reuse the top 2 bits of bytes for sub-blocks 0..3. + */ + private fun getScaleIndex(blockIdx: Int, subBlockIdx: Int): Int { + val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 4 + val j = subBlockIdx + return if (j < 4) { + data[base + j].toInt() and 0x3F + } else { + val low4 = data[base + j + 4].toInt() and 0x0F + val high2 = (data[base + j - 4].toInt() and 0xFF) ushr 6 + low4 or (high2 shl 4) + } + } + + private fun getMinIndex(blockIdx: Int, subBlockIdx: Int): Int { + val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + 4 + val j = subBlockIdx + return if (j < 4) { + data[base + j + 4].toInt() and 0x3F + } else { + val low4 = (data[base + j + 4].toInt() and 0xFF) ushr 4 + val high2 = (data[base + j].toInt() and 0xFF) ushr 6 + low4 or (high2 shl 4) + } + } + + /** + * Look up the 5-bit code for `elementIdx` (0..255) within block `blockIdx`. + * The low 4 bits come from the strided `qs` nibble layout (identical to + * Q4_K); the 5th bit comes from `qh[l]` where `l` is the intra-group + * position and the bit index is `2*group` (lo nibble) or `2*group + 1` + * (hi nibble). Matches `DequantOps.dequantQ5KFromBytes`. + */ + override fun getCode(blockIdx: Int, elementIdx: Int): Int { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + require(elementIdx in 0 until Q5_KTensorData.BLOCK_SIZE) { + "Element index $elementIdx out of bounds (0..255)" + } + val base = blockIdx * Q5_KTensorData.BYTES_PER_BLOCK + val groupIdx = elementIdx / 64 // 0..3 — the `outer` iteration + val withinGroup = elementIdx % 64 // 0..63 + val l = withinGroup % 32 // 0..31 — intra-group position + val qsByte = data[base + Q5_KTensorData.QS_OFFSET + groupIdx * 32 + l].toInt() and 0xFF + val low = if (withinGroup < 32) qsByte and 0x0F else qsByte ushr 4 + val qhByte = data[base + Q5_KTensorData.QH_OFFSET + l].toInt() and 0xFF + val bit = if (withinGroup < 32) 2 * groupIdx else 2 * groupIdx + 1 + val fifth = (qhByte ushr bit) and 0x01 + return low or (fifth shl 4) + } + + override fun get(vararg indices: Int): Byte { + val flatIndex = calcFlatIndex(indices) + val blockIdx = flatIndex / Q5_KTensorData.BLOCK_SIZE + val elementIdx = flatIndex % Q5_KTensorData.BLOCK_SIZE + return getCode(blockIdx, elementIdx).toByte() + } + + override fun set(vararg indices: Int, value: Byte) { + throw UnsupportedOperationException("Q5_K packed tensor data is read-only") + } + + private fun calcFlatIndex(indices: IntArray): Int { + require(indices.size == shape.dimensions.size) { + "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})" + } + var flatIndex = 0 + for (i in indices.indices) { + val idx = indices[i] + require(idx >= 0 && idx < shape.dimensions[i]) { + "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}" + } + flatIndex += idx * strides[i] + } + return flatIndex + } + + public companion object { + /** + * Create Q5_KTensorData from raw GGUF bytes. + */ + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_KBlockTensorData { + return Q5_KBlockTensorData(shape, bytes) + } + + /** + * Convert f16 bits to float32. + */ + internal fun halfToFloat(hbits: Int): Float { + val sign = (hbits and 0x8000) shl 16 + val exp = (hbits and 0x7C00) shr 10 + val mant = hbits and 0x03FF + + return when (exp) { + 0 -> { + if (mant == 0) { + Float.fromBits(sign) + } else { + var m = mant + var e = -14 + while ((m and 0x400) == 0) { + m = m shl 1 + e-- + } + m = m and 0x3FF + val floatExp = (e + 127) shl 23 + val floatMant = m shl 13 + Float.fromBits(sign or floatExp or floatMant) + } + } + 31 -> { + val floatExp = 0xFF shl 23 + val floatMant = mant shl 13 + Float.fromBits(sign or floatExp or floatMant) + } + else -> { + val floatExp = (exp - 15 + 127) shl 23 + val floatMant = mant shl 13 + Float.fromBits(sign or floatExp or floatMant) + } + } + } + } +} + +/** + * Dequantize Q5_K tensor data to a FloatArray (canonical ggml formula: + * `output[i] = code[i] * scale - offset`). + */ +public fun Q5_KTensorData.toFloatArray(): FloatArray { + val result = FloatArray(shape.volume) + var outIdx = 0 + for (blockIdx in 0 until blockCount) { + for (subBlockIdx in 0 until Q5_KTensorData.SUB_BLOCKS_PER_BLOCK) { + val scale = getSubBlockScale(blockIdx, subBlockIdx) + val offset = getSubBlockMin(blockIdx, subBlockIdx) + val elemsStart = subBlockIdx * Q5_KTensorData.SUB_BLOCK_SIZE + for (j in 0 until Q5_KTensorData.SUB_BLOCK_SIZE) { + val elementIdx = elemsStart + j + if (outIdx >= shape.volume) break + val code = getCode(blockIdx, elementIdx) + result[outIdx++] = code * scale - offset + } + } + } + return result +} diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt index 509b6704..1f2705ce 100644 --- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt @@ -40,6 +40,18 @@ public sealed interface TensorEncoding { } } + /** GGML Q5_K block quantization: 256 elements per 176-byte block. */ + public data object Q5_K : TensorEncoding { + public const val BLOCK_SIZE: Int = 256 + public const val BYTES_PER_BLOCK: Int = 176 + + override val name: String get() = "Q5_K" + override fun physicalBytes(elementCount: Long): Long { + val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE + return blocks * BYTES_PER_BLOCK + } + } + /** GGML Q6_K block quantization: 256 elements per 210-byte block. */ public data object Q6_K : TensorEncoding { public const val BLOCK_SIZE: Int = 256 From 58142009b9f05b381b8e78e226ee3b7c515748b8 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 07:24:02 +0200 Subject: [PATCH 2/4] feat(backend-native-cpu): Kotlin/Native cinterop to the C/NEON kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hand-written matmul kernels were JVM-only (consumed via FFM), but the SL2610 board binary is Kotlin/Native — it can't use the FFM wrapper. Add a K/N consumption path via cinterop so the board gets the same C (and, on aarch64, NEON) kernels. - CMake builds a STATIC archive (skainet_kernels_static -> libskainet_kernels.a) alongside the SHARED lib; same sources + flags (incl. the aarch64 NEON march). - cinterop .def (skainet_kernels.h -> sk.ainet.kernels.cinterop). - linuxX64 target on the (previously jvm-only) module, linking the static archive into K/N binaries; link tasks depend on the CMake build. - NativeKnQ5KMatmulKernel (linuxX64Main): calls skainet_q5k_matmul via cinterop with pinned arrays (zero-copy). POC verified on the host (linuxX64): NativeKnQ5KMatmulKernelParityTest — the cinterop kernel matches the commonMain ScalarQ5_KMatmulKernel across 4 shapes (tests=4, failures=0). JVM/FFM path unchanged (jvmTest green). linuxArm64 board target + NEON runtime check are the remaining step. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../build.gradle.kts | 38 ++++++++ .../native/CMakeLists.txt | 87 ++++++++++--------- .../exec/kernel/NativeKnQ5KMatmulKernel.kt | 56 ++++++++++++ .../NativeKnQ5KMatmulKernelParityTest.kt | 70 +++++++++++++++ .../cinterop/skainet_kernels.def | 8 ++ 5 files changed, 220 insertions(+), 39 deletions(-) create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt create mode 100644 skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts index 6ab0ddd9..e5aa22f7 100644 --- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts +++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts @@ -3,10 +3,30 @@ plugins { alias(libs.plugins.vanniktech.mavenPublish) } +// Paths shared by the K/N cinterop (kotlin block) and the CMake tasks below. +val nativeIncludeDir: String = layout.projectDirectory.dir("native/include").asFile.absolutePath +val staticArchivePath: String = + layout.buildDirectory.file("native/cmake-build/libskainet_kernels.a").get().asFile.absolutePath + kotlin { explicitApi() jvm() + // Kotlin/Native: POC on the host (linuxX64); linuxArm64 is the board target. + // Exposes the hand-written C/NEON kernels to K/N via cinterop to the static + // archive libskainet_kernels.a (CMake `skainet_kernels_static`). This is the + // board-consumption path — the JVM consumes the same kernels via FFM instead. + linuxX64 { + compilations.getByName("main").cinterops.create("skainetKernels") { + defFile(project.file("src/nativeInterop/cinterop/skainet_kernels.def")) + includeDirs(nativeIncludeDir) + } + binaries.all { + // Link the static C archive into every linuxX64 binary (incl. tests). + linkerOpts(staticArchivePath) + } + } + sourceSets { val jvmMain by getting { dependencies { @@ -24,6 +44,18 @@ kotlin { implementation(libs.kotlinx.coroutines) } } + val linuxX64Main by getting { + dependencies { + implementation(project(":skainet-backends:skainet-backend-api")) + } + } + val linuxX64Test by getting { + dependencies { + implementation(libs.kotlin.test) + // ScalarQ5_KMatmulKernel reference for the cinterop parity test. + implementation(project(":skainet-backends:skainet-backend-cpu")) + } + } } } @@ -89,6 +121,12 @@ val buildNativeKernels by tasks.registering(Exec::class) { ) } +// The linuxX64 (K/N) binaries link libskainet_kernels.a (built by CMake into +// cmakeBuildPath), so the static archive must exist before the K/N link step. +tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxX64") }.configureEach { + dependsOn(buildNativeKernels) +} + val packageNativeKernels by tasks.registering(Copy::class) { group = "build" description = "Stage the built native kernels library into JVM resources." diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt index f26c125c..492ac4b7 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt +++ b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt @@ -9,7 +9,7 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() -add_library(skainet_kernels SHARED +set(SKAINET_KERNEL_SOURCES src/skainet_smoke.c src/q4k_matmul.c src/q5k_matmul.c @@ -19,45 +19,54 @@ add_library(skainet_kernels SHARED src/q4_0_matmul.c ) -target_include_directories(skainet_kernels PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/include -) +# SHARED: consumed by the JVM via java.lang.foreign (FFM), bundled as a JAR +# resource (libskainet_kernels.{so,dylib,dll}). +# STATIC: consumed by Kotlin/Native via cinterop, linked into the K/N binary +# (libskainet_kernels.a). Same sources + flags; both keep -fPIC. +add_library(skainet_kernels SHARED ${SKAINET_KERNEL_SOURCES}) +add_library(skainet_kernels_static STATIC ${SKAINET_KERNEL_SOURCES}) +set_target_properties(skainet_kernels_static PROPERTIES OUTPUT_NAME skainet_kernels) -# Strip the "lib" prefix on Windows so the artifact name is consistent -# with the resource-bundle path skainet_kernels.{dll,so,dylib}. -if(WIN32) - set_target_properties(skainet_kernels PROPERTIES PREFIX "") -endif() +set(SKAINET_KERNEL_TARGETS skainet_kernels skainet_kernels_static) -# Per-compiler tuning. The Q4_K kernel hot loop is straight-line FP -# arithmetic that auto-vectorizes cleanly under aggressive optimization -# (AVX2 on x86_64, NEON on ARM64). Visibility is also handled here on -# ELF / Mach-O; on Windows the SKAINET_API macro adds dllexport so we -# don't need /VISIBILITY flags. -if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") - target_compile_options(skainet_kernels PRIVATE - -fvisibility=hidden - -Wall -Wextra - -O3 - -ffast-math - -funroll-loops - ) - # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h. - # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 + dotprod - # (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it would - # SIGILL on the board. CONFIRM on-device first: `grep Features /proc/cpuinfo` - # must list `asimddp` and `fphp`/`asimdhp`. Explicit -march (not -mcpu=native) - # because this is a cross-compile from an x86 host. - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") - target_compile_options(skainet_kernels PRIVATE - -march=armv8.2-a+fp16+dotprod +foreach(tgt IN LISTS SKAINET_KERNEL_TARGETS) + target_include_directories(${tgt} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + + # Strip the "lib" prefix on Windows so the shared artifact name matches + # the resource-bundle path skainet_kernels.{dll,so,dylib}. + if(WIN32) + set_target_properties(${tgt} PROPERTIES PREFIX "") + endif() + + # Per-compiler tuning. The matmul hot loops are straight-line FP arithmetic + # that auto-vectorizes under aggressive optimization (AVX2 on x86_64, NEON + # on ARM64). Visibility is handled here on ELF / Mach-O; on Windows the + # SKAINET_API macro adds dllexport. + if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") + target_compile_options(${tgt} PRIVATE + -fvisibility=hidden + -Wall -Wextra + -O3 + -ffast-math + -funroll-loops + ) + # AArch64: enable the NEON paths guarded by __ARM_NEON in skainet_simd.h. + # The SL2610 is Cortex-A55-class (ARMv8.2-A) — it HAS NEON + fp16 + + # dotprod (asimddp) but NOT i8mm (that is ARMv8.6). Do NOT add +i8mm: it + # would SIGILL on the board. CONFIRM on-device first: `grep Features + # /proc/cpuinfo` must list `asimddp` and `fphp`/`asimdhp`. Explicit + # -march (not -mcpu=native) because this is a cross-compile from x86. + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + target_compile_options(${tgt} PRIVATE + -march=armv8.2-a+fp16+dotprod + ) + endif() + set_target_properties(${tgt} PROPERTIES C_VISIBILITY_PRESET hidden) + elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC") + target_compile_options(${tgt} PRIVATE + /O2 + /fp:fast + /W3 ) endif() - set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden) -elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC") - target_compile_options(skainet_kernels PRIVATE - /O2 - /fp:fast - /W3 - ) -endif() +endforeach() diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt new file mode 100644 index 00000000..1ad62644 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt @@ -0,0 +1,56 @@ +package sk.ainet.exec.kernel + +import kotlinx.cinterop.ExperimentalForeignApi +import kotlinx.cinterop.addressOf +import kotlinx.cinterop.reinterpret +import kotlinx.cinterop.usePinned +import sk.ainet.backend.api.kernel.Q5KMatmulKernel +import sk.ainet.kernels.cinterop.skainet_q5k_matmul + +/** + * Kotlin/Native implementation of [Q5KMatmulKernel] that calls the hand-written + * C kernel `skainet_q5k_matmul` (the same `q5k_matmul.c` the JVM consumes via + * FFM) through cinterop, linked from the static archive `libskainet_kernels.a`. + * + * This is the board-consumption path: the SL2610 binary is Kotlin/Native, so it + * cannot use the JVM-FFM wrapper. The arrays are pinned and their base pointers + * passed to C; the C side reads `input + input_offset` etc., so no copy is made. + * + * On `linuxArm64` the linked archive carries the NEON paths + * (`-march=armv8.2-a+fp16+dotprod`); on `linuxX64` (this POC host) it's the + * scalar/auto-vectorized build. Correctness is identical across both. + */ +@OptIn(ExperimentalForeignApi::class) +public object NativeKnQ5KMatmulKernel : Q5KMatmulKernel { + + private const val BLOCK_SIZE = 256 + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "NativeKnQ5KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + + input.usePinned { inPin -> + weight.usePinned { wPin -> + output.usePinned { outPin -> + skainet_q5k_matmul( + inPin.addressOf(0), + inputOffset, + wPin.addressOf(0).reinterpret(), + weightByteOffset, + inputDim, + outputDim, + outPin.addressOf(0), + outputOffset, + ) + } + } + } + } +} diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt new file mode 100644 index 00000000..fceb2b28 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernelParityTest.kt @@ -0,0 +1,70 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue + +/** + * Proves the Kotlin/Native cinterop path: [NativeKnQ5KMatmulKernel] (calling the + * C `skainet_q5k_matmul` via cinterop, linked from libskainet_kernels.a) must + * agree with the commonMain [ScalarQ5_KMatmulKernel] reference within FMA + + * `-ffast-math` reassociation tolerance. + * + * This is the host (linuxX64) de-risking of the board (linuxArm64) consumption: + * the cinterop mechanism + kernel correctness are verified here; only the NEON + * codegen differs on aarch64 (board-verify-pending). + */ +class NativeKnQ5KMatmulKernelParityTest { + + private val blockSize = 256 + private val bytesPerBlock = 176 + + private fun randomQ5KBytes(numBlocks: Int, seed: Int): ByteArray { + val rng = Random(seed) + val bytes = ByteArray(numBlocks * bytesPerBlock) + rng.nextBytes(bytes) + for (block in 0 until numBlocks) { + val base = block * bytesPerBlock + // 0x3C00 == 1.0f16 for d and dMin so dequant stays finite. + bytes[base + 0] = 0x00.toByte() + bytes[base + 1] = 0x3C.toByte() + bytes[base + 2] = 0x00.toByte() + bytes[base + 3] = 0x3C.toByte() + } + return bytes + } + + private fun assertParity(inputDim: Int, outputDim: Int, seed: Int, tol: Float) { + val numBlocks = (inputDim / blockSize) * outputDim + val packed = randomQ5KBytes(numBlocks, seed) + val input = FloatArray(inputDim) { Random(seed + it).nextFloat() - 0.5f } + + val refOut = FloatArray(outputDim) + ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, refOut, 0) + + val knOut = FloatArray(outputDim) + NativeKnQ5KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, knOut, 0) + + for (o in 0 until outputDim) { + val diff = abs(refOut[o] - knOut[o]) + val rel = diff / (abs(refOut[o]) + 1e-9f) + assertTrue( + diff <= tol || rel < 1e-4f, + "row $o diverged: scalar=${refOut[o]} cinterop=${knOut[o]} diff=$diff rel=$rel tol=$tol", + ) + } + } + + @Test + fun single_block_single_row() = assertParity(256, 1, 42, 1e-2f) + + @Test + fun single_block_multi_row() = assertParity(256, 16, 7, 1e-2f) + + @Test + fun multi_block_multi_row() = assertParity(1024, 64, 123, 5e-2f) + + @Test + fun llm_typical_shape() = assertParity(4096, 64, 999, 5e-1f) +} diff --git a/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def b/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def new file mode 100644 index 00000000..23c80954 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/nativeInterop/cinterop/skainet_kernels.def @@ -0,0 +1,8 @@ +# Kotlin/Native cinterop binding for the hand-written C matmul kernels +# (skainet_kernels.h). Generates Kotlin bindings for skainet_q5k_matmul etc.; +# the static archive libskainet_kernels.a (built by CMake) is linked into the +# consuming K/N binary via linkerOpts (see build.gradle.kts). includeDirs for +# the header are supplied from the Gradle cinterop block. +headers = skainet_kernels.h +headerFilter = skainet_kernels.h +package = sk.ainet.kernels.cinterop From 1b6f7f6acc10e1b79c5b0dd77eb7bbd97dce5e99 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 07:31:56 +0200 Subject: [PATCH 3/4] feat(backend-native-cpu): K/N KernelProvider over the cinterop kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The K/N analogue of the JVM NativeKernelProvider (FFM): a KernelProvider (priority 100) exposing the cinterop-backed Q5_K/Q4_K/Q8_0/Q4_0 matmul kernels, plus installNativeKernels() to register it in KernelRegistry — the path the eager runtime's DefaultCpuOps.chooseQuantizedMatmulHeap uses to resolve a kernel. K/N has no ServiceLoader, so registration is an explicit call by the consumer (scalar fallback for Q6_K etc. is registered separately from skainet-backend-cpu). Verified on linuxX64: NativeKnKernelProviderTest — installNativeKernels makes native-cinterop the best-available provider, its Q5_K kernel is the registry-resolved kernel, and it matches the scalar reference (6 K/N tests green total). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../exec/kernel/NativeKnKernelProvider.kt | 131 ++++++++++++++++++ .../exec/kernel/NativeKnKernelProviderTest.kt | 67 +++++++++ 2 files changed, 198 insertions(+) create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt create mode 100644 skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt new file mode 100644 index 00000000..83f80c36 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt @@ -0,0 +1,131 @@ +package sk.ainet.exec.kernel + +import kotlinx.cinterop.ExperimentalForeignApi +import kotlinx.cinterop.addressOf +import kotlinx.cinterop.reinterpret +import kotlinx.cinterop.usePinned +import sk.ainet.backend.api.kernel.Fp32MatmulKernel +import sk.ainet.backend.api.kernel.KernelProvider +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.backend.api.kernel.Q4KMatmulKernel +import sk.ainet.backend.api.kernel.Q4_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5KMatmulKernel +import sk.ainet.backend.api.kernel.Q8_0MatmulKernel +import sk.ainet.kernels.cinterop.skainet_q4_0_matmul +import sk.ainet.kernels.cinterop.skainet_q4k_matmul +import sk.ainet.kernels.cinterop.skainet_q8_0_matmul + +/** + * Kotlin/Native [KernelProvider] backed by the hand-written C kernels via + * cinterop (static archive `libskainet_kernels.a`) — the K/N analogue of the + * JVM `NativeKernelProvider` (FFM). Priority 100, above the commonMain scalar + * reference (0). On `linuxArm64` the linked archive carries the NEON paths. + * + * **Registration is manual on K/N** (no `ServiceLoader`): a consumer calls + * [installNativeKernels] once at startup. [Q5KMatmulKernel] (the FunctionGemma + * Q5_K_M hot path) plus Q4_K / Q8_0 / Q4_0 are wired; the rest cascade to the + * scalar provider. + */ +@OptIn(ExperimentalForeignApi::class) +public object NativeKnKernelProvider : KernelProvider { + override val name: String = "native-cinterop" + override val priority: Int = 100 + + // Statically linked — the symbols are always present once the binary links + // libskainet_kernels.a, so the provider is unconditionally available. + override fun isAvailable(): Boolean = true + + // Abstract on KernelProvider (no default) — no native FP32 SGEMM wrapper yet. + override fun matmulFp32(): Fp32MatmulKernel? = null + + override fun matmulQ5K(): Q5KMatmulKernel = NativeKnQ5KMatmulKernel + override fun matmulQ4K(): Q4KMatmulKernel = NativeKnQ4KMatmulKernel + override fun matmulQ8_0(): Q8_0MatmulKernel = NativeKnQ8_0MatmulKernel + override fun matmulQ4_0(): Q4_0MatmulKernel = NativeKnQ4_0MatmulKernel +} + +/** + * Register [NativeKnKernelProvider] in the process-wide [KernelRegistry]. Idempotent + * (re-registering the same instance is a no-op). Call once at startup before any + * `ops.matmul` on quantized weights. + * + * For quant types without a C kernel (e.g. Q6_K) also register the commonMain + * `ScalarKernelProvider` (from `skainet-backend-cpu`) as the fallback — it lives + * in a different module, so the consumer wires it: + * `KernelRegistry.register(ScalarKernelProvider)`. + */ +public fun installNativeKernels() { + KernelRegistry.register(NativeKnKernelProvider) +} + +@OptIn(ExperimentalForeignApi::class) +public object NativeKnQ4KMatmulKernel : Q4KMatmulKernel { + private const val BLOCK_SIZE = 256 + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "NativeKnQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + input.usePinned { i -> weight.usePinned { w -> output.usePinned { o -> + skainet_q4k_matmul( + i.addressOf(0), inputOffset, + w.addressOf(0).reinterpret(), weightByteOffset, + inputDim, outputDim, + o.addressOf(0), outputOffset, + ) + } } } + } +} + +@OptIn(ExperimentalForeignApi::class) +public object NativeKnQ8_0MatmulKernel : Q8_0MatmulKernel { + private const val BLOCK_SIZE = 32 + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "NativeKnQ8_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + input.usePinned { i -> weight.usePinned { w -> output.usePinned { o -> + skainet_q8_0_matmul( + i.addressOf(0), inputOffset, + w.addressOf(0).reinterpret(), weightByteOffset, + inputDim, outputDim, + o.addressOf(0), outputOffset, + ) + } } } + } +} + +@OptIn(ExperimentalForeignApi::class) +public object NativeKnQ4_0MatmulKernel : Q4_0MatmulKernel { + private const val BLOCK_SIZE = 32 + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "NativeKnQ4_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) return + input.usePinned { i -> weight.usePinned { w -> output.usePinned { o -> + skainet_q4_0_matmul( + i.addressOf(0), inputOffset, + w.addressOf(0).reinterpret(), weightByteOffset, + inputDim, outputDim, + o.addressOf(0), outputOffset, + ) + } } } + } +} diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt new file mode 100644 index 00000000..99e7c614 --- /dev/null +++ b/skainet-backends/skainet-backend-native-cpu/src/linuxX64Test/kotlin/sk/ainet/exec/kernel/NativeKnKernelProviderTest.kt @@ -0,0 +1,67 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.AfterTest +import kotlin.test.BeforeTest +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNotNull +import kotlin.test.assertSame +import kotlin.test.assertTrue +import sk.ainet.backend.api.kernel.KernelRegistry + +/** + * Verifies the K/N kernel provider integrates with [KernelRegistry] the way the + * eager runtime's `DefaultCpuOps.chooseQuantizedMatmulHeap` resolves kernels: + * after [installNativeKernels], the highest-priority available provider is the + * cinterop one, and its Q5_K kernel is the registry-resolved kernel that runs. + */ +class NativeKnKernelProviderTest { + + @BeforeTest + fun clean() = KernelRegistry.clearForTesting() + + @AfterTest + fun reset() = KernelRegistry.clearForTesting() + + @Test + fun installs_and_resolves_native_quant_kernels() { + installNativeKernels() + + // Priority 100 cinterop beats the scalar (0) fallback. + assertEquals("native-cinterop", KernelRegistry.bestAvailable()?.name) + + val provider = KernelRegistry.providers().firstOrNull { it.isAvailable() && it.matmulQ5K() != null } + assertNotNull(provider, "no available provider carries a Q5_K kernel") + assertSame(NativeKnQ5KMatmulKernel, provider.matmulQ5K()) + assertTrue(provider.supports("matmul", listOf("Float32", "Q5_K"))) + } + + @Test + fun registry_resolved_q5k_kernel_is_correct() { + installNativeKernels() + val kernel = KernelRegistry.bestAvailable()!!.matmulQ5K()!! + + val inputDim = 1024 + val outputDim = 64 + val numBlocks = (inputDim / 256) * outputDim + val packed = ByteArray(numBlocks * 176).also { Random(5).nextBytes(it) } + for (b in 0 until numBlocks) { + val base = b * 176 + packed[base] = 0x00; packed[base + 1] = 0x3C // d = 1.0f16 + packed[base + 2] = 0x00; packed[base + 3] = 0x3C // dMin = 1.0f16 + } + val input = FloatArray(inputDim) { Random(it + 1).nextFloat() - 0.5f } + + val ref = FloatArray(outputDim) + ScalarQ5_KMatmulKernel.matmul(input, 0, packed, 0, inputDim, outputDim, ref, 0) + val got = FloatArray(outputDim) + kernel.matmul(input, 0, packed, 0, inputDim, outputDim, got, 0) + + for (o in 0 until outputDim) { + val diff = abs(ref[o] - got[o]) + assertTrue(diff <= 5e-2f || diff / (abs(ref[o]) + 1e-9f) < 1e-4f, "row $o: ref=${ref[o]} got=${got[o]}") + } + } +} From 587d59b0798847480dc51ec19b7417f9b2db02c5 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 10:13:40 +0200 Subject: [PATCH 4/4] feat(backend-native-cpu): add linuxArm64 (board) K/N target Promote the K/N cinterop path from the linuxX64 POC to the real board target: - linuxArm64 target with the same skainet_kernels cinterop; links the aarch64 cross-built static archive (cmake-build-arm64/libskainet_kernels.a, NEON). - Shared `nativeMain` source set holds NativeKn*MatmulKernel + the provider, so linuxX64 and linuxArm64 share one implementation (cinterop bindings are commonized across both targets). - linuxArm64 link tasks depend on the aarch64 cross-build only under -PcrossArm64 (toolchain present); a plain host build still compiles linuxArm64 to a klib. Verified on host: compileKotlinLinuxArm64 + cinteropSkainetKernelsLinuxArm64 succeed (cross-compiled from x86); linuxX64Test still green (6 tests) on the shared nativeMain. Final aarch64 binary link + NEON runtime are board-verify-pending. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../build.gradle.kts | 42 ++++++++++++++----- .../exec/kernel/NativeKnKernelProvider.kt | 0 .../exec/kernel/NativeKnQ5KMatmulKernel.kt | 0 3 files changed, 31 insertions(+), 11 deletions(-) rename skainet-backends/skainet-backend-native-cpu/src/{linuxX64Main => nativeMain}/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt (100%) rename skainet-backends/skainet-backend-native-cpu/src/{linuxX64Main => nativeMain}/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt (100%) diff --git a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts index e5aa22f7..7c5e95e0 100644 --- a/skainet-backends/skainet-backend-native-cpu/build.gradle.kts +++ b/skainet-backends/skainet-backend-native-cpu/build.gradle.kts @@ -7,25 +7,29 @@ plugins { val nativeIncludeDir: String = layout.projectDirectory.dir("native/include").asFile.absolutePath val staticArchivePath: String = layout.buildDirectory.file("native/cmake-build/libskainet_kernels.a").get().asFile.absolutePath +// aarch64 cross-built static archive (produced by buildNativeKernelsArm64 with +// -PcrossArm64; carries the NEON paths). Linked into linuxArm64 binaries. +val staticArchiveArm64Path: String = + layout.buildDirectory.file("native/cmake-build-arm64/libskainet_kernels.a").get().asFile.absolutePath kotlin { explicitApi() jvm() - // Kotlin/Native: POC on the host (linuxX64); linuxArm64 is the board target. - // Exposes the hand-written C/NEON kernels to K/N via cinterop to the static - // archive libskainet_kernels.a (CMake `skainet_kernels_static`). This is the - // board-consumption path — the JVM consumes the same kernels via FFM instead. - linuxX64 { + // Kotlin/Native consumption of the hand-written C/NEON kernels via cinterop + // to the static archive libskainet_kernels.a (CMake `skainet_kernels_static`). + // linuxX64 = host (POC / CI-runnable); linuxArm64 = the SL2610 board target + // (its archive is the aarch64 cross-build with NEON). The JVM consumes the + // same kernels via FFM instead. Shared K/N code lives in `nativeMain`. + fun org.jetbrains.kotlin.gradle.plugin.mpp.KotlinNativeTarget.wireSkainetKernels(archive: String) { compilations.getByName("main").cinterops.create("skainetKernels") { defFile(project.file("src/nativeInterop/cinterop/skainet_kernels.def")) includeDirs(nativeIncludeDir) } - binaries.all { - // Link the static C archive into every linuxX64 binary (incl. tests). - linkerOpts(staticArchivePath) - } + binaries.all { linkerOpts(archive) } } + linuxX64 { wireSkainetKernels(staticArchivePath) } + linuxArm64 { wireSkainetKernels(staticArchiveArm64Path) } sourceSets { val jvmMain by getting { @@ -44,11 +48,17 @@ kotlin { implementation(libs.kotlinx.coroutines) } } - val linuxX64Main by getting { + // Shared K/N kernels (NativeKn*MatmulKernel + provider), consumed by both + // linuxX64 and linuxArm64. The cinterop bindings are commonized across the + // two targets so this source set can reference sk.ainet.kernels.cinterop. + val nativeMain by creating { + dependsOn(commonMain.get()) dependencies { implementation(project(":skainet-backends:skainet-backend-api")) } } + val linuxX64Main by getting { dependsOn(nativeMain) } + val linuxArm64Main by getting { dependsOn(nativeMain) } val linuxX64Test by getting { dependencies { implementation(libs.kotlin.test) @@ -122,7 +132,7 @@ val buildNativeKernels by tasks.registering(Exec::class) { } // The linuxX64 (K/N) binaries link libskainet_kernels.a (built by CMake into -// cmakeBuildPath), so the static archive must exist before the K/N link step. +// cmakeBuildPath), so the host static archive must exist before the K/N link. tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxX64") }.configureEach { dependsOn(buildNativeKernels) } @@ -211,6 +221,16 @@ tasks.named("jvmProcessResources") { if (crossArm64Enabled) dependsOn(packageNativeKernelsArm64) } +// linuxArm64 binaries link the aarch64 cross-built archive. Only wired with +// -PcrossArm64 (cross toolchain present): a plain host build still compiles +// linuxArm64 to a klib (no archive needed) — only a final binary/test link +// needs it, which is a board/CI concern. +if (crossArm64Enabled) { + tasks.matching { it.name.startsWith("link") && it.name.endsWith("LinuxArm64") }.configureEach { + dependsOn(buildNativeKernelsArm64) + } +} + // Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test // JVM so Q4KMatmulMicrobenchTest activates. Skipped silently otherwise. val runBenchProperty = providers.systemProperty("skainet.runBench") diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt b/skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt similarity index 100% rename from skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt rename to skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnKernelProvider.kt diff --git a/skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt b/skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt similarity index 100% rename from skainet-backends/skainet-backend-native-cpu/src/linuxX64Main/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt rename to skainet-backends/skainet-backend-native-cpu/src/nativeMain/kotlin/sk/ainet/exec/kernel/NativeKnQ5KMatmulKernel.kt