diff --git a/README.md b/README.md index 1b1ddc08..5f209c0e 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ SKaiNET is a modular ecosystem. While this repository contains the core engine, |---|---| | Examples and sample projects | [SKaiNET-examples](https://github.com/SKaiNET-developers/SKaiNET-examples) | | Interactive notebooks | [SKaiNET-notebook](https://github.com/SKaiNET-developers/SKaiNET-notebook) | +| Eager backends & kernels (what runs where) | [Backends & kernels mindmap](docs/eager-execution-backends-and-kernels.md) | --- diff --git a/docs/eager-execution-backends-and-kernels.md b/docs/eager-execution-backends-and-kernels.md new file mode 100644 index 00000000..0544dadf --- /dev/null +++ b/docs/eager-execution-backends-and-kernels.md @@ -0,0 +1,92 @@ +# Eager execution: backends & kernels + +A map of SKaiNET's **eager** compute path β€” the `TensorOps` backend and its pluggable +matmul **kernel providers** β€” showing what exists today (βœ…), what's in progress (🚧), and +what's missing (❌). The eager path is `DirectCpuExecutionContext β†’ DefaultCpuOps* β†’ +KernelRegistry β†’ KernelProvider`, distinct from the StableHLO/IREE export path. + +Legend: βœ… available Β· 🚧 partial / works via a legacy path Β· ❌ missing. + +```mermaid +mindmap + root((SKaiNET eager execution)) + CPU backend + Scalar floor βœ… + commonMain β€” all KMP targets + FP32 βœ… + BF16 βœ… + Q8_0 βœ… + Q4_0 βœ… + Q4_K βœ… new + Q6_K βœ… new + Q5_1 βœ… new + Q5_0 βœ… new + Panama Vector βœ… + JVM SIMD β€” jdk.incubator.vector + FP32 BF16 Q8_0 Q4_0 βœ… + Q4_K βœ… + Q5_1 Q5_0 βœ… new + Q6_K 🚧 legacy SIMD path + Native FFM βœ… + JVM only β€” C kernels via CMake + FP32 BF16 Q8_0 Q4_0 Q4_K βœ… + Q4_K MemSeg zero-copy βœ… + Q5_1 Q5_0 Q6_K ❌ + Apple Accelerate βœ… + Native macOS iOS β€” cinterop + dense FP32 matmul βœ… + elementwise reductions βœ… + packed quant via scalar + Platforms + JVM βœ… scalar + Panama + FFM + Native linux βœ… scalar only + Native apple βœ… scalar + Accelerate + JS and WASM βœ… scalar only + Gaps and roadmap + Native FFM Q5 and Q6_K ❌ issue 708 + Native SIMD on linux ❌ + Panama SPI Q6_K kernel 🚧 + Q5_K Q2_K Q3_K IQ4 packed ❌ dequant only + GPU backends IREE Metal ❌ future +``` + +## Kernel Γ— provider (matmul, FP32 activations) + +| Weight format | Scalar (all targets) | Panama Vector (JVM SIMD) | Native FFM (JVM) | +|---|:--:|:--:|:--:| +| FP32 | βœ… | βœ… | βœ… | +| BF16 | βœ… | βœ… | βœ… | +| Q8_0 | βœ… | βœ… | βœ… | +| Q4_0 | βœ… | βœ… | βœ… | +| Q4_K | βœ… | βœ… | βœ… | +| Q6_K | βœ… | 🚧 legacy `JvmQuantizedVectorKernels` (no SPI kernel) | ❌ | +| Q5_1 | βœ… | βœ… | ❌ | +| Q5_0 | βœ… | βœ… | ❌ | +| Q5_K / Q2_K / Q3_K / Q8_K / IQ4 | ❌ (dequant-to-FP32 only) | ❌ | ❌ | + +Resolution is by priority: **Native FFM (100) β†’ Panama (50) β†’ Scalar (0)** β€” the best +*available* provider that carries the kernel wins; otherwise it cascades down. + +## Platform Γ— what runs + +| Target | Providers available | Notes | +|---|---|---| +| **JVM / Android(JVM)** | Scalar + Panama + Native-FFM | full SIMD/native acceleration | +| **Kotlin/Native β€” linux x64/arm64** | Scalar | no SIMD yet (scalar floor) | +| **Kotlin/Native β€” macOS/iOS** | Scalar + Apple Accelerate | Accelerate accelerates *dense* FP32; packed-quant via scalar | +| **JS / WASM (Js, Wasi)** | Scalar | no SIMD | + +**Packed-quant matmul now works on every target** (Q4_K/Q6_K/Q5_1/Q5_0 gained a commonMain +scalar kernel, and `DefaultCpuOpsBase` dispatches packed weights via the registry). Before, +those formats were JVM-only and broke on Native. + +## In progress / missing (with trackers) + +- 🚧 **Q6_K Panama SPI kernel** β€” Q6_K is SIMD on JVM via the legacy `JvmQuantizedVectorKernels.matmulQ6_KVec`, but has no `PanamaVectorQ6KMatmulKernel`/`KernelProvider.matmulQ6K()` SPI entry yet. +- ❌ **Native FFM Q5_1/Q5_0/Q6_K** β€” the C kernel set covers FP32/BF16/Q8_0/Q4_0/Q4_K only. Tracked by **SKaiNET#708** (core kernel) and **SKaiNET-transformers#170** (converter wiring). +- ❌ **Native SIMD on linux** β€” Kotlin/Native linux targets run the scalar floor; no cinterop/OpenBLAS or SIMD path. (Apple has Accelerate for dense ops.) +- ❌ **Other GGML quant formats** (Q5_K, Q2_K, Q3_K, Q8_K, IQ4_NL/XS) β€” loadable via dequant-to-FP32, but no packed matmul kernel. +- ❌ **Non-CPU eager backends** (IREE, Metal, GPU) β€” the `KernelProvider` SPI anticipates them, but none are implemented for the eager path today. + +> Generated as a hand-authored overview. A machine-generated kernel Γ— platform matrix +> (derived from the registered `KernelProvider`s) is a planned follow-up so this stays in sync. diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api index 539540e1..add035f2 100644 --- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api +++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api @@ -92,6 +92,16 @@ public final class sk/ainet/exec/kernel/PanamaVectorQ4_0MatmulKernel : sk/ainet/ public fun matmul ([FI[BIII[FI)V } +public final class sk/ainet/exec/kernel/PanamaVectorQ5_0MatmulKernel : sk/ainet/backend/api/kernel/Q5_0MatmulKernel { + public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ5_0MatmulKernel; + public fun matmul ([FI[BIII[FI)V +} + +public final class sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel : sk/ainet/backend/api/kernel/Q5_1MatmulKernel { + public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel; + public fun matmul ([FI[BIII[FI)V +} + public final class sk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel { public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel; public fun matmul ([FI[BIII[FI)V diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt index ecc68cf5..99ec4eb0 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt @@ -5,6 +5,8 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel import sk.ainet.backend.api.kernel.KernelProvider import sk.ainet.backend.api.kernel.Q4KMatmulKernel import sk.ainet.backend.api.kernel.Q4_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5_0MatmulKernel +import sk.ainet.backend.api.kernel.Q5_1MatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel import sk.ainet.exec.tensor.ops.JvmCpuBackendConfig @@ -53,6 +55,12 @@ public object PanamaVectorKernelProvider : KernelProvider { override fun matmulQ4_0(): Q4_0MatmulKernel? = if (isAvailable()) PanamaVectorQ4_0MatmulKernel else null + override fun matmulQ5_1(): Q5_1MatmulKernel? = + if (isAvailable()) PanamaVectorQ5_1MatmulKernel else null + + override fun matmulQ5_0(): Q5_0MatmulKernel? = + if (isAvailable()) PanamaVectorQ5_0MatmulKernel else null + private fun isVectorApiClassLoaded(): Boolean = runCatching { Class.forName("jdk.incubator.vector.FloatVector") Class.forName("jdk.incubator.vector.VectorSpecies") diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_0MatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_0MatmulKernel.kt new file mode 100644 index 00000000..5d301bf1 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_0MatmulKernel.kt @@ -0,0 +1,81 @@ +package sk.ainet.exec.kernel + +import jdk.incubator.vector.FloatVector +import jdk.incubator.vector.VectorOperators +import jdk.incubator.vector.VectorSpecies +import sk.ainet.backend.api.kernel.Q5_0MatmulKernel + +/** + * SIMD-vectorized FP32 Γ— Q5_0 matmul on the JDK Vector API (scratch-dequant then FMA). + * Dequant `d*(code + (highBit shl 4) - 16)` (symmetric, no per-block min). Numerically + * equivalent to [ScalarQ5_0MatmulKernel]. Block-major layout `(blockIdx*outputDim+o)*22`. + */ +public object PanamaVectorQ5_0MatmulKernel : Q5_0MatmulKernel { + + private const val BLOCK_SIZE = 32 + private const val BYTES_PER_BLOCK = 22 + private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "PanamaVectorQ5_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0) return + if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return } + val blocksPerInputDim = inputDim / BLOCK_SIZE + val step = floatSpecies.length() + val loopBound = floatSpecies.loopBound(BLOCK_SIZE) + val codeBuf = FloatArray(BLOCK_SIZE) + + for (o in 0 until outputDim) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val base = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + val d = halfToFloat(((weight[base + 1].toInt() and 0xFF) shl 8) or (weight[base].toInt() and 0xFF)) + val qh0 = weight[base + 2].toInt() and 0xFF + val qh1 = weight[base + 3].toInt() and 0xFF + val qh2 = weight[base + 4].toInt() and 0xFF + val qh3 = weight[base + 5].toInt() and 0xFF + val qsBase = base + 6 + for (j in 0 until 16) { + val q = weight[qsBase + j].toInt() and 0xFF + val bitLo = ((if (j < 8) qh0 else qh1) ushr (j and 7)) and 1 + val bitHi = ((if (j < 8) qh2 else qh3) ushr (j and 7)) and 1 + codeBuf[j] = d * ((q and 0x0F) + (bitLo shl 4) - 16) + codeBuf[16 + j] = d * ((q ushr 4) + (bitHi shl 4) - 16) + } + val inputBase = inputOffset + blockIdx * BLOCK_SIZE + var accVec = FloatVector.zero(floatSpecies) + var k = 0 + while (k < loopBound) { + accVec = FloatVector.fromArray(floatSpecies, input, inputBase + k) + .fma(FloatVector.fromArray(floatSpecies, codeBuf, k), accVec) + k += step + } + acc += accVec.reduceLanes(VectorOperators.ADD) + while (k < BLOCK_SIZE) { acc += input[inputBase + k] * codeBuf[k]; k++ } + } + output[outputOffset + o] = acc + } + } + + private fun halfToFloat(hbits: Int): Float { + val sign = (hbits and 0x8000) shl 16 + val exp = (hbits and 0x7C00) shr 10 + val mant = hbits and 0x03FF + return when (exp) { + 0 -> if (mant == 0) Float.fromBits(sign) else { + var m = mant; var e = -14 + while ((m and 0x400) == 0) { m = m shl 1; e-- } + Float.fromBits(sign or ((e + 127) shl 23) or ((m and 0x3FF) shl 13)) + } + 31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13)) + else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13)) + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel.kt new file mode 100644 index 00000000..a1c22a08 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel.kt @@ -0,0 +1,85 @@ +package sk.ainet.exec.kernel + +import jdk.incubator.vector.FloatVector +import jdk.incubator.vector.VectorOperators +import jdk.incubator.vector.VectorSpecies +import sk.ainet.backend.api.kernel.Q5_1MatmulKernel + +/** + * SIMD-vectorized FP32 Γ— Q5_1 matmul on the JDK Vector API. Per 32-element block: + * decode `d`/`m`/`qh`, dequant the 32 codes (`d*(code + (highBit shl 4)) + m`, split + * nibble layout) into a reusable scratch buffer, then SIMD-FMA against the matching + * input window. Numerically equivalent to [ScalarQ5_1MatmulKernel] within FMA + + * reordered-reduction tolerance. Block-major weight layout `(blockIdx*outputDim+o)*24`. + */ +public object PanamaVectorQ5_1MatmulKernel : Q5_1MatmulKernel { + + private const val BLOCK_SIZE = 32 + private const val BYTES_PER_BLOCK = 24 + private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "PanamaVectorQ5_1MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0) return + if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return } + val blocksPerInputDim = inputDim / BLOCK_SIZE + val step = floatSpecies.length() + val loopBound = floatSpecies.loopBound(BLOCK_SIZE) + val codeBuf = FloatArray(BLOCK_SIZE) + + for (o in 0 until outputDim) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val base = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + val d = halfToFloat(((weight[base + 1].toInt() and 0xFF) shl 8) or (weight[base].toInt() and 0xFF)) + val m = halfToFloat(((weight[base + 3].toInt() and 0xFF) shl 8) or (weight[base + 2].toInt() and 0xFF)) + val qh0 = weight[base + 4].toInt() and 0xFF + val qh1 = weight[base + 5].toInt() and 0xFF + val qh2 = weight[base + 6].toInt() and 0xFF + val qh3 = weight[base + 7].toInt() and 0xFF + val qsBase = base + 8 + for (j in 0 until 16) { + val q = weight[qsBase + j].toInt() and 0xFF + val bitLo = ((if (j < 8) qh0 else qh1) ushr (j and 7)) and 1 + val bitHi = ((if (j < 8) qh2 else qh3) ushr (j and 7)) and 1 + codeBuf[j] = d * ((q and 0x0F) + (bitLo shl 4)) + m + codeBuf[16 + j] = d * ((q ushr 4) + (bitHi shl 4)) + m + } + val inputBase = inputOffset + blockIdx * BLOCK_SIZE + var accVec = FloatVector.zero(floatSpecies) + var k = 0 + while (k < loopBound) { + accVec = FloatVector.fromArray(floatSpecies, input, inputBase + k) + .fma(FloatVector.fromArray(floatSpecies, codeBuf, k), accVec) + k += step + } + acc += accVec.reduceLanes(VectorOperators.ADD) + while (k < BLOCK_SIZE) { acc += input[inputBase + k] * codeBuf[k]; k++ } + } + output[outputOffset + o] = acc + } + } + + /** Same FP16 β†’ FP32 conversion as [ScalarQ5_1MatmulKernel]. */ + private fun halfToFloat(hbits: Int): Float { + val sign = (hbits and 0x8000) shl 16 + val exp = (hbits and 0x7C00) shr 10 + val mant = hbits and 0x03FF + return when (exp) { + 0 -> if (mant == 0) Float.fromBits(sign) else { + var m = mant; var e = -14 + while ((m and 0x400) == 0) { m = m shl 1; e-- } + Float.fromBits(sign or ((e + 127) shl 23) or ((m and 0x3FF) shl 13)) + } + 31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13)) + else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13)) + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5ParityTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5ParityTest.kt new file mode 100644 index 00000000..0ab2e6ac --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ5ParityTest.kt @@ -0,0 +1,56 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue + +/** Panama SIMD Q5_1/Q5_0 kernels must match the scalar reference within FMA tolerance. */ +class PanamaVectorQ5ParityTest { + + private fun half(v: Float): Int { + val b = v.toRawBits(); val s = (b ushr 16) and 0x8000 + val e = ((b ushr 23) and 0xFF) - 127 + 15; val m = b and 0x7FFFFF + if (e <= 0) return s; if (e >= 31) return s or 0x7C00 + return s or (e shl 10) or (m ushr 13) + } + + /** Block-major packed bytes with VALID (finite) f16 scales; random qh/qs codes. */ + private fun bytes(bpb: Int, inDim: Int, outDim: Int, rng: Random): ByteArray { + val out = ByteArray(outDim * (inDim / 32) * bpb) + var off = 0 + while (off < out.size) { + val d = half(rng.nextFloat() * 0.05f + 0.01f) + out[off] = (d and 0xFF).toByte(); out[off + 1] = ((d ushr 8) and 0xFF).toByte() + var codeStart = off + 2 + if (bpb == 24) { // Q5_1 has a per-block min `m` + val m = half(rng.nextFloat() - 0.5f) + out[off + 2] = (m and 0xFF).toByte(); out[off + 3] = ((m ushr 8) and 0xFF).toByte() + codeStart = off + 4 + } + for (k in codeStart until off + bpb) out[k] = rng.nextInt(256).toByte() + off += bpb + } + return out + } + + private fun check(q5_1: Boolean, inDim: Int, outDim: Int, seed: Int) { + val rng = Random(seed) + val w = bytes(if (q5_1) 24 else 22, inDim, outDim, rng) + val input = FloatArray(inDim) { rng.nextFloat() - 0.5f } + val a = FloatArray(outDim); val b = FloatArray(outDim) + if (q5_1) { + ScalarQ5_1MatmulKernel.matmul(input, 0, w, 0, inDim, outDim, a, 0) + PanamaVectorQ5_1MatmulKernel.matmul(input, 0, w, 0, inDim, outDim, b, 0) + } else { + ScalarQ5_0MatmulKernel.matmul(input, 0, w, 0, inDim, outDim, a, 0) + PanamaVectorQ5_0MatmulKernel.matmul(input, 0, w, 0, inDim, outDim, b, 0) + } + var maxErr = 0f; var maxAbs = 1f + for (o in 0 until outDim) { maxErr = maxOf(maxErr, abs(a[o] - b[o])); maxAbs = maxOf(maxAbs, abs(a[o])) } + assertTrue(maxErr < 1e-4f * maxAbs + 1e-4f, "${if (q5_1) "Q5_1" else "Q5_0"} Panamaβ‰ Scalar: maxErr=$maxErr (maxAbs=$maxAbs)") + } + + @Test fun q5_1_panama_matches_scalar() = check(true, 256, 64, 1) + @Test fun q5_0_panama_matches_scalar() = check(false, 256, 48, 2) +}