From 1cc8a866b23f3ebf24ff3598ad179b21f57d3ccc Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Mon, 8 Jun 2026 11:07:22 +0200 Subject: [PATCH] feat(backend): packed-quant matmul dispatch in DefaultCpuOpsBase (works on Native) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of #708. Makes `ops.matmul(x, ops.transpose(W))` route packed-quant weights to a kernel on EVERY KMP target. Before this, the packed-quant matmul dispatch + lazy transpose lived only in DefaultCpuOpsJvm, so on Kotlin/Native/JS/WASM a packed weight fell through to matmulGeneric, which throws on Byte-packed data — packed matmul was effectively broken off-JVM. - New Q5_1/Q5_0 packed tensor-data types + TensorEncoding.Q5_0/Q5_1 (lang-core). - DefaultCpuOpsBase: `chooseQuantizedMatmulHeap` resolves the kernel via the commonMain KernelRegistry (scalar floor on Native/JS/WASM; Panama/FFM on JVM via the ensureKernelProviders() hook + ServiceLoader) and dispatches FP32 × packed {Q8_0,Q4_0,Q4_K,Q6_K,Q5_1,Q5_0}; lazy-transpose shape-swap branches for the four heap K/Q5 types. The JVM ops keep their MemSeg/SIMD fast paths and intercept Q4_K/Q6_K/Q8_0/Q4_0 before the base — zero JVM regression by construction; Q5_1/Q5_0 (and the whole set on non-JVM) resolve in the base. - Non-JVM platform factories (linux/apple/js/wasm/wasmWasi/android) register ScalarKernelProvider (no ServiceLoader off-JVM). Tests: PackedMatmulDispatchTest (commonTest) runs Q4_K + Q5_1 through ctx.ops.matmul(x, transpose(W)) and matches the dequant reference — green on jvmTest AND linuxX64Test (the Native end-to-end proof). Full backend-cpu jvmTest suite passes (no regression); apiDump regenerated for lang-core + backend-cpu. Co-Authored-By: Claude Opus 4.8 --- .../api/jvm/skainet-backend-cpu.api | 2 + .../ops/PlatformCpuOpsFactory.android.kt | 10 +- .../tensor/ops/PlatformCpuOpsFactory.apple.kt | 5 + .../sk/ainet/exec/tensor/ops/DefaultCpuOps.kt | 89 +++++++++++++++ .../tensor/ops/PackedMatmulDispatchTest.kt | 97 ++++++++++++++++ .../tensor/ops/PlatformCpuOpsFactory.js.kt | 9 +- .../ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt | 12 ++ .../tensor/ops/PlatformCpuOpsFactory.linux.kt | 10 +- .../tensor/ops/PlatformCpuOpsFactory.wasm.kt | 10 +- .../ops/PlatformCpuOpsFactory.wasmWasi.kt | 10 +- .../api/jvm/skainet-lang-core.api | 104 ++++++++++++++++++ .../ainet/lang/tensor/data/Q5_0TensorData.kt | 94 ++++++++++++++++ .../ainet/lang/tensor/data/Q5_1TensorData.kt | 100 +++++++++++++++++ .../lang/tensor/storage/TensorEncoding.kt | 24 ++++ 14 files changed, 566 insertions(+), 10 deletions(-) create mode 100644 skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt create mode 100644 skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt create mode 100644 skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api index c9a8b256..539540e1 100644 --- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api +++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api @@ -180,6 +180,7 @@ public class sk/ainet/exec/tensor/ops/DefaultCpuOpsBase : sk/ainet/lang/tensor/o public fun addScalar (Lsk/ainet/lang/tensor/Tensor;Ljava/lang/Number;)Lsk/ainet/lang/tensor/Tensor; public fun avgPool2d (Lsk/ainet/lang/tensor/Tensor;Lkotlin/Pair;Lkotlin/Pair;Lkotlin/Pair;Z)Lsk/ainet/lang/tensor/Tensor; protected final fun broadcastShapes (Lsk/ainet/lang/tensor/Shape;Lsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/Shape; + protected final fun chooseQuantizedMatmulHeap (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun clamp (Lsk/ainet/lang/tensor/Tensor;FF)Lsk/ainet/lang/tensor/Tensor; public fun concat (Ljava/util/List;I)Lsk/ainet/lang/tensor/Tensor; public fun conv1d (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;IIII)Lsk/ainet/lang/tensor/Tensor; @@ -192,6 +193,7 @@ public class sk/ainet/exec/tensor/ops/DefaultCpuOpsBase : sk/ainet/lang/tensor/o public fun divide (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; protected final fun elementwise (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;Lkotlin/jvm/functions/Function3;)Lsk/ainet/lang/tensor/Tensor; public fun elu (Lsk/ainet/lang/tensor/Tensor;F)Lsk/ainet/lang/tensor/Tensor; + protected fun ensureKernelProviders ()V public fun exp (Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun expm1 (Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun flatten (Lsk/ainet/lang/tensor/Tensor;II)Lsk/ainet/lang/tensor/Tensor; diff --git a/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt b/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt +++ b/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt b/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt index 6db5adc9..153e7112 100644 --- a/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt +++ b/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt @@ -1,9 +1,14 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { println("[SKaiNET] Using Accelerate-backed CPU operations (ARM NEON + AMX)") + // Accelerate overrides dense FP32 matmul; packed-quant weights still flow through + // DefaultCpuOpsBase, so register the scalar packed kernels (no ServiceLoader on Native). + KernelRegistry.register(ScalarKernelProvider) return { factory -> AccelerateCpuOps(factory) } } diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt index 0958cd6e..1a45ae7d 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt @@ -8,8 +8,20 @@ import sk.ainet.lang.types.DType import sk.ainet.lang.ops.Backend import sk.ainet.lang.ops.TensorOp import sk.ainet.lang.ops.InProgress +import sk.ainet.backend.api.kernel.KernelProvider +import sk.ainet.backend.api.kernel.KernelRegistry import sk.ainet.lang.tensor.data.FloatArrayTensorData import sk.ainet.lang.tensor.data.IntArrayTensorData +import sk.ainet.lang.tensor.data.Q4_0TensorData +import sk.ainet.lang.tensor.data.Q8_0TensorData +import sk.ainet.lang.tensor.data.Q4_KTensorData +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q6_KTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_1TensorData +import sk.ainet.lang.tensor.data.Q5_1BlockTensorData +import sk.ainet.lang.tensor.data.Q5_0TensorData +import sk.ainet.lang.tensor.data.Q5_0BlockTensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.UpsampleMode @@ -304,6 +316,64 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory } @TensorOp() + /** + * Hook to populate [KernelRegistry] before the platform-neutral packed-quant + * dispatch resolves kernels. No-op in the base (callers register providers + * directly, e.g. the non-JVM platform factories register [ScalarKernelProvider]); + * the JVM ops override this to auto-install ServiceLoader-discovered providers. + */ + protected open fun ensureKernelProviders() {} + + private fun resolveProvider(test: (KernelProvider) -> Boolean): KernelProvider? { + ensureKernelProviders() + return KernelRegistry.providers().firstOrNull { it.isAvailable() && test(it) } + } + + private val q8_0Kernel by lazy { resolveProvider { it.matmulQ8_0() != null }?.matmulQ8_0() } + private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() } + private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() } + private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() } + private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() } + private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() } + + /** + * Platform-neutral packed-quant matmul: `FP32 input × packed-quant weight`, + * resolving the kernel via [KernelRegistry] (scalar on Native/JS/WASM, Panama/ + * native-FFM on JVM). Returns `null` when the weight isn't a heap-packed quant + * type or no provider carries a kernel, so callers fall through. The JVM ops + * intercept Q4_K/Q6_K/Q8_0/Q4_0 (+ MemSeg) before this runs; Q5_1/Q5_0 (and the + * whole set on non-JVM) resolve here. + */ + protected fun chooseQuantizedMatmulHeap(a: Tensor, b: Tensor): Tensor? { + if (a.dtype != FP32::class || a.shape.rank != 2 || b.shape.rank != 2) return null + if (a.shape[1] != b.shape[0]) return null + val inputBuffer = (a.data as? FloatArrayTensorData<*>)?.buffer ?: return null + val batchSize = a.shape[0] + val inputDim = a.shape[1] + val outputDim = b.shape[1] + + fun run(packed: ByteArray, kernel: (FloatArray, Int, ByteArray, Int, Int, Int, FloatArray, Int) -> Unit): Tensor { + val out = FloatArray(batchSize * outputDim) + for (batch in 0 until batchSize) { + val bi = if (batchSize == 1) inputBuffer else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) + kernel(bi, 0, packed, 0, inputDim, outputDim, out, batch * outputDim) + } + @Suppress("UNCHECKED_CAST") + val outData = dataFactory.fromFloatArray(Shape(batchSize, outputDim), a.dtype, out) as TensorData + return newTensor(outData, a.dtype, a, b) + } + + return when (val bd = b.data) { + is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) } + is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) } + is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + else -> null + } + } + override fun matmul( a: Tensor, b: Tensor @@ -311,6 +381,9 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory require(a.rank >= 1 && b.rank >= 1) { "Matrix multiplication requires tensors with at least 1 dimension per operand" } require(a.dtype == b.dtype) { "DType mismatch: ${a.dtype} vs ${b.dtype}" } + // Packed-quant fast path (FP32 input × packed weight), resolved via KernelRegistry. + chooseQuantizedMatmulHeap(a, b)?.let { return it } + // Fast path: 2D × 2D with FloatArray backing — direct buffer access, no per-element allocation if (a.rank == 2 && b.rank == 2 && (a.dtype == FP32::class) @@ -516,6 +589,22 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory val rows = tensor.shape[rank - 2] val cols = tensor.shape[rank - 1] + // Lazy transpose for heap-packed quant weights (Q4_K/Q6_K/Q5_1/Q5_0): the + // matmul kernels index the packed bytes input-block-major from the post-swap + // (inputDim, outputDim), so transpose is a pure shape swap — same bytes, no copy. + // Lets `ops.matmul(x, ops.transpose(W))` run on every platform without a dequant + // round-trip. (The JVM ops intercept Q4_K/Q6_K + MemSeg before reaching here.) + if (rank == 2) { + @Suppress("UNCHECKED_CAST") + when (val d = tensor.data) { + is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + else -> {} + } + } + // Fast path: 2D float tensor — direct buffer swap if (rank == 2 && tensor.data is FloatArrayTensorData<*>) { val buf = (tensor.data as FloatArrayTensorData<*>).buffer diff --git a/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt new file mode 100644 index 00000000..4bb497ae --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt @@ -0,0 +1,97 @@ +package sk.ainet.exec.tensor.ops + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_1BlockTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.FP32 + +/** + * End-to-end proof that packed-quant weights flow through `ctx.ops.matmul(x, ops.transpose(W))` + * on EVERY platform — exercising the lazy-transpose shape-swap + `chooseQuantizedMatmulHeap` in + * DefaultCpuOpsBase, resolving the registered kernel (scalar on Native/JS/WASM, Panama/FFM on JVM). + * Runs on jvmTest AND linuxX64Test; a green linuxX64 run is the headline "Native packed matmul works". + */ +class PackedMatmulDispatchTest { + + private val ctx = DirectCpuExecutionContext() + + private fun half(v: Float): Int { + val b = v.toRawBits(); val s = (b ushr 16) and 0x8000 + val e = ((b ushr 23) and 0xFF) - 127 + 15; val m = b and 0x7FFFFF + if (e <= 0) return s; if (e >= 31) return s or 0x7C00 + return s or (e shl 10) or (m ushr 13) + } + private fun le16(b: ByteArray, o: Int, h: Int) { b[o] = (h and 0xFF).toByte(); b[o + 1] = ((h ushr 8) and 0xFF).toByte() } + + /** Random block-major Q5_1 bytes for [out,in] + the FP32 weight they dequantize to (row-major). */ + private fun q5_1(inDim: Int, outDim: Int, rng: Random): Pair { + val blocks = inDim / 32; val bytes = ByteArray(outDim * blocks * 24); val wf = FloatArray(outDim * inDim) + for (o in 0 until outDim) for (bI in 0 until blocks) { + val off = (bI * outDim + o) * 24; val dst = o * inDim + bI * 32 + val d = rng.nextFloat() * 0.05f + 0.01f; val m = rng.nextFloat() - 0.5f + le16(bytes, off, half(d)); le16(bytes, off + 2, half(m)) + val qh = IntArray(4) { rng.nextInt(256) }; for (k in 0 until 4) bytes[off + 4 + k] = qh[k].toByte() + for (k in 0 until 16) bytes[off + 8 + k] = rng.nextInt(256).toByte() + for (j in 0 until 16) { + val q = bytes[off + 8 + j].toInt() and 0xFF + val bl = (qh[j / 8] ushr (j % 8)) and 1; val bh = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + wf[dst + j] = d * ((q and 0xF) + (bl shl 4)) + m; wf[dst + 16 + j] = d * ((q ushr 4) + (bh shl 4)) + m + } + } + return bytes to wf + } + + /** Random block-major Q4_K bytes for [out,in] + the FP32 weight. */ + private fun q4_k(inDim: Int, outDim: Int, rng: Random): Pair { + val blocks = inDim / 256; val bytes = ByteArray(outDim * blocks * 144); val wf = FloatArray(outDim * inDim) + for (o in 0 until outDim) for (bI in 0 until blocks) { + val off = (bI * outDim + o) * 144; val dst = o * inDim + bI * 256 + val d = rng.nextFloat() * 0.02f + 0.005f; val dMin = rng.nextFloat() * 0.02f + 0.005f + le16(bytes, off, half(d)); le16(bytes, off + 2, half(dMin)) + for (k in 0 until 140) bytes[off + 4 + k] = rng.nextInt(256).toByte() + val sc = off + 4; val si = IntArray(8); val mi = IntArray(8) + for (s in 0 until 4) { si[s] = bytes[sc + s].toInt() and 0x3F; mi[s] = bytes[sc + s + 4].toInt() and 0x3F } + for (s in 4 until 8) { + si[s] = (bytes[sc + s + 4].toInt() and 0x0F) or (((bytes[sc + s - 4].toInt() and 0xFF) ushr 6) shl 4) + mi[s] = ((bytes[sc + s + 4].toInt() and 0xFF) ushr 4) or (((bytes[sc + s].toInt() and 0xFF) ushr 6) shl 4) + } + val codes = off + 16 + for (g in 0 until 4) for (h in 0 until 2) { + val s = 2 * g + h + for (i in 0 until 32) { + val by = bytes[codes + g * 32 + i].toInt() and 0xFF + val code = if (h == 0) (by and 0x0F) else (by ushr 4) + wf[dst + s * 32 + i] = code * (d * si[s]) - dMin * mi[s] + } + } + } + return bytes to wf + } + + private fun run(fmt: String, inDim: Int, outDim: Int, seed: Int) { + val rng = Random(seed) + val (bytes, wf) = if (fmt == "Q5_1") q5_1(inDim, outDim, rng) else q4_k(inDim, outDim, rng) + @Suppress("UNCHECKED_CAST") + val w = ctx.fromData( + (if (fmt == "Q5_1") Q5_1BlockTensorData(Shape(outDim, inDim), bytes) + else Q4_KBlockTensorData(Shape(outDim, inDim), bytes)) as TensorData, + FP32::class, + ) + val xf = FloatArray(inDim) { rng.nextFloat() - 0.5f } + val x = ctx.fromFloatArray(Shape(1, inDim), FP32::class, xf) + val out = ctx.ops.matmul(x, ctx.ops.transpose(w)).data.copyToFloatArray() + val expected = FloatArray(outDim) { o -> var s = 0f; for (i in 0 until inDim) s += xf[i] * wf[o * inDim + i]; s } + var maxErr = 0f; var maxAbs = 1f + for (o in 0 until outDim) { maxErr = maxOf(maxErr, abs(expected[o] - out[o])); maxAbs = maxOf(maxAbs, abs(expected[o])) } + assertTrue(maxErr < 5e-3f * maxAbs, "$fmt e2e matmul deviates: maxErr=$maxErr (maxAbs=$maxAbs)") + } + + @Test fun q5_1_through_ops_matmul_transpose() = run("Q5_1", inDim = 128, outDim = 16, seed = 7) + @Test fun q4_k_through_ops_matmul_transpose() = run("Q4_K", inDim = 256, outDim = 12, seed = 8) +} diff --git a/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt b/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt index fe3d68a0..bbd66825 100644 --- a/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt +++ b/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt @@ -1,4 +1,9 @@ package sk.ainet.exec.tensor.ops -internal actual fun platformDefaultCpuOpsFactory(): (sk.ainet.lang.tensor.data.TensorDataFactory) -> sk.ainet.lang.tensor.ops.TensorOps = - { factory -> DefaultCpuOps(factory) } +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider + +internal actual fun platformDefaultCpuOpsFactory(): (sk.ainet.lang.tensor.data.TensorDataFactory) -> sk.ainet.lang.tensor.ops.TensorOps { + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt index b70abfd9..17cc9cd3 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt @@ -45,6 +45,18 @@ internal class DefaultCpuOpsJvm( private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + /** + * On the JVM, auto-install ServiceLoader-discovered providers (Panama Vector, + * native FFM) so the base class's platform-neutral packed-quant dispatch + * (`chooseQuantizedMatmulHeap`, used for Q5_1/Q5_0 and the non-JVM path) resolves + * the SIMD/FFM kernels rather than only the scalar floor. + */ + override fun ensureKernelProviders() { + if (KernelRegistry.providers().isEmpty()) { + KernelServiceLoader.installAll() + } + } + /** * FP32 matmul kernel resolved via [KernelRegistry]. First access on a * given instance auto-installs providers via [KernelServiceLoader] diff --git a/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt b/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt +++ b/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt b/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt +++ b/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt b/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt +++ b/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api index c4ecd987..c2c2a1be 100644 --- a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api +++ b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api @@ -3144,6 +3144,88 @@ public final class sk/ainet/lang/tensor/data/Q4_KTensorDataKt { public static final fun toFloatArray (Lsk/ainet/lang/tensor/data/Q4_KTensorData;)[F } +public final class sk/ainet/lang/tensor/data/Q5_0BlockTensorData : sk/ainet/lang/tensor/data/Q5_0TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_0BlockTensorData$Companion; + public fun (Lsk/ainet/lang/tensor/Shape;[B)V + public fun copyToFloatArray ()[F + public fun dequantizeBlock (I[FI)V + public fun get ([I)Ljava/lang/Byte; + public synthetic fun get ([I)Ljava/lang/Object; + public fun getBlockCount ()I + public fun getBlockSize ()I + public fun getElementCount ()J + public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding; + public fun getPackedData ()[B + public fun getPhysicalBytes ()J + public fun getShape ()Lsk/ainet/lang/tensor/Shape; + public fun set ([IB)V + public synthetic fun set ([ILjava/lang/Object;)V + public fun toFloatArray ()[F + public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage; +} + +public final class sk/ainet/lang/tensor/data/Q5_0BlockTensorData$Companion { + public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q5_0BlockTensorData; +} + +public abstract interface class sk/ainet/lang/tensor/data/Q5_0TensorData : sk/ainet/lang/tensor/data/TensorData { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_0TensorData$Companion; + public abstract fun getBlockCount ()I + public abstract fun getPackedData ()[B +} + +public final class sk/ainet/lang/tensor/data/Q5_0TensorData$Companion { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I +} + +public final class sk/ainet/lang/tensor/data/Q5_0TensorData$DefaultImpls { + public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q5_0TensorData;)[F +} + +public final class sk/ainet/lang/tensor/data/Q5_1BlockTensorData : sk/ainet/lang/tensor/data/Q5_1TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_1BlockTensorData$Companion; + public fun (Lsk/ainet/lang/tensor/Shape;[B)V + public fun copyToFloatArray ()[F + public fun dequantizeBlock (I[FI)V + public fun get ([I)Ljava/lang/Byte; + public synthetic fun get ([I)Ljava/lang/Object; + public fun getBlockCount ()I + public fun getBlockSize ()I + public fun getElementCount ()J + public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding; + public fun getPackedData ()[B + public fun getPhysicalBytes ()J + public fun getShape ()Lsk/ainet/lang/tensor/Shape; + public fun set ([IB)V + public synthetic fun set ([ILjava/lang/Object;)V + public fun toFloatArray ()[F + public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage; +} + +public final class sk/ainet/lang/tensor/data/Q5_1BlockTensorData$Companion { + public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q5_1BlockTensorData; +} + +public abstract interface class sk/ainet/lang/tensor/data/Q5_1TensorData : sk/ainet/lang/tensor/data/TensorData { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_1TensorData$Companion; + public abstract fun getBlockCount ()I + public abstract fun getPackedData ()[B +} + +public final class sk/ainet/lang/tensor/data/Q5_1TensorData$Companion { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I +} + +public final class sk/ainet/lang/tensor/data/Q5_1TensorData$DefaultImpls { + public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q5_1TensorData;)[F +} + public final class sk/ainet/lang/tensor/data/Q6_KBlockTensorData : sk/ainet/lang/tensor/data/Q6_KTensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { public static final field Companion Lsk/ainet/lang/tensor/data/Q6_KBlockTensorData$Companion; public fun (Lsk/ainet/lang/tensor/Shape;[B)V @@ -5220,6 +5302,28 @@ public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_K : sk/ainet/l public fun toString ()Ljava/lang/String; } +public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q5_0 : sk/ainet/lang/tensor/storage/TensorEncoding { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q5_0; + public fun equals (Ljava/lang/Object;)Z + public fun getName ()Ljava/lang/String; + public fun hashCode ()I + public fun physicalBytes (J)Ljava/lang/Long; + public fun toString ()Ljava/lang/String; +} + +public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q5_1 : sk/ainet/lang/tensor/storage/TensorEncoding { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q5_1; + public fun equals (Ljava/lang/Object;)Z + public fun getName ()Ljava/lang/String; + public fun hashCode ()I + public fun physicalBytes (J)Ljava/lang/Long; + public fun toString ()Ljava/lang/String; +} + public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q6_K : sk/ainet/lang/tensor/storage/TensorEncoding { public static final field BLOCK_SIZE I public static final field BYTES_PER_BLOCK I diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt new file mode 100644 index 00000000..61eba8d7 --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt @@ -0,0 +1,94 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.storage.PackedBlockStorage +import sk.ainet.lang.tensor.storage.TensorEncoding +import sk.ainet.lang.types.DType + +/** + * Tensor data for the GGML **Q5_0** quantized format (5-bit, symmetric). + * + * Block format (32 elements, 22 bytes/block): + * - bytes 0..1 : `d` (f16 scale) + * - bytes 2..5 : `qh[0..3]` (5th/high bit of each of the 32 codes) + * - bytes 6..21 : `qs[0..15]` (low 4 bits, two nibbles per byte) + * + * Dequant (matches `DequantOps.dequantQ5_0FromBytes`): with `bitX` as in Q5_1, + * + * element[j] = d * (lo + (bitLo shl 4) - 16) + * element[j + 16] = d * (hi + (bitHi shl 4) - 16) + * + * Matmul packing is **input-block-major** `(blockIdx * outputDim + o)`; see + * [Q5_1TensorData] for the layout/transpose contract. + */ +public interface Q5_0TensorData : TensorData { + public val blockCount: Int + public val packedData: ByteArray + + public companion object { + public const val BLOCK_SIZE: Int = 32 + public const val BYTES_PER_BLOCK: Int = 22 + } +} + +/** Packed-byte implementation of [Q5_0TensorData]. */ +public class Q5_0BlockTensorData( + initialShape: Shape, + private val data: ByteArray, +) : Q5_0TensorData, PackedBlockStorage { + + override val shape: Shape = Shape(initialShape.dimensions.copyOf()) + private val strides: IntArray = shape.computeStrides() + override val packedData: ByteArray get() = data + override val blockCount: Int = (shape.volume + Q5_0TensorData.BLOCK_SIZE - 1) / Q5_0TensorData.BLOCK_SIZE + override val encoding: TensorEncoding get() = TensorEncoding.Q5_0 + override val blockSize: Int get() = Q5_0TensorData.BLOCK_SIZE + + init { + val required = blockCount * Q5_0TensorData.BYTES_PER_BLOCK + require(data.size >= required) { "Data size ${data.size} < required $required for $blockCount blocks" } + } + + override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + val base = blockIdx * Q5_0TensorData.BYTES_PER_BLOCK + val d = Q4_0BlockTensorData.halfToFloat(((data[base + 1].toInt() and 0xFF) shl 8) or (data[base].toInt() and 0xFF)) + val qh = intArrayOf( + data[base + 2].toInt() and 0xFF, data[base + 3].toInt() and 0xFF, + data[base + 4].toInt() and 0xFF, data[base + 5].toInt() and 0xFF, + ) + val qs = base + 6 + val elems = minOf(Q5_0TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_0TensorData.BLOCK_SIZE) + for (j in 0 until 16) { + val q = data[qs + j].toInt() and 0xFF + val lo = q and 0x0F; val hi = q ushr 4 + val bitLo = (qh[j / 8] ushr (j % 8)) and 1 + val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + if (j < elems) output[outputOffset + j] = d * (lo + (bitLo shl 4) - 16) + if (16 + j < elems) output[outputOffset + 16 + j] = d * (hi + (bitHi shl 4) - 16) + } + } + + override fun get(vararg indices: Int): Byte { + val flat = calcFlatIndex(indices) + val tmp = FloatArray(Q5_0TensorData.BLOCK_SIZE) + dequantizeBlock(flat / Q5_0TensorData.BLOCK_SIZE, tmp, 0) + return tmp[flat % Q5_0TensorData.BLOCK_SIZE].toInt().toByte() + } + + override fun set(vararg indices: Int, value: Byte): Unit = + throw UnsupportedOperationException("Q5_0BlockTensorData is read-only") + + private fun calcFlatIndex(indices: IntArray): Int { + require(indices.size == shape.dimensions.size) { + "Number of indices (${indices.size}) must match dimensions (${shape.dimensions.size})" + } + var flat = 0 + for (i in indices.indices) flat += indices[i] * strides[i] + return flat + } + + public companion object { + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_0BlockTensorData = Q5_0BlockTensorData(shape, bytes) + } +} diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt new file mode 100644 index 00000000..52550ca3 --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt @@ -0,0 +1,100 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.storage.PackedBlockStorage +import sk.ainet.lang.tensor.storage.TensorEncoding +import sk.ainet.lang.types.DType + +/** + * Tensor data for the GGML **Q5_1** quantized format (5-bit, per-block minimum). + * + * Block format (32 elements, 24 bytes/block): + * - bytes 0..1 : `d` (f16 scale) + * - bytes 2..3 : `m` (f16 minimum) + * - bytes 4..7 : `qh[0..3]` (5th/high bit of each of the 32 codes) + * - bytes 8..23 : `qs[0..15]` (low 4 bits, two nibbles per byte) + * + * Dequant (matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_1FromBytes`), + * for `j ∈ [0,16)`, `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, + * `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`: + * + * element[j] = d * (lo + (bitLo shl 4)) + m + * element[j + 16] = d * (hi + (bitHi shl 4)) + m + * + * As packed by the GGUF converter for matmul, blocks are **input-block-major** + * `(blockIdx * outputDim + o)`; `Q5_1MatmulKernel` indexes them that way and the + * CPU-ops lazy transpose is a pure shape swap. The per-block [dequantizeBlock] + * below is layout-agnostic (it dequantizes the block at a flat index). + */ +public interface Q5_1TensorData : TensorData { + public val blockCount: Int + public val packedData: ByteArray + + public companion object { + public const val BLOCK_SIZE: Int = 32 + public const val BYTES_PER_BLOCK: Int = 24 + } +} + +/** Packed-byte implementation of [Q5_1TensorData]. */ +public class Q5_1BlockTensorData( + initialShape: Shape, + private val data: ByteArray, +) : Q5_1TensorData, PackedBlockStorage { + + override val shape: Shape = Shape(initialShape.dimensions.copyOf()) + private val strides: IntArray = shape.computeStrides() + override val packedData: ByteArray get() = data + override val blockCount: Int = (shape.volume + Q5_1TensorData.BLOCK_SIZE - 1) / Q5_1TensorData.BLOCK_SIZE + override val encoding: TensorEncoding get() = TensorEncoding.Q5_1 + override val blockSize: Int get() = Q5_1TensorData.BLOCK_SIZE + + init { + val required = blockCount * Q5_1TensorData.BYTES_PER_BLOCK + require(data.size >= required) { "Data size ${data.size} < required $required for $blockCount blocks" } + } + + override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + val base = blockIdx * Q5_1TensorData.BYTES_PER_BLOCK + val d = Q4_0BlockTensorData.halfToFloat(((data[base + 1].toInt() and 0xFF) shl 8) or (data[base].toInt() and 0xFF)) + val m = Q4_0BlockTensorData.halfToFloat(((data[base + 3].toInt() and 0xFF) shl 8) or (data[base + 2].toInt() and 0xFF)) + val qh = intArrayOf( + data[base + 4].toInt() and 0xFF, data[base + 5].toInt() and 0xFF, + data[base + 6].toInt() and 0xFF, data[base + 7].toInt() and 0xFF, + ) + val qs = base + 8 + val elems = minOf(Q5_1TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_1TensorData.BLOCK_SIZE) + for (j in 0 until 16) { + val q = data[qs + j].toInt() and 0xFF + val lo = q and 0x0F; val hi = q ushr 4 + val bitLo = (qh[j / 8] ushr (j % 8)) and 1 + val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + if (j < elems) output[outputOffset + j] = d * (lo + (bitLo shl 4)) + m + if (16 + j < elems) output[outputOffset + 16 + j] = d * (hi + (bitHi shl 4)) + m + } + } + + override fun get(vararg indices: Int): Byte { + val flat = calcFlatIndex(indices) + val tmp = FloatArray(Q5_1TensorData.BLOCK_SIZE) + dequantizeBlock(flat / Q5_1TensorData.BLOCK_SIZE, tmp, 0) + return tmp[flat % Q5_1TensorData.BLOCK_SIZE].toInt().toByte() + } + + override fun set(vararg indices: Int, value: Byte): Unit = + throw UnsupportedOperationException("Q5_1BlockTensorData is read-only") + + private fun calcFlatIndex(indices: IntArray): Int { + require(indices.size == shape.dimensions.size) { + "Number of indices (${indices.size}) must match dimensions (${shape.dimensions.size})" + } + var flat = 0 + for (i in indices.indices) flat += indices[i] * strides[i] + return flat + } + + public companion object { + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_1BlockTensorData = Q5_1BlockTensorData(shape, bytes) + } +} diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt index bd781a4f..509b6704 100644 --- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt @@ -76,6 +76,30 @@ public sealed interface TensorEncoding { } } + /** GGML Q5_0 block quantization: 32 elements per 22-byte block. */ + public data object Q5_0 : TensorEncoding { + public const val BLOCK_SIZE: Int = 32 + public const val BYTES_PER_BLOCK: Int = 22 + + override val name: String get() = "Q5_0" + override fun physicalBytes(elementCount: Long): Long { + val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE + return blocks * BYTES_PER_BLOCK + } + } + + /** GGML Q5_1 block quantization: 32 elements per 24-byte block. */ + public data object Q5_1 : TensorEncoding { + public const val BLOCK_SIZE: Int = 32 + public const val BYTES_PER_BLOCK: Int = 24 + + override val name: String get() = "Q5_1" + override fun physicalBytes(elementCount: Long): Long { + val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE + return blocks * BYTES_PER_BLOCK + } + } + /** Ternary encoding: 2 bits per element, packed 4 elements per byte. */ public data object TernaryPacked : TensorEncoding { override val name: String get() = "Ternary"