diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api index c9a8b256..539540e1 100644 --- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api +++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api @@ -180,6 +180,7 @@ public class sk/ainet/exec/tensor/ops/DefaultCpuOpsBase : sk/ainet/lang/tensor/o public fun addScalar (Lsk/ainet/lang/tensor/Tensor;Ljava/lang/Number;)Lsk/ainet/lang/tensor/Tensor; public fun avgPool2d (Lsk/ainet/lang/tensor/Tensor;Lkotlin/Pair;Lkotlin/Pair;Lkotlin/Pair;Z)Lsk/ainet/lang/tensor/Tensor; protected final fun broadcastShapes (Lsk/ainet/lang/tensor/Shape;Lsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/Shape; + protected final fun chooseQuantizedMatmulHeap (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun clamp (Lsk/ainet/lang/tensor/Tensor;FF)Lsk/ainet/lang/tensor/Tensor; public fun concat (Ljava/util/List;I)Lsk/ainet/lang/tensor/Tensor; public fun conv1d (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;IIII)Lsk/ainet/lang/tensor/Tensor; @@ -192,6 +193,7 @@ public class sk/ainet/exec/tensor/ops/DefaultCpuOpsBase : sk/ainet/lang/tensor/o public fun divide (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; protected final fun elementwise (Lsk/ainet/lang/tensor/Tensor;Lsk/ainet/lang/tensor/Tensor;Lkotlin/jvm/functions/Function3;)Lsk/ainet/lang/tensor/Tensor; public fun elu (Lsk/ainet/lang/tensor/Tensor;F)Lsk/ainet/lang/tensor/Tensor; + protected fun ensureKernelProviders ()V public fun exp (Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun expm1 (Lsk/ainet/lang/tensor/Tensor;)Lsk/ainet/lang/tensor/Tensor; public fun flatten (Lsk/ainet/lang/tensor/Tensor;II)Lsk/ainet/lang/tensor/Tensor; diff --git a/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt b/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt +++ b/skainet-backends/skainet-backend-cpu/src/androidMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.android.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt b/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt index 6db5adc9..153e7112 100644 --- a/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt +++ b/skainet-backends/skainet-backend-cpu/src/appleMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.apple.kt @@ -1,9 +1,14 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { println("[SKaiNET] Using Accelerate-backed CPU operations (ARM NEON + AMX)") + // Accelerate overrides dense FP32 matmul; packed-quant weights still flow through + // DefaultCpuOpsBase, so register the scalar packed kernels (no ServiceLoader on Native). + KernelRegistry.register(ScalarKernelProvider) return { factory -> AccelerateCpuOps(factory) } } diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt index 0958cd6e..1a45ae7d 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOps.kt @@ -8,8 +8,20 @@ import sk.ainet.lang.types.DType import sk.ainet.lang.ops.Backend import sk.ainet.lang.ops.TensorOp import sk.ainet.lang.ops.InProgress +import sk.ainet.backend.api.kernel.KernelProvider +import sk.ainet.backend.api.kernel.KernelRegistry import sk.ainet.lang.tensor.data.FloatArrayTensorData import sk.ainet.lang.tensor.data.IntArrayTensorData +import sk.ainet.lang.tensor.data.Q4_0TensorData +import sk.ainet.lang.tensor.data.Q8_0TensorData +import sk.ainet.lang.tensor.data.Q4_KTensorData +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q6_KTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_1TensorData +import sk.ainet.lang.tensor.data.Q5_1BlockTensorData +import sk.ainet.lang.tensor.data.Q5_0TensorData +import sk.ainet.lang.tensor.data.Q5_0BlockTensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.UpsampleMode @@ -304,6 +316,64 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory } @TensorOp() + /** + * Hook to populate [KernelRegistry] before the platform-neutral packed-quant + * dispatch resolves kernels. No-op in the base (callers register providers + * directly, e.g. the non-JVM platform factories register [ScalarKernelProvider]); + * the JVM ops override this to auto-install ServiceLoader-discovered providers. + */ + protected open fun ensureKernelProviders() {} + + private fun resolveProvider(test: (KernelProvider) -> Boolean): KernelProvider? { + ensureKernelProviders() + return KernelRegistry.providers().firstOrNull { it.isAvailable() && test(it) } + } + + private val q8_0Kernel by lazy { resolveProvider { it.matmulQ8_0() != null }?.matmulQ8_0() } + private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() } + private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() } + private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() } + private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() } + private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() } + + /** + * Platform-neutral packed-quant matmul: `FP32 input × packed-quant weight`, + * resolving the kernel via [KernelRegistry] (scalar on Native/JS/WASM, Panama/ + * native-FFM on JVM). Returns `null` when the weight isn't a heap-packed quant + * type or no provider carries a kernel, so callers fall through. The JVM ops + * intercept Q4_K/Q6_K/Q8_0/Q4_0 (+ MemSeg) before this runs; Q5_1/Q5_0 (and the + * whole set on non-JVM) resolve here. + */ + protected fun chooseQuantizedMatmulHeap(a: Tensor, b: Tensor): Tensor? { + if (a.dtype != FP32::class || a.shape.rank != 2 || b.shape.rank != 2) return null + if (a.shape[1] != b.shape[0]) return null + val inputBuffer = (a.data as? FloatArrayTensorData<*>)?.buffer ?: return null + val batchSize = a.shape[0] + val inputDim = a.shape[1] + val outputDim = b.shape[1] + + fun run(packed: ByteArray, kernel: (FloatArray, Int, ByteArray, Int, Int, Int, FloatArray, Int) -> Unit): Tensor { + val out = FloatArray(batchSize * outputDim) + for (batch in 0 until batchSize) { + val bi = if (batchSize == 1) inputBuffer else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) + kernel(bi, 0, packed, 0, inputDim, outputDim, out, batch * outputDim) + } + @Suppress("UNCHECKED_CAST") + val outData = dataFactory.fromFloatArray(Shape(batchSize, outputDim), a.dtype, out) as TensorData + return newTensor(outData, a.dtype, a, b) + } + + return when (val bd = b.data) { + is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) } + is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) } + is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) } + else -> null + } + } + override fun matmul( a: Tensor, b: Tensor @@ -311,6 +381,9 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory require(a.rank >= 1 && b.rank >= 1) { "Matrix multiplication requires tensors with at least 1 dimension per operand" } require(a.dtype == b.dtype) { "DType mismatch: ${a.dtype} vs ${b.dtype}" } + // Packed-quant fast path (FP32 input × packed weight), resolved via KernelRegistry. + chooseQuantizedMatmulHeap(a, b)?.let { return it } + // Fast path: 2D × 2D with FloatArray backing — direct buffer access, no per-element allocation if (a.rank == 2 && b.rank == 2 && (a.dtype == FP32::class) @@ -516,6 +589,22 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory val rows = tensor.shape[rank - 2] val cols = tensor.shape[rank - 1] + // Lazy transpose for heap-packed quant weights (Q4_K/Q6_K/Q5_1/Q5_0): the + // matmul kernels index the packed bytes input-block-major from the post-swap + // (inputDim, outputDim), so transpose is a pure shape swap — same bytes, no copy. + // Lets `ops.matmul(x, ops.transpose(W))` run on every platform without a dequant + // round-trip. (The JVM ops intercept Q4_K/Q6_K + MemSeg before reaching here.) + if (rank == 2) { + @Suppress("UNCHECKED_CAST") + when (val d = tensor.data) { + is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData, tensor.dtype, tensor) + else -> {} + } + } + // Fast path: 2D float tensor — direct buffer swap if (rank == 2 && tensor.data is FloatArrayTensorData<*>) { val buf = (tensor.data as FloatArrayTensorData<*>).buffer diff --git a/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt new file mode 100644 index 00000000..4bb497ae --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt @@ -0,0 +1,97 @@ +package sk.ainet.exec.tensor.ops + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_1BlockTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.FP32 + +/** + * End-to-end proof that packed-quant weights flow through `ctx.ops.matmul(x, ops.transpose(W))` + * on EVERY platform — exercising the lazy-transpose shape-swap + `chooseQuantizedMatmulHeap` in + * DefaultCpuOpsBase, resolving the registered kernel (scalar on Native/JS/WASM, Panama/FFM on JVM). + * Runs on jvmTest AND linuxX64Test; a green linuxX64 run is the headline "Native packed matmul works". + */ +class PackedMatmulDispatchTest { + + private val ctx = DirectCpuExecutionContext() + + private fun half(v: Float): Int { + val b = v.toRawBits(); val s = (b ushr 16) and 0x8000 + val e = ((b ushr 23) and 0xFF) - 127 + 15; val m = b and 0x7FFFFF + if (e <= 0) return s; if (e >= 31) return s or 0x7C00 + return s or (e shl 10) or (m ushr 13) + } + private fun le16(b: ByteArray, o: Int, h: Int) { b[o] = (h and 0xFF).toByte(); b[o + 1] = ((h ushr 8) and 0xFF).toByte() } + + /** Random block-major Q5_1 bytes for [out,in] + the FP32 weight they dequantize to (row-major). */ + private fun q5_1(inDim: Int, outDim: Int, rng: Random): Pair { + val blocks = inDim / 32; val bytes = ByteArray(outDim * blocks * 24); val wf = FloatArray(outDim * inDim) + for (o in 0 until outDim) for (bI in 0 until blocks) { + val off = (bI * outDim + o) * 24; val dst = o * inDim + bI * 32 + val d = rng.nextFloat() * 0.05f + 0.01f; val m = rng.nextFloat() - 0.5f + le16(bytes, off, half(d)); le16(bytes, off + 2, half(m)) + val qh = IntArray(4) { rng.nextInt(256) }; for (k in 0 until 4) bytes[off + 4 + k] = qh[k].toByte() + for (k in 0 until 16) bytes[off + 8 + k] = rng.nextInt(256).toByte() + for (j in 0 until 16) { + val q = bytes[off + 8 + j].toInt() and 0xFF + val bl = (qh[j / 8] ushr (j % 8)) and 1; val bh = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + wf[dst + j] = d * ((q and 0xF) + (bl shl 4)) + m; wf[dst + 16 + j] = d * ((q ushr 4) + (bh shl 4)) + m + } + } + return bytes to wf + } + + /** Random block-major Q4_K bytes for [out,in] + the FP32 weight. */ + private fun q4_k(inDim: Int, outDim: Int, rng: Random): Pair { + val blocks = inDim / 256; val bytes = ByteArray(outDim * blocks * 144); val wf = FloatArray(outDim * inDim) + for (o in 0 until outDim) for (bI in 0 until blocks) { + val off = (bI * outDim + o) * 144; val dst = o * inDim + bI * 256 + val d = rng.nextFloat() * 0.02f + 0.005f; val dMin = rng.nextFloat() * 0.02f + 0.005f + le16(bytes, off, half(d)); le16(bytes, off + 2, half(dMin)) + for (k in 0 until 140) bytes[off + 4 + k] = rng.nextInt(256).toByte() + val sc = off + 4; val si = IntArray(8); val mi = IntArray(8) + for (s in 0 until 4) { si[s] = bytes[sc + s].toInt() and 0x3F; mi[s] = bytes[sc + s + 4].toInt() and 0x3F } + for (s in 4 until 8) { + si[s] = (bytes[sc + s + 4].toInt() and 0x0F) or (((bytes[sc + s - 4].toInt() and 0xFF) ushr 6) shl 4) + mi[s] = ((bytes[sc + s + 4].toInt() and 0xFF) ushr 4) or (((bytes[sc + s].toInt() and 0xFF) ushr 6) shl 4) + } + val codes = off + 16 + for (g in 0 until 4) for (h in 0 until 2) { + val s = 2 * g + h + for (i in 0 until 32) { + val by = bytes[codes + g * 32 + i].toInt() and 0xFF + val code = if (h == 0) (by and 0x0F) else (by ushr 4) + wf[dst + s * 32 + i] = code * (d * si[s]) - dMin * mi[s] + } + } + } + return bytes to wf + } + + private fun run(fmt: String, inDim: Int, outDim: Int, seed: Int) { + val rng = Random(seed) + val (bytes, wf) = if (fmt == "Q5_1") q5_1(inDim, outDim, rng) else q4_k(inDim, outDim, rng) + @Suppress("UNCHECKED_CAST") + val w = ctx.fromData( + (if (fmt == "Q5_1") Q5_1BlockTensorData(Shape(outDim, inDim), bytes) + else Q4_KBlockTensorData(Shape(outDim, inDim), bytes)) as TensorData, + FP32::class, + ) + val xf = FloatArray(inDim) { rng.nextFloat() - 0.5f } + val x = ctx.fromFloatArray(Shape(1, inDim), FP32::class, xf) + val out = ctx.ops.matmul(x, ctx.ops.transpose(w)).data.copyToFloatArray() + val expected = FloatArray(outDim) { o -> var s = 0f; for (i in 0 until inDim) s += xf[i] * wf[o * inDim + i]; s } + var maxErr = 0f; var maxAbs = 1f + for (o in 0 until outDim) { maxErr = maxOf(maxErr, abs(expected[o] - out[o])); maxAbs = maxOf(maxAbs, abs(expected[o])) } + assertTrue(maxErr < 5e-3f * maxAbs, "$fmt e2e matmul deviates: maxErr=$maxErr (maxAbs=$maxAbs)") + } + + @Test fun q5_1_through_ops_matmul_transpose() = run("Q5_1", inDim = 128, outDim = 16, seed = 7) + @Test fun q4_k_through_ops_matmul_transpose() = run("Q4_K", inDim = 256, outDim = 12, seed = 8) +} diff --git a/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt b/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt index fe3d68a0..bbd66825 100644 --- a/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt +++ b/skainet-backends/skainet-backend-cpu/src/jsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.js.kt @@ -1,4 +1,9 @@ package sk.ainet.exec.tensor.ops -internal actual fun platformDefaultCpuOpsFactory(): (sk.ainet.lang.tensor.data.TensorDataFactory) -> sk.ainet.lang.tensor.ops.TensorOps = - { factory -> DefaultCpuOps(factory) } +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider + +internal actual fun platformDefaultCpuOpsFactory(): (sk.ainet.lang.tensor.data.TensorDataFactory) -> sk.ainet.lang.tensor.ops.TensorOps { + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt index 145fdbba..f48f2ac0 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt @@ -31,10 +31,6 @@ import sk.ainet.lang.tensor.data.Q4_KBlockTensorData import sk.ainet.lang.tensor.data.Q4_KTensorData import sk.ainet.lang.tensor.data.Q6_KBlockTensorData import sk.ainet.lang.tensor.data.Q6_KTensorData -import sk.ainet.lang.tensor.data.Q5_1BlockTensorData -import sk.ainet.lang.tensor.data.Q5_1TensorData -import sk.ainet.lang.tensor.data.Q5_0BlockTensorData -import sk.ainet.lang.tensor.data.Q5_0TensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.types.DType import sk.ainet.lang.types.FP16 @@ -49,6 +45,18 @@ internal class DefaultCpuOpsJvm( private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + /** + * On the JVM, auto-install ServiceLoader-discovered providers (Panama Vector, + * native FFM) so the base class's platform-neutral packed-quant dispatch + * (`chooseQuantizedMatmulHeap`, used for Q5_1/Q5_0 and the non-JVM path) resolves + * the SIMD/FFM kernels rather than only the scalar floor. + */ + override fun ensureKernelProviders() { + if (KernelRegistry.providers().isEmpty()) { + KernelServiceLoader.installAll() + } + } + /** * FP32 matmul kernel resolved via [KernelRegistry]. First access on a * given instance auto-installs providers via [KernelServiceLoader] @@ -228,21 +236,8 @@ internal class DefaultCpuOpsJvm( @Suppress("UNCHECKED_CAST") return newTensor(transposed as TensorData, tensor.dtype, tensor) } - // Q5_1 / Q5_0 packed bytes use a row-major `[out, in]` layout that the - // `matmulQ5_1Vec` / `matmulQ5_0Vec` kernels index by output row, so the - // transpose is a pure shape swap — the same bytes give the right values - // under the swapped shape (lets `ops.matmul(x, ops.transpose(W))` run - // without a dequant round-trip). - if (data is Q5_1TensorData) { - val transposed = Q5_1BlockTensorData(Shape(cols, rows), data.packedData) - @Suppress("UNCHECKED_CAST") - return newTensor(transposed as TensorData, tensor.dtype, tensor) - } - if (data is Q5_0TensorData) { - val transposed = Q5_0BlockTensorData(Shape(cols, rows), data.packedData) - @Suppress("UNCHECKED_CAST") - return newTensor(transposed as TensorData, tensor.dtype, tensor) - } + // Q5_1 / Q5_0 lazy transpose is handled in DefaultCpuOpsBase (block-major, + // shared with Native); the JVM ops don't intercept Q5 here. // MemorySegment FP32 fast path: physical transpose via SIMD. // Uses Arena.ofAuto() so the result segment is reclaimed by GC // when the wrapping Tensor is no longer reachable. Earlier @@ -577,32 +572,8 @@ internal class DefaultCpuOpsJvm( @Suppress("UNCHECKED_CAST") CpuTensor(outData as TensorData, this, a.dtype) } - is Q5_1TensorData -> { - val outBuffer = FloatArray(batchSize * outputDim) - for (batch in 0 until batchSize) { - val batchInput = if (batchSize == 1) inputBuffer - else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) - JvmQuantizedVectorKernels.matmulQ5_1Vec( - batchInput, bData.packedData, inputDim, outputDim, outBuffer, batch * outputDim, - ) - } - val outData = DenseFloatArrayTensorData(Shape(batchSize, outputDim), outBuffer) - @Suppress("UNCHECKED_CAST") - CpuTensor(outData as TensorData, this, a.dtype) - } - is Q5_0TensorData -> { - val outBuffer = FloatArray(batchSize * outputDim) - for (batch in 0 until batchSize) { - val batchInput = if (batchSize == 1) inputBuffer - else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) - JvmQuantizedVectorKernels.matmulQ5_0Vec( - batchInput, bData.packedData, inputDim, outputDim, outBuffer, batch * outputDim, - ) - } - val outData = DenseFloatArrayTensorData(Shape(batchSize, outputDim), outBuffer) - @Suppress("UNCHECKED_CAST") - CpuTensor(outData as TensorData, this, a.dtype) - } + // Q5_1 / Q5_0 dispatch is handled in DefaultCpuOpsBase via the kernel + // registry (block-major, shared with Native); not intercepted here. is Q4_KTensorData -> { val outBuffer = FloatArray(batchSize * outputDim) val spiKernel = q4kMatmulKernel diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt index 009c188c..89a8fe9b 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt @@ -910,97 +910,4 @@ internal object JvmQuantizedVectorKernels { } } - /** - * Q5_1 matrix-vector multiply: `output = input · Wᵀ` for a packed Q5_1 weight. - * - * Packed weights are in the natural GGUF **row-major** `[outputDim, inputDim]` - * layout: output row `o`'s `inputDim` weights are `inputDim / 32` contiguous - * 24-byte blocks. Dequant matches `DequantOps.dequantQ5_1FromBytes` exactly: - * `w = d * (code + (highBit shl 4)) + m`. Scalar (keeps weights packed — the - * memory win; SIMD vectorization of the inner loop is a follow-up). - */ - fun matmulQ5_1Vec( - input: FloatArray, - packedWeights: ByteArray, - inputDim: Int, - outputDim: Int, - output: FloatArray, - outputOffset: Int = 0, - ) { - val bytesPerBlock = 24 - val blocksPerInputDim = (inputDim + 31) / 32 - for (o in 0 until outputDim) { - var acc = 0f - val rowBase = o * blocksPerInputDim * bytesPerBlock - for (blk in 0 until blocksPerInputDim) { - val base = rowBase + blk * bytesPerBlock - val d = halfToFloat(((packedWeights[base + 1].toInt() and 0xFF) shl 8) or (packedWeights[base].toInt() and 0xFF)) - val m = halfToFloat(((packedWeights[base + 3].toInt() and 0xFF) shl 8) or (packedWeights[base + 2].toInt() and 0xFF)) - val qh = intArrayOf( - packedWeights[base + 4].toInt() and 0xFF, - packedWeights[base + 5].toInt() and 0xFF, - packedWeights[base + 6].toInt() and 0xFF, - packedWeights[base + 7].toInt() and 0xFF, - ) - val qsBase = base + 8 - val inBase = blk * 32 - for (j in 0 until 16) { - val q = packedWeights[qsBase + j].toInt() and 0xFF - val lo = q and 0x0F - val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - val wLo = d * (lo + (bitLo shl 4)) + m - val wHi = d * (hi + (bitHi shl 4)) + m - acc += input[inBase + j] * wLo + input[inBase + 16 + j] * wHi - } - } - output[outputOffset + o] = acc - } - } - - /** - * Q5_0 matrix-vector multiply: `output = input · Wᵀ` for a packed Q5_0 weight. - * - * Row-major `[outputDim, inputDim]` packing of 22-byte blocks. Dequant matches - * `DequantOps.dequantQ5_0FromBytes`: `w = d * (code + (highBit shl 4) - 16)`. - */ - fun matmulQ5_0Vec( - input: FloatArray, - packedWeights: ByteArray, - inputDim: Int, - outputDim: Int, - output: FloatArray, - outputOffset: Int = 0, - ) { - val bytesPerBlock = 22 - val blocksPerInputDim = (inputDim + 31) / 32 - for (o in 0 until outputDim) { - var acc = 0f - val rowBase = o * blocksPerInputDim * bytesPerBlock - for (blk in 0 until blocksPerInputDim) { - val base = rowBase + blk * bytesPerBlock - val d = halfToFloat(((packedWeights[base + 1].toInt() and 0xFF) shl 8) or (packedWeights[base].toInt() and 0xFF)) - val qh = intArrayOf( - packedWeights[base + 2].toInt() and 0xFF, - packedWeights[base + 3].toInt() and 0xFF, - packedWeights[base + 4].toInt() and 0xFF, - packedWeights[base + 5].toInt() and 0xFF, - ) - val qsBase = base + 6 - val inBase = blk * 32 - for (j in 0 until 16) { - val q = packedWeights[qsBase + j].toInt() and 0xFF - val lo = q and 0x0F - val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - val wLo = d * (lo + (bitLo shl 4) - 16) - val wHi = d * (hi + (bitHi shl 4) - 16) - acc += input[inBase + j] * wLo + input[inBase + 16 + j] * wHi - } - } - output[outputOffset + o] = acc - } - } } diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q5MatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q5MatmulDispatchTest.kt deleted file mode 100644 index 65d1cc27..00000000 --- a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q5MatmulDispatchTest.kt +++ /dev/null @@ -1,140 +0,0 @@ -package sk.ainet.exec.tensor.ops - -import kotlin.random.Random -import kotlin.test.Test -import kotlin.test.assertEquals -import kotlin.test.assertTrue -import sk.ainet.context.DirectCpuExecutionContext -import sk.ainet.lang.tensor.Shape -import sk.ainet.lang.tensor.Tensor -import sk.ainet.lang.tensor.data.Q5_0BlockTensorData -import sk.ainet.lang.tensor.data.Q5_1BlockTensorData -import sk.ainet.lang.tensor.data.TensorData -import sk.ainet.lang.types.FP32 - -/** - * Validates the packed Q5_1 / Q5_0 matmul kernels + lazy transpose: feeding a packed - * weight through `ops.matmul(x, ops.transpose(W))` must match feeding the FP32-dequantized - * weight through the same path. The FP32 reference is dequantized inline (independent of the - * `Q5_*BlockTensorData.dequantizeBlock` code under test), matching ggml / `DequantOps`. - */ -class Q5MatmulDispatchTest { - - private val ctx = DirectCpuExecutionContext() - - private fun f16(v: Float): Int { - // float -> IEEE half bits (round-to-nearest-even, good enough for test weights) - val bits = v.toRawBits() - val sign = (bits ushr 16) and 0x8000 - var expo = ((bits ushr 23) and 0xFF) - 127 + 15 - val mant = bits and 0x7FFFFF - if (expo <= 0) return sign // flush tiny to signed zero - if (expo >= 31) return sign or 0x7C00 // inf - return sign or (expo shl 10) or (mant ushr 13) - } - - private fun halfToFloat(h: Int): Float { - val sign = (h and 0x8000) shl 16 - val exp = (h and 0x7C00) shr 10 - val mant = h and 0x03FF - return when (exp) { - 0 -> Float.fromBits(sign) // (subnormals flushed by f16() above) - 31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13)) - else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13)) - } - } - - // --- Q5_1: 24 bytes/block (d, m, qh[4], qs[16]) --------------------------------------- - - private fun randomQ5_1Block(rng: Random, out: ByteArray, off: Int) { - val d = f16(0.02f + rng.nextFloat() * 0.05f) - val m = f16(-0.3f + rng.nextFloat() * 0.6f) - out[off] = (d and 0xFF).toByte(); out[off + 1] = ((d ushr 8) and 0xFF).toByte() - out[off + 2] = (m and 0xFF).toByte(); out[off + 3] = ((m ushr 8) and 0xFF).toByte() - for (k in 0 until 4) out[off + 4 + k] = rng.nextInt(256).toByte() // qh - for (k in 0 until 16) out[off + 8 + k] = rng.nextInt(256).toByte() // qs - } - - private fun dequantQ5_1Block(b: ByteArray, off: Int, dst: FloatArray, dstOff: Int) { - val d = halfToFloat(((b[off + 1].toInt() and 0xFF) shl 8) or (b[off].toInt() and 0xFF)) - val m = halfToFloat(((b[off + 3].toInt() and 0xFF) shl 8) or (b[off + 2].toInt() and 0xFF)) - val qh = intArrayOf(b[off + 4].toInt() and 0xFF, b[off + 5].toInt() and 0xFF, b[off + 6].toInt() and 0xFF, b[off + 7].toInt() and 0xFF) - for (j in 0 until 16) { - val q = b[off + 8 + j].toInt() and 0xFF - val lo = q and 0x0F; val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - dst[dstOff + j] = d * (lo + (bitLo shl 4)) + m - dst[dstOff + 16 + j] = d * (hi + (bitHi shl 4)) + m - } - } - - // --- Q5_0: 22 bytes/block (d, qh[4], qs[16]), symmetric -16 -------------------------- - - private fun randomQ5_0Block(rng: Random, out: ByteArray, off: Int) { - val d = f16(0.02f + rng.nextFloat() * 0.05f) - out[off] = (d and 0xFF).toByte(); out[off + 1] = ((d ushr 8) and 0xFF).toByte() - for (k in 0 until 4) out[off + 2 + k] = rng.nextInt(256).toByte() - for (k in 0 until 16) out[off + 6 + k] = rng.nextInt(256).toByte() - } - - private fun dequantQ5_0Block(b: ByteArray, off: Int, dst: FloatArray, dstOff: Int) { - val d = halfToFloat(((b[off + 1].toInt() and 0xFF) shl 8) or (b[off].toInt() and 0xFF)) - val qh = intArrayOf(b[off + 2].toInt() and 0xFF, b[off + 3].toInt() and 0xFF, b[off + 4].toInt() and 0xFF, b[off + 5].toInt() and 0xFF) - for (j in 0 until 16) { - val q = b[off + 6 + j].toInt() and 0xFF - val lo = q and 0x0F; val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - dst[dstOff + j] = d * (lo + (bitLo shl 4) - 16) - dst[dstOff + 16 + j] = d * (hi + (bitHi shl 4) - 16) - } - } - - private fun assertPackedMatchesFp32( - encoding: String, inputDim: Int, outputDim: Int, batchSize: Int, seed: Int, - ) { - val rng = Random(seed) - val blocksPerRow = inputDim / 32 - val bytesPerBlock = if (encoding == "Q5_1") 24 else 22 - val bytes = ByteArray(outputDim * blocksPerRow * bytesPerBlock) - val wf = FloatArray(outputDim * inputDim) // row-major [out, in] - for (o in 0 until outputDim) { - for (blk in 0 until blocksPerRow) { - val off = (o * blocksPerRow + blk) * bytesPerBlock - val dstOff = o * inputDim + blk * 32 - if (encoding == "Q5_1") { randomQ5_1Block(rng, bytes, off); dequantQ5_1Block(bytes, off, wf, dstOff) } - else { randomQ5_0Block(rng, bytes, off); dequantQ5_0Block(bytes, off, wf, dstOff) } - } - } - - val packed: Tensor = if (encoding == "Q5_1") - ctx.fromData(Q5_1BlockTensorData(Shape(outputDim, inputDim), bytes) as TensorData, FP32::class) - else - ctx.fromData(Q5_0BlockTensorData(Shape(outputDim, inputDim), bytes) as TensorData, FP32::class) - val fp32 = ctx.fromFloatArray(Shape(outputDim, inputDim), FP32::class, wf) - - val input = ctx.fromFloatArray( - Shape(batchSize, inputDim), FP32::class, FloatArray(batchSize * inputDim) { (rng.nextFloat() - 0.5f) }, - ) - val outPacked = ctx.ops.matmul(input, ctx.ops.transpose(packed)).data.copyToFloatArray() - val outFp32 = ctx.ops.matmul(input, ctx.ops.transpose(fp32)).data.copyToFloatArray() - - assertEquals(outFp32.size, outPacked.size, "$encoding output size") - var maxErr = 0f - for (i in outFp32.indices) maxErr = maxOf(maxErr, kotlin.math.abs(outFp32[i] - outPacked[i])) - assertTrue(maxErr < 1e-3f, "$encoding packed matmul deviates from FP32 dequant: maxErr=$maxErr") - } - - @Test fun q5_1_matmul_matches_fp32_dequant_single_batch() = - assertPackedMatchesFp32("Q5_1", inputDim = 128, outputDim = 64, batchSize = 1, seed = 1) - - @Test fun q5_1_matmul_matches_fp32_dequant_multi_batch() = - assertPackedMatchesFp32("Q5_1", inputDim = 256, outputDim = 96, batchSize = 3, seed = 2) - - @Test fun q5_0_matmul_matches_fp32_dequant_single_batch() = - assertPackedMatchesFp32("Q5_0", inputDim = 128, outputDim = 64, batchSize = 1, seed = 3) - - @Test fun q5_0_matmul_matches_fp32_dequant_multi_batch() = - assertPackedMatchesFp32("Q5_0", inputDim = 192, outputDim = 48, batchSize = 2, seed = 4) -} diff --git a/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt b/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt +++ b/skainet-backends/skainet-backend-cpu/src/linuxMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.linux.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt b/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt +++ b/skainet-backends/skainet-backend-cpu/src/wasmJsMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasm.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt b/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt index 425007c5..aa0ed475 100644 --- a/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt +++ b/skainet-backends/skainet-backend-cpu/src/wasmWasiMain/kotlin/sk/ainet/exec/tensor/ops/PlatformCpuOpsFactory.wasmWasi.kt @@ -1,7 +1,13 @@ package sk.ainet.exec.tensor.ops +import sk.ainet.backend.api.kernel.KernelRegistry +import sk.ainet.exec.kernel.ScalarKernelProvider import sk.ainet.lang.tensor.data.TensorDataFactory import sk.ainet.lang.tensor.ops.TensorOps -internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps = - { factory -> DefaultCpuOps(factory) } +internal actual fun platformDefaultCpuOpsFactory(): (TensorDataFactory) -> TensorOps { + // Non-JVM has no ServiceLoader; register the scalar packed-quant kernels + // (Q4_K/Q6_K/Q5_1/Q5_0/Q8_0/Q4_0) so DefaultCpuOpsBase can dispatch them. + KernelRegistry.register(ScalarKernelProvider) + return { factory -> DefaultCpuOps(factory) } +} diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api index c4ecd987..c2c2a1be 100644 --- a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api +++ b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api @@ -3144,6 +3144,88 @@ public final class sk/ainet/lang/tensor/data/Q4_KTensorDataKt { public static final fun toFloatArray (Lsk/ainet/lang/tensor/data/Q4_KTensorData;)[F } +public final class sk/ainet/lang/tensor/data/Q5_0BlockTensorData : sk/ainet/lang/tensor/data/Q5_0TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_0BlockTensorData$Companion; + public fun (Lsk/ainet/lang/tensor/Shape;[B)V + public fun copyToFloatArray ()[F + public fun dequantizeBlock (I[FI)V + public fun get ([I)Ljava/lang/Byte; + public synthetic fun get ([I)Ljava/lang/Object; + public fun getBlockCount ()I + public fun getBlockSize ()I + public fun getElementCount ()J + public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding; + public fun getPackedData ()[B + public fun getPhysicalBytes ()J + public fun getShape ()Lsk/ainet/lang/tensor/Shape; + public fun set ([IB)V + public synthetic fun set ([ILjava/lang/Object;)V + public fun toFloatArray ()[F + public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage; +} + +public final class sk/ainet/lang/tensor/data/Q5_0BlockTensorData$Companion { + public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q5_0BlockTensorData; +} + +public abstract interface class sk/ainet/lang/tensor/data/Q5_0TensorData : sk/ainet/lang/tensor/data/TensorData { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_0TensorData$Companion; + public abstract fun getBlockCount ()I + public abstract fun getPackedData ()[B +} + +public final class sk/ainet/lang/tensor/data/Q5_0TensorData$Companion { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I +} + +public final class sk/ainet/lang/tensor/data/Q5_0TensorData$DefaultImpls { + public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q5_0TensorData;)[F +} + +public final class sk/ainet/lang/tensor/data/Q5_1BlockTensorData : sk/ainet/lang/tensor/data/Q5_1TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_1BlockTensorData$Companion; + public fun (Lsk/ainet/lang/tensor/Shape;[B)V + public fun copyToFloatArray ()[F + public fun dequantizeBlock (I[FI)V + public fun get ([I)Ljava/lang/Byte; + public synthetic fun get ([I)Ljava/lang/Object; + public fun getBlockCount ()I + public fun getBlockSize ()I + public fun getElementCount ()J + public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding; + public fun getPackedData ()[B + public fun getPhysicalBytes ()J + public fun getShape ()Lsk/ainet/lang/tensor/Shape; + public fun set ([IB)V + public synthetic fun set ([ILjava/lang/Object;)V + public fun toFloatArray ()[F + public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage; +} + +public final class sk/ainet/lang/tensor/data/Q5_1BlockTensorData$Companion { + public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q5_1BlockTensorData; +} + +public abstract interface class sk/ainet/lang/tensor/data/Q5_1TensorData : sk/ainet/lang/tensor/data/TensorData { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field Companion Lsk/ainet/lang/tensor/data/Q5_1TensorData$Companion; + public abstract fun getBlockCount ()I + public abstract fun getPackedData ()[B +} + +public final class sk/ainet/lang/tensor/data/Q5_1TensorData$Companion { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I +} + +public final class sk/ainet/lang/tensor/data/Q5_1TensorData$DefaultImpls { + public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q5_1TensorData;)[F +} + public final class sk/ainet/lang/tensor/data/Q6_KBlockTensorData : sk/ainet/lang/tensor/data/Q6_KTensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { public static final field Companion Lsk/ainet/lang/tensor/data/Q6_KBlockTensorData$Companion; public fun (Lsk/ainet/lang/tensor/Shape;[B)V @@ -5220,6 +5302,28 @@ public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_K : sk/ainet/l public fun toString ()Ljava/lang/String; } +public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q5_0 : sk/ainet/lang/tensor/storage/TensorEncoding { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q5_0; + public fun equals (Ljava/lang/Object;)Z + public fun getName ()Ljava/lang/String; + public fun hashCode ()I + public fun physicalBytes (J)Ljava/lang/Long; + public fun toString ()Ljava/lang/String; +} + +public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q5_1 : sk/ainet/lang/tensor/storage/TensorEncoding { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q5_1; + public fun equals (Ljava/lang/Object;)Z + public fun getName ()Ljava/lang/String; + public fun hashCode ()I + public fun physicalBytes (J)Ljava/lang/Long; + public fun toString ()Ljava/lang/String; +} + public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q6_K : sk/ainet/lang/tensor/storage/TensorEncoding { public static final field BLOCK_SIZE I public static final field BYTES_PER_BLOCK I diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt index d2795e0f..61eba8d7 100644 --- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_0TensorData.kt @@ -6,127 +6,89 @@ import sk.ainet.lang.tensor.storage.TensorEncoding import sk.ainet.lang.types.DType /** - * Tensor data interface for the GGML **Q5_0** quantized format (5-bit, symmetric). + * Tensor data for the GGML **Q5_0** quantized format (5-bit, symmetric). * - * Q5_0 block format (32 elements per block, 22 bytes per block): - * - 2 bytes: f16 scale (`d`) - * - 4 bytes: `qh[0..3]` — the 5th (high) bit of each of the 32 codes - * - 16 bytes: `qs[0..15]` — the low 4 bits, two nibbles per byte + * Block format (32 elements, 22 bytes/block): + * - bytes 0..1 : `d` (f16 scale) + * - bytes 2..5 : `qh[0..3]` (5th/high bit of each of the 32 codes) + * - bytes 6..21 : `qs[0..15]` (low 4 bits, two nibbles per byte) * - * Dequantization (matching `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_0FromBytes`): - * for `j ∈ [0, 16)`, with `q = qs[j]`, `lo = q & 0x0F`, `hi = q >>> 4`, and the - * high bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`: + * Dequant (matches `DequantOps.dequantQ5_0FromBytes`): with `bitX` as in Q5_1, * * element[j] = d * (lo + (bitLo shl 4) - 16) * element[j + 16] = d * (hi + (bitHi shl 4) - 16) * - * The `- 16` bias makes the 5-bit code symmetric around zero. + * Matmul packing is **input-block-major** `(blockIdx * outputDim + o)`; see + * [Q5_1TensorData] for the layout/transpose contract. */ public interface Q5_0TensorData : TensorData { - /** Number of Q5_0 blocks in the tensor. */ public val blockCount: Int - - /** Raw packed data containing all blocks. */ public val packedData: ByteArray public companion object { - /** Elements per Q5_0 block. */ public const val BLOCK_SIZE: Int = 32 - - /** Bytes per Q5_0 block (2 `d` + 4 `qh` + 16 `qs`). */ public const val BYTES_PER_BLOCK: Int = 22 } } -/** - * Implementation of [Q5_0TensorData] backed by a packed byte array, in the - * natural GGUF **row-major** `[out, in]` layout. `matmulQ5_0Vec` indexes the - * packed bytes row-major, so no block-major re-layout is needed. - */ +/** Packed-byte implementation of [Q5_0TensorData]. */ public class Q5_0BlockTensorData( initialShape: Shape, - private val data: ByteArray + private val data: ByteArray, ) : Q5_0TensorData, PackedBlockStorage { override val shape: Shape = Shape(initialShape.dimensions.copyOf()) private val strides: IntArray = shape.computeStrides() override val packedData: ByteArray get() = data - override val blockCount: Int = (shape.volume + Q5_0TensorData.BLOCK_SIZE - 1) / Q5_0TensorData.BLOCK_SIZE - override val encoding: TensorEncoding get() = TensorEncoding.Q5_0 override val blockSize: Int get() = Q5_0TensorData.BLOCK_SIZE init { - val requiredBytes = blockCount * Q5_0TensorData.BYTES_PER_BLOCK - require(data.size >= requiredBytes) { - "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks" - } + val required = blockCount * Q5_0TensorData.BYTES_PER_BLOCK + require(data.size >= required) { "Data size ${data.size} < required $required for $blockCount blocks" } } override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { - require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } val base = blockIdx * Q5_0TensorData.BYTES_PER_BLOCK val d = Q4_0BlockTensorData.halfToFloat(((data[base + 1].toInt() and 0xFF) shl 8) or (data[base].toInt() and 0xFF)) - val qh0 = data[base + 2].toInt() and 0xFF - val qh1 = data[base + 3].toInt() and 0xFF - val qh2 = data[base + 4].toInt() and 0xFF - val qh3 = data[base + 5].toInt() and 0xFF - val qh = intArrayOf(qh0, qh1, qh2, qh3) - val qsBase = base + 6 - val elemsInBlock = minOf(Q5_0TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_0TensorData.BLOCK_SIZE) + val qh = intArrayOf( + data[base + 2].toInt() and 0xFF, data[base + 3].toInt() and 0xFF, + data[base + 4].toInt() and 0xFF, data[base + 5].toInt() and 0xFF, + ) + val qs = base + 6 + val elems = minOf(Q5_0TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_0TensorData.BLOCK_SIZE) for (j in 0 until 16) { - val q = data[qsBase + j].toInt() and 0xFF - val lo = q and 0x0F - val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - val o0 = outputOffset + j - if (j < elemsInBlock && o0 < output.size) output[o0] = d * (lo + (bitLo shl 4) - 16) - val o1 = outputOffset + 16 + j - if (16 + j < elemsInBlock && o1 < output.size) output[o1] = d * (hi + (bitHi shl 4) - 16) + val q = data[qs + j].toInt() and 0xFF + val lo = q and 0x0F; val hi = q ushr 4 + val bitLo = (qh[j / 8] ushr (j % 8)) and 1 + val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + if (j < elems) output[outputOffset + j] = d * (lo + (bitLo shl 4) - 16) + if (16 + j < elems) output[outputOffset + 16 + j] = d * (hi + (bitHi shl 4) - 16) } } override fun get(vararg indices: Int): Byte { - val flatIndex = calcFlatIndex(indices) + val flat = calcFlatIndex(indices) val tmp = FloatArray(Q5_0TensorData.BLOCK_SIZE) - dequantizeBlock(flatIndex / Q5_0TensorData.BLOCK_SIZE, tmp, 0) - return tmp[flatIndex % Q5_0TensorData.BLOCK_SIZE].toInt().toByte() + dequantizeBlock(flat / Q5_0TensorData.BLOCK_SIZE, tmp, 0) + return tmp[flat % Q5_0TensorData.BLOCK_SIZE].toInt().toByte() } - override fun set(vararg indices: Int, value: Byte) { - throw UnsupportedOperationException("Q5_0BlockTensorData is read-only (packed quantized weights)") - } + override fun set(vararg indices: Int, value: Byte): Unit = + throw UnsupportedOperationException("Q5_0BlockTensorData is read-only") private fun calcFlatIndex(indices: IntArray): Int { require(indices.size == shape.dimensions.size) { - "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})" - } - var flatIndex = 0 - for (i in indices.indices) { - val idx = indices[i] - require(idx >= 0 && idx < shape.dimensions[i]) { - "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}" - } - flatIndex += idx * strides[i] + "Number of indices (${indices.size}) must match dimensions (${shape.dimensions.size})" } - return flatIndex + var flat = 0 + for (i in indices.indices) flat += indices[i] * strides[i] + return flat } public companion object { - /** Create [Q5_0BlockTensorData] from raw packed Q5_0 bytes (GGUF row-major). */ - public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_0BlockTensorData = - Q5_0BlockTensorData(shape, bytes) - } -} - -/** Dequantize Q5_0 tensor data to a FloatArray (row-major, matching the packed layout). */ -public fun Q5_0TensorData.toFloatArray(): FloatArray { - val result = FloatArray(shape.volume) - val block = this as Q5_0BlockTensorData - for (blockIdx in 0 until blockCount) { - block.dequantizeBlock(blockIdx, result, blockIdx * Q5_0TensorData.BLOCK_SIZE) + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_0BlockTensorData = Q5_0BlockTensorData(shape, bytes) } - return result } diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt index 1aab1b54..52550ca3 100644 --- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q5_1TensorData.kt @@ -6,135 +6,95 @@ import sk.ainet.lang.tensor.storage.TensorEncoding import sk.ainet.lang.types.DType /** - * Tensor data interface for the GGML **Q5_1** quantized format (5-bit, with a - * per-block minimum). + * Tensor data for the GGML **Q5_1** quantized format (5-bit, per-block minimum). * - * Q5_1 block format (32 elements per block, 24 bytes per block): - * - 2 bytes: f16 scale (`d`) - * - 2 bytes: f16 minimum (`m`) - * - 4 bytes: `qh[0..3]` — the 5th (high) bit of each of the 32 codes - * - 16 bytes: `qs[0..15]` — the low 4 bits, two nibbles per byte + * Block format (32 elements, 24 bytes/block): + * - bytes 0..1 : `d` (f16 scale) + * - bytes 2..3 : `m` (f16 minimum) + * - bytes 4..7 : `qh[0..3]` (5th/high bit of each of the 32 codes) + * - bytes 8..23 : `qs[0..15]` (low 4 bits, two nibbles per byte) * - * Dequantization (matching `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_1FromBytes`): - * for `j ∈ [0, 16)`, with `q = qs[j]`, `lo = q & 0x0F`, `hi = q >>> 4`, and the - * high bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`: + * Dequant (matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_1FromBytes`), + * for `j ∈ [0,16)`, `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, + * `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`: * * element[j] = d * (lo + (bitLo shl 4)) + m * element[j + 16] = d * (hi + (bitHi shl 4)) + m * - * Enables direct quantized matmul without full dequantization, mirroring - * [Q4_0TensorData] / [Q8_0TensorData]. + * As packed by the GGUF converter for matmul, blocks are **input-block-major** + * `(blockIdx * outputDim + o)`; `Q5_1MatmulKernel` indexes them that way and the + * CPU-ops lazy transpose is a pure shape swap. The per-block [dequantizeBlock] + * below is layout-agnostic (it dequantizes the block at a flat index). */ public interface Q5_1TensorData : TensorData { - /** Number of Q5_1 blocks in the tensor. */ public val blockCount: Int - - /** Raw packed data containing all blocks. */ public val packedData: ByteArray public companion object { - /** Elements per Q5_1 block. */ public const val BLOCK_SIZE: Int = 32 - - /** Bytes per Q5_1 block (2 `d` + 2 `m` + 4 `qh` + 16 `qs`). */ public const val BYTES_PER_BLOCK: Int = 24 } } -/** - * Implementation of [Q5_1TensorData] backed by a packed byte array, in the - * natural GGUF **row-major** `[out, in]` layout (each logical row's elements are - * packed sequentially as `in / 32` blocks). `matmulQ5_1Vec` indexes the packed - * bytes row-major, so no block-major re-layout is needed. - */ +/** Packed-byte implementation of [Q5_1TensorData]. */ public class Q5_1BlockTensorData( initialShape: Shape, - private val data: ByteArray + private val data: ByteArray, ) : Q5_1TensorData, PackedBlockStorage { override val shape: Shape = Shape(initialShape.dimensions.copyOf()) private val strides: IntArray = shape.computeStrides() override val packedData: ByteArray get() = data - override val blockCount: Int = (shape.volume + Q5_1TensorData.BLOCK_SIZE - 1) / Q5_1TensorData.BLOCK_SIZE - override val encoding: TensorEncoding get() = TensorEncoding.Q5_1 override val blockSize: Int get() = Q5_1TensorData.BLOCK_SIZE init { - val requiredBytes = blockCount * Q5_1TensorData.BYTES_PER_BLOCK - require(data.size >= requiredBytes) { - "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks" - } + val required = blockCount * Q5_1TensorData.BYTES_PER_BLOCK + require(data.size >= required) { "Data size ${data.size} < required $required for $blockCount blocks" } } override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { - require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } val base = blockIdx * Q5_1TensorData.BYTES_PER_BLOCK val d = Q4_0BlockTensorData.halfToFloat(((data[base + 1].toInt() and 0xFF) shl 8) or (data[base].toInt() and 0xFF)) val m = Q4_0BlockTensorData.halfToFloat(((data[base + 3].toInt() and 0xFF) shl 8) or (data[base + 2].toInt() and 0xFF)) - val qh0 = data[base + 4].toInt() and 0xFF - val qh1 = data[base + 5].toInt() and 0xFF - val qh2 = data[base + 6].toInt() and 0xFF - val qh3 = data[base + 7].toInt() and 0xFF - val qh = intArrayOf(qh0, qh1, qh2, qh3) - val qsBase = base + 8 - val elemsInBlock = minOf(Q5_1TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_1TensorData.BLOCK_SIZE) + val qh = intArrayOf( + data[base + 4].toInt() and 0xFF, data[base + 5].toInt() and 0xFF, + data[base + 6].toInt() and 0xFF, data[base + 7].toInt() and 0xFF, + ) + val qs = base + 8 + val elems = minOf(Q5_1TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q5_1TensorData.BLOCK_SIZE) for (j in 0 until 16) { - val q = data[qsBase + j].toInt() and 0xFF - val lo = q and 0x0F - val hi = q ushr 4 - val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01 - val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01 - val o0 = outputOffset + j - if (j < elemsInBlock && o0 < output.size) output[o0] = d * (lo + (bitLo shl 4)) + m - val o1 = outputOffset + 16 + j - if (16 + j < elemsInBlock && o1 < output.size) output[o1] = d * (hi + (bitHi shl 4)) + m + val q = data[qs + j].toInt() and 0xFF + val lo = q and 0x0F; val hi = q ushr 4 + val bitLo = (qh[j / 8] ushr (j % 8)) and 1 + val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 1 + if (j < elems) output[outputOffset + j] = d * (lo + (bitLo shl 4)) + m + if (16 + j < elems) output[outputOffset + 16 + j] = d * (hi + (bitHi shl 4)) + m } } override fun get(vararg indices: Int): Byte { - val flatIndex = calcFlatIndex(indices) + val flat = calcFlatIndex(indices) val tmp = FloatArray(Q5_1TensorData.BLOCK_SIZE) - val blockIdx = flatIndex / Q5_1TensorData.BLOCK_SIZE - dequantizeBlock(blockIdx, tmp, 0) - // Q5_1 stores real-valued reconstructions; expose the rounded code is not - // meaningful, so this accessor is best-effort for debugging only. - return tmp[flatIndex % Q5_1TensorData.BLOCK_SIZE].toInt().toByte() + dequantizeBlock(flat / Q5_1TensorData.BLOCK_SIZE, tmp, 0) + return tmp[flat % Q5_1TensorData.BLOCK_SIZE].toInt().toByte() } - override fun set(vararg indices: Int, value: Byte) { - throw UnsupportedOperationException("Q5_1BlockTensorData is read-only (packed quantized weights)") - } + override fun set(vararg indices: Int, value: Byte): Unit = + throw UnsupportedOperationException("Q5_1BlockTensorData is read-only") private fun calcFlatIndex(indices: IntArray): Int { require(indices.size == shape.dimensions.size) { - "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})" - } - var flatIndex = 0 - for (i in indices.indices) { - val idx = indices[i] - require(idx >= 0 && idx < shape.dimensions[i]) { - "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}" - } - flatIndex += idx * strides[i] + "Number of indices (${indices.size}) must match dimensions (${shape.dimensions.size})" } - return flatIndex + var flat = 0 + for (i in indices.indices) flat += indices[i] * strides[i] + return flat } public companion object { - /** Create [Q5_1BlockTensorData] from raw packed Q5_1 bytes (GGUF row-major). */ - public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_1BlockTensorData = - Q5_1BlockTensorData(shape, bytes) - } -} - -/** Dequantize Q5_1 tensor data to a FloatArray (row-major, matching the packed layout). */ -public fun Q5_1TensorData.toFloatArray(): FloatArray { - val result = FloatArray(shape.volume) - val block = this as Q5_1BlockTensorData - for (blockIdx in 0 until blockCount) { - block.dequantizeBlock(blockIdx, result, blockIdx * Q5_1TensorData.BLOCK_SIZE) + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q5_1BlockTensorData = Q5_1BlockTensorData(shape, bytes) } - return result }