diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt index a5934221..fd22f37f 100644 --- a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt +++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/KernelProvider.kt @@ -67,6 +67,12 @@ public interface KernelProvider { */ public fun matmulQ8_0(): Q8_0MatmulKernel? = null + /** + * F32 × Q4_0 matmul kernel exposed by this provider, or `null` if + * this provider does not specialize Q4_0. Same fall-through pattern. + */ + public fun matmulQ4_0(): Q4_0MatmulKernel? = null + /** * Capability query: does this provider carry a kernel for * [opName] with the given [dtypeKeys]? @@ -100,6 +106,7 @@ public interface KernelProvider { "BFloat16" -> matmulBf16() != null "Q4_K" -> matmulQ4K() != null "Q8_0" -> matmulQ8_0() != null + "Q4_0" -> matmulQ4_0() != null else -> false } } diff --git a/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt new file mode 100644 index 00000000..fae0825b --- /dev/null +++ b/skainet-backends/skainet-backend-api/src/commonMain/kotlin/sk/ainet/backend/api/kernel/Q4_0MatmulKernel.kt @@ -0,0 +1,46 @@ +package sk.ainet.backend.api.kernel + +/** + * F32 input × Q4_0-packed weights matrix-vector multiply, in canonical + * ggml block layout. + * + * output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j]) + * for j ∈ [0, inputDim), o ∈ [0, outputDim) + * + * Block layout (32-element block, 18 bytes/block; see + * [sk.ainet.lang.tensor.data.Q4_0BlockTensorData] kdoc): + * - bytes 0..1 : `d` (block scale, FP16 LE) + * - bytes 2..17 : 16 bytes packing 32 4-bit codes (split layout — low + * nibbles decode elements 0..15, high nibbles decode elements 16..31) + * + * Per element: `dequant = (code - 8) * d` (the `- 8` bias centres the + * unsigned 4-bit code around zero). Q4_0 has no per-block min / offset. + * + * Implementations MUST NOT mutate `input` or `weight`. They MAY assume + * the arrays do not alias each other or `output`. They MUST fully + * write the `outputDim` floats starting at `output[outputOffset]`. + * + * Packed-weight row-major contract: `weight` holds blocks laid out + * `(blockIdx * outputDim + o) * 18` for output row `o` and input block + * index `blockIdx`. This matches `Q4_0BlockTensorData.packedData`. + * + * `inputDim` MUST be a multiple of 32 (the Q4_0 block size). + */ +public interface Q4_0MatmulKernel { + /** + * @param input FP32 input vector (single row). + * @param inputOffset element offset into [input] where the row starts. + * @param weight packed Q4_0 bytes for the full `outputDim × inputDim` weight tensor. + * @param weightByteOffset byte offset into [weight] where block (0, 0) starts. + * @param inputDim contraction dimension (must be a multiple of 32). + * @param outputDim number of output cells. + * @param output FP32 output vector. + * @param outputOffset element offset into [output] where the row starts. + */ + public fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) +} diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api index 0caec153..a953d311 100644 --- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api +++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api @@ -53,6 +53,7 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProvider : sk/ainet/ba public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel; public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel; public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel; + public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel; public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel; public fun supports (Ljava/lang/String;Ljava/util/List;)Z } @@ -65,6 +66,7 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProviderFactory : sk/a public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel; public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel; public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel; + public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel; public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel; public fun supports (Ljava/lang/String;Ljava/util/List;)Z } @@ -97,6 +99,7 @@ public final class sk/ainet/exec/kernel/ScalarKernelProvider : sk/ainet/backend/ public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel; public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel; public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel; + public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel; public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel; public fun supports (Ljava/lang/String;Ljava/util/List;)Z } @@ -109,6 +112,7 @@ public final class sk/ainet/exec/kernel/ScalarKernelProviderFactory : sk/ainet/b public fun matmulBf16 ()Lsk/ainet/backend/api/kernel/Bf16MatmulKernel; public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel; public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel; + public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel; public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel; public fun supports (Ljava/lang/String;Ljava/util/List;)Z } @@ -118,6 +122,11 @@ public final class sk/ainet/exec/kernel/ScalarMatmulKernel : sk/ainet/backend/ap public fun matmul ([FII[FII[FIIIII)V } +public final class sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel : sk/ainet/backend/api/kernel/Q4_0MatmulKernel { + public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ4_0MatmulKernel; + public fun matmul ([FI[BIII[FI)V +} + public final class sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel { public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ8_0MatmulKernel; public fun matmul ([FI[BIII[FI)V diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt index 080377a7..a7c13ccd 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarKernelProvider.kt @@ -3,6 +3,7 @@ package sk.ainet.exec.kernel import sk.ainet.backend.api.kernel.Bf16MatmulKernel import sk.ainet.backend.api.kernel.Fp32MatmulKernel import sk.ainet.backend.api.kernel.KernelProvider +import sk.ainet.backend.api.kernel.Q4_0MatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel /** @@ -25,4 +26,5 @@ public object ScalarKernelProvider : KernelProvider { override fun matmulFp32(): Fp32MatmulKernel = ScalarMatmulKernel override fun matmulBf16(): Bf16MatmulKernel = ScalarBf16MatmulKernel override fun matmulQ8_0(): Q8_0MatmulKernel = ScalarQ8_0MatmulKernel + override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel } diff --git a/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt new file mode 100644 index 00000000..6a844e86 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/commonMain/kotlin/sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel.kt @@ -0,0 +1,96 @@ +package sk.ainet.exec.kernel + +import sk.ainet.backend.api.kernel.Q4_0MatmulKernel + +/** + * Scalar reference implementation of [Q4_0MatmulKernel] — straight + * per-block dequant + per-element FMA, no SIMD. Always available on + * every KMP target. Used as: + * + * - The correctness reference that accelerated kernels (Panama Vector, + * native FFM) must match within FP order tolerance. + * - A guaranteed fallback when no accelerated provider is registered. + * + * Block layout (32-element block, 18 bytes): + * - bytes 0..1 : FP16 little-endian scale (`d`) + * - bytes 2..17: 16 bytes packing 32 4-bit codes (split layout) + * + * Dequant per element: `(code - 8) * d`. No min / offset. + * + * Performance is intentionally modest; production paths should pick the + * Panama Vector or native variant via the kernel registry. + */ +public object ScalarQ4_0MatmulKernel : Q4_0MatmulKernel { + + private const val BLOCK_SIZE = 32 + private const val BYTES_PER_BLOCK = 18 + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "ScalarQ4_0MatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0 || inputDim == 0) { + if (outputDim > 0) { + for (o in 0 until outputDim) output[outputOffset + o] = 0f + } + return + } + val blocksPerInputDim = inputDim / BLOCK_SIZE + + for (o in 0 until outputDim) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + // FP16 scale: two LE bytes. + val dBits = (weight[blockBase].toInt() and 0xFF) or + ((weight[blockBase + 1].toInt() and 0xFF) shl 8) + val d = halfToFloat(dBits) + // 32 codes, blockIdx-th window of the input vector. Split + // layout: low nibbles → elements 0..15, high → 16..31. + val inputBase = inputOffset + blockIdx * BLOCK_SIZE + val codesBase = blockBase + 2 + for (j in 0 until 16) { + val b = weight[codesBase + j].toInt() and 0xFF + val lo = (b and 0x0F) - 8 + val hi = (b ushr 4) - 8 + acc += input[inputBase + j] * lo * d + acc += input[inputBase + 16 + j] * hi * d + } + } + output[outputOffset + o] = acc + } + } + + /** + * Convert a 16-bit IEEE-754 half-precision value (low 16 bits of + * [hbits]) to FP32. Mirrors [ScalarQ8_0MatmulKernel]'s inlined helper + * — the skainet-lang-core dequant helper is internal to that module. + */ + private fun halfToFloat(hbits: Int): Float { + val sign = (hbits and 0x8000) shl 16 + val exp = (hbits and 0x7C00) shr 10 + val mant = hbits and 0x03FF + return when (exp) { + 0 -> { + if (mant == 0) Float.fromBits(sign) + else { + var m = mant + var e = -14 + while ((m and 0x400) == 0) { + m = m shl 1 + e-- + } + m = m and 0x3FF + Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13)) + } + } + 31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13)) + else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13)) + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt index 703beebf..b70abfd9 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt @@ -9,9 +9,11 @@ import sk.ainet.backend.api.kernel.KernelRegistry import sk.ainet.backend.api.kernel.KernelServiceLoader import sk.ainet.backend.api.kernel.KernelStrictness import sk.ainet.backend.api.kernel.Q4KMatmulKernel +import sk.ainet.backend.api.kernel.Q4_0MatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel import sk.ainet.exec.kernel.ScalarBf16MatmulKernel import sk.ainet.exec.kernel.ScalarMatmulKernel +import sk.ainet.exec.kernel.ScalarQ4_0MatmulKernel import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.Tensor import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData @@ -21,6 +23,7 @@ import sk.ainet.lang.tensor.data.MemorySegmentTensorData import sk.ainet.lang.tensor.data.Q4MemorySegmentMarker import sk.ainet.lang.tensor.data.Q4MemorySegmentTensorData import sk.ainet.lang.tensor.data.Bf16TensorData +import sk.ainet.lang.tensor.data.Q4_0TensorData import sk.ainet.lang.tensor.data.Q8_0TensorData import sk.ainet.lang.tensor.data.Q8MemorySegmentMarker import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData @@ -113,6 +116,24 @@ internal class DefaultCpuOpsJvm( ?: ScalarBf16MatmulKernel } + /** + * Q4_0 matmul kernel resolved via [KernelRegistry]. Mirrors + * [bf16MatmulKernel]: non-null, picks the highest-priority provider + * that carries a Q4_0 kernel (native FFM at 100, Panama Vector at + * 50), falling back to [ScalarQ4_0MatmulKernel] — the scalar SPI + * kernel is the floor (every `KernelProvider` carries one), so Q4_0 + * has no pre-SPI legacy fallback to thread through. + */ + private val q4_0MatmulKernel: Q4_0MatmulKernel by lazy { + if (KernelRegistry.providers().isEmpty()) { + KernelServiceLoader.installAll() + } + KernelRegistry.providers() + .firstOrNull { it.isAvailable() && it.matmulQ4_0() != null } + ?.matmulQ4_0() + ?: ScalarQ4_0MatmulKernel + } + override fun add(a: Tensor, b: Tensor): Tensor { vectorFloatBinary(a, b, { x, y -> x.add(y) }) { x, y -> x + y }?.let { return it } return super.add(a, b) @@ -521,6 +542,22 @@ internal class DefaultCpuOpsJvm( @Suppress("UNCHECKED_CAST") CpuTensor(outData as TensorData, this, a.dtype) } + is Q4_0TensorData -> { + val outBuffer = FloatArray(batchSize * outputDim) + for (batch in 0 until batchSize) { + val batchInput = if (batchSize == 1) inputBuffer + else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) + q4_0MatmulKernel.matmul( + batchInput, 0, + bData.packedData, 0, + inputDim, outputDim, + outBuffer, batch * outputDim, + ) + } + val outData = DenseFloatArrayTensorData(Shape(batchSize, outputDim), outBuffer) + @Suppress("UNCHECKED_CAST") + CpuTensor(outData as TensorData, this, a.dtype) + } is Q4_KTensorData -> { val outBuffer = FloatArray(batchSize * outputDim) val spiKernel = q4kMatmulKernel diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt index cc68683b..07a83cc3 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelProviderSupportsTest.kt @@ -47,6 +47,11 @@ class KernelProviderSupportsTest { p.supports("matmul", listOf("Float32", "Q8_0")), "Q8_0 matmul support must mirror matmulQ8_0() != null", ) + assertEquals( + p.matmulQ4_0() != null, + p.supports("matmul", listOf("Float32", "Q4_0")), + "Q4_0 matmul support must mirror matmulQ4_0() != null", + ) } @Test @@ -62,6 +67,9 @@ class KernelProviderSupportsTest { p.matmulQ4K() != null, p.supports("matmul", listOf("Float32", "Q4_K")), ) + // Scalar carries the Q4_0 floor kernel, so the capability query + // must report it as supported. + assertTrue(p.supports("matmul", listOf("Float32", "Q4_0"))) } @Test diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt new file mode 100644 index 00000000..f005dc25 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0MatmulDispatchTest.kt @@ -0,0 +1,110 @@ +package sk.ainet.exec.tensor.ops + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.exec.kernel.ScalarQ4_0MatmulKernel +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.Tensor +import sk.ainet.lang.tensor.data.Q4_0BlockTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.FP32 + +/** + * Integration tests for the FP32 × Q4_0 dispatch path in + * [DefaultCpuOpsJvm.matmul]. Confirms that calling matmul on a + * Q4_0-backed weight tensor produces the same output as the scalar + * Q4_0 kernel — proving the dispatch actually routes through the + * registered Q4_0 SPI kernel (or the scalar floor). Mirrors + * [Q8_0MatmulDispatchTest]; pins integration, not kernel correctness. + */ +class Q4_0MatmulDispatchTest { + + private val ctx = DirectCpuExecutionContext() + + private val blockSize = 32 + private val bytesPerBlock = 18 + + private fun randomQ4_0Bytes(blocksPerInputDim: Int, outputDim: Int, seed: Int): ByteArray { + val rng = Random(seed) + val numBlocks = blocksPerInputDim * outputDim + val bytes = ByteArray(numBlocks * bytesPerBlock) + rng.nextBytes(bytes) + for (block in 0 until numBlocks) { + val base = block * bytesPerBlock + // FP16 scale ≈ 7.6e-3 (0x2200) — safely finite, non-zero. + bytes[base + 0] = 0x00.toByte() + bytes[base + 1] = 0x22.toByte() + } + return bytes + } + + private fun scalarQ4_0Reference( + input: FloatArray, weight: ByteArray, + inputDim: Int, outputDim: Int, + batchSize: Int, + ): FloatArray { + val out = FloatArray(batchSize * outputDim) + for (b in 0 until batchSize) { + ScalarQ4_0MatmulKernel.matmul( + input, b * inputDim, + weight, 0, + inputDim, outputDim, + out, b * outputDim, + ) + } + return out + } + + private fun assertDispatchMatchesScalar( + batchSize: Int, inputDim: Int, outputDim: Int, seed: Int, + tolPerBlock: Float = 1e-2f, + ) { + val rng = Random(seed) + val inputFloats = FloatArray(batchSize * inputDim) { rng.nextFloat() - 0.5f } + val blocksPerInputDim = inputDim / blockSize + + val weightBytes = randomQ4_0Bytes(blocksPerInputDim, outputDim, seed) + // Logical shape of a Q4_0 weight tensor is [inputDim, outputDim]. + @Suppress("UNCHECKED_CAST") + val td = Q4_0BlockTensorData(Shape(inputDim, outputDim), weightBytes) as TensorData + val weight = ctx.fromData(td, FP32::class) + val input = ctx.fromFloatArray( + Shape(batchSize, inputDim), FP32::class, inputFloats, + ) + + val out = ctx.ops.matmul(input, weight) + val outArr = out.data.copyToFloatArray() + + val expected = scalarQ4_0Reference(inputFloats, weightBytes, inputDim, outputDim, batchSize) + + val tol = (tolPerBlock * blocksPerInputDim.coerceAtLeast(1)).coerceAtLeast(tolPerBlock) + for (i in expected.indices) { + val diff = abs(expected[i] - outArr[i]) + assertTrue( + diff <= tol, + "dispatch mismatch at $i: expected=${expected[i]} got=${outArr[i]} diff=$diff tol=$tol", + ) + } + } + + @Test + fun single_batch_matmul_against_q4_0_weight_routes_correctly() { + // batchSize=1 hits the optimized "no copyOfRange" branch in chooseQuantizedMatmul. + assertDispatchMatchesScalar(batchSize = 1, inputDim = 128, outputDim = 64, seed = 1) + } + + @Test + fun multi_batch_matmul_against_q4_0_weight_routes_correctly() { + // batchSize>1 exercises the per-row copyOfRange branch. + assertDispatchMatchesScalar(batchSize = 3, inputDim = 256, outputDim = 32, seed = 2) + } + + @Test + fun llm_typical_attention_proj_matmul_routes_correctly() { + // Realistic attention-projection size (matvec at dim×dim). + assertDispatchMatchesScalar(batchSize = 1, inputDim = 512, outputDim = 512, seed = 3) + } +} diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api index 40f996c8..1805010a 100644 --- a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api +++ b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api @@ -3030,6 +3030,55 @@ public final class sk/ainet/lang/tensor/data/Q4MemorySegmentTensorData$Companion public static synthetic fun fromRawBytes$default (Lsk/ainet/lang/tensor/data/Q4MemorySegmentTensorData$Companion;Lsk/ainet/lang/tensor/Shape;[BLjava/lang/foreign/Arena;JILjava/lang/Object;)Lsk/ainet/lang/tensor/data/Q4MemorySegmentTensorData; } +public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData : sk/ainet/lang/tensor/data/Q4_0TensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { + public static final field Companion Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion; + public fun (Lsk/ainet/lang/tensor/Shape;[B)V + public fun copyToFloatArray ()[F + public fun dequantizeBlock (I[FI)V + public fun get ([I)Ljava/lang/Byte; + public synthetic fun get ([I)Ljava/lang/Object; + public fun getBlockCount ()I + public fun getBlockScale (I)F + public fun getBlockSize ()I + public fun getCode (II)B + public fun getElementCount ()J + public fun getEncoding ()Lsk/ainet/lang/tensor/storage/TensorEncoding; + public fun getPackedData ()[B + public fun getPhysicalBytes ()J + public fun getShape ()Lsk/ainet/lang/tensor/Shape; + public fun set ([IB)V + public synthetic fun set ([ILjava/lang/Object;)V + public fun toFloatArray ()[F + public fun toTensorStorage (Lsk/ainet/lang/tensor/storage/LogicalDType;Lsk/ainet/lang/tensor/storage/Placement;)Lsk/ainet/lang/tensor/storage/TensorStorage; +} + +public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion { + public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData; +} + +public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field Companion Lsk/ainet/lang/tensor/data/Q4_0TensorData$Companion; + public abstract fun getBlockCount ()I + public abstract fun getBlockScale (I)F + public abstract fun getCode (II)B + public abstract fun getPackedData ()[B +} + +public final class sk/ainet/lang/tensor/data/Q4_0TensorData$Companion { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I +} + +public final class sk/ainet/lang/tensor/data/Q4_0TensorData$DefaultImpls { + public static fun copyToFloatArray (Lsk/ainet/lang/tensor/data/Q4_0TensorData;)[F +} + +public final class sk/ainet/lang/tensor/data/Q4_0TensorDataKt { + public static final fun toFloatArray (Lsk/ainet/lang/tensor/data/Q4_0TensorData;)[F +} + public final class sk/ainet/lang/tensor/data/Q4_KBlockTensorData : sk/ainet/lang/tensor/data/Q4_KTensorData, sk/ainet/lang/tensor/storage/PackedBlockStorage { public static final field Companion Lsk/ainet/lang/tensor/data/Q4_KBlockTensorData$Companion; public fun (Lsk/ainet/lang/tensor/Shape;[B)V @@ -5143,6 +5192,17 @@ public final class sk/ainet/lang/tensor/storage/TensorEncoding$Opaque : sk/ainet public fun toString ()Ljava/lang/String; } +public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_0 : sk/ainet/lang/tensor/storage/TensorEncoding { + public static final field BLOCK_SIZE I + public static final field BYTES_PER_BLOCK I + public static final field INSTANCE Lsk/ainet/lang/tensor/storage/TensorEncoding$Q4_0; + public fun equals (Ljava/lang/Object;)Z + public fun getName ()Ljava/lang/String; + public fun hashCode ()I + public fun physicalBytes (J)Ljava/lang/Long; + public fun toString ()Ljava/lang/String; +} + public final class sk/ainet/lang/tensor/storage/TensorEncoding$Q4_K : sk/ainet/lang/tensor/storage/TensorEncoding { public static final field BLOCK_SIZE I public static final field BYTES_PER_BLOCK I diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt new file mode 100644 index 00000000..c1923721 --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorData.kt @@ -0,0 +1,202 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.storage.PackedBlockStorage +import sk.ainet.lang.tensor.storage.TensorEncoding +import sk.ainet.lang.types.DType + +/** + * Tensor data interface for the Q4_0 quantized format (older GGML 4-bit). + * + * Q4_0 block format (32 elements per block, 18 bytes per block): + * - 2 bytes: f16 scale (`d`) + * - 16 bytes: 32 packed 4-bit codes (2 nibbles per byte) + * + * Canonical ggml nibble layout (the *split* layout, matching + * `sk.ainet.io.gguf.dequant.DequantOps.dequantQ4_0FromBytes`): for the + * 16 code bytes `qs[0..15]`, the low nibbles decode elements `0..15` and + * the high nibbles decode elements `16..31`: + * + * element[j] = ((qs[j] & 0x0F) - 8) * d for j ∈ [0, 16) + * element[j + 16] = ((qs[j] >>> 4) - 8) * d + * + * The `- 8` bias makes the 4-bit code symmetric around zero. This is the + * layout real GGUF Q4_0 weights are stored in. + * + * This interface enables direct quantized matmul without full + * dequantization, mirroring [Q8_0TensorData]. + */ +public interface Q4_0TensorData : TensorData { + /** Number of Q4_0 blocks in the tensor. */ + public val blockCount: Int + + /** Raw packed data containing all blocks. */ + public val packedData: ByteArray + + /** Get the scale factor (`d`) for a specific block. */ + public fun getBlockScale(blockIdx: Int): Float + + /** + * Get the raw unsigned 4-bit code (0..15) for [elementIdx] (0..31) + * within a block. The dequantized value is `(code - 8) * scale`. + */ + public fun getCode(blockIdx: Int, elementIdx: Int): Byte + + public companion object { + /** Elements per Q4_0 block. */ + public const val BLOCK_SIZE: Int = 32 + + /** Bytes per Q4_0 block (2 bytes scale + 16 bytes packed nibbles). */ + public const val BYTES_PER_BLOCK: Int = 18 + } +} + +/** + * Implementation of [Q4_0TensorData] backed by a packed byte array. + * + * Memory layout per block (18 bytes): + * - bytes [0..1] : f16 scale (little-endian) + * - bytes [2..17] : 16 bytes packing 32 4-bit codes (split layout, see + * [Q4_0TensorData] kdoc) + * + * @param initialShape the logical shape of the tensor (in elements, not blocks) + * @param data the raw packed block data + */ +public class Q4_0BlockTensorData( + initialShape: Shape, + private val data: ByteArray +) : Q4_0TensorData, PackedBlockStorage { + + override val shape: Shape = Shape(initialShape.dimensions.copyOf()) + private val strides: IntArray = shape.computeStrides() + override val packedData: ByteArray get() = data + + override val blockCount: Int = (shape.volume + Q4_0TensorData.BLOCK_SIZE - 1) / Q4_0TensorData.BLOCK_SIZE + + // PackedBlockStorage implementation + override val encoding: TensorEncoding get() = TensorEncoding.Q4_0 + override val blockSize: Int get() = Q4_0TensorData.BLOCK_SIZE + + override fun dequantizeBlock(blockIdx: Int, output: FloatArray, outputOffset: Int) { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + val scale = getBlockScale(blockIdx) + val elemsInBlock = minOf(Q4_0TensorData.BLOCK_SIZE, shape.volume - blockIdx * Q4_0TensorData.BLOCK_SIZE) + val codesBase = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2 + for (j in 0 until 16) { + val b = data[codesBase + j].toInt() and 0xFF + val lo = (b and 0x0F) - 8 + val hi = (b ushr 4) - 8 + val o0 = outputOffset + j + if (j < elemsInBlock && o0 < output.size) output[o0] = lo.toFloat() * scale + val o1 = outputOffset + 16 + j + if (16 + j < elemsInBlock && o1 < output.size) output[o1] = hi.toFloat() * scale + } + } + + init { + val requiredBytes = blockCount * Q4_0TensorData.BYTES_PER_BLOCK + require(data.size >= requiredBytes) { + "Data size ${data.size} is less than required $requiredBytes bytes for $blockCount blocks" + } + } + + override fun getBlockScale(blockIdx: Int): Float { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds (0..$blockCount)" } + val offset = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + val b0 = data[offset].toInt() and 0xFF + val b1 = data[offset + 1].toInt() and 0xFF + return halfToFloat((b1 shl 8) or b0) + } + + override fun getCode(blockIdx: Int, elementIdx: Int): Byte { + require(blockIdx in 0 until blockCount) { "Block index $blockIdx out of bounds" } + require(elementIdx in 0 until Q4_0TensorData.BLOCK_SIZE) { "Element index $elementIdx out of bounds (0..31)" } + val byteInBlock = if (elementIdx < 16) elementIdx else elementIdx - 16 + val b = data[blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2 + byteInBlock].toInt() and 0xFF + val nibble = if (elementIdx < 16) (b and 0x0F) else (b ushr 4) + return nibble.toByte() + } + + override fun get(vararg indices: Int): Byte { + val flatIndex = calcFlatIndex(indices) + val blockIdx = flatIndex / Q4_0TensorData.BLOCK_SIZE + val elementIdx = flatIndex % Q4_0TensorData.BLOCK_SIZE + return getCode(blockIdx, elementIdx) + } + + override fun set(vararg indices: Int, value: Byte) { + val flatIndex = calcFlatIndex(indices) + val blockIdx = flatIndex / Q4_0TensorData.BLOCK_SIZE + val elementIdx = flatIndex % Q4_0TensorData.BLOCK_SIZE + val byteInBlock = if (elementIdx < 16) elementIdx else elementIdx - 16 + val offset = blockIdx * Q4_0TensorData.BYTES_PER_BLOCK + 2 + byteInBlock + val nib = value.toInt() and 0x0F + val cur = data[offset].toInt() and 0xFF + data[offset] = if (elementIdx < 16) ((cur and 0xF0) or nib).toByte() + else ((cur and 0x0F) or (nib shl 4)).toByte() + } + + private fun calcFlatIndex(indices: IntArray): Int { + require(indices.size == shape.dimensions.size) { + "Number of indices (${indices.size}) must match tensor dimensions (${shape.dimensions.size})" + } + var flatIndex = 0 + for (i in indices.indices) { + val idx = indices[i] + require(idx >= 0 && idx < shape.dimensions[i]) { + "Index $idx out of bounds for dimension $i with size ${shape.dimensions[i]}" + } + flatIndex += idx * strides[i] + } + return flatIndex + } + + public companion object { + /** Create [Q4_0BlockTensorData] from raw packed Q4_0 bytes. */ + public fun fromRawBytes(shape: Shape, bytes: ByteArray): Q4_0BlockTensorData { + return Q4_0BlockTensorData(shape, bytes) + } + + /** Convert f16 bits to float32. */ + internal fun halfToFloat(hbits: Int): Float { + val sign = (hbits and 0x8000) shl 16 + val exp = (hbits and 0x7C00) shr 10 + val mant = hbits and 0x03FF + return when (exp) { + 0 -> { + if (mant == 0) { + Float.fromBits(sign) + } else { + var m = mant + var e = -14 + while ((m and 0x400) == 0) { + m = m shl 1 + e-- + } + m = m and 0x3FF + Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13)) + } + } + 31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13)) + else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13)) + } + } + } +} + +/** + * Dequantize Q4_0 tensor data to a FloatArray. + * `element[j] = (code[j] - 8) * scale` in the canonical split layout. + */ +public fun Q4_0TensorData.toFloatArray(): FloatArray { + val result = FloatArray(shape.volume) + for (blockIdx in 0 until blockCount) { + val scale = getBlockScale(blockIdx) + val base = blockIdx * Q4_0TensorData.BLOCK_SIZE + val elemsInBlock = minOf(Q4_0TensorData.BLOCK_SIZE, shape.volume - base) + for (i in 0 until elemsInBlock) { + result[base + i] = (getCode(blockIdx, i).toInt() - 8).toFloat() * scale + } + } + return result +} diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt index 4a9f745f..bd781a4f 100644 --- a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/storage/TensorEncoding.kt @@ -52,6 +52,18 @@ public sealed interface TensorEncoding { } } + /** GGML Q4_0 block quantization: 32 elements per 18-byte block. */ + public data object Q4_0 : TensorEncoding { + public const val BLOCK_SIZE: Int = 32 + public const val BYTES_PER_BLOCK: Int = 18 + + override val name: String get() = "Q4_0" + override fun physicalBytes(elementCount: Long): Long { + val blocks = (elementCount + BLOCK_SIZE - 1) / BLOCK_SIZE + return blocks * BYTES_PER_BLOCK + } + } + /** GGML Q8_0 block quantization: 32 elements per 34-byte block. */ public data object Q8_0 : TensorEncoding { public const val BLOCK_SIZE: Int = 32 diff --git a/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt new file mode 100644 index 00000000..a2cc0c33 --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0TensorDataTest.kt @@ -0,0 +1,95 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import kotlin.test.Test +import kotlin.test.assertContentEquals +import kotlin.test.assertEquals + +class Q4_0TensorDataTest { + + /** Pack 32 unsigned 4-bit codes (0..15) into the canonical split layout. */ + private fun packCodes(codes: IntArray): ByteArray { + require(codes.size == 32) + val out = ByteArray(16) + for (j in 0 until 16) { + out[j] = ((codes[j] and 0x0F) or ((codes[j + 16] and 0x0F) shl 4)).toByte() + } + return out + } + + private fun block(scaleLo: Int, scaleHi: Int, codes: IntArray): ByteArray = + byteArrayOf(scaleLo.toByte(), scaleHi.toByte()) + packCodes(codes) + + @Test + fun `constants are correct`() { + assertEquals(32, Q4_0TensorData.BLOCK_SIZE) + assertEquals(18, Q4_0TensorData.BYTES_PER_BLOCK) + } + + @Test + fun `reads scale from block`() { + // scale = 1.0 (f16 0x3C00 little-endian) + val data = block(0x00, 0x3C, IntArray(32) { 8 }) + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data) + assertEquals(1.0f, tensor.getBlockScale(0), 0.001f) + } + + @Test + fun `split layout decodes low nibbles to first half and high nibbles to second half`() { + // codes[j]=j%16 → low nibble j∈0..15 ; codes[j+16]=15-(j%16) → high nibble + val codes = IntArray(32) { i -> if (i < 16) i else 15 - (i - 16) } + val data = block(0x00, 0x3C, codes) // scale 1.0 + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data) + for (i in 0 until 32) { + assertEquals(codes[i].toByte(), tensor.getCode(0, i), "code mismatch at $i") + } + } + + @Test + fun `toFloatArray applies minus-eight bias and scale`() { + // scale = 0.5 (f16 0x3800). codes: elem0=10 → (10-8)*0.5=1.0 ; elem16=6 → (6-8)*0.5=-1.0 + val codes = IntArray(32) { 8 } + codes[0] = 10 // low nibble of byte 0 → element 0 + codes[16] = 6 // high nibble of byte 0 → element 16 + val data = block(0x00, 0x38, codes) + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data) + val floats = tensor.toFloatArray() + assertEquals(1.0f, floats[0], 0.01f) + assertEquals(-1.0f, floats[16], 0.01f) + assertEquals(0.0f, floats[1], 0.01f) // code 8 → (8-8)*scale = 0 + } + + @Test + fun `matches canonical ggml dequant for a known block`() { + // Mirror DequantOps.dequantQ4_0FromBytes: out[j]=(lo-8)*d, out[j+16]=(hi-8)*d. + val codes = IntArray(32) { i -> (i * 7 + 3) and 0x0F } // arbitrary 0..15 pattern + val data = block(0x00, 0x3C, codes) // scale 1.0 + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data) + val floats = tensor.toFloatArray() + for (i in 0 until 32) { + assertEquals((codes[i] - 8).toFloat(), floats[i], 0.001f, "dequant mismatch at $i") + } + } + + @Test + fun `set round-trips through nibble packing`() { + val data = block(0x00, 0x3C, IntArray(32) { 8 }) + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(32), data) + tensor[3] = 5 // low nibble of byte 3 + tensor[19] = 12 // high nibble of byte 3 (19-16=3) + assertEquals(5.toByte(), tensor[3]) + assertEquals(12.toByte(), tensor[19]) + } + + @Test + fun `handles multiple blocks and 2D shape`() { + val b0 = block(0x00, 0x3C, IntArray(32) { 8 }) // scale 1.0 + val b1 = block(0x00, 0x40, IntArray(32) { 9 }) // scale 2.0, code 9 + val tensor = Q4_0BlockTensorData.fromRawBytes(Shape(8, 8), b0 + b1) + assertEquals(2, tensor.blockCount) + assertContentEquals(intArrayOf(8, 8), tensor.shape.dimensions) + assertEquals(1.0f, tensor.getBlockScale(0), 0.001f) + assertEquals(2.0f, tensor.getBlockScale(1), 0.001f) + assertEquals(9.toByte(), tensor.getCode(1, 0)) + } +}