diff --git a/docs/kernel-support-matrix.md b/docs/kernel-support-matrix.md index c1094a63..b39abb14 100644 --- a/docs/kernel-support-matrix.md +++ b/docs/kernel-support-matrix.md @@ -11,7 +11,7 @@ | `Q8_0` | native-ffm | panama-vector | scalar | scalar | scalar | | `Q4_0` | native-ffm | panama-vector | scalar | scalar | scalar | | `Q4_K` | native-ffm | panama-vector | scalar | scalar | scalar | -| `Q6_K` | scalar | scalar | scalar | scalar | scalar | +| `Q6_K` | panama-vector | panama-vector | scalar | scalar | scalar | | `Q5_1` | panama-vector | panama-vector | scalar | scalar | scalar | | `Q5_0` | panama-vector | panama-vector | scalar | scalar | scalar | diff --git a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api index add035f2..7f4996ba 100644 --- a/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api +++ b/skainet-backends/skainet-backend-cpu/api/jvm/skainet-backend-cpu.api @@ -102,6 +102,11 @@ public final class sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel : sk/ainet/ public fun matmul ([FI[BIII[FI)V } +public final class sk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel : sk/ainet/backend/api/kernel/Q6KMatmulKernel { + public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel; + public fun matmul ([FI[BIII[FI)V +} + public final class sk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel { public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel; public fun matmul ([FI[BIII[FI)V diff --git a/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt index 4bb497ae..0c460b04 100644 --- a/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt +++ b/skainet-backends/skainet-backend-cpu/src/commonTest/kotlin/sk/ainet/exec/tensor/ops/PackedMatmulDispatchTest.kt @@ -8,6 +8,7 @@ import sk.ainet.context.DirectCpuExecutionContext import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.data.Q4_KBlockTensorData import sk.ainet.lang.tensor.data.Q5_1BlockTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.types.FP32 @@ -74,13 +75,46 @@ class PackedMatmulDispatchTest { return bytes to wf } + /** Random block-major Q6_K bytes for [out,in] + the FP32 weight. */ + private fun q6_k(inDim: Int, outDim: Int, rng: Random): Pair { + val blocks = inDim / 256; val bytes = ByteArray(outDim * blocks * 210); val wf = FloatArray(outDim * inDim) + for (o in 0 until outDim) for (bI in 0 until blocks) { + val off = (bI * outDim + o) * 210; val dst = o * inDim + bI * 256 + for (k in 0 until 208) bytes[off + k] = rng.nextInt(256).toByte() + val d = rng.nextFloat() * 0.01f + 0.002f; le16(bytes, off + 208, half(d)) + for (h in 0..1) { + val qlB = off + h * 64; val qhB = off + 128 + h * 32; val scB = off + 192 + h * 8; val ob = h * 128 + for (isIdx in 0..1) { + val sc1 = d * bytes[scB + isIdx].toInt(); val sc2 = d * bytes[scB + isIdx + 2].toInt() + val sc3 = d * bytes[scB + isIdx + 4].toInt(); val sc4 = d * bytes[scB + isIdx + 6].toInt() + for (l in isIdx * 16 until isIdx * 16 + 16) { + val ql0 = bytes[qlB + l].toInt() and 0xFF; val ql32 = bytes[qlB + l + 32].toInt() and 0xFF + val qhL = bytes[qhB + l].toInt() and 0xFF + wf[dst + ob + l + 0] = sc1 * (((ql0 and 0xF) or ((qhL and 3) shl 4)) - 32) + wf[dst + ob + l + 32] = sc2 * (((ql32 and 0xF) or (((qhL ushr 2) and 3) shl 4)) - 32) + wf[dst + ob + l + 64] = sc3 * (((ql0 ushr 4) or (((qhL ushr 4) and 3) shl 4)) - 32) + wf[dst + ob + l + 96] = sc4 * (((ql32 ushr 4) or (((qhL ushr 6) and 3) shl 4)) - 32) + } + } + } + } + return bytes to wf + } + private fun run(fmt: String, inDim: Int, outDim: Int, seed: Int) { val rng = Random(seed) - val (bytes, wf) = if (fmt == "Q5_1") q5_1(inDim, outDim, rng) else q4_k(inDim, outDim, rng) + val (bytes, wf) = when (fmt) { + "Q5_1" -> q5_1(inDim, outDim, rng) + "Q6_K" -> q6_k(inDim, outDim, rng) + else -> q4_k(inDim, outDim, rng) + } @Suppress("UNCHECKED_CAST") val w = ctx.fromData( - (if (fmt == "Q5_1") Q5_1BlockTensorData(Shape(outDim, inDim), bytes) - else Q4_KBlockTensorData(Shape(outDim, inDim), bytes)) as TensorData, + (when (fmt) { + "Q5_1" -> Q5_1BlockTensorData(Shape(outDim, inDim), bytes) + "Q6_K" -> Q6_KBlockTensorData(Shape(outDim, inDim), bytes) + else -> Q4_KBlockTensorData(Shape(outDim, inDim), bytes) + }) as TensorData, FP32::class, ) val xf = FloatArray(inDim) { rng.nextFloat() - 0.5f } @@ -94,4 +128,5 @@ class PackedMatmulDispatchTest { @Test fun q5_1_through_ops_matmul_transpose() = run("Q5_1", inDim = 128, outDim = 16, seed = 7) @Test fun q4_k_through_ops_matmul_transpose() = run("Q4_K", inDim = 256, outDim = 12, seed = 8) + @Test fun q6_k_through_ops_matmul_transpose() = run("Q6_K", inDim = 512, outDim = 8, seed = 9) } diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt index 99ec4eb0..8aee59e2 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorKernelProvider.kt @@ -7,6 +7,7 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel import sk.ainet.backend.api.kernel.Q4_0MatmulKernel import sk.ainet.backend.api.kernel.Q5_0MatmulKernel import sk.ainet.backend.api.kernel.Q5_1MatmulKernel +import sk.ainet.backend.api.kernel.Q6KMatmulKernel import sk.ainet.backend.api.kernel.Q8_0MatmulKernel import sk.ainet.exec.tensor.ops.JvmCpuBackendConfig @@ -61,6 +62,9 @@ public object PanamaVectorKernelProvider : KernelProvider { override fun matmulQ5_0(): Q5_0MatmulKernel? = if (isAvailable()) PanamaVectorQ5_0MatmulKernel else null + override fun matmulQ6K(): Q6KMatmulKernel? = + if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null + private fun isVectorApiClassLoaded(): Boolean = runCatching { Class.forName("jdk.incubator.vector.FloatVector") Class.forName("jdk.incubator.vector.VectorSpecies") diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel.kt new file mode 100644 index 00000000..09fe8b86 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel.kt @@ -0,0 +1,57 @@ +package sk.ainet.exec.kernel + +import jdk.incubator.vector.FloatVector +import jdk.incubator.vector.VectorOperators +import jdk.incubator.vector.VectorSpecies +import sk.ainet.backend.api.kernel.Q6KMatmulKernel +import sk.ainet.exec.tensor.ops.JvmQuantizedVectorKernels + +/** + * SIMD-vectorized FP32 × Q6_K matmul on the JDK Vector API. Reuses the existing SIMD + * Q6_K block dequant ([JvmQuantizedVectorKernels.dequantQ6_KBlock]) into a 256-element + * scratch buffer, then a Vector-API FMA dot against the matching input window. + * Numerically equivalent to [ScalarQ6_KMatmulKernel]. Block-major layout + * `(blockIdx*outputDim+o)*210`. + */ +public object PanamaVectorQ6_KMatmulKernel : Q6KMatmulKernel { + + private const val BLOCK_SIZE = 256 + private const val BYTES_PER_BLOCK = 210 + private val floatSpecies: VectorSpecies = FloatVector.SPECIES_PREFERRED + + override fun matmul( + input: FloatArray, inputOffset: Int, + weight: ByteArray, weightByteOffset: Int, + inputDim: Int, outputDim: Int, + output: FloatArray, outputOffset: Int, + ) { + require(inputDim % BLOCK_SIZE == 0) { + "PanamaVectorQ6_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim" + } + if (outputDim == 0) return + if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return } + val blocksPerInputDim = inputDim / BLOCK_SIZE + val step = floatSpecies.length() + val loopBound = floatSpecies.loopBound(BLOCK_SIZE) + val scratch = FloatArray(BLOCK_SIZE) + + for (o in 0 until outputDim) { + var acc = 0f + for (blockIdx in 0 until blocksPerInputDim) { + val base = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK + JvmQuantizedVectorKernels.dequantQ6_KBlock(weight, base, scratch, 0) + val inputBase = inputOffset + blockIdx * BLOCK_SIZE + var accVec = FloatVector.zero(floatSpecies) + var k = 0 + while (k < loopBound) { + accVec = FloatVector.fromArray(floatSpecies, input, inputBase + k) + .fma(FloatVector.fromArray(floatSpecies, scratch, k), accVec) + k += step + } + acc += accVec.reduceLanes(VectorOperators.ADD) + while (k < BLOCK_SIZE) { acc += input[inputBase + k] * scratch[k]; k++ } + } + output[outputOffset + o] = acc + } + } +} diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt index f48f2ac0..0aa070cc 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/DefaultCpuOpsJvm.kt @@ -29,8 +29,6 @@ import sk.ainet.lang.tensor.data.Q8MemorySegmentMarker import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData import sk.ainet.lang.tensor.data.Q4_KBlockTensorData import sk.ainet.lang.tensor.data.Q4_KTensorData -import sk.ainet.lang.tensor.data.Q6_KBlockTensorData -import sk.ainet.lang.tensor.data.Q6_KTensorData import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.types.DType import sk.ainet.lang.types.FP16 @@ -224,20 +222,8 @@ internal class DefaultCpuOpsJvm( @Suppress("UNCHECKED_CAST") return newTensor(transposed as TensorData, tensor.dtype, tensor) } - // Q6_K packed bytes mirror the Q4_K lazy-transpose pattern: the - // `matmulQ6_KVec` kernel reads the packed bytes in - // input-block-major order, so the shape swap is purely a metadata - // change. Unlocks running Gemma 4 E2B Q4_K_M (which uses Q6_K for - // FFN + embedding + lm_head) without the 12 GB FP32 dequant - // bloat the converter used to produce. - if (data is Q6_KTensorData) { - val packedData = data.packedData - val transposed = Q6_KBlockTensorData(Shape(cols, rows), packedData) - @Suppress("UNCHECKED_CAST") - return newTensor(transposed as TensorData, tensor.dtype, tensor) - } - // Q5_1 / Q5_0 lazy transpose is handled in DefaultCpuOpsBase (block-major, - // shared with Native); the JVM ops don't intercept Q5 here. + // Q6_K / Q5_1 / Q5_0 lazy transpose is handled in DefaultCpuOpsBase + // (block-major, shared with Native); the JVM ops don't intercept them here. // MemorySegment FP32 fast path: physical transpose via SIMD. // Uses Arena.ofAuto() so the result segment is reclaimed by GC // when the wrapping Tensor is no longer reachable. Earlier @@ -617,24 +603,8 @@ internal class DefaultCpuOpsJvm( @Suppress("UNCHECKED_CAST") CpuTensor(outData as TensorData, this, a.dtype) } - is Q6_KTensorData -> { - val outBuffer = FloatArray(batchSize * outputDim) - for (batch in 0 until batchSize) { - val batchInput = if (batchSize == 1) inputBuffer - else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim) - JvmQuantizedVectorKernels.matmulQ6_KVec( - batchInput, - bData.packedData, - inputDim, - outputDim, - outBuffer, - batch * outputDim, - ) - } - val outData = DenseFloatArrayTensorData(Shape(batchSize, outputDim), outBuffer) - @Suppress("UNCHECKED_CAST") - CpuTensor(outData as TensorData, this, a.dtype) - } + // Q6_K / Q5_1 / Q5_0 dispatch is handled in DefaultCpuOpsBase via the kernel + // registry (block-major, shared with Native); not intercepted here. // MemorySegment-backed quantized weights (Q4/Q8) — dispatch to MemorySegment kernels is MemorySegmentBackedData -> { chooseQuantizedMatmulMemSeg(inputBuffer, bData, batchSize, inputDim, outputDim, a) diff --git a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt index 89a8fe9b..e63864f6 100644 --- a/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt +++ b/skainet-backends/skainet-backend-cpu/src/jvmMain/kotlin/sk/ainet/exec/tensor/ops/JvmQuantizedVectorKernels.kt @@ -372,7 +372,7 @@ internal object JvmQuantizedVectorKernels { * stores per chunk. Scalar tail fires only when `floatStep` doesn't * divide 16 (rare). */ - private fun dequantQ6_KBlock( + internal fun dequantQ6_KBlock( packedWeights: ByteArray, blockByteOffset: Int, scratch: FloatArray, diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6KParityTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6KParityTest.kt new file mode 100644 index 00000000..c85a91c5 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/PanamaVectorQ6KParityTest.kt @@ -0,0 +1,45 @@ +package sk.ainet.exec.kernel + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue + +/** Panama SIMD Q6_K kernel must match the scalar reference within FMA tolerance. */ +class PanamaVectorQ6KParityTest { + + private fun half(v: Float): Int { + val b = v.toRawBits(); val s = (b ushr 16) and 0x8000 + val e = ((b ushr 23) and 0xFF) - 127 + 15; val m = b and 0x7FFFFF + if (e <= 0) return s; if (e >= 31) return s or 0x7C00 + return s or (e shl 10) or (m ushr 13) + } + + /** Block-major Q6_K bytes (210 B/block) with a valid finite f16 scale; random ql/qh/scales. */ + private fun bytes(inDim: Int, outDim: Int, rng: Random): ByteArray { + val out = ByteArray(outDim * (inDim / 256) * 210) + var off = 0 + while (off < out.size) { + for (k in 0 until 208) out[off + k] = rng.nextInt(256).toByte() // ql + qh + scales + val d = half(rng.nextFloat() * 0.01f + 0.002f) + out[off + 208] = (d and 0xFF).toByte(); out[off + 209] = ((d ushr 8) and 0xFF).toByte() + off += 210 + } + return out + } + + private fun check(inDim: Int, outDim: Int, seed: Int) { + val rng = Random(seed) + val w = bytes(inDim, outDim, rng) + val input = FloatArray(inDim) { rng.nextFloat() - 0.5f } + val a = FloatArray(outDim); val b = FloatArray(outDim) + ScalarQ6_KMatmulKernel.matmul(input, 0, w, 0, inDim, outDim, a, 0) + PanamaVectorQ6_KMatmulKernel.matmul(input, 0, w, 0, inDim, outDim, b, 0) + var maxErr = 0f; var maxAbs = 1f + for (o in 0 until outDim) { maxErr = maxOf(maxErr, abs(a[o] - b[o])); maxAbs = maxOf(maxAbs, abs(a[o])) } + assertTrue(maxErr < 1e-3f * maxAbs + 1e-3f, "Q6_K Panama≠Scalar: maxErr=$maxErr (maxAbs=$maxAbs)") + } + + @Test fun q6_k_panama_matches_scalar_single() = check(256, 32, 1) + @Test fun q6_k_panama_matches_scalar_multi() = check(512, 16, 2) +} diff --git a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt index c42ff611..66be594e 100644 --- a/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt +++ b/skainet-backends/skainet-backend-native-cpu/src/jvmTest/kotlin/sk/ainet/exec/kernel/KernelSupportMatrixTest.kt @@ -32,7 +32,7 @@ class KernelSupportMatrixTest { private fun tiers(): List = listOf( Tier("scalar", 0, allTargets, scalarFormats()), Tier("panama-vector", 50, setOf("jvm", "android"), - setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q5_1", "Q5_0")), + setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0")), Tier("native-ffm", 100, setOf("jvm"), setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K")), )