Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/kernel-support-matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
| `Q8_0` | native-ffm | panama-vector | scalar | scalar | scalar |
| `Q4_0` | native-ffm | panama-vector | scalar | scalar | scalar |
| `Q4_K` | native-ffm | panama-vector | scalar | scalar | scalar |
| `Q6_K` | scalar | scalar | scalar | scalar | scalar |
| `Q6_K` | panama-vector | panama-vector | scalar | scalar | scalar |
| `Q5_1` | panama-vector | panama-vector | scalar | scalar | scalar |
| `Q5_0` | panama-vector | panama-vector | scalar | scalar | scalar |

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ public final class sk/ainet/exec/kernel/PanamaVectorQ5_1MatmulKernel : sk/ainet/
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel : sk/ainet/backend/api/kernel/Q6KMatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ6_KMatmulKernel;
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/PanamaVectorQ8_0MatmulKernel;
public fun matmul ([FI[BIII[FI)V
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import sk.ainet.context.DirectCpuExecutionContext
import sk.ainet.lang.tensor.Shape
import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.FP32

Expand Down Expand Up @@ -74,13 +75,46 @@ class PackedMatmulDispatchTest {
return bytes to wf
}

/** Random block-major Q6_K bytes for [out,in] + the FP32 weight. */
private fun q6_k(inDim: Int, outDim: Int, rng: Random): Pair<ByteArray, FloatArray> {
val blocks = inDim / 256; val bytes = ByteArray(outDim * blocks * 210); val wf = FloatArray(outDim * inDim)
for (o in 0 until outDim) for (bI in 0 until blocks) {
val off = (bI * outDim + o) * 210; val dst = o * inDim + bI * 256
for (k in 0 until 208) bytes[off + k] = rng.nextInt(256).toByte()
val d = rng.nextFloat() * 0.01f + 0.002f; le16(bytes, off + 208, half(d))
for (h in 0..1) {
val qlB = off + h * 64; val qhB = off + 128 + h * 32; val scB = off + 192 + h * 8; val ob = h * 128
for (isIdx in 0..1) {
val sc1 = d * bytes[scB + isIdx].toInt(); val sc2 = d * bytes[scB + isIdx + 2].toInt()
val sc3 = d * bytes[scB + isIdx + 4].toInt(); val sc4 = d * bytes[scB + isIdx + 6].toInt()
for (l in isIdx * 16 until isIdx * 16 + 16) {
val ql0 = bytes[qlB + l].toInt() and 0xFF; val ql32 = bytes[qlB + l + 32].toInt() and 0xFF
val qhL = bytes[qhB + l].toInt() and 0xFF
wf[dst + ob + l + 0] = sc1 * (((ql0 and 0xF) or ((qhL and 3) shl 4)) - 32)
wf[dst + ob + l + 32] = sc2 * (((ql32 and 0xF) or (((qhL ushr 2) and 3) shl 4)) - 32)
wf[dst + ob + l + 64] = sc3 * (((ql0 ushr 4) or (((qhL ushr 4) and 3) shl 4)) - 32)
wf[dst + ob + l + 96] = sc4 * (((ql32 ushr 4) or (((qhL ushr 6) and 3) shl 4)) - 32)
}
}
}
}
return bytes to wf
}

private fun run(fmt: String, inDim: Int, outDim: Int, seed: Int) {
val rng = Random(seed)
val (bytes, wf) = if (fmt == "Q5_1") q5_1(inDim, outDim, rng) else q4_k(inDim, outDim, rng)
val (bytes, wf) = when (fmt) {
"Q5_1" -> q5_1(inDim, outDim, rng)
"Q6_K" -> q6_k(inDim, outDim, rng)
else -> q4_k(inDim, outDim, rng)
}
@Suppress("UNCHECKED_CAST")
val w = ctx.fromData(
(if (fmt == "Q5_1") Q5_1BlockTensorData(Shape(outDim, inDim), bytes)
else Q4_KBlockTensorData(Shape(outDim, inDim), bytes)) as TensorData<FP32, Float>,
(when (fmt) {
"Q5_1" -> Q5_1BlockTensorData(Shape(outDim, inDim), bytes)
"Q6_K" -> Q6_KBlockTensorData(Shape(outDim, inDim), bytes)
else -> Q4_KBlockTensorData(Shape(outDim, inDim), bytes)
}) as TensorData<FP32, Float>,
FP32::class,
)
val xf = FloatArray(inDim) { rng.nextFloat() - 0.5f }
Expand All @@ -94,4 +128,5 @@ class PackedMatmulDispatchTest {

@Test fun q5_1_through_ops_matmul_transpose() = run("Q5_1", inDim = 128, outDim = 16, seed = 7)
@Test fun q4_k_through_ops_matmul_transpose() = run("Q4_K", inDim = 256, outDim = 12, seed = 8)
@Test fun q6_k_through_ops_matmul_transpose() = run("Q6_K", inDim = 512, outDim = 8, seed = 9)
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
import sk.ainet.exec.tensor.ops.JvmCpuBackendConfig

Expand Down Expand Up @@ -61,6 +62,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
override fun matmulQ5_0(): Q5_0MatmulKernel? =
if (isAvailable()) PanamaVectorQ5_0MatmulKernel else null

override fun matmulQ6K(): Q6KMatmulKernel? =
if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null

private fun isVectorApiClassLoaded(): Boolean = runCatching {
Class.forName("jdk.incubator.vector.FloatVector")
Class.forName("jdk.incubator.vector.VectorSpecies")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package sk.ainet.exec.kernel

import jdk.incubator.vector.FloatVector
import jdk.incubator.vector.VectorOperators
import jdk.incubator.vector.VectorSpecies
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
import sk.ainet.exec.tensor.ops.JvmQuantizedVectorKernels

/**
* SIMD-vectorized FP32 × Q6_K matmul on the JDK Vector API. Reuses the existing SIMD
* Q6_K block dequant ([JvmQuantizedVectorKernels.dequantQ6_KBlock]) into a 256-element
* scratch buffer, then a Vector-API FMA dot against the matching input window.
* Numerically equivalent to [ScalarQ6_KMatmulKernel]. Block-major layout
* `(blockIdx*outputDim+o)*210`.
*/
public object PanamaVectorQ6_KMatmulKernel : Q6KMatmulKernel {

private const val BLOCK_SIZE = 256
private const val BYTES_PER_BLOCK = 210
private val floatSpecies: VectorSpecies<Float> = FloatVector.SPECIES_PREFERRED

override fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
) {
require(inputDim % BLOCK_SIZE == 0) {
"PanamaVectorQ6_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
}
if (outputDim == 0) return
if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
val blocksPerInputDim = inputDim / BLOCK_SIZE
val step = floatSpecies.length()
val loopBound = floatSpecies.loopBound(BLOCK_SIZE)
val scratch = FloatArray(BLOCK_SIZE)

for (o in 0 until outputDim) {
var acc = 0f
for (blockIdx in 0 until blocksPerInputDim) {
val base = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
JvmQuantizedVectorKernels.dequantQ6_KBlock(weight, base, scratch, 0)
val inputBase = inputOffset + blockIdx * BLOCK_SIZE
var accVec = FloatVector.zero(floatSpecies)
var k = 0
while (k < loopBound) {
accVec = FloatVector.fromArray(floatSpecies, input, inputBase + k)
.fma(FloatVector.fromArray(floatSpecies, scratch, k), accVec)
k += step
}
acc += accVec.reduceLanes(VectorOperators.ADD)
while (k < BLOCK_SIZE) { acc += input[inputBase + k] * scratch[k]; k++ }
}
output[outputOffset + o] = acc
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ import sk.ainet.lang.tensor.data.Q8MemorySegmentMarker
import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData
import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
import sk.ainet.lang.tensor.data.Q4_KTensorData
import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
import sk.ainet.lang.tensor.data.Q6_KTensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.DType
import sk.ainet.lang.types.FP16
Expand Down Expand Up @@ -224,20 +222,8 @@ internal class DefaultCpuOpsJvm(
@Suppress("UNCHECKED_CAST")
return newTensor(transposed as TensorData<T, V>, tensor.dtype, tensor)
}
// Q6_K packed bytes mirror the Q4_K lazy-transpose pattern: the
// `matmulQ6_KVec` kernel reads the packed bytes in
// input-block-major order, so the shape swap is purely a metadata
// change. Unlocks running Gemma 4 E2B Q4_K_M (which uses Q6_K for
// FFN + embedding + lm_head) without the 12 GB FP32 dequant
// bloat the converter used to produce.
if (data is Q6_KTensorData) {
val packedData = data.packedData
val transposed = Q6_KBlockTensorData(Shape(cols, rows), packedData)
@Suppress("UNCHECKED_CAST")
return newTensor(transposed as TensorData<T, V>, tensor.dtype, tensor)
}
// Q5_1 / Q5_0 lazy transpose is handled in DefaultCpuOpsBase (block-major,
// shared with Native); the JVM ops don't intercept Q5 here.
// Q6_K / Q5_1 / Q5_0 lazy transpose is handled in DefaultCpuOpsBase
// (block-major, shared with Native); the JVM ops don't intercept them here.
// MemorySegment FP32 fast path: physical transpose via SIMD.
// Uses Arena.ofAuto() so the result segment is reclaimed by GC
// when the wrapping Tensor is no longer reachable. Earlier
Expand Down Expand Up @@ -617,24 +603,8 @@ internal class DefaultCpuOpsJvm(
@Suppress("UNCHECKED_CAST")
CpuTensor(outData as TensorData<T, V>, this, a.dtype)
}
is Q6_KTensorData -> {
val outBuffer = FloatArray(batchSize * outputDim)
for (batch in 0 until batchSize) {
val batchInput = if (batchSize == 1) inputBuffer
else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim)
JvmQuantizedVectorKernels.matmulQ6_KVec(
batchInput,
bData.packedData,
inputDim,
outputDim,
outBuffer,
batch * outputDim,
)
}
val outData = DenseFloatArrayTensorData<T>(Shape(batchSize, outputDim), outBuffer)
@Suppress("UNCHECKED_CAST")
CpuTensor(outData as TensorData<T, V>, this, a.dtype)
}
// Q6_K / Q5_1 / Q5_0 dispatch is handled in DefaultCpuOpsBase via the kernel
// registry (block-major, shared with Native); not intercepted here.
// MemorySegment-backed quantized weights (Q4/Q8) — dispatch to MemorySegment kernels
is MemorySegmentBackedData -> {
chooseQuantizedMatmulMemSeg(inputBuffer, bData, batchSize, inputDim, outputDim, a)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ internal object JvmQuantizedVectorKernels {
* stores per chunk. Scalar tail fires only when `floatStep` doesn't
* divide 16 (rare).
*/
private fun dequantQ6_KBlock(
internal fun dequantQ6_KBlock(
packedWeights: ByteArray,
blockByteOffset: Int,
scratch: FloatArray,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package sk.ainet.exec.kernel

import kotlin.math.abs
import kotlin.random.Random
import kotlin.test.Test
import kotlin.test.assertTrue

/** Panama SIMD Q6_K kernel must match the scalar reference within FMA tolerance. */
class PanamaVectorQ6KParityTest {

private fun half(v: Float): Int {
val b = v.toRawBits(); val s = (b ushr 16) and 0x8000
val e = ((b ushr 23) and 0xFF) - 127 + 15; val m = b and 0x7FFFFF
if (e <= 0) return s; if (e >= 31) return s or 0x7C00
return s or (e shl 10) or (m ushr 13)
}

/** Block-major Q6_K bytes (210 B/block) with a valid finite f16 scale; random ql/qh/scales. */
private fun bytes(inDim: Int, outDim: Int, rng: Random): ByteArray {
val out = ByteArray(outDim * (inDim / 256) * 210)
var off = 0
while (off < out.size) {
for (k in 0 until 208) out[off + k] = rng.nextInt(256).toByte() // ql + qh + scales
val d = half(rng.nextFloat() * 0.01f + 0.002f)
out[off + 208] = (d and 0xFF).toByte(); out[off + 209] = ((d ushr 8) and 0xFF).toByte()
off += 210
}
return out
}

private fun check(inDim: Int, outDim: Int, seed: Int) {
val rng = Random(seed)
val w = bytes(inDim, outDim, rng)
val input = FloatArray(inDim) { rng.nextFloat() - 0.5f }
val a = FloatArray(outDim); val b = FloatArray(outDim)
ScalarQ6_KMatmulKernel.matmul(input, 0, w, 0, inDim, outDim, a, 0)
PanamaVectorQ6_KMatmulKernel.matmul(input, 0, w, 0, inDim, outDim, b, 0)
var maxErr = 0f; var maxAbs = 1f
for (o in 0 until outDim) { maxErr = maxOf(maxErr, abs(a[o] - b[o])); maxAbs = maxOf(maxAbs, abs(a[o])) }
assertTrue(maxErr < 1e-3f * maxAbs + 1e-3f, "Q6_K Panama≠Scalar: maxErr=$maxErr (maxAbs=$maxAbs)")
}

@Test fun q6_k_panama_matches_scalar_single() = check(256, 32, 1)
@Test fun q6_k_panama_matches_scalar_multi() = check(512, 16, 2)
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class KernelSupportMatrixTest {
private fun tiers(): List<Tier> = listOf(
Tier("scalar", 0, allTargets, scalarFormats()),
Tier("panama-vector", 50, setOf("jvm", "android"),
setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q5_1", "Q5_0")),
setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K", "Q6_K", "Q5_1", "Q5_0")),
Tier("native-ffm", 100, setOf("jvm"),
setOf("Float32", "BFloat16", "Q8_0", "Q4_0", "Q4_K")),
)
Expand Down
Loading