Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@ import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
import sk.ainet.lang.tensor.data.Q4_KTensorData
import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
import sk.ainet.lang.tensor.data.Q6_KTensorData
import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
import sk.ainet.lang.tensor.data.Q5_1TensorData
import sk.ainet.lang.tensor.data.Q5_0BlockTensorData
import sk.ainet.lang.tensor.data.Q5_0TensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.DType
import sk.ainet.lang.types.FP16
Expand Down Expand Up @@ -224,6 +228,21 @@ internal class DefaultCpuOpsJvm(
@Suppress("UNCHECKED_CAST")
return newTensor(transposed as TensorData<T, V>, tensor.dtype, tensor)
}
// Q5_1 / Q5_0 packed bytes use a row-major `[out, in]` layout that the
// `matmulQ5_1Vec` / `matmulQ5_0Vec` kernels index by output row, so the
// transpose is a pure shape swap — the same bytes give the right values
// under the swapped shape (lets `ops.matmul(x, ops.transpose(W))` run
// without a dequant round-trip).
if (data is Q5_1TensorData) {
val transposed = Q5_1BlockTensorData(Shape(cols, rows), data.packedData)
@Suppress("UNCHECKED_CAST")
return newTensor(transposed as TensorData<T, V>, tensor.dtype, tensor)
}
if (data is Q5_0TensorData) {
val transposed = Q5_0BlockTensorData(Shape(cols, rows), data.packedData)
@Suppress("UNCHECKED_CAST")
return newTensor(transposed as TensorData<T, V>, tensor.dtype, tensor)
}
// MemorySegment FP32 fast path: physical transpose via SIMD.
// Uses Arena.ofAuto() so the result segment is reclaimed by GC
// when the wrapping Tensor is no longer reachable. Earlier
Expand Down Expand Up @@ -558,6 +577,32 @@ internal class DefaultCpuOpsJvm(
@Suppress("UNCHECKED_CAST")
CpuTensor(outData as TensorData<T, V>, this, a.dtype)
}
is Q5_1TensorData -> {
val outBuffer = FloatArray(batchSize * outputDim)
for (batch in 0 until batchSize) {
val batchInput = if (batchSize == 1) inputBuffer
else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim)
JvmQuantizedVectorKernels.matmulQ5_1Vec(
batchInput, bData.packedData, inputDim, outputDim, outBuffer, batch * outputDim,
)
}
val outData = DenseFloatArrayTensorData<T>(Shape(batchSize, outputDim), outBuffer)
@Suppress("UNCHECKED_CAST")
CpuTensor(outData as TensorData<T, V>, this, a.dtype)
}
is Q5_0TensorData -> {
val outBuffer = FloatArray(batchSize * outputDim)
for (batch in 0 until batchSize) {
val batchInput = if (batchSize == 1) inputBuffer
else inputBuffer.copyOfRange(batch * inputDim, (batch + 1) * inputDim)
JvmQuantizedVectorKernels.matmulQ5_0Vec(
batchInput, bData.packedData, inputDim, outputDim, outBuffer, batch * outputDim,
)
}
val outData = DenseFloatArrayTensorData<T>(Shape(batchSize, outputDim), outBuffer)
@Suppress("UNCHECKED_CAST")
CpuTensor(outData as TensorData<T, V>, this, a.dtype)
}
is Q4_KTensorData -> {
val outBuffer = FloatArray(batchSize * outputDim)
val spiKernel = q4kMatmulKernel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -909,4 +909,98 @@ internal object JvmQuantizedVectorKernels {
output[outputOffset + o] = accVec.reduceLanes(VectorOperators.ADD) + accScalar
}
}

/**
* Q5_1 matrix-vector multiply: `output = input · Wᵀ` for a packed Q5_1 weight.
*
* Packed weights are in the natural GGUF **row-major** `[outputDim, inputDim]`
* layout: output row `o`'s `inputDim` weights are `inputDim / 32` contiguous
* 24-byte blocks. Dequant matches `DequantOps.dequantQ5_1FromBytes` exactly:
* `w = d * (code + (highBit shl 4)) + m`. Scalar (keeps weights packed — the
* memory win; SIMD vectorization of the inner loop is a follow-up).
*/
fun matmulQ5_1Vec(
input: FloatArray,
packedWeights: ByteArray,
inputDim: Int,
outputDim: Int,
output: FloatArray,
outputOffset: Int = 0,
) {
val bytesPerBlock = 24
val blocksPerInputDim = (inputDim + 31) / 32
for (o in 0 until outputDim) {
var acc = 0f
val rowBase = o * blocksPerInputDim * bytesPerBlock
for (blk in 0 until blocksPerInputDim) {
val base = rowBase + blk * bytesPerBlock
val d = halfToFloat(((packedWeights[base + 1].toInt() and 0xFF) shl 8) or (packedWeights[base].toInt() and 0xFF))
val m = halfToFloat(((packedWeights[base + 3].toInt() and 0xFF) shl 8) or (packedWeights[base + 2].toInt() and 0xFF))
val qh = intArrayOf(
packedWeights[base + 4].toInt() and 0xFF,
packedWeights[base + 5].toInt() and 0xFF,
packedWeights[base + 6].toInt() and 0xFF,
packedWeights[base + 7].toInt() and 0xFF,
)
val qsBase = base + 8
val inBase = blk * 32
for (j in 0 until 16) {
val q = packedWeights[qsBase + j].toInt() and 0xFF
val lo = q and 0x0F
val hi = q ushr 4
val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01
val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01
val wLo = d * (lo + (bitLo shl 4)) + m
val wHi = d * (hi + (bitHi shl 4)) + m
acc += input[inBase + j] * wLo + input[inBase + 16 + j] * wHi
}
}
output[outputOffset + o] = acc
}
}

/**
* Q5_0 matrix-vector multiply: `output = input · Wᵀ` for a packed Q5_0 weight.
*
* Row-major `[outputDim, inputDim]` packing of 22-byte blocks. Dequant matches
* `DequantOps.dequantQ5_0FromBytes`: `w = d * (code + (highBit shl 4) - 16)`.
*/
fun matmulQ5_0Vec(
input: FloatArray,
packedWeights: ByteArray,
inputDim: Int,
outputDim: Int,
output: FloatArray,
outputOffset: Int = 0,
) {
val bytesPerBlock = 22
val blocksPerInputDim = (inputDim + 31) / 32
for (o in 0 until outputDim) {
var acc = 0f
val rowBase = o * blocksPerInputDim * bytesPerBlock
for (blk in 0 until blocksPerInputDim) {
val base = rowBase + blk * bytesPerBlock
val d = halfToFloat(((packedWeights[base + 1].toInt() and 0xFF) shl 8) or (packedWeights[base].toInt() and 0xFF))
val qh = intArrayOf(
packedWeights[base + 2].toInt() and 0xFF,
packedWeights[base + 3].toInt() and 0xFF,
packedWeights[base + 4].toInt() and 0xFF,
packedWeights[base + 5].toInt() and 0xFF,
)
val qsBase = base + 6
val inBase = blk * 32
for (j in 0 until 16) {
val q = packedWeights[qsBase + j].toInt() and 0xFF
val lo = q and 0x0F
val hi = q ushr 4
val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01
val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01
val wLo = d * (lo + (bitLo shl 4) - 16)
val wHi = d * (hi + (bitHi shl 4) - 16)
acc += input[inBase + j] * wLo + input[inBase + 16 + j] * wHi
}
}
output[outputOffset + o] = acc
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package sk.ainet.exec.tensor.ops

import kotlin.random.Random
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertTrue
import sk.ainet.context.DirectCpuExecutionContext
import sk.ainet.lang.tensor.Shape
import sk.ainet.lang.tensor.Tensor
import sk.ainet.lang.tensor.data.Q5_0BlockTensorData
import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.FP32

/**
* Validates the packed Q5_1 / Q5_0 matmul kernels + lazy transpose: feeding a packed
* weight through `ops.matmul(x, ops.transpose(W))` must match feeding the FP32-dequantized
* weight through the same path. The FP32 reference is dequantized inline (independent of the
* `Q5_*BlockTensorData.dequantizeBlock` code under test), matching ggml / `DequantOps`.
*/
class Q5MatmulDispatchTest {

private val ctx = DirectCpuExecutionContext()

private fun f16(v: Float): Int {
// float -> IEEE half bits (round-to-nearest-even, good enough for test weights)
val bits = v.toRawBits()
val sign = (bits ushr 16) and 0x8000
var expo = ((bits ushr 23) and 0xFF) - 127 + 15
val mant = bits and 0x7FFFFF
if (expo <= 0) return sign // flush tiny to signed zero
if (expo >= 31) return sign or 0x7C00 // inf
return sign or (expo shl 10) or (mant ushr 13)
}

private fun halfToFloat(h: Int): Float {
val sign = (h and 0x8000) shl 16
val exp = (h and 0x7C00) shr 10
val mant = h and 0x03FF
return when (exp) {
0 -> Float.fromBits(sign) // (subnormals flushed by f16() above)
31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
}
}

// --- Q5_1: 24 bytes/block (d, m, qh[4], qs[16]) ---------------------------------------

private fun randomQ5_1Block(rng: Random, out: ByteArray, off: Int) {
val d = f16(0.02f + rng.nextFloat() * 0.05f)
val m = f16(-0.3f + rng.nextFloat() * 0.6f)
out[off] = (d and 0xFF).toByte(); out[off + 1] = ((d ushr 8) and 0xFF).toByte()
out[off + 2] = (m and 0xFF).toByte(); out[off + 3] = ((m ushr 8) and 0xFF).toByte()
for (k in 0 until 4) out[off + 4 + k] = rng.nextInt(256).toByte() // qh
for (k in 0 until 16) out[off + 8 + k] = rng.nextInt(256).toByte() // qs
}

private fun dequantQ5_1Block(b: ByteArray, off: Int, dst: FloatArray, dstOff: Int) {
val d = halfToFloat(((b[off + 1].toInt() and 0xFF) shl 8) or (b[off].toInt() and 0xFF))
val m = halfToFloat(((b[off + 3].toInt() and 0xFF) shl 8) or (b[off + 2].toInt() and 0xFF))
val qh = intArrayOf(b[off + 4].toInt() and 0xFF, b[off + 5].toInt() and 0xFF, b[off + 6].toInt() and 0xFF, b[off + 7].toInt() and 0xFF)
for (j in 0 until 16) {
val q = b[off + 8 + j].toInt() and 0xFF
val lo = q and 0x0F; val hi = q ushr 4
val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01
val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01
dst[dstOff + j] = d * (lo + (bitLo shl 4)) + m
dst[dstOff + 16 + j] = d * (hi + (bitHi shl 4)) + m
}
}

// --- Q5_0: 22 bytes/block (d, qh[4], qs[16]), symmetric -16 --------------------------

private fun randomQ5_0Block(rng: Random, out: ByteArray, off: Int) {
val d = f16(0.02f + rng.nextFloat() * 0.05f)
out[off] = (d and 0xFF).toByte(); out[off + 1] = ((d ushr 8) and 0xFF).toByte()
for (k in 0 until 4) out[off + 2 + k] = rng.nextInt(256).toByte()
for (k in 0 until 16) out[off + 6 + k] = rng.nextInt(256).toByte()
}

private fun dequantQ5_0Block(b: ByteArray, off: Int, dst: FloatArray, dstOff: Int) {
val d = halfToFloat(((b[off + 1].toInt() and 0xFF) shl 8) or (b[off].toInt() and 0xFF))
val qh = intArrayOf(b[off + 2].toInt() and 0xFF, b[off + 3].toInt() and 0xFF, b[off + 4].toInt() and 0xFF, b[off + 5].toInt() and 0xFF)
for (j in 0 until 16) {
val q = b[off + 6 + j].toInt() and 0xFF
val lo = q and 0x0F; val hi = q ushr 4
val bitLo = (qh[j / 8] ushr (j % 8)) and 0x01
val bitHi = (qh[(j + 16) / 8] ushr ((j + 16) % 8)) and 0x01
dst[dstOff + j] = d * (lo + (bitLo shl 4) - 16)
dst[dstOff + 16 + j] = d * (hi + (bitHi shl 4) - 16)
}
}

private fun assertPackedMatchesFp32(
encoding: String, inputDim: Int, outputDim: Int, batchSize: Int, seed: Int,
) {
val rng = Random(seed)
val blocksPerRow = inputDim / 32
val bytesPerBlock = if (encoding == "Q5_1") 24 else 22
val bytes = ByteArray(outputDim * blocksPerRow * bytesPerBlock)
val wf = FloatArray(outputDim * inputDim) // row-major [out, in]
for (o in 0 until outputDim) {
for (blk in 0 until blocksPerRow) {
val off = (o * blocksPerRow + blk) * bytesPerBlock
val dstOff = o * inputDim + blk * 32
if (encoding == "Q5_1") { randomQ5_1Block(rng, bytes, off); dequantQ5_1Block(bytes, off, wf, dstOff) }
else { randomQ5_0Block(rng, bytes, off); dequantQ5_0Block(bytes, off, wf, dstOff) }
}
}

val packed: Tensor<FP32, Float> = if (encoding == "Q5_1")
ctx.fromData(Q5_1BlockTensorData(Shape(outputDim, inputDim), bytes) as TensorData<FP32, Float>, FP32::class)
else
ctx.fromData(Q5_0BlockTensorData(Shape(outputDim, inputDim), bytes) as TensorData<FP32, Float>, FP32::class)
val fp32 = ctx.fromFloatArray<FP32, Float>(Shape(outputDim, inputDim), FP32::class, wf)

val input = ctx.fromFloatArray<FP32, Float>(
Shape(batchSize, inputDim), FP32::class, FloatArray(batchSize * inputDim) { (rng.nextFloat() - 0.5f) },
)
val outPacked = ctx.ops.matmul(input, ctx.ops.transpose(packed)).data.copyToFloatArray()
val outFp32 = ctx.ops.matmul(input, ctx.ops.transpose(fp32)).data.copyToFloatArray()

assertEquals(outFp32.size, outPacked.size, "$encoding output size")
var maxErr = 0f
for (i in outFp32.indices) maxErr = maxOf(maxErr, kotlin.math.abs(outFp32[i] - outPacked[i]))
assertTrue(maxErr < 1e-3f, "$encoding packed matmul deviates from FP32 dequant: maxErr=$maxErr")
}

@Test fun q5_1_matmul_matches_fp32_dequant_single_batch() =
assertPackedMatchesFp32("Q5_1", inputDim = 128, outputDim = 64, batchSize = 1, seed = 1)

@Test fun q5_1_matmul_matches_fp32_dequant_multi_batch() =
assertPackedMatchesFp32("Q5_1", inputDim = 256, outputDim = 96, batchSize = 3, seed = 2)

@Test fun q5_0_matmul_matches_fp32_dequant_single_batch() =
assertPackedMatchesFp32("Q5_0", inputDim = 128, outputDim = 64, batchSize = 1, seed = 3)

@Test fun q5_0_matmul_matches_fp32_dequant_multi_batch() =
assertPackedMatchesFp32("Q5_0", inputDim = 192, outputDim = 48, batchSize = 2, seed = 4)
}
Loading
Loading