Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ public interface KernelProvider {
*/
public fun matmulQ6K(): Q6KMatmulKernel? = null

/**
* F32 × Q5_K matmul kernel exposed by this provider, or `null` if
* this provider does not specialize Q5_K. Same fall-through pattern.
*/
public fun matmulQ5K(): Q5KMatmulKernel? = null

/**
* F32 × Q5_1 matmul kernel exposed by this provider, or `null` if
* this provider does not specialize Q5_1. Same fall-through pattern.
Expand Down Expand Up @@ -126,6 +132,7 @@ public interface KernelProvider {
"Q8_0" -> matmulQ8_0() != null
"Q4_0" -> matmulQ4_0() != null
"Q6_K" -> matmulQ6K() != null
"Q5_K" -> matmulQ5K() != null
"Q5_1" -> matmulQ5_1() != null
"Q5_0" -> matmulQ5_0() != null
else -> false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package sk.ainet.backend.api.kernel

/**
* F32 input × Q5_K-packed weights matrix-vector multiply, in canonical
* ggml super-block layout.
*
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
*
* Block layout (256-element super-block, 176 bytes/block; see
* [sk.ainet.lang.tensor.data.Q5_KTensorData] kdoc for the byte map):
* - bytes 0..1 : `d` (super-block scale, FP16 LE)
* - bytes 2..3 : `dMin` (super-block min-scale, FP16 LE)
* - bytes 4..15 : 12 bytes of packed (6-bit scaleIdx, 6-bit minIdx) for
* 8 sub-blocks via ggml's `get_scale_min_k4` mixing
* (identical to Q4_K)
* - bytes 16..47 : 32 bytes `qh` high-bit plane (the 5th bit of each code)
* - bytes 48..175: 128 bytes of 4-bit low nibbles, *strided* in 4 groups of
* 32 bytes (identical layout to Q4_K's `qs`)
*
* Per sub-block s ∈ 0..7:
* `scale[s] = d * scaleIdx[s]`
* `offset[s] = dMin * minIdx[s]`
* per element: `code = lowNibble | (fifthBit << 4)` (0..31);
* `dequant = code * scale[s] - offset[s]`
*
* The lazy-`dmin` accumulation trick (used by every well-tuned K-quant
* kernel including ggml's reference) avoids subtracting `offset` per
* element by tracking `Σ(input · code)` and `Σ(input)` per sub-block
* and combining as `scale * codeSum − offset * inputSum` once.
*
* Implementations MUST NOT mutate `input` or `weight`. They MAY assume
* the arrays do not alias each other or `output`. They MUST fully
* write the `outputDim` floats starting at `output[outputOffset]`.
*
* Packed-weight row-major contract: `weight` holds blocks laid out
* `(blockIdx * outputDim + o) * 176` for output row `o` and input
* block index `blockIdx`. This matches `Q5_KBlockTensorData.packedData`.
*
* `inputDim` MUST be a multiple of 256 (the Q5_K block size).
*/
public interface Q5KMatmulKernel {
/**
* @param input FP32 input vector (single row).
* @param inputOffset element offset into [input] where the row starts.
* @param weight packed Q5_K bytes for the full `outputDim × inputDim` weight tensor.
* @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
* @param inputDim contraction dimension (must be a multiple of 256).
* @param outputDim number of output cells.
* @param output FP32 output vector.
* @param outputOffset element offset into [output] where the row starts.
*/
public fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import sk.ainet.backend.api.kernel.KernelProvider
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5KMatmulKernel
import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
import sk.ainet.backend.api.kernel.Q8_0MatmulKernel
Expand Down Expand Up @@ -33,6 +34,7 @@ public object ScalarKernelProvider : KernelProvider {
override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel
override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel
override fun matmulQ5K(): Q5KMatmulKernel = ScalarQ5_KMatmulKernel
override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel
override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package sk.ainet.exec.kernel

import sk.ainet.backend.api.kernel.Q5KMatmulKernel

/**
* Scalar reference [Q5KMatmulKernel] — commonMain, so Q5_K packed matmul works
* on Kotlin/Native / JS / WASM, not only the JVM SIMD path.
*
* Q5_K super-block: 256 elements / 176 bytes, block-major `(blockIdx*outputDim+o)*176`:
* `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 32 `qh`
* high-bit bytes 128 `qs` low-nibble bytes. Each of the 8 sub-blocks (32 elts)
* contributes `codeSum*scale - inputSum*offset`, with `scale = d*scaleIdx`,
* `offset = dMin*minIdx`, and the 5-bit `code = lowNibble | (fifthBit << 4)`.
* Math mirrors `DequantOps.dequantQ5KFromBytes` and the Q4_K kernel (only the
* 5th-bit fold differs).
*/
public object ScalarQ5_KMatmulKernel : Q5KMatmulKernel {

private const val BLOCK_SIZE = 256
private const val SUB_BLOCK = 32
private const val BYTES_PER_BLOCK = 176
private const val QH_OFFSET = 16
private const val QS_OFFSET = 48

override fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
) {
require(inputDim % BLOCK_SIZE == 0) {
"ScalarQ5_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
}
if (outputDim == 0) return
if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
val blocksPerInputDim = inputDim / BLOCK_SIZE
val scaleIdx = IntArray(8)
val minIdx = IntArray(8)

for (o in 0 until outputDim) {
var acc = 0f
for (blockIdx in 0 until blocksPerInputDim) {
val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF))
val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF))

// ggml get_scale_min_k4 over the 12 scale bytes (identical to Q4_K).
val sc = blockBase + 4
for (sb in 0 until 4) {
scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F
minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F
}
for (sb in 4 until 8) {
val low4S = weight[sc + sb + 4].toInt() and 0x0F
val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6
scaleIdx[sb] = low4S or (high2S shl 4)
val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4
val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6
minIdx[sb] = low4M or (high2M shl 4)
}

val qhBase = blockBase + QH_OFFSET
val qsBase = blockBase + QS_OFFSET
val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE
for (groupJ in 0 until 4) {
val qsRegion = qsBase + groupJ * 32
// sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes;
// the 5th bit comes from qh[i], bit (2*groupJ + half).
for (half in 0 until 2) {
val sb = 2 * groupJ + half
val bit = 2 * groupJ + half
val inStart = inBlockBase + sb * SUB_BLOCK
var codeSum = 0f
var inputSum = 0f
for (i in 0 until 32) {
val b = weight[qsRegion + i].toInt() and 0xFF
val low = if (half == 0) (b and 0x0F) else (b ushr 4)
val fifth = ((weight[qhBase + i].toInt() and 0xFF) ushr bit) and 0x01
val code = low or (fifth shl 4)
val v = input[inStart + i]
codeSum += v * code
inputSum += v
}
acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb])
}
}
}
output[outputOffset + o] = acc
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import sk.ainet.lang.tensor.data.Q4_KTensorData
import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
import sk.ainet.lang.tensor.data.Q6_KTensorData
import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
import sk.ainet.lang.tensor.data.Q5_KTensorData
import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
import sk.ainet.lang.tensor.data.Q5_1TensorData
import sk.ainet.lang.tensor.data.Q5_1BlockTensorData
import sk.ainet.lang.tensor.data.Q5_0TensorData
Expand Down Expand Up @@ -333,6 +335,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
private val q4_0Kernel by lazy { resolveProvider { it.matmulQ4_0() != null }?.matmulQ4_0() }
private val q4kKernel by lazy { resolveProvider { it.matmulQ4K() != null }?.matmulQ4K() }
private val q6kKernel by lazy { resolveProvider { it.matmulQ6K() != null }?.matmulQ6K() }
private val q5kKernel by lazy { resolveProvider { it.matmulQ5K() != null }?.matmulQ5K() }
private val q5_1Kernel by lazy { resolveProvider { it.matmulQ5_1() != null }?.matmulQ5_1() }
private val q5_0Kernel by lazy { resolveProvider { it.matmulQ5_0() != null }?.matmulQ5_0() }

Expand Down Expand Up @@ -367,6 +370,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
is Q5_1TensorData -> q5_1Kernel?.let { k -> run(bd.packedData, k::matmul) }
is Q5_0TensorData -> q5_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
is Q4_KTensorData -> q4kKernel?.let { k -> run(bd.packedData, k::matmul) }
is Q5_KTensorData -> q5kKernel?.let { k -> run(bd.packedData, k::matmul) }
is Q6_KTensorData -> q6kKernel?.let { k -> run(bd.packedData, k::matmul) }
is Q8_0TensorData -> q8_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
is Q4_0TensorData -> q4_0Kernel?.let { k -> run(bd.packedData, k::matmul) }
Expand Down Expand Up @@ -598,6 +602,7 @@ public open class DefaultCpuOpsBase(protected val dataFactory: TensorDataFactory
@Suppress("UNCHECKED_CAST")
when (val d = tensor.data) {
is Q4_KTensorData -> return newTensor(Q4_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
is Q5_KTensorData -> return newTensor(Q5_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
is Q6_KTensorData -> return newTensor(Q6_KBlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
is Q5_1TensorData -> return newTensor(Q5_1BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
is Q5_0TensorData -> return newTensor(Q5_0BlockTensorData(Shape(cols, rows), d.packedData) as TensorData<T, V>, tensor.dtype, tensor)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import sk.ainet.backend.api.kernel.Fp32MatmulKernel
import sk.ainet.backend.api.kernel.KernelProvider
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5KMatmulKernel
import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
Expand Down Expand Up @@ -65,6 +66,9 @@ public object PanamaVectorKernelProvider : KernelProvider {
override fun matmulQ6K(): Q6KMatmulKernel? =
if (isAvailable()) PanamaVectorQ6_KMatmulKernel else null

override fun matmulQ5K(): Q5KMatmulKernel? =
if (isAvailable()) PanamaVectorQ5_KMatmulKernel else null

private fun isVectorApiClassLoaded(): Boolean = runCatching {
Class.forName("jdk.incubator.vector.FloatVector")
Class.forName("jdk.incubator.vector.VectorSpecies")
Expand Down
Loading
Loading