Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ public interface KernelProvider {
*/
public fun matmulQ4_0(): Q4_0MatmulKernel? = null

/**
* F32 × Q6_K matmul kernel exposed by this provider, or `null` if
* this provider does not specialize Q6_K. Same fall-through pattern.
*/
public fun matmulQ6K(): Q6KMatmulKernel? = null

/**
* F32 × Q5_1 matmul kernel exposed by this provider, or `null` if
* this provider does not specialize Q5_1. Same fall-through pattern.
*/
public fun matmulQ5_1(): Q5_1MatmulKernel? = null

/**
* F32 × Q5_0 matmul kernel exposed by this provider, or `null` if
* this provider does not specialize Q5_0. Same fall-through pattern.
*/
public fun matmulQ5_0(): Q5_0MatmulKernel? = null

/**
* Capability query: does this provider carry a kernel for
* [opName] with the given [dtypeKeys]?
Expand Down Expand Up @@ -107,6 +125,9 @@ public interface KernelProvider {
"Q4_K" -> matmulQ4K() != null
"Q8_0" -> matmulQ8_0() != null
"Q4_0" -> matmulQ4_0() != null
"Q6_K" -> matmulQ6K() != null
"Q5_1" -> matmulQ5_1() != null
"Q5_0" -> matmulQ5_0() != null
else -> false
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package sk.ainet.backend.api.kernel

/**
* F32 input × Q5_0-packed weights matrix-vector multiply, in canonical
* ggml block layout.
*
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
*
* Block layout (32-element block, 22 bytes/block; see
* [sk.ainet.lang.tensor.data.Q5_0BlockTensorData] kdoc):
* - bytes 0..1 : `d` (block scale, FP16 LE)
* - bytes 2..5 : `qh[0..3]` (the 5th/high bit of each of the 32 codes)
* - bytes 6..21 : `qs[0..15]` (low 4 bits, two nibbles per byte)
*
* Per element, with `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, and the high
* bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`:
*
* element[j] = d * (lo + (bitLo shl 4) - 16) for j ∈ [0, 16)
* element[j + 16] = d * (hi + (bitHi shl 4) - 16)
*
* The `- 16` bias centres the unsigned 5-bit code around zero (no per-block
* min). Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_0FromBytes`.
*
* Implementations MUST NOT mutate `input` or `weight`. They MUST fully
* write the `outputDim` floats starting at `output[outputOffset]`.
*
* Packed-weight **block-major** row contract: `weight` holds blocks laid
* out `(blockIdx * outputDim + o) * 22`. Matches `Q5_0BlockTensorData.packedData`.
*
* `inputDim` MUST be a multiple of 32 (the Q5_0 block size).
*/
public interface Q5_0MatmulKernel {
public fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package sk.ainet.backend.api.kernel

/**
* F32 input × Q5_1-packed weights matrix-vector multiply, in canonical
* ggml block layout.
*
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
*
* Block layout (32-element block, 24 bytes/block; see
* [sk.ainet.lang.tensor.data.Q5_1BlockTensorData] kdoc):
* - bytes 0..1 : `d` (block scale, FP16 LE)
* - bytes 2..3 : `m` (block minimum, FP16 LE)
* - bytes 4..7 : `qh[0..3]` (the 5th/high bit of each of the 32 codes)
* - bytes 8..23 : `qs[0..15]` (low 4 bits, two nibbles per byte)
*
* Per element, with `lo = qs[j] & 0x0F`, `hi = qs[j] >>> 4`, and the high
* bits `bitLo = (qh[j/8] >>> (j%8)) & 1`, `bitHi = (qh[(j+16)/8] >>> ((j+16)%8)) & 1`:
*
* element[j] = d * (lo + (bitLo shl 4)) + m for j ∈ [0, 16)
* element[j + 16] = d * (hi + (bitHi shl 4)) + m
*
* Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ5_1FromBytes`.
*
* Implementations MUST NOT mutate `input` or `weight`. They MAY assume
* the arrays do not alias each other or `output`. They MUST fully write
* the `outputDim` floats starting at `output[outputOffset]`.
*
* Packed-weight **block-major** row contract: `weight` holds blocks laid
* out `(blockIdx * outputDim + o) * 24` for output row `o` and input
* block index `blockIdx`. This matches `Q5_1BlockTensorData.packedData`
* after the GGUF row-major → input-block-major re-layout.
*
* `inputDim` MUST be a multiple of 32 (the Q5_1 block size).
*/
public interface Q5_1MatmulKernel {
/**
* @param input FP32 input vector (single row).
* @param inputOffset element offset into [input] where the row starts.
* @param weight packed Q5_1 bytes for the full `outputDim × inputDim` weight tensor.
* @param weightByteOffset byte offset into [weight] where block (0, 0) starts.
* @param inputDim contraction dimension (must be a multiple of 32).
* @param outputDim number of output cells.
* @param output FP32 output vector.
* @param outputOffset element offset into [output] where the row starts.
*/
public fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package sk.ainet.backend.api.kernel

/**
* F32 input × Q6_K-packed weights matrix-vector multiply, in canonical
* ggml block layout.
*
* output[outputOffset + o] = Σ_j input[inputOffset + j] · dequant(weight[o, j])
* for j ∈ [0, inputDim), o ∈ [0, outputDim)
*
* Q6_K super-block layout (256 elements, 210 bytes/block; see
* [sk.ainet.lang.tensor.data.Q6_KBlockTensorData]):
* - bytes 0..127 : `ql[0..127]` (lower 4 bits of each code)
* - bytes 128..191 : `qh[0..63]` (upper 2 bits of each code)
* - bytes 192..207 : `scales[0..15]`(int8 per-16-element sub-block scales)
* - bytes 208..209 : `d` (super-block scale, FP16 LE)
*
* The 6-bit signed code is reassembled from `ql`/`qh` (see ggml
* `dequantize_row_q6_K`); per element `dequant = d * scales[sub] * (code - 32)`.
* Matches `sk.ainet.io.gguf.dequant.DequantOps.dequantQ6KFromBytes` — that is
* the authoritative reference; implementations MUST agree with it.
*
* Implementations MUST NOT mutate `input` or `weight`. They MUST fully
* write the `outputDim` floats starting at `output[outputOffset]`.
*
* Packed-weight **block-major** row contract: blocks laid out
* `(blockIdx * outputDim + o) * 210`. Matches `Q6_KBlockTensorData.packedData`.
*
* `inputDim` MUST be a multiple of 256 (the Q6_K super-block size).
*/
public interface Q6KMatmulKernel {
public fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProvider : sk/ainet/ba
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
}
Expand All @@ -67,6 +70,9 @@ public final class sk/ainet/exec/kernel/PanamaVectorKernelProviderFactory : sk/a
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
}
Expand Down Expand Up @@ -105,6 +111,9 @@ public final class sk/ainet/exec/kernel/ScalarKernelProvider : sk/ainet/backend/
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
}
Expand All @@ -118,6 +127,9 @@ public final class sk/ainet/exec/kernel/ScalarKernelProviderFactory : sk/ainet/b
public fun matmulFp32 ()Lsk/ainet/backend/api/kernel/Fp32MatmulKernel;
public fun matmulQ4K ()Lsk/ainet/backend/api/kernel/Q4KMatmulKernel;
public fun matmulQ4_0 ()Lsk/ainet/backend/api/kernel/Q4_0MatmulKernel;
public fun matmulQ5_0 ()Lsk/ainet/backend/api/kernel/Q5_0MatmulKernel;
public fun matmulQ5_1 ()Lsk/ainet/backend/api/kernel/Q5_1MatmulKernel;
public fun matmulQ6K ()Lsk/ainet/backend/api/kernel/Q6KMatmulKernel;
public fun matmulQ8_0 ()Lsk/ainet/backend/api/kernel/Q8_0MatmulKernel;
public fun supports (Ljava/lang/String;Ljava/util/List;)Z
}
Expand All @@ -132,6 +144,26 @@ public final class sk/ainet/exec/kernel/ScalarQ4_0MatmulKernel : sk/ainet/backen
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/ScalarQ4_KMatmulKernel : sk/ainet/backend/api/kernel/Q4KMatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ4_KMatmulKernel;
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/ScalarQ5_0MatmulKernel : sk/ainet/backend/api/kernel/Q5_0MatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ5_0MatmulKernel;
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/ScalarQ5_1MatmulKernel : sk/ainet/backend/api/kernel/Q5_1MatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ5_1MatmulKernel;
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/ScalarQ6_KMatmulKernel : sk/ainet/backend/api/kernel/Q6KMatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ6_KMatmulKernel;
public fun matmul ([FI[BIII[FI)V
}

public final class sk/ainet/exec/kernel/ScalarQ8_0MatmulKernel : sk/ainet/backend/api/kernel/Q8_0MatmulKernel {
public static final field INSTANCE Lsk/ainet/exec/kernel/ScalarQ8_0MatmulKernel;
public fun matmul ([FI[BIII[FI)V
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package sk.ainet.exec.kernel

/**
* Convert a 16-bit IEEE-754 half-precision value (low 16 bits of [hbits])
* to FP32. Shared by the scalar packed-quant kernels in this package
* (Q5_1/Q5_0/Q4_K/Q6_K). Mirrors the inlined helpers in
* [ScalarQ4_0MatmulKernel] / [ScalarQ8_0MatmulKernel].
*/
internal fun decodeHalf(hbits: Int): Float {
val sign = (hbits and 0x8000) shl 16
val exp = (hbits and 0x7C00) shr 10
val mant = hbits and 0x03FF
return when (exp) {
0 -> {
if (mant == 0) {
Float.fromBits(sign)
} else {
var m = mant
var e = -14
while ((m and 0x400) == 0) {
m = m shl 1
e--
}
m = m and 0x3FF
Float.fromBits(sign or ((e + 127) shl 23) or (m shl 13))
}
}
31 -> Float.fromBits(sign or (0xFF shl 23) or (mant shl 13))
else -> Float.fromBits(sign or ((exp - 15 + 127) shl 23) or (mant shl 13))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ package sk.ainet.exec.kernel
import sk.ainet.backend.api.kernel.Bf16MatmulKernel
import sk.ainet.backend.api.kernel.Fp32MatmulKernel
import sk.ainet.backend.api.kernel.KernelProvider
import sk.ainet.backend.api.kernel.Q4KMatmulKernel
import sk.ainet.backend.api.kernel.Q4_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_0MatmulKernel
import sk.ainet.backend.api.kernel.Q5_1MatmulKernel
import sk.ainet.backend.api.kernel.Q6KMatmulKernel
import sk.ainet.backend.api.kernel.Q8_0MatmulKernel

/**
Expand All @@ -27,4 +31,8 @@ public object ScalarKernelProvider : KernelProvider {
override fun matmulBf16(): Bf16MatmulKernel = ScalarBf16MatmulKernel
override fun matmulQ8_0(): Q8_0MatmulKernel = ScalarQ8_0MatmulKernel
override fun matmulQ4_0(): Q4_0MatmulKernel = ScalarQ4_0MatmulKernel
override fun matmulQ4K(): Q4KMatmulKernel = ScalarQ4_KMatmulKernel
override fun matmulQ6K(): Q6KMatmulKernel = ScalarQ6_KMatmulKernel
override fun matmulQ5_1(): Q5_1MatmulKernel = ScalarQ5_1MatmulKernel
override fun matmulQ5_0(): Q5_0MatmulKernel = ScalarQ5_0MatmulKernel
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package sk.ainet.exec.kernel

import sk.ainet.backend.api.kernel.Q4KMatmulKernel

/**
* Scalar reference [Q4KMatmulKernel] — commonMain, so Q4_K packed matmul works
* on Kotlin/Native / JS / WASM, not only the JVM SIMD path.
*
* Q4_K super-block: 256 elements / 144 bytes, block-major `(blockIdx*outputDim+o)*144`:
* `d`(f16) `dMin`(f16) 12 scale bytes (ggml `get_scale_min_k4` packing) 128 code bytes.
* Each of the 8 sub-blocks (32 elts) contributes `codeSum*scale - inputSum*offset`,
* with `scale = d*scaleIdx`, `offset = dMin*minIdx`. Math mirrors
* `JvmQuantizedVectorKernels.matmulQ4_KVec` / `DequantOps.dequantQ4KFromBytes`.
*/
public object ScalarQ4_KMatmulKernel : Q4KMatmulKernel {

private const val BLOCK_SIZE = 256
private const val SUB_BLOCK = 32
private const val BYTES_PER_BLOCK = 144

override fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
) {
require(inputDim % BLOCK_SIZE == 0) {
"ScalarQ4_KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
}
if (outputDim == 0) return
if (inputDim == 0) { for (o in 0 until outputDim) output[outputOffset + o] = 0f; return }
val blocksPerInputDim = inputDim / BLOCK_SIZE
val scaleIdx = IntArray(8)
val minIdx = IntArray(8)

for (o in 0 until outputDim) {
var acc = 0f
for (blockIdx in 0 until blocksPerInputDim) {
val blockBase = weightByteOffset + (blockIdx * outputDim + o) * BYTES_PER_BLOCK
val d = decodeHalf(((weight[blockBase + 1].toInt() and 0xFF) shl 8) or (weight[blockBase].toInt() and 0xFF))
val dMin = decodeHalf(((weight[blockBase + 3].toInt() and 0xFF) shl 8) or (weight[blockBase + 2].toInt() and 0xFF))

// ggml get_scale_min_k4 over the 12 scale bytes.
val sc = blockBase + 4
for (sb in 0 until 4) {
scaleIdx[sb] = weight[sc + sb].toInt() and 0x3F
minIdx[sb] = weight[sc + sb + 4].toInt() and 0x3F
}
for (sb in 4 until 8) {
val low4S = weight[sc + sb + 4].toInt() and 0x0F
val high2S = (weight[sc + sb - 4].toInt() and 0xFF) ushr 6
scaleIdx[sb] = low4S or (high2S shl 4)
val low4M = (weight[sc + sb + 4].toInt() and 0xFF) ushr 4
val high2M = (weight[sc + sb].toInt() and 0xFF) ushr 6
minIdx[sb] = low4M or (high2M shl 4)
}

val codesOffset = blockBase + 16
val inBlockBase = inputOffset + blockIdx * BLOCK_SIZE
for (groupJ in 0 until 4) {
val qsRegion = codesOffset + groupJ * 32
// sub-block lo (low nibbles) then hi (high nibbles) of the same 32 bytes.
for (half in 0 until 2) {
val sb = 2 * groupJ + half
val inStart = inBlockBase + sb * SUB_BLOCK
var codeSum = 0f
var inputSum = 0f
for (i in 0 until 32) {
val b = weight[qsRegion + i].toInt() and 0xFF
val code = if (half == 0) (b and 0x0F) else (b ushr 4)
val v = input[inStart + i]
codeSum += v * code
inputSum += v
}
acc += codeSum * (d * scaleIdx[sb]) - inputSum * (dMin * minIdx[sb])
}
}
}
output[outputOffset + o] = acc
}
}
}
Loading
Loading