Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package sk.ainet.exec.tensor.ops

import kotlin.math.abs
import kotlin.random.Random
import kotlin.test.Test
import kotlin.test.assertTrue
import sk.ainet.context.DirectCpuExecutionContext
import sk.ainet.lang.tensor.Shape
import sk.ainet.lang.tensor.Tensor
import sk.ainet.lang.tensor.data.Q4_0Quantizer
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.FP32

/**
* End-to-end proof that the [Q4_0Quantizer] (FP32 → Q4_0) output is
* directly consumable by the matmul dispatch — i.e. *any* loader that
* produces dense FP32 weights can quantize them to Q4_0 and run
* inference through the same kernel path GGUF Q4_0 weights use.
*
* Quantizes a dense weight, runs `ctx.ops.matmul(x, qWeight)`, and
* checks it tracks the dense FP32 matmul within 4-bit error.
*/
class Q4_0QuantizeRoundTripMatmulTest {

private val ctx = DirectCpuExecutionContext()

@Suppress("UNCHECKED_CAST")
private fun assertQuantizedTracksDense(inputDim: Int, outputDim: Int, seed: Int) {
val rng = Random(seed)
// Logical weight W[o][j] (output o, input j).
val w = Array(outputDim) { FloatArray(inputDim) { rng.nextFloat() - 0.5f } }
val inputV = FloatArray(inputDim) { rng.nextFloat() - 0.5f }

// Reference: plain FP32 matmul.
val expected = FloatArray(outputDim)
for (o in 0 until outputDim) {
var acc = 0f
for (j in 0 until inputDim) acc += inputV[j] * w[o][j]
expected[o] = acc
}

// Arrange weights in the kernel's packed block order — block
// (blockIdx, o) holds the 32 input positions [blockIdx*32 .. +31]
// for output o — then quantize that flat array. This is the layout
// a loader producing Q4_0 matmul weights must emit.
val blocks = inputDim / 32
val flat = FloatArray(inputDim * outputDim)
var p = 0
for (blockIdx in 0 until blocks) {
for (o in 0 until outputDim) {
for (k in 0 until 32) {
flat[p++] = w[o][blockIdx * 32 + k]
}
}
}
val qData = Q4_0Quantizer.quantize(flat, Shape(inputDim, outputDim))
val weight: Tensor<FP32, Float> = ctx.fromData(qData as TensorData<FP32, Float>, FP32::class)
val input = ctx.fromFloatArray<FP32, Float>(Shape(1, inputDim), FP32::class, inputV)

val out = ctx.ops.matmul(input, weight).data.copyToFloatArray()

// Q4_0 quantization error per weight is ~step/2 (step ≈ |max|/8 per
// block); the dot-product error over `inputDim` random-signed terms
// grows ~√blocks, not linearly. Tolerance scales accordingly.
val tol = 0.1f + 0.1f * (inputDim / 32).coerceAtLeast(1)
for (o in 0 until outputDim) {
val diff = abs(expected[o] - out[o])
assertTrue(
diff <= tol,
"quantized matmul drifted at $o: dense=${expected[o]} q4_0=${out[o]} diff=$diff tol=$tol",
)
}
}

@Test fun single_output_tracks_dense() =
assertQuantizedTracksDense(inputDim = 64, outputDim = 1, seed = 1)

@Test fun attention_proj_shape_tracks_dense() =
assertQuantizedTracksDense(inputDim = 128, outputDim = 128, seed = 2)
}
6 changes: 6 additions & 0 deletions skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api
Original file line number Diff line number Diff line change
Expand Up @@ -3056,6 +3056,12 @@ public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion {
public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
}

public final class sk/ainet/lang/tensor/data/Q4_0Quantizer {
public static final field INSTANCE Lsk/ainet/lang/tensor/data/Q4_0Quantizer;
public final fun quantize ([FLsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData;
public final fun quantizeToBytes ([F)[B
}

public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData {
public static final field BLOCK_SIZE I
public static final field BYTES_PER_BLOCK I
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package sk.ainet.lang.tensor.data

import sk.ainet.lang.tensor.Shape
import kotlin.math.abs

/**
* FP32 → Q4_0 quantizer — the loader-agnostic counterpart to
* [Q4_0TensorData]'s decode side.
*
* Q4_0 was decode-only until now (GGUF files arrive pre-quantized).
* This makes Q4_0 *producible* from dense FP32 in pure `commonMain`, so
* any source — a SafeTensors / JSON loader that only carries dense
* weights, an in-memory tensor, an offline packing tool — can emit
* canonical ggml Q4_0 blocks without going through GGUF.
*
* Algorithm (per 32-element block, matching ggml `quantize_row_q4_0`):
* 1. Find the element of greatest magnitude `max` (sign preserved).
* 2. `d = max / -8` so the most-negative code (0 → `-8`) recovers it;
* store `d` as the block's FP16 scale.
* 3. Each element: `code = clamp(round(x / d + 8), 0, 15)`, packed in
* the canonical split layout (low nibbles → elements 0..15, high →
* 16..31).
*
* Round-trips through [Q4_0TensorData.toFloatArray] within 4-bit
* quantization error.
*/
public object Q4_0Quantizer {

private const val BLOCK_SIZE = 32
private const val BYTES_PER_BLOCK = 18

/**
* Quantize [values] (length must be a multiple of 32) into packed
* Q4_0 bytes — `18 * (values.size / 32)` bytes.
*/
public fun quantizeToBytes(values: FloatArray): ByteArray {
require(values.size % BLOCK_SIZE == 0) {
"Q4_0 quantization requires a length that is a multiple of $BLOCK_SIZE; got ${values.size}"
}
val blocks = values.size / BLOCK_SIZE
val out = ByteArray(blocks * BYTES_PER_BLOCK)

for (b in 0 until blocks) {
val base = b * BLOCK_SIZE
// 1. Max-magnitude value, sign preserved.
var amax = 0f
var max = 0f
for (i in 0 until BLOCK_SIZE) {
val v = values[base + i]
val a = abs(v)
if (a > amax) {
amax = a
max = v
}
}
val d = max / -8f
val id = if (d != 0f) 1f / d else 0f

val outBase = b * BYTES_PER_BLOCK
// FP16 scale, little-endian.
val half = floatToHalf(d)
out[outBase] = (half and 0xFF).toByte()
out[outBase + 1] = ((half ushr 8) and 0xFF).toByte()

// 2. Codes, split layout: byte j packs element j (low) and j+16 (high).
for (j in 0 until 16) {
val lo = quantCode(values[base + j], id)
val hi = quantCode(values[base + 16 + j], id)
out[outBase + 2 + j] = ((hi shl 4) or lo).toByte()
}
}
return out
}

/**
* Quantize [values] into a [Q4_0BlockTensorData] with logical
* [shape] (`shape.volume` must equal `values.size` and be a
* multiple of 32).
*/
public fun quantize(values: FloatArray, shape: Shape): Q4_0BlockTensorData {
require(shape.volume == values.size) {
"shape volume ${shape.volume} must equal values length ${values.size}"
}
return Q4_0BlockTensorData(shape, quantizeToBytes(values))
}

private fun quantCode(x: Float, id: Float): Int {
// ggml: (int)(x * id + 8.5f), clamped to [0, 15].
val q = (x * id + 8.5f).toInt()
return if (q < 0) 0 else if (q > 15) 15 else q
}

/** Round-to-nearest FP32 → FP16 bits. */
private fun floatToHalf(value: Float): Int {
val bits = value.toRawBits()
val sign = (bits ushr 16) and 0x8000
var exp = ((bits ushr 23) and 0xFF) - 127 + 15
val mant = bits and 0x7FFFFF
return when {
exp >= 0x1F -> sign or 0x7C00 // overflow → ±inf
exp <= 0 -> {
// Subnormal / underflow to zero (scales here are well within
// normal FP16 range, so this branch is the safe floor).
if (exp < -10) {
sign
} else {
val m = (mant or 0x800000) ushr (1 - exp + 13)
sign or m
}
}
else -> {
// Round to nearest, ties to even.
val half = sign or (exp shl 10) or (mant ushr 13)
val roundBit = (mant ushr 12) and 1
half + roundBit
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package sk.ainet.lang.tensor.data

import sk.ainet.lang.tensor.Shape
import kotlin.math.abs
import kotlin.random.Random
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFailsWith
import kotlin.test.assertTrue

class Q4_0QuantizerTest {

@Test
fun `quantizeToBytes produces 18 bytes per 32-element block`() {
val bytes = Q4_0Quantizer.quantizeToBytes(FloatArray(64) { 0.1f * it })
assertEquals(2 * 18, bytes.size)
}

@Test
fun `rejects non-block-aligned length`() {
assertFailsWith<IllegalArgumentException> {
Q4_0Quantizer.quantizeToBytes(FloatArray(31))
}
}

@Test
fun `quantize then dequantize round-trips within 4-bit error`() {
val rng = Random(7)
val n = 32 * 8
val values = FloatArray(n) { (rng.nextFloat() - 0.5f) * 4f }
val q = Q4_0Quantizer.quantize(values, Shape(n))
val back = q.toFloatArray()

// Per block, max-magnitude sets the step ≈ |max| / 8. Allow ~1 step.
for (b in 0 until n / 32) {
var amax = 0f
for (i in 0 until 32) amax = maxOf(amax, abs(values[b * 32 + i]))
val step = amax / 8f
for (i in 0 until 32) {
val idx = b * 32 + i
val diff = abs(values[idx] - back[idx])
assertTrue(
diff <= step + 1e-4f,
"round-trip error at $idx: orig=${values[idx]} back=${back[idx]} diff=$diff step=$step",
)
}
}
}

@Test
fun `recovers the max-magnitude element closely`() {
val values = FloatArray(32) { 0f }
values[5] = -3.7f // dominant negative
values[9] = 1.2f
val back = Q4_0Quantizer.quantize(values, Shape(32)).toFloatArray()
// d = max / -8 with max = -3.7 → the dominant element recovers near-exactly.
assertEquals(-3.7f, back[5], 0.05f)
}

@Test
fun `all-zero block stays zero`() {
val back = Q4_0Quantizer.quantize(FloatArray(32), Shape(32)).toFloatArray()
for (v in back) assertEquals(0f, v, 1e-6f)
}

@Test
fun `quantize rejects shape volume mismatch`() {
assertFailsWith<IllegalArgumentException> {
Q4_0Quantizer.quantize(FloatArray(32), Shape(64))
}
}
}
Loading