diff --git a/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0QuantizeRoundTripMatmulTest.kt b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0QuantizeRoundTripMatmulTest.kt new file mode 100644 index 00000000..618faca2 --- /dev/null +++ b/skainet-backends/skainet-backend-cpu/src/jvmTest/kotlin/sk/ainet/exec/tensor/ops/Q4_0QuantizeRoundTripMatmulTest.kt @@ -0,0 +1,80 @@ +package sk.ainet.exec.tensor.ops + +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.Tensor +import sk.ainet.lang.tensor.data.Q4_0Quantizer +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.FP32 + +/** + * End-to-end proof that the [Q4_0Quantizer] (FP32 → Q4_0) output is + * directly consumable by the matmul dispatch — i.e. *any* loader that + * produces dense FP32 weights can quantize them to Q4_0 and run + * inference through the same kernel path GGUF Q4_0 weights use. + * + * Quantizes a dense weight, runs `ctx.ops.matmul(x, qWeight)`, and + * checks it tracks the dense FP32 matmul within 4-bit error. + */ +class Q4_0QuantizeRoundTripMatmulTest { + + private val ctx = DirectCpuExecutionContext() + + @Suppress("UNCHECKED_CAST") + private fun assertQuantizedTracksDense(inputDim: Int, outputDim: Int, seed: Int) { + val rng = Random(seed) + // Logical weight W[o][j] (output o, input j). + val w = Array(outputDim) { FloatArray(inputDim) { rng.nextFloat() - 0.5f } } + val inputV = FloatArray(inputDim) { rng.nextFloat() - 0.5f } + + // Reference: plain FP32 matmul. + val expected = FloatArray(outputDim) + for (o in 0 until outputDim) { + var acc = 0f + for (j in 0 until inputDim) acc += inputV[j] * w[o][j] + expected[o] = acc + } + + // Arrange weights in the kernel's packed block order — block + // (blockIdx, o) holds the 32 input positions [blockIdx*32 .. +31] + // for output o — then quantize that flat array. This is the layout + // a loader producing Q4_0 matmul weights must emit. + val blocks = inputDim / 32 + val flat = FloatArray(inputDim * outputDim) + var p = 0 + for (blockIdx in 0 until blocks) { + for (o in 0 until outputDim) { + for (k in 0 until 32) { + flat[p++] = w[o][blockIdx * 32 + k] + } + } + } + val qData = Q4_0Quantizer.quantize(flat, Shape(inputDim, outputDim)) + val weight: Tensor = ctx.fromData(qData as TensorData, FP32::class) + val input = ctx.fromFloatArray(Shape(1, inputDim), FP32::class, inputV) + + val out = ctx.ops.matmul(input, weight).data.copyToFloatArray() + + // Q4_0 quantization error per weight is ~step/2 (step ≈ |max|/8 per + // block); the dot-product error over `inputDim` random-signed terms + // grows ~√blocks, not linearly. Tolerance scales accordingly. + val tol = 0.1f + 0.1f * (inputDim / 32).coerceAtLeast(1) + for (o in 0 until outputDim) { + val diff = abs(expected[o] - out[o]) + assertTrue( + diff <= tol, + "quantized matmul drifted at $o: dense=${expected[o]} q4_0=${out[o]} diff=$diff tol=$tol", + ) + } + } + + @Test fun single_output_tracks_dense() = + assertQuantizedTracksDense(inputDim = 64, outputDim = 1, seed = 1) + + @Test fun attention_proj_shape_tracks_dense() = + assertQuantizedTracksDense(inputDim = 128, outputDim = 128, seed = 2) +} diff --git a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api index 1805010a..ad66191e 100644 --- a/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api +++ b/skainet-lang/skainet-lang-core/api/jvm/skainet-lang-core.api @@ -3056,6 +3056,12 @@ public final class sk/ainet/lang/tensor/data/Q4_0BlockTensorData$Companion { public final fun fromRawBytes (Lsk/ainet/lang/tensor/Shape;[B)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData; } +public final class sk/ainet/lang/tensor/data/Q4_0Quantizer { + public static final field INSTANCE Lsk/ainet/lang/tensor/data/Q4_0Quantizer; + public final fun quantize ([FLsk/ainet/lang/tensor/Shape;)Lsk/ainet/lang/tensor/data/Q4_0BlockTensorData; + public final fun quantizeToBytes ([F)[B +} + public abstract interface class sk/ainet/lang/tensor/data/Q4_0TensorData : sk/ainet/lang/tensor/data/TensorData { public static final field BLOCK_SIZE I public static final field BYTES_PER_BLOCK I diff --git a/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0Quantizer.kt b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0Quantizer.kt new file mode 100644 index 00000000..17126ef9 --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonMain/kotlin/sk/ainet/lang/tensor/data/Q4_0Quantizer.kt @@ -0,0 +1,119 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import kotlin.math.abs + +/** + * FP32 → Q4_0 quantizer — the loader-agnostic counterpart to + * [Q4_0TensorData]'s decode side. + * + * Q4_0 was decode-only until now (GGUF files arrive pre-quantized). + * This makes Q4_0 *producible* from dense FP32 in pure `commonMain`, so + * any source — a SafeTensors / JSON loader that only carries dense + * weights, an in-memory tensor, an offline packing tool — can emit + * canonical ggml Q4_0 blocks without going through GGUF. + * + * Algorithm (per 32-element block, matching ggml `quantize_row_q4_0`): + * 1. Find the element of greatest magnitude `max` (sign preserved). + * 2. `d = max / -8` so the most-negative code (0 → `-8`) recovers it; + * store `d` as the block's FP16 scale. + * 3. Each element: `code = clamp(round(x / d + 8), 0, 15)`, packed in + * the canonical split layout (low nibbles → elements 0..15, high → + * 16..31). + * + * Round-trips through [Q4_0TensorData.toFloatArray] within 4-bit + * quantization error. + */ +public object Q4_0Quantizer { + + private const val BLOCK_SIZE = 32 + private const val BYTES_PER_BLOCK = 18 + + /** + * Quantize [values] (length must be a multiple of 32) into packed + * Q4_0 bytes — `18 * (values.size / 32)` bytes. + */ + public fun quantizeToBytes(values: FloatArray): ByteArray { + require(values.size % BLOCK_SIZE == 0) { + "Q4_0 quantization requires a length that is a multiple of $BLOCK_SIZE; got ${values.size}" + } + val blocks = values.size / BLOCK_SIZE + val out = ByteArray(blocks * BYTES_PER_BLOCK) + + for (b in 0 until blocks) { + val base = b * BLOCK_SIZE + // 1. Max-magnitude value, sign preserved. + var amax = 0f + var max = 0f + for (i in 0 until BLOCK_SIZE) { + val v = values[base + i] + val a = abs(v) + if (a > amax) { + amax = a + max = v + } + } + val d = max / -8f + val id = if (d != 0f) 1f / d else 0f + + val outBase = b * BYTES_PER_BLOCK + // FP16 scale, little-endian. + val half = floatToHalf(d) + out[outBase] = (half and 0xFF).toByte() + out[outBase + 1] = ((half ushr 8) and 0xFF).toByte() + + // 2. Codes, split layout: byte j packs element j (low) and j+16 (high). + for (j in 0 until 16) { + val lo = quantCode(values[base + j], id) + val hi = quantCode(values[base + 16 + j], id) + out[outBase + 2 + j] = ((hi shl 4) or lo).toByte() + } + } + return out + } + + /** + * Quantize [values] into a [Q4_0BlockTensorData] with logical + * [shape] (`shape.volume` must equal `values.size` and be a + * multiple of 32). + */ + public fun quantize(values: FloatArray, shape: Shape): Q4_0BlockTensorData { + require(shape.volume == values.size) { + "shape volume ${shape.volume} must equal values length ${values.size}" + } + return Q4_0BlockTensorData(shape, quantizeToBytes(values)) + } + + private fun quantCode(x: Float, id: Float): Int { + // ggml: (int)(x * id + 8.5f), clamped to [0, 15]. + val q = (x * id + 8.5f).toInt() + return if (q < 0) 0 else if (q > 15) 15 else q + } + + /** Round-to-nearest FP32 → FP16 bits. */ + private fun floatToHalf(value: Float): Int { + val bits = value.toRawBits() + val sign = (bits ushr 16) and 0x8000 + var exp = ((bits ushr 23) and 0xFF) - 127 + 15 + val mant = bits and 0x7FFFFF + return when { + exp >= 0x1F -> sign or 0x7C00 // overflow → ±inf + exp <= 0 -> { + // Subnormal / underflow to zero (scales here are well within + // normal FP16 range, so this branch is the safe floor). + if (exp < -10) { + sign + } else { + val m = (mant or 0x800000) ushr (1 - exp + 13) + sign or m + } + } + else -> { + // Round to nearest, ties to even. + val half = sign or (exp shl 10) or (mant ushr 13) + val roundBit = (mant ushr 12) and 1 + half + roundBit + } + } + } +} diff --git a/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0QuantizerTest.kt b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0QuantizerTest.kt new file mode 100644 index 00000000..457bbdea --- /dev/null +++ b/skainet-lang/skainet-lang-core/src/commonTest/kotlin/sk/ainet/lang/tensor/data/Q4_0QuantizerTest.kt @@ -0,0 +1,72 @@ +package sk.ainet.lang.tensor.data + +import sk.ainet.lang.tensor.Shape +import kotlin.math.abs +import kotlin.random.Random +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFailsWith +import kotlin.test.assertTrue + +class Q4_0QuantizerTest { + + @Test + fun `quantizeToBytes produces 18 bytes per 32-element block`() { + val bytes = Q4_0Quantizer.quantizeToBytes(FloatArray(64) { 0.1f * it }) + assertEquals(2 * 18, bytes.size) + } + + @Test + fun `rejects non-block-aligned length`() { + assertFailsWith { + Q4_0Quantizer.quantizeToBytes(FloatArray(31)) + } + } + + @Test + fun `quantize then dequantize round-trips within 4-bit error`() { + val rng = Random(7) + val n = 32 * 8 + val values = FloatArray(n) { (rng.nextFloat() - 0.5f) * 4f } + val q = Q4_0Quantizer.quantize(values, Shape(n)) + val back = q.toFloatArray() + + // Per block, max-magnitude sets the step ≈ |max| / 8. Allow ~1 step. + for (b in 0 until n / 32) { + var amax = 0f + for (i in 0 until 32) amax = maxOf(amax, abs(values[b * 32 + i])) + val step = amax / 8f + for (i in 0 until 32) { + val idx = b * 32 + i + val diff = abs(values[idx] - back[idx]) + assertTrue( + diff <= step + 1e-4f, + "round-trip error at $idx: orig=${values[idx]} back=${back[idx]} diff=$diff step=$step", + ) + } + } + } + + @Test + fun `recovers the max-magnitude element closely`() { + val values = FloatArray(32) { 0f } + values[5] = -3.7f // dominant negative + values[9] = 1.2f + val back = Q4_0Quantizer.quantize(values, Shape(32)).toFloatArray() + // d = max / -8 with max = -3.7 → the dominant element recovers near-exactly. + assertEquals(-3.7f, back[5], 0.05f) + } + + @Test + fun `all-zero block stays zero`() { + val back = Q4_0Quantizer.quantize(FloatArray(32), Shape(32)).toFloatArray() + for (v in back) assertEquals(0f, v, 1e-6f) + } + + @Test + fun `quantize rejects shape volume mismatch`() { + assertFailsWith { + Q4_0Quantizer.quantize(FloatArray(32), Shape(64)) + } + } +}