diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 66e7fb68..5aa078ed 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,5 +1,5 @@ [versions] -skainet = "0.28.1" +skainet = "0.30.0" agp = "9.2.1" jacksonDatabind = "2.22.0" jsonSchemaValidator = "3.0.3" diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt index f73b3ac8..abc8c3e3 100644 --- a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt @@ -122,7 +122,7 @@ public class GemmaNetworkLoader @PublishedApi internal constructor( public suspend inline fun load( ctx: ExecutionContext ): Module { - val weights: Gemma4Weights = when (val wp = weightsProvider) { + val rawWeights: Gemma4Weights = when (val wp = weightsProvider) { is WeightsProvider.GgufSource -> { val loader = Gemma4WeightLoader(wp.sourceProvider, quantPolicy = wp.quantPolicy) loader.loadToMap(ctx) @@ -142,6 +142,24 @@ public class GemmaNetworkLoader @PublishedApi internal constructor( } } + // NATIVE_OPTIMIZED yields raw-byte quant tensors the network mapper can't + // consume directly. Pack them (heap Q4/5/6_K + FP32 fallback) here — this + // is commonMain so it works on Kotlin/Native (the board) as well as the + // JVM, and replaces the JVM-only `convertGemmaWeightsToMemSeg` for the + // `load()` entry point. + val ggufPolicy = when (val wp = weightsProvider) { + is WeightsProvider.GgufSource -> wp.quantPolicy + is WeightsProvider.GgufRandomAccess -> wp.quantPolicy + else -> null + } + val weights: Gemma4Weights = + if (ggufPolicy == QuantPolicy.NATIVE_OPTIMIZED) { + @Suppress("UNCHECKED_CAST") + convertGemmaWeightsPacked(rawWeights, ctx) as Gemma4Weights + } else { + rawWeights + } + return applyWeightsToNetwork(ctx, weights) } diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt new file mode 100644 index 00000000..ec52eb4c --- /dev/null +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt @@ -0,0 +1,125 @@ +package sk.ainet.models.gemma + +import sk.ainet.context.ExecutionContext +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.io.gguf.dequant.DequantOps +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.Tensor +import sk.ainet.lang.tensor.data.IntArrayTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.DType +import sk.ainet.lang.types.FP32 + +/** + * commonMain (Kotlin/Native-capable) analogue of the jvmMain + * `convertGemmaWeightsToMemSeg`. Converts the raw-byte quantized tensors a + * `NATIVE_OPTIMIZED` load produces into the forms the DSL matmul path consumes: + * + * - **Q4_K / Q5_K / Q6_K matmul weights** → heap-packed `Q{4,5,6}_KBlockTensorData` + * (via [packGemmaKQuant], with the row-major→block-major relayout). These keep + * the GGUF footprint and run the in-kernel dequant matmul (NEON on the board). + * - **token_embd / output** → FP32 dequant in canonical `[vocab, embed]` order + * (the embedding is gathered, not matmul'd, so no transpose). + * - **everything else quantized** → FP32 dequant transposed to `[out, in]` + * row-major so `linearProject` (`x @ W.t()`) is correct. + * + * Unlike the MemSeg converter this uses no `java.lang.foreign` — it runs on the + * SL2610 board binary (Kotlin/Native) as well as the JVM. The JVM still prefers + * the MemSeg path (lazy transpose + Q4/Q8 MemSeg); this is the board path. + */ +public fun convertGemmaWeightsPacked( + weights: Gemma4Weights<*, *>, + ctx: ExecutionContext, +): Gemma4Weights<*, *> { + @Suppress("UNCHECKED_CAST") + val typed = weights as Gemma4Weights + val quantTypes = typed.quantTypes + if (quantTypes.isEmpty()) return weights + + val logicalShapes = typed.logicalShapes + val newTensors = linkedMapOf>() + for ((name, tensor) in typed.tensors) { + val qt = quantTypes[name] + newTensors[name] = when { + qt == null -> tensor // not quantized + else -> { + val shape = logicalShapes[name] ?: logicalShapeFor(name, typed.metadata) + if (shape == null) { + tensor // unknown 2-D layout — leave as-is + } else { + val bytes = extractRawBytes(tensor.data) + val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS || + name == Gemma4TensorNames.OUTPUT_WEIGHT + val packed = if (!isEmbed) packGemmaKQuant(bytes, qt, shape) else null + when { + packed != null -> { + @Suppress("UNCHECKED_CAST") + ctx.fromData(packed as TensorData, FP32::class) as Tensor + } + isEmbed -> dequantNoTranspose(bytes, qt, shape, ctx) + else -> dequantTransposed(bytes, qt, shape, ctx) + } + } + } + } + } + @Suppress("UNCHECKED_CAST") + return Gemma4Weights(typed.metadata, newTensors, typed.quantTypes, typed.logicalShapes) as Gemma4Weights<*, *> +} + +/** Dequant to FP32 in natural `[rows, cols]` order (embeddings — gathered, not matmul'd). */ +@Suppress("UNCHECKED_CAST") +private fun dequantNoTranspose( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, + ctx: ExecutionContext, +): Tensor { + val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume) + return ctx.fromFloatArray(shape, FP32::class, floats) as Tensor +} + +/** + * Dequant to a canonical FP32 `[out, in]` row-major weight. GGUF stores K/legacy + * blocks column-major within a row, so the dequantized floats are transposed + * column-major → row-major to match what `linearProject` (`x @ W.t()`) expects. + */ +@Suppress("UNCHECKED_CAST") +private fun dequantTransposed( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, + ctx: ExecutionContext, +): Tensor { + val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume) + val out = shape[0] + val inDim = shape[1] + val rowMajor = DequantOps.transposeColumnMajorToRowMajor(floats, inDim, out) + return ctx.fromFloatArray(shape, FP32::class, rowMajor) as Tensor +} + +/** + * Read the raw packed bytes back from a `NATIVE_OPTIMIZED` quant tensor. The + * backing differs by platform/factory — JVM stores `IntArrayTensorData` (byte + * values widened to Int); Kotlin/Native stores a Byte-typed tensor — so handle + * both element types. + */ +internal fun extractRawBytes(data: TensorData<*, *>): ByteArray { + if (data is IntArrayTensorData<*>) { + val buf = data.buffer + return ByteArray(buf.size) { buf[it].toByte() } + } + val n = data.shape.volume + @Suppress("UNCHECKED_CAST") + val d = data as TensorData<*, Any?> + return ByteArray(n) { + when (val v = d[it]) { + is Byte -> v + is Int -> v.toByte() + else -> error( + "convertGemmaWeightsPacked: cannot read bytes from ${data::class.simpleName} " + + "(element ${v?.let { e -> e::class.simpleName }})", + ) + } + } +} diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt new file mode 100644 index 00000000..7f4e7b9f --- /dev/null +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt @@ -0,0 +1,121 @@ +package sk.ainet.models.gemma + +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.DType + +/** + * Platform-neutral (commonMain) layout helpers for Gemma 4 quantized weights. + * + * These were previously JVM-only (inside `GemmaMemSegConverter`), but the + * Kotlin/Native board path needs the same logic: on K/N there is no + * `java.lang.foreign` MemSeg conversion, so the eager runtime keeps K-quant + * weights as heap-packed `Q{4,5,6}_KBlockTensorData` produced here. The JVM + * MemSeg converter reuses the same relayout + shape recovery. + */ + +/** + * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF name + * and model metadata. `Gemma4WeightLoader` with `NATIVE_OPTIMIZED` stores + * quantized tensors as 1-D byte arrays, so converters need the original + * `[rows, cols]` shape to re-layout blocks. Returns `null` for tensors without + * a 2-D matmul layout (norms, embeddings the converter dequantizes anyway). + */ +internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? { + val embed = metadata.embeddingLength + val vocab = metadata.vocabSize + return when { + name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed) + name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed) + name.startsWith("blk.") -> { + val rest = name.substringAfter("blk.") + val layer = rest.substringBefore('.').toIntOrNull() ?: return null + val headDim = metadata.getHeadDim(layer) + val qDim = metadata.headCount * headDim + val kvDim = metadata.kvHeadCount * headDim + val ffn = metadata.intermediateSize + when { + name.endsWith(".attn_q.weight") -> Shape(qDim, embed) + name.endsWith(".attn_k.weight") -> Shape(kvDim, embed) + name.endsWith(".attn_v.weight") -> Shape(kvDim, embed) + name.endsWith(".attn_output.weight") -> Shape(embed, qDim) + name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed) + name.endsWith(".ffn_up.weight") -> Shape(ffn, embed) + name.endsWith(".ffn_down.weight") -> Shape(embed, ffn) + else -> null + } + } + else -> null + } +} + +/** + * Re-layout GGUF K-series bytes from row-major block order + * (`(r * blocksPerRow + b) * bytesPerBlock`) to the input-block-major order the + * `matmulQ{K}` kernels expect (`(b * outDim + r) * bytesPerBlock`). For a + * `[outDim, inDim]` weight with `inDim % 256 == 0`, this is a block-level 2-D + * transpose; bytes inside a block are untouched. + * + * @param bytesPerBlock 144 (Q4_K), 176 (Q5_K), 210 (Q6_K). + */ +internal fun relayoutKSeriesRowMajorToBlockMajor( + bytes: ByteArray, + shape: Shape, + bytesPerBlock: Int, +): ByteArray { + val blockSize = 256 + require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" } + val outDim = shape[0] + val inDim = shape[1] + require(inDim % blockSize == 0) { "K-series weight inDim ($inDim) must be a multiple of $blockSize" } + val blocksPerRow = inDim / blockSize + val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong() + require(bytes.size.toLong() >= expected) { + "K-series byte buffer ${bytes.size} < expected $expected for [$outDim, $inDim] @ ${bytesPerBlock}B/block" + } + val out = ByteArray(bytes.size) + for (r in 0 until outDim) { + for (b in 0 until blocksPerRow) { + val srcOff = (r * blocksPerRow + b) * bytesPerBlock + val dstOff = (b * outDim + r) * bytesPerBlock + bytes.copyInto(out, dstOff, srcOff, srcOff + bytesPerBlock) + } + } + return out +} + +/** Bytes per ggml block for the K-quant types this packer handles. */ +private fun kQuantBytesPerBlock(qt: GGMLQuantizationType): Int? = when (qt) { + GGMLQuantizationType.Q4_K -> 144 + GGMLQuantizationType.Q5_K -> 176 + GGMLQuantizationType.Q6_K -> 210 + else -> null +} + +/** + * Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the + * heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K / + * Q6_K). Performs the row-major → block-major relayout. Returns `null` for + * non-K-quant types (caller dequantizes those to FP32). + * + * commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena). + */ +internal fun packGemmaKQuant( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, +): TensorData? { + val bpb = kQuantBytesPerBlock(qt) ?: return null + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb) + @Suppress("UNCHECKED_CAST") + return when (qt) { + GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData + GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData + GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData + else -> null + } +} diff --git a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt new file mode 100644 index 00000000..52a1cdd1 --- /dev/null +++ b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt @@ -0,0 +1,73 @@ +package sk.ainet.models.gemma + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNull +import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.types.FP32 +import sk.ainet.lang.types.Int8 + +/** + * Unit tests for the commonMain (board-shareable) Gemma quant layout helpers. + * These run on every target (JVM + Kotlin/Native), proving the K/N board path's + * relayout + packing logic without needing the full model. + */ +class GemmaQuantLayoutTest { + + @Test + fun relayout_is_block_level_transpose() { + // [outDim=2, inDim=512] -> blocksPerRow=2, 4 Q5_K blocks of 176 B. + val bpb = 176 + val outDim = 2 + val inDim = 512 + val blocksPerRow = inDim / 256 + val bytes = ByteArray(outDim * blocksPerRow * bpb) + // Tag each source block with its row-major index in its first byte. + for (i in 0 until outDim * blocksPerRow) bytes[i * bpb] = i.toByte() + + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, Shape(outDim, inDim), bpb) + + // dst block (b*outDim + r) must hold src block (r*blocksPerRow + b). + for (r in 0 until outDim) { + for (b in 0 until blocksPerRow) { + val srcIdx = r * blocksPerRow + b + val dstIdx = b * outDim + r + assertEquals(srcIdx.toByte(), relaid[dstIdx * bpb], "block ($r,$b) misplaced") + } + } + } + + @Test + fun pack_q5k_produces_block_tensor_with_relaid_bytes() { + val shape = Shape(2, 512) + val bytes = ByteArray(2 * 2 * 176) + for (i in 0 until 4) bytes[i * 176] = (i + 1).toByte() + + val td = packGemmaKQuant(bytes, GGMLQuantizationType.Q5_K, shape) + assertTrue(td is Q5_KBlockTensorData, "Q5_K should pack to Q5_KBlockTensorData") + // packedData is the block-major relayout of the input. + val expected = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176) + assertTrue(expected.contentEquals(td.packedData)) + } + + @Test + fun pack_non_kquant_returns_null() { + assertNull(packGemmaKQuant(ByteArray(34), GGMLQuantizationType.Q8_0, Shape(1, 32))) + } + + @Test + fun extract_raw_bytes_roundtrips_on_every_platform() { + // The NATIVE_OPTIMIZED loader wraps quant bytes via ctx.fromByteArray; + // extractRawBytes must read them back regardless of the platform backing + // (JVM IntArrayTensorData vs native Byte-typed). Runs on jvm + linuxX64. + val ctx = DirectCpuExecutionContext.create() + val bytes = ByteArray(176 * 3) { ((it * 31 + 7) and 0xFF).toByte() } + val t = ctx.fromByteArray(Shape(bytes.size), Int8::class, bytes) + val got = extractRawBytes(t.data) + assertTrue(bytes.contentEquals(got), "extractRawBytes round-trip mismatch") + } +} diff --git a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt index d3a4502f..191f2510 100644 --- a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt +++ b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt @@ -8,6 +8,7 @@ import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.Tensor import sk.ainet.lang.tensor.data.IntArrayTensorData import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData import sk.ainet.lang.tensor.data.Q6_KBlockTensorData import sk.ainet.lang.tensor.data.Q4MemorySegmentTensorData import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData @@ -15,44 +16,9 @@ import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.types.DType import sk.ainet.lang.types.FP32 -/** - * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF - * name and the model metadata. `Gemma4WeightLoader` with - * `NATIVE_OPTIMIZED` stores quantized tensors as 1-D byte arrays so the - * tensor-data factory accepts them; the converter needs the original - * shape to re-layout blocks and construct `Q4_KBlockTensorData` / - * `Q4/Q8MemorySegmentTensorData`. - * - * Returns `null` for tensors that don't have a 2-D matmul layout (norms, - * embeddings the converter wants to dequant anyway). - */ -internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? { - val embed = metadata.embeddingLength - val vocab = metadata.vocabSize - return when { - name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed) - name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed) - name.startsWith("blk.") -> { - val rest = name.substringAfter("blk.") - val layer = rest.substringBefore('.').toIntOrNull() ?: return null - val headDim = metadata.getHeadDim(layer) - val qDim = metadata.headCount * headDim - val kvDim = metadata.kvHeadCount * headDim - val ffn = metadata.intermediateSize - when { - name.endsWith(".attn_q.weight") -> Shape(qDim, embed) - name.endsWith(".attn_k.weight") -> Shape(kvDim, embed) - name.endsWith(".attn_v.weight") -> Shape(kvDim, embed) - name.endsWith(".attn_output.weight") -> Shape(embed, qDim) - name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed) - name.endsWith(".ffn_up.weight") -> Shape(ffn, embed) - name.endsWith(".ffn_down.weight") -> Shape(embed, ffn) - else -> null - } - } - else -> null - } -} +// logicalShapeFor + relayoutKSeriesRowMajorToBlockMajor moved to commonMain +// (GemmaQuantLayout.kt) so the Kotlin/Native board path shares them. This +// JVM-only file keeps the MemSeg (FFM) conversion + the FP32 dequant fallbacks. /** * Convert raw-byte quantized tensors in a [Gemma4Weights] map (produced by @@ -197,8 +163,14 @@ private fun convertOne( ctx.fromData(data as TensorData, advertisedDtype) as Tensor } GGMLQuantizationType.Q5_K -> { - // No native matmul kernel yet for Q5_K. Fall back to a correct FP32 dequant. - dequantPackedToFp32(bytes, qt, shape, ctx) + // Same packed-path treatment as Q4_K/Q6_K, enabled by the Q5_K + // matmul kernel (scalar/Panama/native) + the lazy Q5_K transpose + // in DefaultCpuOps. FunctionGemma-270M Q5_K_M ships most attn/FFN + // weights as Q5_K, so keeping them packed (176 B/block) avoids the + // FP32 inflation and runs the in-kernel dequant matmul. + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176) + val data = Q5_KBlockTensorData.fromRawBytes(shape, relaid) + ctx.fromData(data as TensorData, advertisedDtype) as Tensor } else -> { // Any other quant type without a packed SIMD kernel (Q5_0/Q5_1/Q4_1/Q2_K/…) @@ -280,53 +252,9 @@ private fun dequantToFloat( } /** - * Re-layout GGUF K-series bytes from row-major block order (block at row r, - * block index b within row → byte offset `(r * blocksPerRow + b) * bytesPerBlock`) - * to the input-block-major layout the `matmulQ{K}_Vec` kernels expect - * (block at blockIdx bI for output row r → byte offset - * `(bI * outDim + r) * bytesPerBlock`). - * - * For a weight of shape `[outDim, inDim]` with `inDim % 256 == 0` (the - * K-series block size), this is just a 2D block-level transpose of the - * `[outDim, inDim/256]` array of `bytesPerBlock`-byte blocks. Bytes - * inside a block are untouched. - * - * @param bytes packed weight bytes in row-major [outDim, blocksPerRow] order - * @param shape logical `[outDim, inDim]` shape - * @param bytesPerBlock 144 for Q4_K, 210 for Q6_K (ggml block sizes) - */ -internal fun relayoutKSeriesRowMajorToBlockMajor( - bytes: ByteArray, - shape: sk.ainet.lang.tensor.Shape, - bytesPerBlock: Int -): ByteArray { - val blockSize = 256 - require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" } - val outDim = shape[0] - val inDim = shape[1] - require(inDim % blockSize == 0) { - "K-series weight inDim ($inDim) must be a multiple of $blockSize" - } - val blocksPerRow = inDim / blockSize - val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong() - require(bytes.size.toLong() >= expected) { - "K-series byte buffer size ${bytes.size} < expected $expected for shape [$outDim, $inDim] @ ${bytesPerBlock}B/block" - } - val out = ByteArray(bytes.size) - for (r in 0 until outDim) { - for (b in 0 until blocksPerRow) { - val srcOff = (r * blocksPerRow + b) * bytesPerBlock - val dstOff = (b * outDim + r) * bytesPerBlock - System.arraycopy(bytes, srcOff, out, dstOff, bytesPerBlock) - } - } - return out -} - -/** - * Back-compat shim that delegates to [relayoutKSeriesRowMajorToBlockMajor] - * at Q4_K's 144-byte block size. Kept for any callers outside this file - * pinned to the old name. + * Back-compat shim that delegates to the commonMain + * [relayoutKSeriesRowMajorToBlockMajor] at Q4_K's 144-byte block size. Kept for + * any callers outside this file pinned to the old name. */ internal fun relayoutQ4_KRowMajorToBlockMajor(bytes: ByteArray, shape: sk.ainet.lang.tensor.Shape): ByteArray = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 144) diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt index 406197c6..3f938609 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt @@ -31,7 +31,7 @@ import kotlin.test.assertEquals */ @Tag("integration") class GemmaBehavioralAbTest { - private val gguf = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + private val gguf = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" private fun argmax(a: FloatArray): Int { var bi = 0; var bv = a[0] diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt new file mode 100644 index 00000000..1d4a7ad4 --- /dev/null +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt @@ -0,0 +1,142 @@ +package sk.ainet.models.gemma + +import java.io.File +import java.lang.foreign.Arena +import kotlinx.coroutines.runBlocking +import kotlinx.io.buffered +import kotlinx.io.files.Path +import kotlinx.io.files.SystemFileSystem +import org.junit.jupiter.api.Assumptions +import org.junit.jupiter.api.Tag +import sk.ainet.apps.llm.OptimizedLLMMode +import sk.ainet.apps.llm.OptimizedLLMRuntime +import sk.ainet.apps.llm.tokenizer.GGUFTokenizer +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.io.JvmRandomAccessSource +import sk.ainet.io.model.QuantPolicy +import sk.ainet.lang.types.FP32 +import kotlin.test.Test +import kotlin.test.assertEquals + +/** + * End-to-end check that the NEW Q5_K packed in-kernel dequant path (upstream + * SKaiNET `Q5_KBlockTensorData` + `Q5KMatmulKernel`, wired here via + * [convertGemmaWeightsToMemSeg]) decodes FunctionGemma-270M (`Q5_K_M`) + * identically to the FP32-dequant baseline, and reports tokens/sec. + * + * Before this, the converter dequantized Q5_K weights to FP32 on load ("no + * native matmul kernel yet for Q5_K"). Now Q5_K stays packed (176 B/block) + * and runs the in-kernel dequant matmul. Both paths decode the same weights, + * so greedy argmax token sequences must match. + * + * Skips when the GGUF isn't present (CI without the checkpoint). + */ +@Tag("integration") +class GemmaQ5KPackedParityTest { + + private val gguf = + "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + + private fun argmax(a: FloatArray): Int { + var bi = 0; var bv = a[0] + for (i in 1 until a.size) if (a[i] > bv) { bv = a[i]; bi = i } + return bi + } + + private fun buildPrompt(u: String) = + "user\n$u\nmodel\n" + + private fun decode( + runtime: OptimizedLLMRuntime, + promptTokens: List, + maxNew: Int, + eos: Int, + eot: Int, + ): List { + runtime.reset() + var logits = FloatArray(0) + for (t in promptTokens) logits = runtime.forward(t).data.copyToFloatArray() + val gen = mutableListOf() + while (gen.size < maxNew) { + val next = argmax(logits) + gen.add(next) + if (next == eos || next == eot) break + logits = runtime.forward(next).data.copyToFloatArray() + } + return gen + } + + @Test + fun q5kPackedMatchesFp32() = runBlocking { + Assumptions.assumeTrue(File(gguf).exists(), "FunctionGemma GGUF not present — skipping") + + val ctx = DirectCpuExecutionContext.create() + val tokenizer = GGUFTokenizer.fromSource(SystemFileSystem.source(Path(gguf)).buffered()) + val eot = tokenizer.encode("").single() + val eos = tokenizer.eosTokenId + val promptTokens = + listOf(tokenizer.bosTokenId) + tokenizer.encode(buildPrompt("Turn the light on.")).toList() + val maxNew = 12 + + // --- FP32 dequant-on-load baseline --- + val wFp32 = Gemma4WeightLoader( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32, + ).loadToMapStreaming(ctx, FP32::class) + val mFp32 = GemmaNetworkLoader.fromWeights(ctx, wFp32, FP32::class) + val rtFp32 = OptimizedLLMRuntime( + model = mFp32, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + val genFp32 = decode(rtFp32, promptTokens, maxNew, eos, eot) + + // --- Q5_K packed in-kernel dequant path (NATIVE_OPTIMIZED + convert) --- + Arena.ofConfined().use { arena -> + val wNat = Gemma4WeightLoader( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.NATIVE_OPTIMIZED, + ).loadToMapStreaming(ctx, FP32::class) + val wConv = convertGemmaWeightsToMemSeg(wNat, ctx, arena) + @Suppress("UNCHECKED_CAST") + val mNat = GemmaNetworkLoader.fromWeights( + ctx, wConv as Gemma4Weights, FP32::class, + ) + val rtNat = OptimizedLLMRuntime( + model = mNat, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + + // Warmup one decode (JIT + kernel-provider resolution), then time. + decode(rtNat, promptTokens, 2, eos, eot) + val t0 = System.nanoTime() + val genNat = decode(rtNat, promptTokens, maxNew, eos, eot) + val ms = (System.nanoTime() - t0) / 1e6 + val toks = genNat.size + promptTokens.size + + println("Q5K-packed gen=$genNat") + println("FP32-base gen=$genFp32") + println("Q5K decoded='${tokenizer.decode(genNat.toIntArray()).replace("\n", "\\n")}'") + println( + "Q5K-packed throughput: $toks tok in ${"%.0f".format(ms)} ms " + + "(${"%.2f".format(toks * 1000.0 / ms)} tok/s incl. prefill)", + ) + + assertEquals(genFp32, genNat, "Q5_K packed decode diverged from FP32 baseline") + } + + // The wired path: GemmaNetworkLoader.load(NATIVE_OPTIMIZED) applies the + // commonMain convertGemmaWeightsPacked (the board path) — no MemSeg, no + // Arena. Must decode identically to the FP32 baseline too. + val mLoad = GemmaNetworkLoader.fromGguf( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.NATIVE_OPTIMIZED, + ).load(ctx) + val rtLoad = OptimizedLLMRuntime( + model = mLoad, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + val genLoad = decode(rtLoad, promptTokens, maxNew, eos, eot) + println("load(NATIVE_OPTIMIZED) gen=$genLoad") + assertEquals(genFp32, genLoad, "load(NATIVE_OPTIMIZED) packed decode diverged from FP32 baseline") + } +} diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt index 227fb351..59ddc216 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt @@ -35,7 +35,7 @@ import kotlin.test.Test class RealGemmaBakeIrpaTest { @Test fun bakeRealGemmaToIrpa() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt index cbd6ebf8..af3c5e01 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt @@ -17,7 +17,7 @@ import kotlin.test.Test class RealGemmaDequantDumpTest { @Test fun dumpDequant() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt index 3bfccce1..f0037477 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt @@ -24,7 +24,7 @@ import kotlin.test.Test class RealGemmaEagerAbTest { @Test fun eagerLogits() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt index f90bda23..019dcd86 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt @@ -32,7 +32,7 @@ import kotlin.test.Test class RealGemmaExternalParamTest { @Test fun externalizeRealGemmaWeights() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt index 28952531..2905da6c 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt @@ -21,7 +21,7 @@ import kotlin.test.Test class RealGemmaLoadTest { @Test fun loadFunctionGemmaWeights() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val loader = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/settings.gradle.kts b/settings.gradle.kts index d43ab581..ab3b9ebc 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -8,6 +8,12 @@ pluginManagement { dependencyResolutionManagement { repositories { + // mavenLocal first so a locally-published upstream SKaiNET (same + // coordinates/version, e.g. sk.ainet.core:*:0.29.1 from a sibling + // ../SKaiNET `publishToMavenLocal`) shadows Maven Central. Lets the + // transformers build consume in-progress SKaiNET changes without the + // composite build. Maven Central remains the fallback. + mavenLocal() google() mavenCentral() }