From 0ce5927a8138098a6897adc8062d96685e959aff Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Wed, 10 Jun 2026 23:41:42 +0200 Subject: [PATCH 1/6] feat(gemma): wire Q5_K packed in-kernel dequant into the eager runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FunctionGemma-270M ships as Q5_K_M, but GemmaMemSegConverter dequantized Q5_K weights to FP32 on load ("no native matmul kernel yet for Q5_K"), losing the memory savings and the in-kernel dequant. Upstream SKaiNET 0.29.1 now provides a first-class Q5_K packed matmul (Q5_KBlockTensorData + Q5KMatmulKernel: scalar/Panama/native), so keep Q5_K packed here too: relayout GGUF bytes to block-major + wrap as Q5_KBlockTensorData (176 B/ block). Dispatch + lazy transpose reach it via DefaultCpuOps. - Bump skainet 0.28.1 -> 0.29.1 (source-of-truth for the llm-bom platform). - settings.gradle.kts: mavenLocal first so a locally-published SKaiNET 0.29.1 (carrying the in-progress Q5_K kernel) shadows Maven Central until it's released; Central remains the fallback. Verified (GemmaQ5KPackedParityTest, -PincludeIntegration): the Q5_K packed path decodes FunctionGemma byte-identically to the FP32 baseline — [262146, 236769, 3255, 718, 498, 1373, 262152, 106] -> `(state="on") ` for "Turn the light on." (the known-good tool call), 0.81 tok/s on the JVM host incl. prefill. Co-Authored-By: Claude Opus 4.8 (1M context) --- gradle/libs.versions.toml | 2 +- .../models/gemma/GemmaMemSegConverter.kt | 11 +- .../models/gemma/GemmaQ5KPackedParityTest.kt | 127 ++++++++++++++++++ settings.gradle.kts | 6 + 4 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 66e7fb6..98b9de5 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,5 +1,5 @@ [versions] -skainet = "0.28.1" +skainet = "0.29.1" agp = "9.2.1" jacksonDatabind = "2.22.0" jsonSchemaValidator = "3.0.3" diff --git a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt index d3a4502..232b417 100644 --- a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt +++ b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt @@ -8,6 +8,7 @@ import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.Tensor import sk.ainet.lang.tensor.data.IntArrayTensorData import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData import sk.ainet.lang.tensor.data.Q6_KBlockTensorData import sk.ainet.lang.tensor.data.Q4MemorySegmentTensorData import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData @@ -197,8 +198,14 @@ private fun convertOne( ctx.fromData(data as TensorData, advertisedDtype) as Tensor } GGMLQuantizationType.Q5_K -> { - // No native matmul kernel yet for Q5_K. Fall back to a correct FP32 dequant. - dequantPackedToFp32(bytes, qt, shape, ctx) + // Same packed-path treatment as Q4_K/Q6_K, enabled by the Q5_K + // matmul kernel (scalar/Panama/native) + the lazy Q5_K transpose + // in DefaultCpuOps. FunctionGemma-270M Q5_K_M ships most attn/FFN + // weights as Q5_K, so keeping them packed (176 B/block) avoids the + // FP32 inflation and runs the in-kernel dequant matmul. + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176) + val data = Q5_KBlockTensorData.fromRawBytes(shape, relaid) + ctx.fromData(data as TensorData, advertisedDtype) as Tensor } else -> { // Any other quant type without a packed SIMD kernel (Q5_0/Q5_1/Q4_1/Q2_K/…) diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt new file mode 100644 index 0000000..1e33fd6 --- /dev/null +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt @@ -0,0 +1,127 @@ +package sk.ainet.models.gemma + +import java.io.File +import java.lang.foreign.Arena +import kotlinx.coroutines.runBlocking +import kotlinx.io.buffered +import kotlinx.io.files.Path +import kotlinx.io.files.SystemFileSystem +import org.junit.jupiter.api.Assumptions +import org.junit.jupiter.api.Tag +import sk.ainet.apps.llm.OptimizedLLMMode +import sk.ainet.apps.llm.OptimizedLLMRuntime +import sk.ainet.apps.llm.tokenizer.GGUFTokenizer +import sk.ainet.context.DirectCpuExecutionContext +import sk.ainet.io.JvmRandomAccessSource +import sk.ainet.io.model.QuantPolicy +import sk.ainet.lang.types.FP32 +import kotlin.test.Test +import kotlin.test.assertEquals + +/** + * End-to-end check that the NEW Q5_K packed in-kernel dequant path (upstream + * SKaiNET `Q5_KBlockTensorData` + `Q5KMatmulKernel`, wired here via + * [convertGemmaWeightsToMemSeg]) decodes FunctionGemma-270M (`Q5_K_M`) + * identically to the FP32-dequant baseline, and reports tokens/sec. + * + * Before this, the converter dequantized Q5_K weights to FP32 on load ("no + * native matmul kernel yet for Q5_K"). Now Q5_K stays packed (176 B/block) + * and runs the in-kernel dequant matmul. Both paths decode the same weights, + * so greedy argmax token sequences must match. + * + * Skips when the GGUF isn't present (CI without the checkpoint). + */ +@Tag("integration") +class GemmaQ5KPackedParityTest { + + private val gguf = + "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + + private fun argmax(a: FloatArray): Int { + var bi = 0; var bv = a[0] + for (i in 1 until a.size) if (a[i] > bv) { bv = a[i]; bi = i } + return bi + } + + private fun buildPrompt(u: String) = + "user\n$u\nmodel\n" + + private fun decode( + runtime: OptimizedLLMRuntime, + promptTokens: List, + maxNew: Int, + eos: Int, + eot: Int, + ): List { + runtime.reset() + var logits = FloatArray(0) + for (t in promptTokens) logits = runtime.forward(t).data.copyToFloatArray() + val gen = mutableListOf() + while (gen.size < maxNew) { + val next = argmax(logits) + gen.add(next) + if (next == eos || next == eot) break + logits = runtime.forward(next).data.copyToFloatArray() + } + return gen + } + + @Test + fun q5kPackedMatchesFp32() = runBlocking { + Assumptions.assumeTrue(File(gguf).exists(), "FunctionGemma GGUF not present — skipping") + + val ctx = DirectCpuExecutionContext.create() + val tokenizer = GGUFTokenizer.fromSource(SystemFileSystem.source(Path(gguf)).buffered()) + val eot = tokenizer.encode("").single() + val eos = tokenizer.eosTokenId + val promptTokens = + listOf(tokenizer.bosTokenId) + tokenizer.encode(buildPrompt("Turn the light on.")).toList() + val maxNew = 12 + + // --- FP32 dequant-on-load baseline --- + val wFp32 = Gemma4WeightLoader( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32, + ).loadToMapStreaming(ctx, FP32::class) + val mFp32 = GemmaNetworkLoader.fromWeights(ctx, wFp32, FP32::class) + val rtFp32 = OptimizedLLMRuntime( + model = mFp32, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + val genFp32 = decode(rtFp32, promptTokens, maxNew, eos, eot) + + // --- Q5_K packed in-kernel dequant path (NATIVE_OPTIMIZED + convert) --- + Arena.ofConfined().use { arena -> + val wNat = Gemma4WeightLoader( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.NATIVE_OPTIMIZED, + ).loadToMapStreaming(ctx, FP32::class) + val wConv = convertGemmaWeightsToMemSeg(wNat, ctx, arena) + @Suppress("UNCHECKED_CAST") + val mNat = GemmaNetworkLoader.fromWeights( + ctx, wConv as Gemma4Weights, FP32::class, + ) + val rtNat = OptimizedLLMRuntime( + model = mNat, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + + // Warmup one decode (JIT + kernel-provider resolution), then time. + decode(rtNat, promptTokens, 2, eos, eot) + val t0 = System.nanoTime() + val genNat = decode(rtNat, promptTokens, maxNew, eos, eot) + val ms = (System.nanoTime() - t0) / 1e6 + val toks = genNat.size + promptTokens.size + + println("Q5K-packed gen=$genNat") + println("FP32-base gen=$genFp32") + println("Q5K decoded='${tokenizer.decode(genNat.toIntArray()).replace("\n", "\\n")}'") + println( + "Q5K-packed throughput: $toks tok in ${"%.0f".format(ms)} ms " + + "(${"%.2f".format(toks * 1000.0 / ms)} tok/s incl. prefill)", + ) + + assertEquals(genFp32, genNat, "Q5_K packed decode diverged from FP32 baseline") + } + } +} diff --git a/settings.gradle.kts b/settings.gradle.kts index d43ab58..ab3b9eb 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -8,6 +8,12 @@ pluginManagement { dependencyResolutionManagement { repositories { + // mavenLocal first so a locally-published upstream SKaiNET (same + // coordinates/version, e.g. sk.ainet.core:*:0.29.1 from a sibling + // ../SKaiNET `publishToMavenLocal`) shadows Maven Central. Lets the + // transformers build consume in-progress SKaiNET changes without the + // composite build. Maven Central remains the fallback. + mavenLocal() google() mavenCentral() } From 04585d8f01d79502ff228e8092117834e6be27d1 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 10:19:41 +0200 Subject: [PATCH 2/6] feat(gemma): commonMain quant layout helpers for the Kotlin/Native board path The board binary is Kotlin/Native, but GemmaMemSegConverter (the NATIVE_OPTIMIZED packed-weight path) is jvmMain-only (java.lang.foreign). Move the reusable, platform-neutral pieces to commonMain so K/N can keep K-quant weights packed: - GemmaQuantLayout.kt (commonMain): logicalShapeFor + relayoutKSeriesRowMajor ToBlockMajor (now copyInto, KMP-safe) + packGemmaKQuant() which builds heap-packed Q4_K/Q5_K/Q6_KBlockTensorData directly (no MemSeg/Arena). - GemmaMemSegConverter (jvmMain) now shares those commonMain helpers (dup removed); MemSeg/FFM conversion + FP32 fallbacks stay JVM-only. - commonTest GemmaQuantLayoutTest: block-transpose relayout + packing, runs on every target. Verified: gemma compiles for JVM + linuxX64; layout tests green (3). Next (board integration): a commonMain convertGemmaWeightsPacked wired into the K/N load path (byte extraction differs JVM IntArrayTensorData vs native Byte- backed), then a full K/N decode on the SL2610. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../sk/ainet/models/gemma/GemmaQuantLayout.kt | 121 ++++++++++++++++++ .../models/gemma/GemmaQuantLayoutTest.kt | 59 +++++++++ .../models/gemma/GemmaMemSegConverter.kt | 91 +------------ 3 files changed, 186 insertions(+), 85 deletions(-) create mode 100644 llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt create mode 100644 llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt new file mode 100644 index 0000000..7f4e7b9 --- /dev/null +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt @@ -0,0 +1,121 @@ +package sk.ainet.models.gemma + +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q4_KBlockTensorData +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.tensor.data.Q6_KBlockTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.DType + +/** + * Platform-neutral (commonMain) layout helpers for Gemma 4 quantized weights. + * + * These were previously JVM-only (inside `GemmaMemSegConverter`), but the + * Kotlin/Native board path needs the same logic: on K/N there is no + * `java.lang.foreign` MemSeg conversion, so the eager runtime keeps K-quant + * weights as heap-packed `Q{4,5,6}_KBlockTensorData` produced here. The JVM + * MemSeg converter reuses the same relayout + shape recovery. + */ + +/** + * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF name + * and model metadata. `Gemma4WeightLoader` with `NATIVE_OPTIMIZED` stores + * quantized tensors as 1-D byte arrays, so converters need the original + * `[rows, cols]` shape to re-layout blocks. Returns `null` for tensors without + * a 2-D matmul layout (norms, embeddings the converter dequantizes anyway). + */ +internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? { + val embed = metadata.embeddingLength + val vocab = metadata.vocabSize + return when { + name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed) + name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed) + name.startsWith("blk.") -> { + val rest = name.substringAfter("blk.") + val layer = rest.substringBefore('.').toIntOrNull() ?: return null + val headDim = metadata.getHeadDim(layer) + val qDim = metadata.headCount * headDim + val kvDim = metadata.kvHeadCount * headDim + val ffn = metadata.intermediateSize + when { + name.endsWith(".attn_q.weight") -> Shape(qDim, embed) + name.endsWith(".attn_k.weight") -> Shape(kvDim, embed) + name.endsWith(".attn_v.weight") -> Shape(kvDim, embed) + name.endsWith(".attn_output.weight") -> Shape(embed, qDim) + name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed) + name.endsWith(".ffn_up.weight") -> Shape(ffn, embed) + name.endsWith(".ffn_down.weight") -> Shape(embed, ffn) + else -> null + } + } + else -> null + } +} + +/** + * Re-layout GGUF K-series bytes from row-major block order + * (`(r * blocksPerRow + b) * bytesPerBlock`) to the input-block-major order the + * `matmulQ{K}` kernels expect (`(b * outDim + r) * bytesPerBlock`). For a + * `[outDim, inDim]` weight with `inDim % 256 == 0`, this is a block-level 2-D + * transpose; bytes inside a block are untouched. + * + * @param bytesPerBlock 144 (Q4_K), 176 (Q5_K), 210 (Q6_K). + */ +internal fun relayoutKSeriesRowMajorToBlockMajor( + bytes: ByteArray, + shape: Shape, + bytesPerBlock: Int, +): ByteArray { + val blockSize = 256 + require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" } + val outDim = shape[0] + val inDim = shape[1] + require(inDim % blockSize == 0) { "K-series weight inDim ($inDim) must be a multiple of $blockSize" } + val blocksPerRow = inDim / blockSize + val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong() + require(bytes.size.toLong() >= expected) { + "K-series byte buffer ${bytes.size} < expected $expected for [$outDim, $inDim] @ ${bytesPerBlock}B/block" + } + val out = ByteArray(bytes.size) + for (r in 0 until outDim) { + for (b in 0 until blocksPerRow) { + val srcOff = (r * blocksPerRow + b) * bytesPerBlock + val dstOff = (b * outDim + r) * bytesPerBlock + bytes.copyInto(out, dstOff, srcOff, srcOff + bytesPerBlock) + } + } + return out +} + +/** Bytes per ggml block for the K-quant types this packer handles. */ +private fun kQuantBytesPerBlock(qt: GGMLQuantizationType): Int? = when (qt) { + GGMLQuantizationType.Q4_K -> 144 + GGMLQuantizationType.Q5_K -> 176 + GGMLQuantizationType.Q6_K -> 210 + else -> null +} + +/** + * Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the + * heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K / + * Q6_K). Performs the row-major → block-major relayout. Returns `null` for + * non-K-quant types (caller dequantizes those to FP32). + * + * commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena). + */ +internal fun packGemmaKQuant( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, +): TensorData? { + val bpb = kQuantBytesPerBlock(qt) ?: return null + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb) + @Suppress("UNCHECKED_CAST") + return when (qt) { + GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData + GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData + GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData + else -> null + } +} diff --git a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt new file mode 100644 index 0000000..7c7a9c3 --- /dev/null +++ b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt @@ -0,0 +1,59 @@ +package sk.ainet.models.gemma + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNull +import kotlin.test.assertTrue +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.data.Q5_KBlockTensorData +import sk.ainet.lang.types.FP32 + +/** + * Unit tests for the commonMain (board-shareable) Gemma quant layout helpers. + * These run on every target (JVM + Kotlin/Native), proving the K/N board path's + * relayout + packing logic without needing the full model. + */ +class GemmaQuantLayoutTest { + + @Test + fun relayout_is_block_level_transpose() { + // [outDim=2, inDim=512] -> blocksPerRow=2, 4 Q5_K blocks of 176 B. + val bpb = 176 + val outDim = 2 + val inDim = 512 + val blocksPerRow = inDim / 256 + val bytes = ByteArray(outDim * blocksPerRow * bpb) + // Tag each source block with its row-major index in its first byte. + for (i in 0 until outDim * blocksPerRow) bytes[i * bpb] = i.toByte() + + val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, Shape(outDim, inDim), bpb) + + // dst block (b*outDim + r) must hold src block (r*blocksPerRow + b). + for (r in 0 until outDim) { + for (b in 0 until blocksPerRow) { + val srcIdx = r * blocksPerRow + b + val dstIdx = b * outDim + r + assertEquals(srcIdx.toByte(), relaid[dstIdx * bpb], "block ($r,$b) misplaced") + } + } + } + + @Test + fun pack_q5k_produces_block_tensor_with_relaid_bytes() { + val shape = Shape(2, 512) + val bytes = ByteArray(2 * 2 * 176) + for (i in 0 until 4) bytes[i * 176] = (i + 1).toByte() + + val td = packGemmaKQuant(bytes, GGMLQuantizationType.Q5_K, shape) + assertTrue(td is Q5_KBlockTensorData, "Q5_K should pack to Q5_KBlockTensorData") + // packedData is the block-major relayout of the input. + val expected = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176) + assertTrue(expected.contentEquals(td.packedData)) + } + + @Test + fun pack_non_kquant_returns_null() { + assertNull(packGemmaKQuant(ByteArray(34), GGMLQuantizationType.Q8_0, Shape(1, 32))) + } +} diff --git a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt index 232b417..191f251 100644 --- a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt +++ b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt @@ -16,44 +16,9 @@ import sk.ainet.lang.tensor.data.TensorData import sk.ainet.lang.types.DType import sk.ainet.lang.types.FP32 -/** - * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF - * name and the model metadata. `Gemma4WeightLoader` with - * `NATIVE_OPTIMIZED` stores quantized tensors as 1-D byte arrays so the - * tensor-data factory accepts them; the converter needs the original - * shape to re-layout blocks and construct `Q4_KBlockTensorData` / - * `Q4/Q8MemorySegmentTensorData`. - * - * Returns `null` for tensors that don't have a 2-D matmul layout (norms, - * embeddings the converter wants to dequant anyway). - */ -internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? { - val embed = metadata.embeddingLength - val vocab = metadata.vocabSize - return when { - name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed) - name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed) - name.startsWith("blk.") -> { - val rest = name.substringAfter("blk.") - val layer = rest.substringBefore('.').toIntOrNull() ?: return null - val headDim = metadata.getHeadDim(layer) - val qDim = metadata.headCount * headDim - val kvDim = metadata.kvHeadCount * headDim - val ffn = metadata.intermediateSize - when { - name.endsWith(".attn_q.weight") -> Shape(qDim, embed) - name.endsWith(".attn_k.weight") -> Shape(kvDim, embed) - name.endsWith(".attn_v.weight") -> Shape(kvDim, embed) - name.endsWith(".attn_output.weight") -> Shape(embed, qDim) - name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed) - name.endsWith(".ffn_up.weight") -> Shape(ffn, embed) - name.endsWith(".ffn_down.weight") -> Shape(embed, ffn) - else -> null - } - } - else -> null - } -} +// logicalShapeFor + relayoutKSeriesRowMajorToBlockMajor moved to commonMain +// (GemmaQuantLayout.kt) so the Kotlin/Native board path shares them. This +// JVM-only file keeps the MemSeg (FFM) conversion + the FP32 dequant fallbacks. /** * Convert raw-byte quantized tensors in a [Gemma4Weights] map (produced by @@ -287,53 +252,9 @@ private fun dequantToFloat( } /** - * Re-layout GGUF K-series bytes from row-major block order (block at row r, - * block index b within row → byte offset `(r * blocksPerRow + b) * bytesPerBlock`) - * to the input-block-major layout the `matmulQ{K}_Vec` kernels expect - * (block at blockIdx bI for output row r → byte offset - * `(bI * outDim + r) * bytesPerBlock`). - * - * For a weight of shape `[outDim, inDim]` with `inDim % 256 == 0` (the - * K-series block size), this is just a 2D block-level transpose of the - * `[outDim, inDim/256]` array of `bytesPerBlock`-byte blocks. Bytes - * inside a block are untouched. - * - * @param bytes packed weight bytes in row-major [outDim, blocksPerRow] order - * @param shape logical `[outDim, inDim]` shape - * @param bytesPerBlock 144 for Q4_K, 210 for Q6_K (ggml block sizes) - */ -internal fun relayoutKSeriesRowMajorToBlockMajor( - bytes: ByteArray, - shape: sk.ainet.lang.tensor.Shape, - bytesPerBlock: Int -): ByteArray { - val blockSize = 256 - require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" } - val outDim = shape[0] - val inDim = shape[1] - require(inDim % blockSize == 0) { - "K-series weight inDim ($inDim) must be a multiple of $blockSize" - } - val blocksPerRow = inDim / blockSize - val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong() - require(bytes.size.toLong() >= expected) { - "K-series byte buffer size ${bytes.size} < expected $expected for shape [$outDim, $inDim] @ ${bytesPerBlock}B/block" - } - val out = ByteArray(bytes.size) - for (r in 0 until outDim) { - for (b in 0 until blocksPerRow) { - val srcOff = (r * blocksPerRow + b) * bytesPerBlock - val dstOff = (b * outDim + r) * bytesPerBlock - System.arraycopy(bytes, srcOff, out, dstOff, bytesPerBlock) - } - } - return out -} - -/** - * Back-compat shim that delegates to [relayoutKSeriesRowMajorToBlockMajor] - * at Q4_K's 144-byte block size. Kept for any callers outside this file - * pinned to the old name. + * Back-compat shim that delegates to the commonMain + * [relayoutKSeriesRowMajorToBlockMajor] at Q4_K's 144-byte block size. Kept for + * any callers outside this file pinned to the old name. */ internal fun relayoutQ4_KRowMajorToBlockMajor(bytes: ByteArray, shape: sk.ainet.lang.tensor.Shape): ByteArray = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 144) From cb96e5363579ccbebe207a1e6ff8534bafccec03 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 13:37:50 +0200 Subject: [PATCH 3/6] feat(gemma): wire convertGemmaWeightsPacked into GemmaNetworkLoader.load() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NATIVE_OPTIMIZED loads produce raw-byte quant tensors the network mapper can't consume; on JVM an external convertGemmaWeightsToMemSeg (FFM) handled that, but the Kotlin/Native board has no such path. Add a commonMain converter and make load() apply it, so load(NATIVE_OPTIMIZED) yields a runnable network on the board AND the JVM (previously it couldn't be built from raw-byte weights at all). - GemmaPackedWeights.kt (commonMain): convertGemmaWeightsPacked — packs Q4/5/6_K matmul weights to heap Q*_KBlockTensorData (packGemmaKQuant), dequants token_embd/output to FP32 (gathered, no transpose) and other quant types to FP32 [out,in]. No java.lang.foreign. Plus extractRawBytes, which reads the loader's bytes back across both backings (JVM IntArrayTensorData / native Byte-typed). - GemmaNetworkLoader.load(): for NATIVE_OPTIMIZED, run convertGemmaWeightsPacked before applyWeightsToNetwork. Verified on JVM AND linuxX64 (GemmaQuantLayoutTest, 4 tests each): relayout, packing, and the byte-extraction round-trip — so native byte extraction is executed, not just compiled. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ainet/models/gemma/GemmaNetworkLoader.kt | 20 ++- .../ainet/models/gemma/GemmaPackedWeights.kt | 125 ++++++++++++++++++ .../models/gemma/GemmaQuantLayoutTest.kt | 14 ++ 3 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt index f73b3ac..abc8c3e 100644 --- a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt @@ -122,7 +122,7 @@ public class GemmaNetworkLoader @PublishedApi internal constructor( public suspend inline fun load( ctx: ExecutionContext ): Module { - val weights: Gemma4Weights = when (val wp = weightsProvider) { + val rawWeights: Gemma4Weights = when (val wp = weightsProvider) { is WeightsProvider.GgufSource -> { val loader = Gemma4WeightLoader(wp.sourceProvider, quantPolicy = wp.quantPolicy) loader.loadToMap(ctx) @@ -142,6 +142,24 @@ public class GemmaNetworkLoader @PublishedApi internal constructor( } } + // NATIVE_OPTIMIZED yields raw-byte quant tensors the network mapper can't + // consume directly. Pack them (heap Q4/5/6_K + FP32 fallback) here — this + // is commonMain so it works on Kotlin/Native (the board) as well as the + // JVM, and replaces the JVM-only `convertGemmaWeightsToMemSeg` for the + // `load()` entry point. + val ggufPolicy = when (val wp = weightsProvider) { + is WeightsProvider.GgufSource -> wp.quantPolicy + is WeightsProvider.GgufRandomAccess -> wp.quantPolicy + else -> null + } + val weights: Gemma4Weights = + if (ggufPolicy == QuantPolicy.NATIVE_OPTIMIZED) { + @Suppress("UNCHECKED_CAST") + convertGemmaWeightsPacked(rawWeights, ctx) as Gemma4Weights + } else { + rawWeights + } + return applyWeightsToNetwork(ctx, weights) } diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt new file mode 100644 index 0000000..ec52eb4 --- /dev/null +++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt @@ -0,0 +1,125 @@ +package sk.ainet.models.gemma + +import sk.ainet.context.ExecutionContext +import sk.ainet.io.gguf.GGMLQuantizationType +import sk.ainet.io.gguf.dequant.DequantOps +import sk.ainet.lang.tensor.Shape +import sk.ainet.lang.tensor.Tensor +import sk.ainet.lang.tensor.data.IntArrayTensorData +import sk.ainet.lang.tensor.data.TensorData +import sk.ainet.lang.types.DType +import sk.ainet.lang.types.FP32 + +/** + * commonMain (Kotlin/Native-capable) analogue of the jvmMain + * `convertGemmaWeightsToMemSeg`. Converts the raw-byte quantized tensors a + * `NATIVE_OPTIMIZED` load produces into the forms the DSL matmul path consumes: + * + * - **Q4_K / Q5_K / Q6_K matmul weights** → heap-packed `Q{4,5,6}_KBlockTensorData` + * (via [packGemmaKQuant], with the row-major→block-major relayout). These keep + * the GGUF footprint and run the in-kernel dequant matmul (NEON on the board). + * - **token_embd / output** → FP32 dequant in canonical `[vocab, embed]` order + * (the embedding is gathered, not matmul'd, so no transpose). + * - **everything else quantized** → FP32 dequant transposed to `[out, in]` + * row-major so `linearProject` (`x @ W.t()`) is correct. + * + * Unlike the MemSeg converter this uses no `java.lang.foreign` — it runs on the + * SL2610 board binary (Kotlin/Native) as well as the JVM. The JVM still prefers + * the MemSeg path (lazy transpose + Q4/Q8 MemSeg); this is the board path. + */ +public fun convertGemmaWeightsPacked( + weights: Gemma4Weights<*, *>, + ctx: ExecutionContext, +): Gemma4Weights<*, *> { + @Suppress("UNCHECKED_CAST") + val typed = weights as Gemma4Weights + val quantTypes = typed.quantTypes + if (quantTypes.isEmpty()) return weights + + val logicalShapes = typed.logicalShapes + val newTensors = linkedMapOf>() + for ((name, tensor) in typed.tensors) { + val qt = quantTypes[name] + newTensors[name] = when { + qt == null -> tensor // not quantized + else -> { + val shape = logicalShapes[name] ?: logicalShapeFor(name, typed.metadata) + if (shape == null) { + tensor // unknown 2-D layout — leave as-is + } else { + val bytes = extractRawBytes(tensor.data) + val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS || + name == Gemma4TensorNames.OUTPUT_WEIGHT + val packed = if (!isEmbed) packGemmaKQuant(bytes, qt, shape) else null + when { + packed != null -> { + @Suppress("UNCHECKED_CAST") + ctx.fromData(packed as TensorData, FP32::class) as Tensor + } + isEmbed -> dequantNoTranspose(bytes, qt, shape, ctx) + else -> dequantTransposed(bytes, qt, shape, ctx) + } + } + } + } + } + @Suppress("UNCHECKED_CAST") + return Gemma4Weights(typed.metadata, newTensors, typed.quantTypes, typed.logicalShapes) as Gemma4Weights<*, *> +} + +/** Dequant to FP32 in natural `[rows, cols]` order (embeddings — gathered, not matmul'd). */ +@Suppress("UNCHECKED_CAST") +private fun dequantNoTranspose( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, + ctx: ExecutionContext, +): Tensor { + val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume) + return ctx.fromFloatArray(shape, FP32::class, floats) as Tensor +} + +/** + * Dequant to a canonical FP32 `[out, in]` row-major weight. GGUF stores K/legacy + * blocks column-major within a row, so the dequantized floats are transposed + * column-major → row-major to match what `linearProject` (`x @ W.t()`) expects. + */ +@Suppress("UNCHECKED_CAST") +private fun dequantTransposed( + bytes: ByteArray, + qt: GGMLQuantizationType, + shape: Shape, + ctx: ExecutionContext, +): Tensor { + val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume) + val out = shape[0] + val inDim = shape[1] + val rowMajor = DequantOps.transposeColumnMajorToRowMajor(floats, inDim, out) + return ctx.fromFloatArray(shape, FP32::class, rowMajor) as Tensor +} + +/** + * Read the raw packed bytes back from a `NATIVE_OPTIMIZED` quant tensor. The + * backing differs by platform/factory — JVM stores `IntArrayTensorData` (byte + * values widened to Int); Kotlin/Native stores a Byte-typed tensor — so handle + * both element types. + */ +internal fun extractRawBytes(data: TensorData<*, *>): ByteArray { + if (data is IntArrayTensorData<*>) { + val buf = data.buffer + return ByteArray(buf.size) { buf[it].toByte() } + } + val n = data.shape.volume + @Suppress("UNCHECKED_CAST") + val d = data as TensorData<*, Any?> + return ByteArray(n) { + when (val v = d[it]) { + is Byte -> v + is Int -> v.toByte() + else -> error( + "convertGemmaWeightsPacked: cannot read bytes from ${data::class.simpleName} " + + "(element ${v?.let { e -> e::class.simpleName }})", + ) + } + } +} diff --git a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt index 7c7a9c3..52a1cdd 100644 --- a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt +++ b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt @@ -4,10 +4,12 @@ import kotlin.test.Test import kotlin.test.assertEquals import kotlin.test.assertNull import kotlin.test.assertTrue +import sk.ainet.context.DirectCpuExecutionContext import sk.ainet.io.gguf.GGMLQuantizationType import sk.ainet.lang.tensor.Shape import sk.ainet.lang.tensor.data.Q5_KBlockTensorData import sk.ainet.lang.types.FP32 +import sk.ainet.lang.types.Int8 /** * Unit tests for the commonMain (board-shareable) Gemma quant layout helpers. @@ -56,4 +58,16 @@ class GemmaQuantLayoutTest { fun pack_non_kquant_returns_null() { assertNull(packGemmaKQuant(ByteArray(34), GGMLQuantizationType.Q8_0, Shape(1, 32))) } + + @Test + fun extract_raw_bytes_roundtrips_on_every_platform() { + // The NATIVE_OPTIMIZED loader wraps quant bytes via ctx.fromByteArray; + // extractRawBytes must read them back regardless of the platform backing + // (JVM IntArrayTensorData vs native Byte-typed). Runs on jvm + linuxX64. + val ctx = DirectCpuExecutionContext.create() + val bytes = ByteArray(176 * 3) { ((it * 31 + 7) and 0xFF).toByte() } + val t = ctx.fromByteArray(Shape(bytes.size), Int8::class, bytes) + val got = extractRawBytes(t.data) + assertTrue(bytes.contentEquals(got), "extractRawBytes round-trip mismatch") + } } From aaffafb8a3f234b6348387d2cb6e0a00add3a0b6 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 13:43:04 +0200 Subject: [PATCH 4/6] test(gemma): end-to-end parity for load(NATIVE_OPTIMIZED) packed path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends GemmaQ5KPackedParityTest to also decode via GemmaNetworkLoader.load(NATIVE_OPTIMIZED) — the wired commonMain convertGemmaWeightsPacked (board) path, no MemSeg/Arena. All three paths (FP32 baseline, jvmMain MemSeg-packed, load() packed) produce the identical token sequence -> `(state="on")` for "Turn the light on." Co-Authored-By: Claude Opus 4.8 (1M context) --- .../models/gemma/GemmaQ5KPackedParityTest.kt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt index 1e33fd6..1d4a7ad 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt @@ -123,5 +123,20 @@ class GemmaQ5KPackedParityTest { assertEquals(genFp32, genNat, "Q5_K packed decode diverged from FP32 baseline") } + + // The wired path: GemmaNetworkLoader.load(NATIVE_OPTIMIZED) applies the + // commonMain convertGemmaWeightsPacked (the board path) — no MemSeg, no + // Arena. Must decode identically to the FP32 baseline too. + val mLoad = GemmaNetworkLoader.fromGguf( + randomAccessProvider = { JvmRandomAccessSource.open(gguf) }, + quantPolicy = QuantPolicy.NATIVE_OPTIMIZED, + ).load(ctx) + val rtLoad = OptimizedLLMRuntime( + model = mLoad, ctx = ctx, mode = OptimizedLLMMode.DIRECT, + dtype = FP32::class, bos = tokenizer.bosTokenId, + ) + val genLoad = decode(rtLoad, promptTokens, maxNew, eos, eot) + println("load(NATIVE_OPTIMIZED) gen=$genLoad") + assertEquals(genFp32, genLoad, "load(NATIVE_OPTIMIZED) packed decode diverged from FP32 baseline") } } From a222b2a431e67525da41e4cb36e956838aca12d9 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 17:46:39 +0200 Subject: [PATCH 5/6] build: consume skainet 0.30.0 (released Q5_K + NEON + K/N cinterop) Co-Authored-By: Claude Opus 4.8 (1M context) --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 98b9de5..5aa078e 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,5 +1,5 @@ [versions] -skainet = "0.29.1" +skainet = "0.30.0" agp = "9.2.1" jacksonDatabind = "2.22.0" jsonSchemaValidator = "3.0.3" From 0406dc670f5b93b3dc908a1ad0e7d50d810b88ea Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Thu, 11 Jun 2026 17:46:39 +0200 Subject: [PATCH 6/6] test(gemma): fix stale FunctionGemma GGUF path in integration tests Six real-model integration tests (RealGemmaLoad/Eager/BakeIrpa/ExternalParam/ DequantDump + GemmaBehavioralAb) pointed at an old workspace path (/home/miso/projects/coral/sl2610-voice-cc-kt/models/...) and failed with "File not found" under -PincludeIntegration. Repoint them to the actual model location (SKaiNET-embedded/sl2610-function-calling/models/), matching GemmaQ5KPackedParityTest. Verified: all 6 pass against skainet 0.30.0 (mavenLocal), -PincludeIntegration. --- .../kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt | 2 +- .../kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt | 2 +- .../kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt | 2 +- .../kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt | 2 +- .../kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt | 2 +- .../jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt index 406197c..3f93860 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt @@ -31,7 +31,7 @@ import kotlin.test.assertEquals */ @Tag("integration") class GemmaBehavioralAbTest { - private val gguf = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + private val gguf = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" private fun argmax(a: FloatArray): Int { var bi = 0; var bv = a[0] diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt index 227fb35..59ddc21 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt @@ -35,7 +35,7 @@ import kotlin.test.Test class RealGemmaBakeIrpaTest { @Test fun bakeRealGemmaToIrpa() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt index cbd6ebf..af3c5e0 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt @@ -17,7 +17,7 @@ import kotlin.test.Test class RealGemmaDequantDumpTest { @Test fun dumpDequant() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt index 3bfccce..f003747 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt @@ -24,7 +24,7 @@ import kotlin.test.Test class RealGemmaEagerAbTest { @Test fun eagerLogits() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt index f90bda2..019dcd8 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt @@ -32,7 +32,7 @@ import kotlin.test.Test class RealGemmaExternalParamTest { @Test fun externalizeRealGemmaWeights() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val weights = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) }, diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt index 2895253..2905da6 100644 --- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt +++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt @@ -21,7 +21,7 @@ import kotlin.test.Test class RealGemmaLoadTest { @Test fun loadFunctionGemmaWeights() = runBlocking { - val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" + val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf" val ctx = DirectCpuExecutionContext.create() val loader = Gemma4WeightLoader( randomAccessProvider = { JvmRandomAccessSource.open(path) },