From 0ce5927a8138098a6897adc8062d96685e959aff Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Wed, 10 Jun 2026 23:41:42 +0200
Subject: [PATCH 1/6] feat(gemma): wire Q5_K packed in-kernel dequant into the
 eager runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

FunctionGemma-270M ships as Q5_K_M, but GemmaMemSegConverter dequantized
Q5_K weights to FP32 on load ("no native matmul kernel yet for Q5_K"),
losing the memory savings and the in-kernel dequant. Upstream SKaiNET
0.29.1 now provides a first-class Q5_K packed matmul (Q5_KBlockTensorData
+ Q5KMatmulKernel: scalar/Panama/native), so keep Q5_K packed here too:
relayout GGUF bytes to block-major + wrap as Q5_KBlockTensorData (176 B/
block). Dispatch + lazy transpose reach it via DefaultCpuOps.

- Bump skainet 0.28.1 -> 0.29.1 (source-of-truth for the llm-bom platform).
- settings.gradle.kts: mavenLocal first so a locally-published SKaiNET
  0.29.1 (carrying the in-progress Q5_K kernel) shadows Maven Central until
  it's released; Central remains the fallback.

Verified (GemmaQ5KPackedParityTest, -PincludeIntegration): the Q5_K packed
path decodes FunctionGemma byte-identically to the FP32 baseline —
[262146, 236769, 3255, 718, 498, 1373, 262152, 106] -> `<tool_0>(state="on")
<end>` for "Turn the light on." (the known-good tool call), 0.81 tok/s on
the JVM host incl. prefill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 gradle/libs.versions.toml                     |   2 +-
 .../models/gemma/GemmaMemSegConverter.kt      |  11 +-
 .../models/gemma/GemmaQ5KPackedParityTest.kt  | 127 ++++++++++++++++++
 settings.gradle.kts                           |   6 +
 4 files changed, 143 insertions(+), 3 deletions(-)
 create mode 100644 llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 66e7fb6..98b9de5 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -1,5 +1,5 @@
 [versions]
-skainet = "0.28.1"
+skainet = "0.29.1"
 agp = "9.2.1"
 jacksonDatabind = "2.22.0"
 jsonSchemaValidator = "3.0.3"
diff --git a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
index d3a4502..232b417 100644
--- a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
+++ b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
@@ -8,6 +8,7 @@ import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.Tensor
 import sk.ainet.lang.tensor.data.IntArrayTensorData
 import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
 import sk.ainet.lang.tensor.data.Q4MemorySegmentTensorData
 import sk.ainet.lang.tensor.data.Q8MemorySegmentTensorData
@@ -197,8 +198,14 @@ private fun <T : DType, V> convertOne(
             ctx.fromData(data as TensorData<FP32, Float>, advertisedDtype) as Tensor<T, V>
         }
         GGMLQuantizationType.Q5_K -> {
-            // No native matmul kernel yet for Q5_K. Fall back to a correct FP32 dequant.
-            dequantPackedToFp32<T, V>(bytes, qt, shape, ctx)
+            // Same packed-path treatment as Q4_K/Q6_K, enabled by the Q5_K
+            // matmul kernel (scalar/Panama/native) + the lazy Q5_K transpose
+            // in DefaultCpuOps. FunctionGemma-270M Q5_K_M ships most attn/FFN
+            // weights as Q5_K, so keeping them packed (176 B/block) avoids the
+            // FP32 inflation and runs the in-kernel dequant matmul.
+            val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176)
+            val data = Q5_KBlockTensorData.fromRawBytes(shape, relaid)
+            ctx.fromData(data as TensorData<FP32, Float>, advertisedDtype) as Tensor<T, V>
         }
         else -> {
             // Any other quant type without a packed SIMD kernel (Q5_0/Q5_1/Q4_1/Q2_K/…)
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
new file mode 100644
index 0000000..1e33fd6
--- /dev/null
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
@@ -0,0 +1,127 @@
+package sk.ainet.models.gemma
+
+import java.io.File
+import java.lang.foreign.Arena
+import kotlinx.coroutines.runBlocking
+import kotlinx.io.buffered
+import kotlinx.io.files.Path
+import kotlinx.io.files.SystemFileSystem
+import org.junit.jupiter.api.Assumptions
+import org.junit.jupiter.api.Tag
+import sk.ainet.apps.llm.OptimizedLLMMode
+import sk.ainet.apps.llm.OptimizedLLMRuntime
+import sk.ainet.apps.llm.tokenizer.GGUFTokenizer
+import sk.ainet.context.DirectCpuExecutionContext
+import sk.ainet.io.JvmRandomAccessSource
+import sk.ainet.io.model.QuantPolicy
+import sk.ainet.lang.types.FP32
+import kotlin.test.Test
+import kotlin.test.assertEquals
+
+/**
+ * End-to-end check that the NEW Q5_K packed in-kernel dequant path (upstream
+ * SKaiNET `Q5_KBlockTensorData` + `Q5KMatmulKernel`, wired here via
+ * [convertGemmaWeightsToMemSeg]) decodes FunctionGemma-270M (`Q5_K_M`)
+ * identically to the FP32-dequant baseline, and reports tokens/sec.
+ *
+ * Before this, the converter dequantized Q5_K weights to FP32 on load ("no
+ * native matmul kernel yet for Q5_K"). Now Q5_K stays packed (176 B/block)
+ * and runs the in-kernel dequant matmul. Both paths decode the same weights,
+ * so greedy argmax token sequences must match.
+ *
+ * Skips when the GGUF isn't present (CI without the checkpoint).
+ */
+@Tag("integration")
+class GemmaQ5KPackedParityTest {
+
+    private val gguf =
+        "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+
+    private fun argmax(a: FloatArray): Int {
+        var bi = 0; var bv = a[0]
+        for (i in 1 until a.size) if (a[i] > bv) { bv = a[i]; bi = i }
+        return bi
+    }
+
+    private fun buildPrompt(u: String) =
+        "<start_of_turn>user\n$u<end_of_turn>\n<start_of_turn>model\n"
+
+    private fun decode(
+        runtime: OptimizedLLMRuntime<FP32>,
+        promptTokens: List<Int>,
+        maxNew: Int,
+        eos: Int,
+        eot: Int,
+    ): List<Int> {
+        runtime.reset()
+        var logits = FloatArray(0)
+        for (t in promptTokens) logits = runtime.forward(t).data.copyToFloatArray()
+        val gen = mutableListOf<Int>()
+        while (gen.size < maxNew) {
+            val next = argmax(logits)
+            gen.add(next)
+            if (next == eos || next == eot) break
+            logits = runtime.forward(next).data.copyToFloatArray()
+        }
+        return gen
+    }
+
+    @Test
+    fun q5kPackedMatchesFp32() = runBlocking {
+        Assumptions.assumeTrue(File(gguf).exists(), "FunctionGemma GGUF not present — skipping")
+
+        val ctx = DirectCpuExecutionContext.create()
+        val tokenizer = GGUFTokenizer.fromSource(SystemFileSystem.source(Path(gguf)).buffered())
+        val eot = tokenizer.encode("<end_of_turn>").single()
+        val eos = tokenizer.eosTokenId
+        val promptTokens =
+            listOf(tokenizer.bosTokenId) + tokenizer.encode(buildPrompt("Turn the light on.")).toList()
+        val maxNew = 12
+
+        // --- FP32 dequant-on-load baseline ---
+        val wFp32 = Gemma4WeightLoader(
+            randomAccessProvider = { JvmRandomAccessSource.open(gguf) },
+            quantPolicy = QuantPolicy.DEQUANTIZE_TO_FP32,
+        ).loadToMapStreaming<FP32, Float>(ctx, FP32::class)
+        val mFp32 = GemmaNetworkLoader.fromWeights(ctx, wFp32, FP32::class)
+        val rtFp32 = OptimizedLLMRuntime(
+            model = mFp32, ctx = ctx, mode = OptimizedLLMMode.DIRECT,
+            dtype = FP32::class, bos = tokenizer.bosTokenId,
+        )
+        val genFp32 = decode(rtFp32, promptTokens, maxNew, eos, eot)
+
+        // --- Q5_K packed in-kernel dequant path (NATIVE_OPTIMIZED + convert) ---
+        Arena.ofConfined().use { arena ->
+            val wNat = Gemma4WeightLoader(
+                randomAccessProvider = { JvmRandomAccessSource.open(gguf) },
+                quantPolicy = QuantPolicy.NATIVE_OPTIMIZED,
+            ).loadToMapStreaming<FP32, Float>(ctx, FP32::class)
+            val wConv = convertGemmaWeightsToMemSeg(wNat, ctx, arena)
+            @Suppress("UNCHECKED_CAST")
+            val mNat = GemmaNetworkLoader.fromWeights(
+                ctx, wConv as Gemma4Weights<FP32, Float>, FP32::class,
+            )
+            val rtNat = OptimizedLLMRuntime(
+                model = mNat, ctx = ctx, mode = OptimizedLLMMode.DIRECT,
+                dtype = FP32::class, bos = tokenizer.bosTokenId,
+            )
+
+            // Warmup one decode (JIT + kernel-provider resolution), then time.
+            decode(rtNat, promptTokens, 2, eos, eot)
+            val t0 = System.nanoTime()
+            val genNat = decode(rtNat, promptTokens, maxNew, eos, eot)
+            val ms = (System.nanoTime() - t0) / 1e6
+            val toks = genNat.size + promptTokens.size
+
+            println("Q5K-packed gen=$genNat")
+            println("FP32-base  gen=$genFp32")
+            println("Q5K decoded='${tokenizer.decode(genNat.toIntArray()).replace("\n", "\\n")}'")
+            println(
+                "Q5K-packed throughput: $toks tok in ${"%.0f".format(ms)} ms " +
+                    "(${"%.2f".format(toks * 1000.0 / ms)} tok/s incl. prefill)",
+            )
+
+            assertEquals(genFp32, genNat, "Q5_K packed decode diverged from FP32 baseline")
+        }
+    }
+}
diff --git a/settings.gradle.kts b/settings.gradle.kts
index d43ab58..ab3b9eb 100644
--- a/settings.gradle.kts
+++ b/settings.gradle.kts
@@ -8,6 +8,12 @@ pluginManagement {
 
 dependencyResolutionManagement {
     repositories {
+        // mavenLocal first so a locally-published upstream SKaiNET (same
+        // coordinates/version, e.g. sk.ainet.core:*:0.29.1 from a sibling
+        // ../SKaiNET `publishToMavenLocal`) shadows Maven Central. Lets the
+        // transformers build consume in-progress SKaiNET changes without the
+        // composite build. Maven Central remains the fallback.
+        mavenLocal()
         google()
         mavenCentral()
     }

From 04585d8f01d79502ff228e8092117834e6be27d1 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Thu, 11 Jun 2026 10:19:41 +0200
Subject: [PATCH 2/6] feat(gemma): commonMain quant layout helpers for the
 Kotlin/Native board path

The board binary is Kotlin/Native, but GemmaMemSegConverter (the NATIVE_OPTIMIZED
packed-weight path) is jvmMain-only (java.lang.foreign). Move the reusable,
platform-neutral pieces to commonMain so K/N can keep K-quant weights packed:

- GemmaQuantLayout.kt (commonMain): logicalShapeFor + relayoutKSeriesRowMajor
  ToBlockMajor (now copyInto, KMP-safe) + packGemmaKQuant<T>() which builds
  heap-packed Q4_K/Q5_K/Q6_KBlockTensorData directly (no MemSeg/Arena).
- GemmaMemSegConverter (jvmMain) now shares those commonMain helpers (dup
  removed); MemSeg/FFM conversion + FP32 fallbacks stay JVM-only.
- commonTest GemmaQuantLayoutTest: block-transpose relayout + packing, runs on
  every target.

Verified: gemma compiles for JVM + linuxX64; layout tests green (3).

Next (board integration): a commonMain convertGemmaWeightsPacked wired into the
K/N load path (byte extraction differs JVM IntArrayTensorData vs native Byte-
backed), then a full K/N decode on the SL2610.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../sk/ainet/models/gemma/GemmaQuantLayout.kt | 121 ++++++++++++++++++
 .../models/gemma/GemmaQuantLayoutTest.kt      |  59 +++++++++
 .../models/gemma/GemmaMemSegConverter.kt      |  91 +------------
 3 files changed, 186 insertions(+), 85 deletions(-)
 create mode 100644 llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt
 create mode 100644 llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt

diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt
new file mode 100644
index 0000000..7f4e7b9
--- /dev/null
+++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaQuantLayout.kt
@@ -0,0 +1,121 @@
+package sk.ainet.models.gemma
+
+import sk.ainet.io.gguf.GGMLQuantizationType
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
+import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.DType
+
+/**
+ * Platform-neutral (commonMain) layout helpers for Gemma 4 quantized weights.
+ *
+ * These were previously JVM-only (inside `GemmaMemSegConverter`), but the
+ * Kotlin/Native board path needs the same logic: on K/N there is no
+ * `java.lang.foreign` MemSeg conversion, so the eager runtime keeps K-quant
+ * weights as heap-packed `Q{4,5,6}_KBlockTensorData` produced here. The JVM
+ * MemSeg converter reuses the same relayout + shape recovery.
+ */
+
+/**
+ * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF name
+ * and model metadata. `Gemma4WeightLoader` with `NATIVE_OPTIMIZED` stores
+ * quantized tensors as 1-D byte arrays, so converters need the original
+ * `[rows, cols]` shape to re-layout blocks. Returns `null` for tensors without
+ * a 2-D matmul layout (norms, embeddings the converter dequantizes anyway).
+ */
+internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? {
+    val embed = metadata.embeddingLength
+    val vocab = metadata.vocabSize
+    return when {
+        name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed)
+        name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed)
+        name.startsWith("blk.") -> {
+            val rest = name.substringAfter("blk.")
+            val layer = rest.substringBefore('.').toIntOrNull() ?: return null
+            val headDim = metadata.getHeadDim(layer)
+            val qDim = metadata.headCount * headDim
+            val kvDim = metadata.kvHeadCount * headDim
+            val ffn = metadata.intermediateSize
+            when {
+                name.endsWith(".attn_q.weight") -> Shape(qDim, embed)
+                name.endsWith(".attn_k.weight") -> Shape(kvDim, embed)
+                name.endsWith(".attn_v.weight") -> Shape(kvDim, embed)
+                name.endsWith(".attn_output.weight") -> Shape(embed, qDim)
+                name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed)
+                name.endsWith(".ffn_up.weight") -> Shape(ffn, embed)
+                name.endsWith(".ffn_down.weight") -> Shape(embed, ffn)
+                else -> null
+            }
+        }
+        else -> null
+    }
+}
+
+/**
+ * Re-layout GGUF K-series bytes from row-major block order
+ * (`(r * blocksPerRow + b) * bytesPerBlock`) to the input-block-major order the
+ * `matmulQ{K}` kernels expect (`(b * outDim + r) * bytesPerBlock`). For a
+ * `[outDim, inDim]` weight with `inDim % 256 == 0`, this is a block-level 2-D
+ * transpose; bytes inside a block are untouched.
+ *
+ * @param bytesPerBlock 144 (Q4_K), 176 (Q5_K), 210 (Q6_K).
+ */
+internal fun relayoutKSeriesRowMajorToBlockMajor(
+    bytes: ByteArray,
+    shape: Shape,
+    bytesPerBlock: Int,
+): ByteArray {
+    val blockSize = 256
+    require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" }
+    val outDim = shape[0]
+    val inDim = shape[1]
+    require(inDim % blockSize == 0) { "K-series weight inDim ($inDim) must be a multiple of $blockSize" }
+    val blocksPerRow = inDim / blockSize
+    val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong()
+    require(bytes.size.toLong() >= expected) {
+        "K-series byte buffer ${bytes.size} < expected $expected for [$outDim, $inDim] @ ${bytesPerBlock}B/block"
+    }
+    val out = ByteArray(bytes.size)
+    for (r in 0 until outDim) {
+        for (b in 0 until blocksPerRow) {
+            val srcOff = (r * blocksPerRow + b) * bytesPerBlock
+            val dstOff = (b * outDim + r) * bytesPerBlock
+            bytes.copyInto(out, dstOff, srcOff, srcOff + bytesPerBlock)
+        }
+    }
+    return out
+}
+
+/** Bytes per ggml block for the K-quant types this packer handles. */
+private fun kQuantBytesPerBlock(qt: GGMLQuantizationType): Int? = when (qt) {
+    GGMLQuantizationType.Q4_K -> 144
+    GGMLQuantizationType.Q5_K -> 176
+    GGMLQuantizationType.Q6_K -> 210
+    else -> null
+}
+
+/**
+ * Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the
+ * heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K /
+ * Q6_K). Performs the row-major → block-major relayout. Returns `null` for
+ * non-K-quant types (caller dequantizes those to FP32).
+ *
+ * commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena).
+ */
+internal fun <T : DType> packGemmaKQuant(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+): TensorData<T, *>? {
+    val bpb = kQuantBytesPerBlock(qt) ?: return null
+    val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb)
+    @Suppress("UNCHECKED_CAST")
+    return when (qt) {
+        GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData<T, *>
+        else -> null
+    }
+}
diff --git a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt
new file mode 100644
index 0000000..7c7a9c3
--- /dev/null
+++ b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt
@@ -0,0 +1,59 @@
+package sk.ainet.models.gemma
+
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertNull
+import kotlin.test.assertTrue
+import sk.ainet.io.gguf.GGMLQuantizationType
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
+import sk.ainet.lang.types.FP32
+
+/**
+ * Unit tests for the commonMain (board-shareable) Gemma quant layout helpers.
+ * These run on every target (JVM + Kotlin/Native), proving the K/N board path's
+ * relayout + packing logic without needing the full model.
+ */
+class GemmaQuantLayoutTest {
+
+    @Test
+    fun relayout_is_block_level_transpose() {
+        // [outDim=2, inDim=512] -> blocksPerRow=2, 4 Q5_K blocks of 176 B.
+        val bpb = 176
+        val outDim = 2
+        val inDim = 512
+        val blocksPerRow = inDim / 256
+        val bytes = ByteArray(outDim * blocksPerRow * bpb)
+        // Tag each source block with its row-major index in its first byte.
+        for (i in 0 until outDim * blocksPerRow) bytes[i * bpb] = i.toByte()
+
+        val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, Shape(outDim, inDim), bpb)
+
+        // dst block (b*outDim + r) must hold src block (r*blocksPerRow + b).
+        for (r in 0 until outDim) {
+            for (b in 0 until blocksPerRow) {
+                val srcIdx = r * blocksPerRow + b
+                val dstIdx = b * outDim + r
+                assertEquals(srcIdx.toByte(), relaid[dstIdx * bpb], "block ($r,$b) misplaced")
+            }
+        }
+    }
+
+    @Test
+    fun pack_q5k_produces_block_tensor_with_relaid_bytes() {
+        val shape = Shape(2, 512)
+        val bytes = ByteArray(2 * 2 * 176)
+        for (i in 0 until 4) bytes[i * 176] = (i + 1).toByte()
+
+        val td = packGemmaKQuant<FP32>(bytes, GGMLQuantizationType.Q5_K, shape)
+        assertTrue(td is Q5_KBlockTensorData, "Q5_K should pack to Q5_KBlockTensorData")
+        // packedData is the block-major relayout of the input.
+        val expected = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 176)
+        assertTrue(expected.contentEquals(td.packedData))
+    }
+
+    @Test
+    fun pack_non_kquant_returns_null() {
+        assertNull(packGemmaKQuant<FP32>(ByteArray(34), GGMLQuantizationType.Q8_0, Shape(1, 32)))
+    }
+}
diff --git a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
index 232b417..191f251 100644
--- a/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
+++ b/llm-inference/gemma/src/jvmMain/kotlin/sk/ainet/models/gemma/GemmaMemSegConverter.kt
@@ -16,44 +16,9 @@ import sk.ainet.lang.tensor.data.TensorData
 import sk.ainet.lang.types.DType
 import sk.ainet.lang.types.FP32
 
-/**
- * Recover the logical 2-D shape of a Gemma 4 weight tensor from its GGUF
- * name and the model metadata. `Gemma4WeightLoader` with
- * `NATIVE_OPTIMIZED` stores quantized tensors as 1-D byte arrays so the
- * tensor-data factory accepts them; the converter needs the original
- * shape to re-layout blocks and construct `Q4_KBlockTensorData` /
- * `Q4/Q8MemorySegmentTensorData`.
- *
- * Returns `null` for tensors that don't have a 2-D matmul layout (norms,
- * embeddings the converter wants to dequant anyway).
- */
-internal fun logicalShapeFor(name: String, metadata: Gemma4ModelMetadata): Shape? {
-    val embed = metadata.embeddingLength
-    val vocab = metadata.vocabSize
-    return when {
-        name == Gemma4TensorNames.TOKEN_EMBEDDINGS -> Shape(vocab, embed)
-        name == Gemma4TensorNames.OUTPUT_WEIGHT -> Shape(vocab, embed)
-        name.startsWith("blk.") -> {
-            val rest = name.substringAfter("blk.")
-            val layer = rest.substringBefore('.').toIntOrNull() ?: return null
-            val headDim = metadata.getHeadDim(layer)
-            val qDim = metadata.headCount * headDim
-            val kvDim = metadata.kvHeadCount * headDim
-            val ffn = metadata.intermediateSize
-            when {
-                name.endsWith(".attn_q.weight") -> Shape(qDim, embed)
-                name.endsWith(".attn_k.weight") -> Shape(kvDim, embed)
-                name.endsWith(".attn_v.weight") -> Shape(kvDim, embed)
-                name.endsWith(".attn_output.weight") -> Shape(embed, qDim)
-                name.endsWith(".ffn_gate.weight") -> Shape(ffn, embed)
-                name.endsWith(".ffn_up.weight") -> Shape(ffn, embed)
-                name.endsWith(".ffn_down.weight") -> Shape(embed, ffn)
-                else -> null
-            }
-        }
-        else -> null
-    }
-}
+// logicalShapeFor + relayoutKSeriesRowMajorToBlockMajor moved to commonMain
+// (GemmaQuantLayout.kt) so the Kotlin/Native board path shares them. This
+// JVM-only file keeps the MemSeg (FFM) conversion + the FP32 dequant fallbacks.
 
 /**
  * Convert raw-byte quantized tensors in a [Gemma4Weights] map (produced by
@@ -287,53 +252,9 @@ private fun <T : DType, V> dequantToFloat(
 }
 
 /**
- * Re-layout GGUF K-series bytes from row-major block order (block at row r,
- * block index b within row → byte offset `(r * blocksPerRow + b) * bytesPerBlock`)
- * to the input-block-major layout the `matmulQ{K}_Vec` kernels expect
- * (block at blockIdx bI for output row r → byte offset
- * `(bI * outDim + r) * bytesPerBlock`).
- *
- * For a weight of shape `[outDim, inDim]` with `inDim % 256 == 0` (the
- * K-series block size), this is just a 2D block-level transpose of the
- * `[outDim, inDim/256]` array of `bytesPerBlock`-byte blocks. Bytes
- * inside a block are untouched.
- *
- * @param bytes packed weight bytes in row-major [outDim, blocksPerRow] order
- * @param shape logical `[outDim, inDim]` shape
- * @param bytesPerBlock 144 for Q4_K, 210 for Q6_K (ggml block sizes)
- */
-internal fun relayoutKSeriesRowMajorToBlockMajor(
-    bytes: ByteArray,
-    shape: sk.ainet.lang.tensor.Shape,
-    bytesPerBlock: Int
-): ByteArray {
-    val blockSize = 256
-    require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" }
-    val outDim = shape[0]
-    val inDim = shape[1]
-    require(inDim % blockSize == 0) {
-        "K-series weight inDim ($inDim) must be a multiple of $blockSize"
-    }
-    val blocksPerRow = inDim / blockSize
-    val expected = outDim.toLong() * blocksPerRow.toLong() * bytesPerBlock.toLong()
-    require(bytes.size.toLong() >= expected) {
-        "K-series byte buffer size ${bytes.size} < expected $expected for shape [$outDim, $inDim] @ ${bytesPerBlock}B/block"
-    }
-    val out = ByteArray(bytes.size)
-    for (r in 0 until outDim) {
-        for (b in 0 until blocksPerRow) {
-            val srcOff = (r * blocksPerRow + b) * bytesPerBlock
-            val dstOff = (b * outDim + r) * bytesPerBlock
-            System.arraycopy(bytes, srcOff, out, dstOff, bytesPerBlock)
-        }
-    }
-    return out
-}
-
-/**
- * Back-compat shim that delegates to [relayoutKSeriesRowMajorToBlockMajor]
- * at Q4_K's 144-byte block size. Kept for any callers outside this file
- * pinned to the old name.
+ * Back-compat shim that delegates to the commonMain
+ * [relayoutKSeriesRowMajorToBlockMajor] at Q4_K's 144-byte block size. Kept for
+ * any callers outside this file pinned to the old name.
  */
 internal fun relayoutQ4_KRowMajorToBlockMajor(bytes: ByteArray, shape: sk.ainet.lang.tensor.Shape): ByteArray =
     relayoutKSeriesRowMajorToBlockMajor(bytes, shape, 144)

From cb96e5363579ccbebe207a1e6ff8534bafccec03 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Thu, 11 Jun 2026 13:37:50 +0200
Subject: [PATCH 3/6] feat(gemma): wire convertGemmaWeightsPacked into
 GemmaNetworkLoader.load()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NATIVE_OPTIMIZED loads produce raw-byte quant tensors the network mapper can't
consume; on JVM an external convertGemmaWeightsToMemSeg (FFM) handled that, but
the Kotlin/Native board has no such path. Add a commonMain converter and make
load() apply it, so load(NATIVE_OPTIMIZED) yields a runnable network on the
board AND the JVM (previously it couldn't be built from raw-byte weights at all).

- GemmaPackedWeights.kt (commonMain): convertGemmaWeightsPacked — packs
  Q4/5/6_K matmul weights to heap Q*_KBlockTensorData (packGemmaKQuant),
  dequants token_embd/output to FP32 (gathered, no transpose) and other quant
  types to FP32 [out,in]. No java.lang.foreign. Plus extractRawBytes, which
  reads the loader's bytes back across both backings (JVM IntArrayTensorData /
  native Byte-typed).
- GemmaNetworkLoader.load(): for NATIVE_OPTIMIZED, run convertGemmaWeightsPacked
  before applyWeightsToNetwork.

Verified on JVM AND linuxX64 (GemmaQuantLayoutTest, 4 tests each): relayout,
packing, and the byte-extraction round-trip — so native byte extraction is
executed, not just compiled.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../ainet/models/gemma/GemmaNetworkLoader.kt  |  20 ++-
 .../ainet/models/gemma/GemmaPackedWeights.kt  | 125 ++++++++++++++++++
 .../models/gemma/GemmaQuantLayoutTest.kt      |  14 ++
 3 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt

diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt
index f73b3ac..abc8c3e 100644
--- a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt
+++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaNetworkLoader.kt
@@ -122,7 +122,7 @@ public class GemmaNetworkLoader @PublishedApi internal constructor(
     public suspend inline fun <reified T : DType, V> load(
         ctx: ExecutionContext
     ): Module<T, V> {
-        val weights: Gemma4Weights<T, V> = when (val wp = weightsProvider) {
+        val rawWeights: Gemma4Weights<T, V> = when (val wp = weightsProvider) {
             is WeightsProvider.GgufSource -> {
                 val loader = Gemma4WeightLoader(wp.sourceProvider, quantPolicy = wp.quantPolicy)
                 loader.loadToMap<T, V>(ctx)
@@ -142,6 +142,24 @@ public class GemmaNetworkLoader @PublishedApi internal constructor(
             }
         }
 
+        // NATIVE_OPTIMIZED yields raw-byte quant tensors the network mapper can't
+        // consume directly. Pack them (heap Q4/5/6_K + FP32 fallback) here — this
+        // is commonMain so it works on Kotlin/Native (the board) as well as the
+        // JVM, and replaces the JVM-only `convertGemmaWeightsToMemSeg` for the
+        // `load()` entry point.
+        val ggufPolicy = when (val wp = weightsProvider) {
+            is WeightsProvider.GgufSource -> wp.quantPolicy
+            is WeightsProvider.GgufRandomAccess -> wp.quantPolicy
+            else -> null
+        }
+        val weights: Gemma4Weights<T, V> =
+            if (ggufPolicy == QuantPolicy.NATIVE_OPTIMIZED) {
+                @Suppress("UNCHECKED_CAST")
+                convertGemmaWeightsPacked(rawWeights, ctx) as Gemma4Weights<T, V>
+            } else {
+                rawWeights
+            }
+
         return applyWeightsToNetwork(ctx, weights)
     }
 
diff --git a/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt
new file mode 100644
index 0000000..ec52eb4
--- /dev/null
+++ b/llm-inference/gemma/src/commonMain/kotlin/sk/ainet/models/gemma/GemmaPackedWeights.kt
@@ -0,0 +1,125 @@
+package sk.ainet.models.gemma
+
+import sk.ainet.context.ExecutionContext
+import sk.ainet.io.gguf.GGMLQuantizationType
+import sk.ainet.io.gguf.dequant.DequantOps
+import sk.ainet.lang.tensor.Shape
+import sk.ainet.lang.tensor.Tensor
+import sk.ainet.lang.tensor.data.IntArrayTensorData
+import sk.ainet.lang.tensor.data.TensorData
+import sk.ainet.lang.types.DType
+import sk.ainet.lang.types.FP32
+
+/**
+ * commonMain (Kotlin/Native-capable) analogue of the jvmMain
+ * `convertGemmaWeightsToMemSeg`. Converts the raw-byte quantized tensors a
+ * `NATIVE_OPTIMIZED` load produces into the forms the DSL matmul path consumes:
+ *
+ * - **Q4_K / Q5_K / Q6_K matmul weights** → heap-packed `Q{4,5,6}_KBlockTensorData`
+ *   (via [packGemmaKQuant], with the row-major→block-major relayout). These keep
+ *   the GGUF footprint and run the in-kernel dequant matmul (NEON on the board).
+ * - **token_embd / output** → FP32 dequant in canonical `[vocab, embed]` order
+ *   (the embedding is gathered, not matmul'd, so no transpose).
+ * - **everything else quantized** → FP32 dequant transposed to `[out, in]`
+ *   row-major so `linearProject` (`x @ W.t()`) is correct.
+ *
+ * Unlike the MemSeg converter this uses no `java.lang.foreign` — it runs on the
+ * SL2610 board binary (Kotlin/Native) as well as the JVM. The JVM still prefers
+ * the MemSeg path (lazy transpose + Q4/Q8 MemSeg); this is the board path.
+ */
+public fun convertGemmaWeightsPacked(
+    weights: Gemma4Weights<*, *>,
+    ctx: ExecutionContext,
+): Gemma4Weights<*, *> {
+    @Suppress("UNCHECKED_CAST")
+    val typed = weights as Gemma4Weights<DType, Any>
+    val quantTypes = typed.quantTypes
+    if (quantTypes.isEmpty()) return weights
+
+    val logicalShapes = typed.logicalShapes
+    val newTensors = linkedMapOf<String, Tensor<DType, Any>>()
+    for ((name, tensor) in typed.tensors) {
+        val qt = quantTypes[name]
+        newTensors[name] = when {
+            qt == null -> tensor // not quantized
+            else -> {
+                val shape = logicalShapes[name] ?: logicalShapeFor(name, typed.metadata)
+                if (shape == null) {
+                    tensor // unknown 2-D layout — leave as-is
+                } else {
+                    val bytes = extractRawBytes(tensor.data)
+                    val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS ||
+                        name == Gemma4TensorNames.OUTPUT_WEIGHT
+                    val packed = if (!isEmbed) packGemmaKQuant<FP32>(bytes, qt, shape) else null
+                    when {
+                        packed != null -> {
+                            @Suppress("UNCHECKED_CAST")
+                            ctx.fromData(packed as TensorData<FP32, Float>, FP32::class) as Tensor<DType, Any>
+                        }
+                        isEmbed -> dequantNoTranspose(bytes, qt, shape, ctx)
+                        else -> dequantTransposed(bytes, qt, shape, ctx)
+                    }
+                }
+            }
+        }
+    }
+    @Suppress("UNCHECKED_CAST")
+    return Gemma4Weights(typed.metadata, newTensors, typed.quantTypes, typed.logicalShapes) as Gemma4Weights<*, *>
+}
+
+/** Dequant to FP32 in natural `[rows, cols]` order (embeddings — gathered, not matmul'd). */
+@Suppress("UNCHECKED_CAST")
+private fun dequantNoTranspose(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+    ctx: ExecutionContext,
+): Tensor<DType, Any> {
+    val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
+    return ctx.fromFloatArray<FP32, Float>(shape, FP32::class, floats) as Tensor<DType, Any>
+}
+
+/**
+ * Dequant to a canonical FP32 `[out, in]` row-major weight. GGUF stores K/legacy
+ * blocks column-major within a row, so the dequantized floats are transposed
+ * column-major → row-major to match what `linearProject` (`x @ W.t()`) expects.
+ */
+@Suppress("UNCHECKED_CAST")
+private fun dequantTransposed(
+    bytes: ByteArray,
+    qt: GGMLQuantizationType,
+    shape: Shape,
+    ctx: ExecutionContext,
+): Tensor<DType, Any> {
+    val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
+    val out = shape[0]
+    val inDim = shape[1]
+    val rowMajor = DequantOps.transposeColumnMajorToRowMajor(floats, inDim, out)
+    return ctx.fromFloatArray<FP32, Float>(shape, FP32::class, rowMajor) as Tensor<DType, Any>
+}
+
+/**
+ * Read the raw packed bytes back from a `NATIVE_OPTIMIZED` quant tensor. The
+ * backing differs by platform/factory — JVM stores `IntArrayTensorData` (byte
+ * values widened to Int); Kotlin/Native stores a Byte-typed tensor — so handle
+ * both element types.
+ */
+internal fun extractRawBytes(data: TensorData<*, *>): ByteArray {
+    if (data is IntArrayTensorData<*>) {
+        val buf = data.buffer
+        return ByteArray(buf.size) { buf[it].toByte() }
+    }
+    val n = data.shape.volume
+    @Suppress("UNCHECKED_CAST")
+    val d = data as TensorData<*, Any?>
+    return ByteArray(n) {
+        when (val v = d[it]) {
+            is Byte -> v
+            is Int -> v.toByte()
+            else -> error(
+                "convertGemmaWeightsPacked: cannot read bytes from ${data::class.simpleName} " +
+                    "(element ${v?.let { e -> e::class.simpleName }})",
+            )
+        }
+    }
+}
diff --git a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt
index 7c7a9c3..52a1cdd 100644
--- a/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt
+++ b/llm-inference/gemma/src/commonTest/kotlin/sk/ainet/models/gemma/GemmaQuantLayoutTest.kt
@@ -4,10 +4,12 @@ import kotlin.test.Test
 import kotlin.test.assertEquals
 import kotlin.test.assertNull
 import kotlin.test.assertTrue
+import sk.ainet.context.DirectCpuExecutionContext
 import sk.ainet.io.gguf.GGMLQuantizationType
 import sk.ainet.lang.tensor.Shape
 import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
 import sk.ainet.lang.types.FP32
+import sk.ainet.lang.types.Int8
 
 /**
  * Unit tests for the commonMain (board-shareable) Gemma quant layout helpers.
@@ -56,4 +58,16 @@ class GemmaQuantLayoutTest {
     fun pack_non_kquant_returns_null() {
         assertNull(packGemmaKQuant<FP32>(ByteArray(34), GGMLQuantizationType.Q8_0, Shape(1, 32)))
     }
+
+    @Test
+    fun extract_raw_bytes_roundtrips_on_every_platform() {
+        // The NATIVE_OPTIMIZED loader wraps quant bytes via ctx.fromByteArray<Int8,Byte>;
+        // extractRawBytes must read them back regardless of the platform backing
+        // (JVM IntArrayTensorData vs native Byte-typed). Runs on jvm + linuxX64.
+        val ctx = DirectCpuExecutionContext.create()
+        val bytes = ByteArray(176 * 3) { ((it * 31 + 7) and 0xFF).toByte() }
+        val t = ctx.fromByteArray<Int8, Byte>(Shape(bytes.size), Int8::class, bytes)
+        val got = extractRawBytes(t.data)
+        assertTrue(bytes.contentEquals(got), "extractRawBytes round-trip mismatch")
+    }
 }

From aaffafb8a3f234b6348387d2cb6e0a00add3a0b6 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Thu, 11 Jun 2026 13:43:04 +0200
Subject: [PATCH 4/6] test(gemma): end-to-end parity for load(NATIVE_OPTIMIZED)
 packed path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends GemmaQ5KPackedParityTest to also decode via
GemmaNetworkLoader.load(NATIVE_OPTIMIZED) — the wired commonMain
convertGemmaWeightsPacked (board) path, no MemSeg/Arena. All three paths
(FP32 baseline, jvmMain MemSeg-packed, load() packed) produce the identical
token sequence -> `<tool_0>(state="on")<end>` for "Turn the light on."

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../models/gemma/GemmaQ5KPackedParityTest.kt      | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
index 1e33fd6..1d4a7ad 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaQ5KPackedParityTest.kt
@@ -123,5 +123,20 @@ class GemmaQ5KPackedParityTest {
 
             assertEquals(genFp32, genNat, "Q5_K packed decode diverged from FP32 baseline")
         }
+
+        // The wired path: GemmaNetworkLoader.load(NATIVE_OPTIMIZED) applies the
+        // commonMain convertGemmaWeightsPacked (the board path) — no MemSeg, no
+        // Arena. Must decode identically to the FP32 baseline too.
+        val mLoad = GemmaNetworkLoader.fromGguf(
+            randomAccessProvider = { JvmRandomAccessSource.open(gguf) },
+            quantPolicy = QuantPolicy.NATIVE_OPTIMIZED,
+        ).load<FP32, Float>(ctx)
+        val rtLoad = OptimizedLLMRuntime(
+            model = mLoad, ctx = ctx, mode = OptimizedLLMMode.DIRECT,
+            dtype = FP32::class, bos = tokenizer.bosTokenId,
+        )
+        val genLoad = decode(rtLoad, promptTokens, maxNew, eos, eot)
+        println("load(NATIVE_OPTIMIZED) gen=$genLoad")
+        assertEquals(genFp32, genLoad, "load(NATIVE_OPTIMIZED) packed decode diverged from FP32 baseline")
     }
 }

From a222b2a431e67525da41e4cb36e956838aca12d9 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Thu, 11 Jun 2026 17:46:39 +0200
Subject: [PATCH 5/6] build: consume skainet 0.30.0 (released Q5_K + NEON + K/N
 cinterop)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 gradle/libs.versions.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
index 98b9de5..5aa078e 100644
--- a/gradle/libs.versions.toml
+++ b/gradle/libs.versions.toml
@@ -1,5 +1,5 @@
 [versions]
-skainet = "0.29.1"
+skainet = "0.30.0"
 agp = "9.2.1"
 jacksonDatabind = "2.22.0"
 jsonSchemaValidator = "3.0.3"

From 0406dc670f5b93b3dc908a1ad0e7d50d810b88ea Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@gmail.com>
Date: Thu, 11 Jun 2026 17:46:39 +0200
Subject: [PATCH 6/6] test(gemma): fix stale FunctionGemma GGUF path in
 integration tests

Six real-model integration tests (RealGemmaLoad/Eager/BakeIrpa/ExternalParam/
DequantDump + GemmaBehavioralAb) pointed at an old workspace path
(/home/miso/projects/coral/sl2610-voice-cc-kt/models/...) and failed with
"File not found" under -PincludeIntegration. Repoint them to the actual model
location (SKaiNET-embedded/sl2610-function-calling/models/), matching
GemmaQ5KPackedParityTest.

Verified: all 6 pass against skainet 0.30.0 (mavenLocal), -PincludeIntegration.
---
 .../kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt       | 2 +-
 .../kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt       | 2 +-
 .../kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt    | 2 +-
 .../kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt        | 2 +-
 .../kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt  | 2 +-
 .../jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt
index 406197c..3f93860 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/GemmaBehavioralAbTest.kt
@@ -31,7 +31,7 @@ import kotlin.test.assertEquals
  */
 @Tag("integration")
 class GemmaBehavioralAbTest {
-    private val gguf = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+    private val gguf = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
 
     private fun argmax(a: FloatArray): Int {
         var bi = 0; var bv = a[0]
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt
index 227fb35..59ddc21 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaBakeIrpaTest.kt
@@ -35,7 +35,7 @@ import kotlin.test.Test
 class RealGemmaBakeIrpaTest {
     @Test
     fun bakeRealGemmaToIrpa() = runBlocking {
-        val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+        val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
         val ctx = DirectCpuExecutionContext.create()
         val weights = Gemma4WeightLoader(
             randomAccessProvider = { JvmRandomAccessSource.open(path) },
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt
index cbd6ebf..af3c5e0 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaDequantDumpTest.kt
@@ -17,7 +17,7 @@ import kotlin.test.Test
 class RealGemmaDequantDumpTest {
     @Test
     fun dumpDequant() = runBlocking {
-        val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+        val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
         val ctx = DirectCpuExecutionContext.create()
         val weights = Gemma4WeightLoader(
             randomAccessProvider = { JvmRandomAccessSource.open(path) },
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt
index 3bfccce..f003747 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaEagerAbTest.kt
@@ -24,7 +24,7 @@ import kotlin.test.Test
 class RealGemmaEagerAbTest {
     @Test
     fun eagerLogits() = runBlocking {
-        val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+        val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
         val ctx = DirectCpuExecutionContext.create()
         val weights = Gemma4WeightLoader(
             randomAccessProvider = { JvmRandomAccessSource.open(path) },
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt
index f90bda2..019dcd8 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaExternalParamTest.kt
@@ -32,7 +32,7 @@ import kotlin.test.Test
 class RealGemmaExternalParamTest {
     @Test
     fun externalizeRealGemmaWeights() = runBlocking {
-        val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+        val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
         val ctx = DirectCpuExecutionContext.create()
         val weights = Gemma4WeightLoader(
             randomAccessProvider = { JvmRandomAccessSource.open(path) },
diff --git a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt
index 2895253..2905da6 100644
--- a/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt
+++ b/llm-inference/gemma/src/jvmTest/kotlin/sk/ainet/models/gemma/RealGemmaLoadTest.kt
@@ -21,7 +21,7 @@ import kotlin.test.Test
 class RealGemmaLoadTest {
     @Test
     fun loadFunctionGemmaWeights() = runBlocking {
-        val path = "/home/miso/projects/coral/sl2610-voice-cc-kt/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
+        val path = "/home/miso/projects/coral/SKaiNET-embedded/sl2610-function-calling/models/functiongemma-physical-ai-v10-Q5_K_M.gguf"
         val ctx = DirectCpuExecutionContext.create()
         val loader = Gemma4WeightLoader(
             randomAccessProvider = { JvmRandomAccessSource.open(path) },