Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import sk.ainet.io.gguf.GGMLQuantizationType
import sk.ainet.io.gguf.dequant.DequantOps
import sk.ainet.lang.tensor.Shape
import sk.ainet.lang.tensor.Tensor
import sk.ainet.lang.tensor.data.DenseFloatArrayTensorData
import sk.ainet.lang.tensor.data.IntArrayTensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.DType
Expand Down Expand Up @@ -48,8 +49,12 @@ public fun convertGemmaWeightsPacked(
tensor // unknown 2-D layout — leave as-is
} else {
val bytes = extractRawBytes(tensor.data)
val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS ||
name == Gemma4TensorNames.OUTPUT_WEIGHT
// Only the token-embedding table is gathered (row lookup) and so
// must be FP32 here. `output`/lm_head is a real matmul weight —
// it stays packed (FunctionGemma's tied output is Q8_0 → NEON
// Q8_0 kernel, transposed lazily by ops.transpose) instead of a
// second ~0.67 GB FP32 copy that would OOM the 1.9 GB board.
val isEmbed = name == Gemma4TensorNames.TOKEN_EMBEDDINGS
val packed = if (!isEmbed) packGemmaKQuant<FP32>(bytes, qt, shape) else null
when {
packed != null -> {
Expand All @@ -76,7 +81,11 @@ private fun dequantNoTranspose(
ctx: ExecutionContext,
): Tensor<DType, Any> {
val floats = DequantOps.dequantFromBytes(bytes, qt, shape.volume)
return ctx.fromFloatArray<FP32, Float>(shape, FP32::class, floats) as Tensor<DType, Any>
// Wrap the dequant array directly (no-copy) rather than ctx.fromFloatArray,
// which routes through BufferHandleFactory.owned and allocates a second
// full-size buffer — for the 262k×640 FP32 token_embd (~0.67 GB) that
// transient double is itself enough to OOM the 1.9 GB board.
return ctx.fromData(DenseFloatArrayTensorData<FP32>(shape, floats), FP32::class) as Tensor<DType, Any>
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import sk.ainet.lang.tensor.Shape
import sk.ainet.lang.tensor.data.Q4_KBlockTensorData
import sk.ainet.lang.tensor.data.Q5_KBlockTensorData
import sk.ainet.lang.tensor.data.Q6_KBlockTensorData
import sk.ainet.lang.tensor.data.Q8_0BlockTensorData
import sk.ainet.lang.tensor.data.TensorData
import sk.ainet.lang.types.DType

Expand Down Expand Up @@ -66,8 +67,8 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
bytes: ByteArray,
shape: Shape,
bytesPerBlock: Int,
blockSize: Int = 256,
): ByteArray {
val blockSize = 256
require(shape.rank == 2) { "K-series weight must be 2D, got rank ${shape.rank}" }
val outDim = shape[0]
val inDim = shape[1]
Expand All @@ -88,19 +89,31 @@ internal fun relayoutKSeriesRowMajorToBlockMajor(
return out
}

/** Bytes per ggml block for the K-quant types this packer handles. */
private fun kQuantBytesPerBlock(qt: GGMLQuantizationType): Int? = when (qt) {
GGMLQuantizationType.Q4_K -> 144
GGMLQuantizationType.Q5_K -> 176
GGMLQuantizationType.Q6_K -> 210
/**
* Block geometry `(blockElems, bytesPerBlock)` for the quant types this packer
* handles. The K-series are 256-element super-blocks; Q8_0 is a 32-element block
* (f16 scale + 32 int8). All four have a first-class CPU matmul kernel + a lazy
* transpose in `ops.transpose`, so all four can stay packed instead of FP32.
*/
private fun quantBlockLayout(qt: GGMLQuantizationType): Pair<Int, Int>? = when (qt) {
GGMLQuantizationType.Q4_K -> 256 to 144
GGMLQuantizationType.Q5_K -> 256 to 176
GGMLQuantizationType.Q6_K -> 256 to 210
GGMLQuantizationType.Q8_0 -> 32 to 34
else -> null
}

/**
* Pack raw GGUF K-quant `bytes` of logical `[out, in]` shape into the
* heap-packed block tensor data the matmul kernels read directly (Q4_K / Q5_K /
* Q6_K). Performs the row-major → block-major relayout. Returns `null` for
* non-K-quant types (caller dequantizes those to FP32).
* Pack raw GGUF `bytes` of logical `[out, in]` shape into the heap-packed block
* tensor data the matmul kernels read directly (Q4_K / Q5_K / Q6_K / Q8_0).
* Performs the row-major → block-major relayout. Returns `null` for types
* without a packed kernel (caller dequantizes those to FP32).
*
* Q8_0 matters for gemma's tied `output`/lm_head: FunctionGemma's token_embd is
* Q8_0, so keeping the lm_head packed (vs ~0.67 GB FP32) is what lets the eager
* decode fit the 1.9 GB board, and it runs on the NEON Q8_0 kernel. (Requires
* the Q8_0 case in `ops.transpose` — engine — so `linearProject` can transpose
* the packed weight; see transformers #178.)
*
* commonMain → works on JVM and Kotlin/Native alike (no MemSeg / Arena).
*/
Expand All @@ -109,13 +122,14 @@ internal fun <T : DType> packGemmaKQuant(
qt: GGMLQuantizationType,
shape: Shape,
): TensorData<T, *>? {
val bpb = kQuantBytesPerBlock(qt) ?: return null
val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb)
val (blockElems, bpb) = quantBlockLayout(qt) ?: return null
val relaid = relayoutKSeriesRowMajorToBlockMajor(bytes, shape, bpb, blockElems)
@Suppress("UNCHECKED_CAST")
return when (qt) {
GGMLQuantizationType.Q4_K -> Q4_KBlockTensorData(shape, relaid) as TensorData<T, *>
GGMLQuantizationType.Q5_K -> Q5_KBlockTensorData(shape, relaid) as TensorData<T, *>
GGMLQuantizationType.Q6_K -> Q6_KBlockTensorData(shape, relaid) as TensorData<T, *>
GGMLQuantizationType.Q8_0 -> Q8_0BlockTensorData(shape, relaid) as TensorData<T, *>
else -> null
}
}
Loading