Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions skainet-backends/skainet-backend-native-cpu/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ kotlin {
val jvmTest by getting {
dependencies {
implementation(libs.kotlin.test)
// Parity tests compare NativeQ4KMatmulKernel output
// against PanamaVectorQ4KMatmulKernel; the Panama
// kernel pulls in parallelChunks which transitively
// requires kotlinx-coroutines.
implementation(project(":skainet-backends:skainet-backend-cpu"))
implementation(libs.kotlinx.coroutines)
}
}
}
Expand Down Expand Up @@ -106,10 +112,15 @@ tasks.named("jvmProcessResources") {
dependsOn(packageNativeKernels)
}

// Forward `-Dskainet.runBench=true` from Gradle CLI to the forked test
// JVM so Q4KMatmulMicrobenchTest activates. Skipped silently otherwise.
val runBenchProperty = providers.systemProperty("skainet.runBench")

tasks.withType<Test>().configureEach {
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
runBenchProperty.orNull?.let { systemProperty("skainet.runBench", it) }
}

tasks.withType<JavaExec>().configureEach {
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED")
jvmArgs("--enable-preview", "--enable-native-access=ALL-UNNAMED", "--add-modules", "jdk.incubator.vector")
}
12 changes: 10 additions & 2 deletions skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ endif()

add_library(skainet_kernels SHARED
src/skainet_smoke.c
src/q4k_matmul.c
)

target_include_directories(skainet_kernels PUBLIC
Expand All @@ -23,8 +24,15 @@ if(WIN32)
set_target_properties(skainet_kernels PROPERTIES PREFIX "")
endif()

# Hide non-exported symbols on ELF / Mach-O for a smaller surface area.
# Hide non-exported symbols on ELF / Mach-O for a smaller surface area
# and let the compiler auto-vectorize the Q4_K hot loop.
if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
target_compile_options(skainet_kernels PRIVATE -fvisibility=hidden -Wall -Wextra)
target_compile_options(skainet_kernels PRIVATE
-fvisibility=hidden
-Wall -Wextra
-O3
-ffast-math
-funroll-loops
)
set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,30 @@ extern "C" {
*/
SKAINET_API void skainet_smoke_double(const float* input, float* output, int32_t length);

/*
* Q4_K matrix-vector multiply.
*
* output[output_offset + o] = sum_j input[input_offset + j] *
* dequant(weight[block, o, j])
*
* Block layout: canonical ggml Q4_K, 256 elements per super-block, 144
* bytes per block, with packed weights laid out as
* weight + weight_byte_offset + (block_idx * output_dim + o) * 144
*
* Caller owns input/weight/output memory; the kernel does not retain
* pointers past return. input_dim must be a multiple of 256.
*/
SKAINET_API void skainet_q4k_matmul(
const float* input,
int32_t input_offset,
const uint8_t* weight,
int32_t weight_byte_offset,
int32_t input_dim,
int32_t output_dim,
float* output,
int32_t output_offset
);

#ifdef __cplusplus
}
#endif
Expand Down
151 changes: 151 additions & 0 deletions skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#include "skainet_kernels.h"

#include <stddef.h>
#include <stdint.h>

#define Q4K_BLOCK_SIZE 256
#define Q4K_SUB_BLOCK_SIZE 32
#define Q4K_SUB_BLOCKS 8
#define Q4K_BYTES_PER_BLOCK 144

/*
* IEEE 754 binary16 (LE byte order) -> binary32 conversion.
* Mirrors PanamaVectorQ4KMatmulKernel.halfToFloat byte-for-byte.
*/
static inline float skainet_half_to_float(uint16_t hbits) {
const uint32_t sign = (hbits >> 15) & 0x1u;
const uint32_t exp = (hbits >> 10) & 0x1Fu;
const uint32_t frac = hbits & 0x3FFu;

if (exp == 0u) {
if (frac == 0u) {
union { uint32_t u; float f; } v = { sign << 31 };
return v.f;
}
float f = ((float) frac) / 1024.0f * (1.0f / 16384.0f);
return sign ? -f : f;
}
if (exp == 0x1Fu) {
union { uint32_t u; float f; } v;
v.u = (sign << 31) | 0x7F800000u | (frac ? 0x00400000u : 0u);
return v.f;
}
union { uint32_t u; float f; } v;
v.u = (sign << 31) | ((exp - 15u + 127u) << 23) | (frac << 13);
return v.f;
}

/*
* ggml's get_scale_min_k4 unmix for the 12-byte packed sub-scale region
* (bytes 4..15 of a Q4_K block). Same logic as the Kotlin reference.
*/
static inline void skainet_q4k_decode_scales(
const uint8_t* scales,
int* scale_idx,
int* min_idx
) {
for (int sb = 0; sb < 4; ++sb) {
scale_idx[sb] = scales[sb] & 0x3F;
min_idx[sb] = scales[sb + 4] & 0x3F;
}
for (int sb = 4; sb < 8; ++sb) {
const int low4_s = scales[sb + 4] & 0x0F;
const int high2_s = (scales[sb - 4] >> 6) & 0x03;
scale_idx[sb] = low4_s | (high2_s << 4);

const int low4_m = (scales[sb + 4] >> 4) & 0x0F;
const int high2_m = (scales[sb] >> 6) & 0x03;
min_idx[sb] = low4_m | (high2_m << 4);
}
}

/*
* Native Q4_K matrix-vector multiply matching the
* sk.ainet.backend.api.kernel.Q4KMatmulKernel SPI contract. Single
* input row times an `outputDim x inputDim` Q4_K-packed weight tensor
* laid out (blockIdx * outputDim + o) * 144 bytes.
*
* Lazy-dmin pattern: per sub-block accumulate
* codeSum[s] = sum_i input[i] * code[i]
* inputSum[s] = sum_i input[i]
* and combine once via
* acc += d * scaleIdx[s] * codeSum[s] - dMin * minIdx[s] * inputSum[s]
*
* Scalar single-threaded for PR 2; the tight inner loop is
* straight-line FP arithmetic so -O3 auto-vectorizes the
* codeSum/inputSum accumulators on AVX2/NEON.
*/
SKAINET_API void skainet_q4k_matmul(
const float* __restrict__ input,
int32_t input_offset,
const uint8_t* __restrict__ weight,
int32_t weight_byte_offset,
int32_t input_dim,
int32_t output_dim,
float* __restrict__ output,
int32_t output_offset
) {
if (output_dim <= 0 || input_dim <= 0) return;

const int32_t blocks_per_input_dim = input_dim / Q4K_BLOCK_SIZE;
const float* in_base = input + input_offset;
float* out_base = output + output_offset;

int scale_idx[Q4K_SUB_BLOCKS];
int min_idx[Q4K_SUB_BLOCKS];

for (int32_t o = 0; o < output_dim; ++o) {
float acc = 0.0f;

for (int32_t block_idx = 0; block_idx < blocks_per_input_dim; ++block_idx) {
const uint8_t* block = weight + weight_byte_offset
+ (size_t)(block_idx * output_dim + o) * Q4K_BYTES_PER_BLOCK;

/* d, dMin (FP16 LE -> FP32). */
const uint16_t d_bits = (uint16_t) block[0] | ((uint16_t) block[1] << 8);
const uint16_t d_min_bits = (uint16_t) block[2] | ((uint16_t) block[3] << 8);
const float d = skainet_half_to_float(d_bits);
const float d_min = skainet_half_to_float(d_min_bits);

/* 12 bytes of packed (scaleIdx, minIdx) -> 8 ints each. */
skainet_q4k_decode_scales(block + 4, scale_idx, min_idx);

const uint8_t* qs = block + 16;
const float* in_block = in_base + (size_t) block_idx * Q4K_BLOCK_SIZE;

/* 4 strided qs groups; group j carries sub-blocks 2j (lo) and 2j+1 (hi). */
for (int group_j = 0; group_j < 4; ++group_j) {
const uint8_t* qs_group = qs + group_j * Q4K_SUB_BLOCK_SIZE;
const int sb_lo = 2 * group_j;
const int sb_hi = sb_lo + 1;
const float* in_lo = in_block + sb_lo * Q4K_SUB_BLOCK_SIZE;
const float* in_hi = in_block + sb_hi * Q4K_SUB_BLOCK_SIZE;

float code_sum_lo = 0.0f, input_sum_lo = 0.0f;
float code_sum_hi = 0.0f, input_sum_hi = 0.0f;

/* 32 iterations — auto-vectorizes cleanly under -O3. */
for (int i = 0; i < Q4K_SUB_BLOCK_SIZE; ++i) {
const uint8_t b = qs_group[i];
const float code_lo = (float)(b & 0x0F);
const float code_hi = (float)(b >> 4);
const float v_lo = in_lo[i];
const float v_hi = in_hi[i];
code_sum_lo += v_lo * code_lo;
input_sum_lo += v_lo;
code_sum_hi += v_hi * code_hi;
input_sum_hi += v_hi;
}

const float scale_lo = d * (float) scale_idx[sb_lo];
const float offset_lo = d_min * (float) min_idx[sb_lo];
const float scale_hi = d * (float) scale_idx[sb_hi];
const float offset_hi = d_min * (float) min_idx[sb_hi];
acc += code_sum_lo * scale_lo - input_sum_lo * offset_lo;
acc += code_sum_hi * scale_hi - input_sum_hi * offset_hi;
}
}

out_base[o] = acc;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,24 @@ import sk.ainet.backend.api.kernel.Q4KMatmulKernel
* Native (FFM) [KernelProvider]. Sits at priority `100`, above
* [PanamaVectorKernelProvider] (`50`) and the scalar reference (`0`).
*
* PR 1 of the staged native-FFM rollout (see the `native-ffm-plan`
* asciidoc) only ships the module scaffolding: the Gradle ↔ CMake
* pipeline that produces a host-arch shared library, its bundling into
* JAR resources, and an end-to-end FFM smoke downcall test. No real
* matmul kernel is wired into the public SPI yet.
* Availability is gated on [NativeQ4KMatmulKernel.isAvailable] — the
* bundled `libskainet_kernels` shared library has to load AND the
* `skainet_q4k_matmul` symbol has to resolve via FFM. When either
* fails (missing arch, sandbox, JDK without FFM, kill-switch),
* `KernelRegistry.bestAvailable()` cleanly cascades to
* [PanamaVectorKernelProvider] at priority 50.
*
* Until [NativeQ4KMatmulKernel] (or its `MemSegment`-input sibling)
* lands in PR 2, this provider deliberately reports `isAvailable() =
* false` and returns `null` from every kernel accessor. That keeps
* `KernelRegistry.bestAvailable()` cleanly cascading down to the
* Panama priority-50 provider on every shape we measure today, so
* adding the new module to the classpath produces no behavior change.
* PR 2 of the staged rollout: real Q4_K matmul wired into the SPI.
* `matmulFp32` follows in a later PR alongside a native FP32 kernel.
*/
public object NativeKernelProvider : KernelProvider {
override val name: String = "native-ffm"
override val priority: Int = 100

override fun isAvailable(): Boolean = false
override fun isAvailable(): Boolean = NativeQ4KMatmulKernel.isAvailable()

override fun matmulFp32(): Fp32MatmulKernel? = null

override fun matmulQ4K(): Q4KMatmulKernel? = null
override fun matmulQ4K(): Q4KMatmulKernel? =
if (NativeQ4KMatmulKernel.isAvailable()) NativeQ4KMatmulKernel else null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package sk.ainet.exec.kernel

import java.lang.foreign.Arena
import java.lang.foreign.FunctionDescriptor
import java.lang.foreign.Linker
import java.lang.foreign.MemorySegment
import java.lang.foreign.ValueLayout
import java.lang.invoke.MethodHandle
import sk.ainet.backend.api.kernel.Q4KMatmulKernel

/**
* Native (FFM) implementation of [Q4KMatmulKernel].
*
* Wraps the bundled C symbol
*
* void skainet_q4k_matmul(
* const float* input, int32_t input_offset,
* const uint8_t* weight, int32_t weight_byte_offset,
* int32_t input_dim, int32_t output_dim,
* float* output, int32_t output_offset);
*
* The C kernel implements the same lazy-`dmin` accumulation as
* [PanamaVectorQ4KMatmulKernel] (sum input·code and sum input per
* sub-block, combine via `d * scaleIdx[s] * codeSum - dMin * minIdx[s] * inputSum`)
* and shares the canonical 256-element / 144-byte super-block layout.
*
* Numerical parity vs the Panama kernel is asserted by
* [NativeQ4KMatmulKernelParityTest] within `1e-4` relative tolerance,
* matching the parity bar `PanamaVectorQ4KMatmulKernelTest` uses.
*
* PR 2 of the staged native-FFM rollout: ships a single-threaded
* scalar C kernel (`-O3 -ffast-math`, auto-vectorized inner loop).
* NEON / AVX2 intrinsics, `MemorySegment`-input zero-copy variant,
* and cross-arch CI shipping are deferred to PRs 3–5.
*/
internal object NativeQ4KMatmulKernel : Q4KMatmulKernel {

private const val BLOCK_SIZE = 256

fun isAvailable(): Boolean = handle != null

override fun matmul(
input: FloatArray, inputOffset: Int,
weight: ByteArray, weightByteOffset: Int,
inputDim: Int, outputDim: Int,
output: FloatArray, outputOffset: Int,
) {
require(inputDim % BLOCK_SIZE == 0) {
"NativeQ4KMatmulKernel: inputDim must be a multiple of $BLOCK_SIZE; got $inputDim"
}
if (outputDim == 0 || inputDim == 0) return
val mh = handle
?: error("NativeQ4KMatmulKernel.matmul invoked while native library unavailable")

// The native kernel writes outputDim floats and only reads
// inputDim floats + (inputDim/256)*outputDim*144 weight bytes,
// so the segments size exactly to those windows. Heap-array
// segments would also work but allocating off-heap copies keeps
// the native side oblivious to the JVM heap layout (and lets
// the same wrapper take MemorySegment-backed inputs in PR 3).
Arena.ofConfined().use { arena ->
val inSeg = arena.allocate(
inputDim.toLong() * java.lang.Float.BYTES,
ValueLayout.JAVA_FLOAT.byteAlignment(),
)
val outSeg = arena.allocate(
outputDim.toLong() * java.lang.Float.BYTES,
ValueLayout.JAVA_FLOAT.byteAlignment(),
)
val weightBytesUsed = ((inputDim / BLOCK_SIZE).toLong() * outputDim) * 144L
val weightSeg = arena.allocate(weightBytesUsed, 1L)

MemorySegment.copy(input, inputOffset, inSeg, ValueLayout.JAVA_FLOAT, 0L, inputDim)
MemorySegment.copy(weight, weightByteOffset, weightSeg, ValueLayout.JAVA_BYTE, 0L, weightBytesUsed.toInt())

mh.invoke(
inSeg, 0,
weightSeg, 0,
inputDim, outputDim,
outSeg, 0,
)

MemorySegment.copy(outSeg, ValueLayout.JAVA_FLOAT, 0L, output, outputOffset, outputDim)
}
}

private val handle: MethodHandle? by lazy {
val lookup = NativeLibraryLoader.lookup() ?: return@lazy null
val symbol = lookup.find("skainet_q4k_matmul").orElse(null) ?: return@lazy null
val descriptor = FunctionDescriptor.ofVoid(
ValueLayout.ADDRESS, // input
ValueLayout.JAVA_INT, // input_offset
ValueLayout.ADDRESS, // weight
ValueLayout.JAVA_INT, // weight_byte_offset
ValueLayout.JAVA_INT, // input_dim
ValueLayout.JAVA_INT, // output_dim
ValueLayout.ADDRESS, // output
ValueLayout.JAVA_INT, // output_offset
)
runCatching { Linker.nativeLinker().downcallHandle(symbol, descriptor) }.getOrNull()
}
}
Loading
Loading