diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index df08a9f4..972fdd7d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,22 +1,106 @@ name: release +# Tag-triggered Maven Central release. +# +# Two-phase flow so the published JAR for skainet-backend-native-cpu +# carries every supported native lib (.so / .dylib / .dll) regardless +# of which OS hosts the publish step: +# +# 1. build-native — matrix job: each runner builds its own host's +# libskainet_kernels via CMake, uploads the resulting binary as +# an artifact named `native-`. fail-fast stays on so a +# missing arch aborts the release rather than shipping a partial +# fat JAR. +# +# 2. publish — runs on macOS (signing tooling is wired up there), +# downloads every native artifact, stages them into the native +# module's resources tree (`build/native/resources/native//`), +# then runs `./gradlew publish`. Gradle's own CMake step rebuilds +# for the macOS host into native/macos-arm64/; the pre-staged libs +# for the other arches sit in their own subdirs and survive. +# resources.srcDir(nativeResourcesRoot) on jvmMain picks them all +# up into the published JAR. +# +# Linux ARM64 is intentionally absent: Kotlin/Native plugin 2.3.21 +# doesn't support `linux aarch64` as a HOST target ("Unknown host +# target" — see SKaiNET PR #577). Linux ARM64 consumers fall back +# cleanly to the Panama priority-50 provider. + on: push: tags: - '**' jobs: + build-native: + name: native ${{ matrix.arch_label }} + strategy: + fail-fast: true + matrix: + include: + - os: ubuntu-latest + arch_label: linux-x86_64 + lib_name: libskainet_kernels.so + - os: macos-14 + arch_label: macos-arm64 + lib_name: libskainet_kernels.dylib + - os: windows-latest + arch_label: windows-x86_64 + lib_name: skainet_kernels.dll + runs-on: ${{ matrix.os }} + timeout-minutes: 30 + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set up JDK 25 + uses: actions/setup-java@v5 + with: + distribution: 'zulu' + java-version: 25 + + - name: Verify cmake + run: cmake --version + + - name: Build native lib (Unix) + if: runner.os != 'Windows' + env: + GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8 + run: | + ./gradlew --no-daemon --stacktrace --no-configuration-cache \ + :skainet-backends:skainet-backend-native-cpu:packageNativeKernels + + - name: Build native lib (Windows) + if: runner.os == 'Windows' + shell: pwsh + env: + GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8 + run: | + .\gradlew.bat --no-daemon --stacktrace --no-configuration-cache ` + :skainet-backends:skainet-backend-native-cpu:packageNativeKernels + + - name: Upload native artifact + uses: actions/upload-artifact@v7 + with: + name: native-${{ matrix.arch_label }} + path: skainet-backends/skainet-backend-native-cpu/build/native/resources/native/${{ matrix.arch_label }}/${{ matrix.lib_name }} + if-no-files-found: error + retention-days: 14 + publish: name: Release build and publish + needs: build-native runs-on: macOS-latest steps: - name: Check out code uses: actions/checkout@v6 + - name: Set up JDK 25 uses: actions/setup-java@v5 with: distribution: 'zulu' java-version: 25 + - name: Validate signing configuration run: | if ! grep -Eq '^[[:space:]]*signAllPublications[[:space:]]*=[[:space:]]*true[[:space:]]*$' gradle.properties; then @@ -25,10 +109,35 @@ jobs: grep -n 'signAllPublications' gradle.properties || echo "No signAllPublications property found" >&2 exit 1 fi + + - name: Download cross-arch native artifacts + uses: actions/download-artifact@v4 + with: + path: native-artifacts + # All artifacts named `native-*` from the build-native matrix. + pattern: native-* + merge-multiple: false + + - name: Stage cross-arch native libs into module resources + run: | + set -euo pipefail + DEST="skainet-backends/skainet-backend-native-cpu/build/native/resources/native" + for arch in linux-x86_64 macos-arm64 windows-x86_64; do + src_dir="native-artifacts/native-${arch}" + if [ ! -d "$src_dir" ]; then + echo "Missing native artifact for ${arch}" >&2 + exit 1 + fi + mkdir -p "${DEST}/${arch}" + cp -v "${src_dir}"/* "${DEST}/${arch}/" + done + echo "--- Staged tree ---" + find "$DEST" -type f + - name: Publish to MavenCentral run: ./gradlew publish --no-configuration-cache --stacktrace env: ORG_GRADLE_PROJECT_mavenCentralUsername: ${{ secrets.MAVEN_CENTRAL_USERNAME }} ORG_GRADLE_PROJECT_mavenCentralPassword: ${{ secrets.MAVEN_CENTRAL_PASSWORD }} ORG_GRADLE_PROJECT_signingInMemoryKey: ${{ secrets.GPG_PRIVATE_KEY }} - ORG_GRADLE_PROJECT_signingInMemoryKeyPassword: ${{ secrets.SIGNING_PASSWORD }} \ No newline at end of file + ORG_GRADLE_PROJECT_signingInMemoryKeyPassword: ${{ secrets.SIGNING_PASSWORD }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 44dda453..1436693f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,36 @@ ## [Unreleased] +## [0.22.0] - 2026-04-30 + +### Added + +#### Native (FFM) CPU kernel provider — M5 milestone closed + +This release closes milestone M5 of the JVM inference performance roadmap with a priority-100 native kernel provider that wraps a bundled C shared library via Java's Foreign Function & Memory API. Plugs into the existing `KernelProvider` SPI so `KernelRegistry.bestAvailable()` automatically routes Q4_K and FP32 matmul through native when the lib loads, falling back cleanly to the priority-50 Panama Vector kernels otherwise. + +- **`skainet-backend-native-cpu` module** — new JVM-only KMP module wrapping a CMake-built shared library (`libskainet_kernels.{so,dylib,dll}`). Bundled into the JAR resources at `native/-/`, extracted at runtime to a process-scoped temp dir, loaded via `System.load`, and accessed via `Linker.nativeLinker().downcallHandle(...)`. ServiceLoader auto-registers `NativeKernelProviderFactory` via `META-INF/services/sk.ainet.backend.api.kernel.KernelProvider`. (PR #571) +- **Native Q4_K matmul** — single-source scalar C kernel (`-O3 -ffast-math -funroll-loops`); the inner 32-iteration loop auto-vectorizes cleanly into `vfmadd231ps` (AVX2) / `fmla` (NEON). Mirrors `PanamaVectorQ4KMatmulKernel` byte-for-byte on the canonical ggml super-block layout (256 elements / 144 bytes, FP16 d/dMin, 12-byte `get_scale_min_k4` packed sub-scales, 128 bytes of strided 4-bit codes, lazy-`dmin` accumulation). Microbench (Linux x86_64, JDK 21.0.10): **5.87× / 4.71× / 4.17× faster than Panama Vector at 1024² / 2048² / 4096² Q4_K matmul shapes** — single-threaded native beating Panama's `parallelChunks` multi-threaded path on every measured shape. Numerical parity vs Panama within `1e-4` relative tolerance. (PR #572) +- **`Q4KMemSegMatmulKernel` SPI sibling + zero-copy native variant** — JVM-only sibling kernel interface in `skainet-backend-api/jvmMain` taking weights as `MemorySegment` instead of `ByteArray`, plus a JVM-only `MemSegKernelProvider` provider interface that providers can implement alongside `KernelProvider` for the smart-cast lookup pattern at the call site. Reuses the same C symbol as the heap-input kernel — the bytes just don't round-trip through the JVM heap. **+20% wall-clock at 4096²** vs the heap-copy path (9 MB weight transfer eliminated); noise-level at smaller shapes. Bit-identical output to the heap variant. (PR #573) +- **Cross-arch CI matrix** — new `.github/workflows/native-cpu-multiarch.yml` builds and tests the native module on `ubuntu-latest`, `macos-14` (Apple Silicon), and `windows-latest` for every push/PR that touches the native module. Catches portability regressions (linker, alignment, compiler-specific syntax) at PR time rather than after release. C portability tightened: `SKAINET_RESTRICT` macro maps to `__restrict__` on GCC/Clang and `__restrict` on MSVC; CMake grows an MSVC compile-flag branch (`/O2 /fp:fast /W3`) alongside the existing GCC/Clang one. Linux ARM64 was attempted but Kotlin/Native plugin 2.3.21 doesn't support `linux aarch64` as a HOST target ("Unknown host target") — left out for now. (PRs #574, #577) +- **Native FP32 SGEMM** — row-major `C(m,n) = A(m,k) * B(k,n)` with stride support, i-p-j outer-product order so the inner `c[j] += a*b[j]` loop streams two contiguous arrays and auto-vectorizes into FMA. Wired into the existing `matmulFp32()` SPI accessor. Microbench at 256³ / 512³ / 1024³: **1.77× / 1.58× / 1.55× faster than `PanamaVectorMatmulKernel`**. The narrower margin vs Q4_K reflects Panama's already-polished FP32 path (tile-blocking + B-pack + `parallelChunks`); native still wins on every measured shape. Numerical parity within `1e-5 * k` relative tolerance. (PR #575) +- **Multi-arch fat JAR publishing** — `.github/workflows/publish.yml` extended to a two-phase flow: a matrix `build-native` job builds `libskainet_kernels` on each supported host (linux-x86_64, macos-arm64, windows-x86_64), and the `publish` job downloads all three artifacts, stages them into the native module's resources tree, and publishes with every supported arch bundled. Consumers on any of the three arches get a working native path out of the box — no manual side-loading. + +#### Module + publishing infrastructure + +- **`skainet-backend-native-cpu` registered in BOM** — `skainet-bom` now constrains the new module alongside `skainet-backend-api` and `skainet-backend-cpu`. Consumers depending on the BOM get a constrained version without a separate pin. (PR #576) +- **Publishing config wired** — `vanniktech.mavenPublish` plugin + per-module `gradle.properties` (POM_ARTIFACT_ID + POM_NAME) on the new module. Composite-build consumers (e.g. SKaiNET-transformers via `includeBuild`) substitute the published coordinates with the local project ref through the same path every other SKaiNET module uses. (PR #576) + +### Documentation + +- **`NativeKernelProvider` consumption kdoc** — covers two gotchas downstream consumers hit on first wiring: (1) the module is JVM-only (FFM has no Native/JS/Wasm equivalents) so KMP consumers must add the dep to `jvmMain.dependencies`, never `commonMain`; (2) `com.gradleup.shadow:9.4.x` `mergeServiceFiles()` silently drops the `NativeKernelProviderFactory` entry when both `skainet-backend-cpu` and `skainet-backend-native-cpu` are on a shadow JAR's classpath — workaround pointer to the `kllama-cli` `doLast` fix in SKaiNET-transformers PR #88. (PR #579) +- **`docs/.../perf/native-ffm-plan.adoc`** — design baseline for the native FFM provider (recovered from the 0.21.0-cycle PRD that was dropped from the repo root and rehomed as asciidoc). Documents module layout, FFM binding pattern, staged delivery, success metrics, and risks. + +### Limitations + +- **Linux ARM64 native lib is not in the published JAR.** Kotlin/Native plugin 2.3.21 doesn't support `linux aarch64` as a HOST target on the runners GitHub provides, so the cross-arch CI matrix excludes it. Linux ARM64 consumers (Raspberry Pi, AWS Graviton) cleanly fall back to the priority-50 Panama Vector provider — no functional regression, just no native speedup. Re-add when either the Kotlin/Native plugin gains the host or a self-hosted ARM64 runner is wired in. +- **Shadow-jar consumers** using `com.gradleup.shadow:9.4.x` still need a `doLast` workaround to merge the `META-INF/services/sk.ainet.backend.api.kernel.KernelProvider` entries — see SKaiNET-transformers PR #88's `kllama-cli`/`skainet-cli` fix for the canonical implementation. Spring Boot apps consuming via Maven (BOOT-INF/lib/) are unaffected. + ## [0.21.0] - 2026-04-28 ### Added diff --git a/README.md b/README.md index c9ec7a37..4c79dd6e 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ Add the core dependencies (Gradle Kotlin DSL): ```kotlin dependencies { - implementation("sk.ainet.core:SKaiNET-lang-core:0.21.0") - implementation("sk.ainet.core:SKaiNET-backend-cpu:0.21.0") + implementation("sk.ainet.core:SKaiNET-lang-core:0.22.0") + implementation("sk.ainet.core:SKaiNET-backend-cpu:0.22.0") } ``` @@ -137,10 +137,11 @@ SKaiNET is a modular ecosystem. While this repository contains the core engine, --- -## What's New in 0.21.0 +## What's New in 0.22.0 -- **JVM CPU performance — Vector API SIMD across the board.** Pluggable `KernelProvider` SPI with priority-ordered lookup; FP32 matmul tile-blocked at **8.6×–10.8× over scalar**, Q4_K matmul fully SIMD-fused with inline dequant at **~30–73 GFLOPS** on Apple Silicon. Every quantized format we support (Q4_0, Q4_K, Q4_K MemSeg, Q6_K, Q8_0) is now SIMD'd to some degree. -- **`ScratchPool` SPI and `TensorOps.permute(axes)`** — runtime workspace allocator for transient tensors and arbitrary-axis permutation. +- **Native (FFM) CPU kernel provider — M5 milestone closed.** New `skainet-backend-native-cpu` module bundles a hand-tuned C shared library (`-O3 -ffast-math` auto-vectorized into AVX2 / NEON FMA) reachable via FFM downcalls. **4.17×–5.87× faster than Panama Vector on Q4_K matmul** at LLM-typical 1024²–4096² shapes; **1.55×–1.77× faster on FP32 SGEMM** at 256³–1024³. Auto-registers via ServiceLoader; `KernelRegistry.bestAvailable()` routes through native when the lib loads, falls through cleanly to the priority-50 Panama provider otherwise. +- **Zero-copy MemSeg path for mmap'd Q4_K weights** — JVM-only `Q4KMemSegMatmulKernel` SPI sibling skips the staged `ByteArray → MemorySegment` copy that costs +20% wall-clock at 4096² shapes. +- **Cross-arch shipping** — published JAR carries native libs for `linux-x86_64`, `macos-arm64`, and `windows-x86_64`. Linux ARM64 consumers cleanly fall back to Panama (Kotlin/Native host limitation tracked). See [CHANGELOG.md](CHANGELOG.md) for the full release history. diff --git a/gradle.properties b/gradle.properties index 466b8289..33f2e360 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ GROUP=sk.ainet.core -VERSION_NAME=0.22.0-SNAPSHOT +VERSION_NAME=0.22.0 POM_DESCRIPTION=SKaiNET POM_URL=https://github.com/SKaiNET-developers/skainet/