Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions .github/workflows/native-cpu-multiarch.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
name: Native CPU multi-arch build

# Cross-arch CI for the FFM native kernel provider
# (skainet-backends/skainet-backend-native-cpu). The local Gradle build
# only produces a host-arch .so/.dylib/.dll; this workflow proves the
# CMake + Kotlin pipeline works on every supported host so consumers
# on Apple Silicon, ARM Linux, and Windows aren't silently broken when
# they pull a published JAR built on x86_64 Linux.
#
# Each matrix job runs the native module's jvmTest end-to-end (CMake
# configure + build + bundle into JAR resources + parity tests via
# FFM downcall). The built shared library is uploaded as an artifact
# so a later "fat-JAR" aggregation step (deferred to a follow-up PR)
# can stage all four arches into one publishable artifact.

on:
push:
branches: [main, develop]
paths:
- 'skainet-backends/skainet-backend-native-cpu/**'
- 'skainet-backends/skainet-backend-api/src/jvmMain/kotlin/sk/ainet/backend/api/kernel/**'
- '.github/workflows/native-cpu-multiarch.yml'
pull_request:
paths:
- 'skainet-backends/skainet-backend-native-cpu/**'
- 'skainet-backends/skainet-backend-api/src/jvmMain/kotlin/sk/ainet/backend/api/kernel/**'
- '.github/workflows/native-cpu-multiarch.yml'

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
native-build-test:
name: ${{ matrix.arch_label }}
strategy:
fail-fast: false
matrix:
include:
- os: ubuntu-latest
arch_label: linux-x86_64
lib_name: libskainet_kernels.so
- os: ubuntu-24.04-arm
arch_label: linux-arm64
lib_name: libskainet_kernels.so
- os: macos-14
arch_label: macos-arm64
lib_name: libskainet_kernels.dylib
- os: windows-latest
arch_label: windows-x86_64
lib_name: skainet_kernels.dll
runs-on: ${{ matrix.os }}
timeout-minutes: 30

steps:
- name: Checkout
uses: actions/checkout@v6

- name: Copy CI gradle.properties (Unix)
if: runner.os != 'Windows'
run: |
mkdir -p ~/.gradle
cp .github/ci-gradle.properties ~/.gradle/gradle.properties

- name: Copy CI gradle.properties (Windows)
if: runner.os == 'Windows'
shell: pwsh
run: |
New-Item -ItemType Directory -Force -Path "$HOME\.gradle" | Out-Null
Copy-Item .github\ci-gradle.properties "$HOME\.gradle\gradle.properties"

- name: Set up JDK 25
uses: actions/setup-java@v5
with:
distribution: 'zulu'
java-version: 25

- name: Verify cmake
run: cmake --version

- name: Build + test native module (Unix)
if: runner.os != 'Windows'
env:
GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8
run: |
./gradlew --no-daemon --stacktrace \
:skainet-backends:skainet-backend-native-cpu:jvmTest \
:skainet-backends:skainet-backend-native-cpu:jvmJar

- name: Build + test native module (Windows)
if: runner.os == 'Windows'
shell: pwsh
env:
GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8
run: |
.\gradlew.bat --no-daemon --stacktrace `
:skainet-backends:skainet-backend-native-cpu:jvmTest `
:skainet-backends:skainet-backend-native-cpu:jvmJar

- name: Upload native library
if: success()
uses: actions/upload-artifact@v7
with:
name: libskainet_kernels-${{ matrix.arch_label }}
path: skainet-backends/skainet-backend-native-cpu/build/native/resources/native/${{ matrix.arch_label }}/${{ matrix.lib_name }}
if-no-files-found: error
retention-days: 14

- name: Upload test reports
if: always()
uses: actions/upload-artifact@v7
with:
name: native-cpu-test-reports-${{ matrix.arch_label }}
path: |
skainet-backends/skainet-backend-native-cpu/build/reports/tests/**
skainet-backends/skainet-backend-native-cpu/build/test-results/**
retention-days: 14
13 changes: 11 additions & 2 deletions skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ if(WIN32)
set_target_properties(skainet_kernels PROPERTIES PREFIX "")
endif()

# Hide non-exported symbols on ELF / Mach-O for a smaller surface area
# and let the compiler auto-vectorize the Q4_K hot loop.
# Per-compiler tuning. The Q4_K kernel hot loop is straight-line FP
# arithmetic that auto-vectorizes cleanly under aggressive optimization
# (AVX2 on x86_64, NEON on ARM64). Visibility is also handled here on
# ELF / Mach-O; on Windows the SKAINET_API macro adds dllexport so we
# don't need /VISIBILITY flags.
if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
target_compile_options(skainet_kernels PRIVATE
-fvisibility=hidden
Expand All @@ -35,4 +38,10 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
-funroll-loops
)
set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden)
elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC")
target_compile_options(skainet_kernels PRIVATE
/O2
/fp:fast
/W3
)
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,17 @@
# define SKAINET_API
#endif

/* Portable "restrict" qualifier: GNU/Clang accept __restrict__,
* MSVC accepts __restrict, and the C99 keyword `restrict` is
* unreliable across compiler modes. */
#if defined(__GNUC__) || defined(__clang__)
# define SKAINET_RESTRICT __restrict__
#elif defined(_MSC_VER)
# define SKAINET_RESTRICT __restrict
#else
# define SKAINET_RESTRICT
#endif

#ifdef __cplusplus
extern "C" {
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ static inline void skainet_q4k_decode_scales(
* codeSum/inputSum accumulators on AVX2/NEON.
*/
SKAINET_API void skainet_q4k_matmul(
const float* __restrict__ input,
const float* SKAINET_RESTRICT input,
int32_t input_offset,
const uint8_t* __restrict__ weight,
const uint8_t* SKAINET_RESTRICT weight,
int32_t weight_byte_offset,
int32_t input_dim,
int32_t output_dim,
float* __restrict__ output,
float* SKAINET_RESTRICT output,
int32_t output_offset
) {
if (output_dim <= 0 || input_dim <= 0) return;
Expand Down
Loading