diff --git a/.github/workflows/native-cpu-multiarch.yml b/.github/workflows/native-cpu-multiarch.yml new file mode 100644 index 00000000..e30a74cc --- /dev/null +++ b/.github/workflows/native-cpu-multiarch.yml @@ -0,0 +1,117 @@ +name: Native CPU multi-arch build + +# Cross-arch CI for the FFM native kernel provider +# (skainet-backends/skainet-backend-native-cpu). The local Gradle build +# only produces a host-arch .so/.dylib/.dll; this workflow proves the +# CMake + Kotlin pipeline works on every supported host so consumers +# on Apple Silicon, ARM Linux, and Windows aren't silently broken when +# they pull a published JAR built on x86_64 Linux. +# +# Each matrix job runs the native module's jvmTest end-to-end (CMake +# configure + build + bundle into JAR resources + parity tests via +# FFM downcall). The built shared library is uploaded as an artifact +# so a later "fat-JAR" aggregation step (deferred to a follow-up PR) +# can stage all four arches into one publishable artifact. + +on: + push: + branches: [main, develop] + paths: + - 'skainet-backends/skainet-backend-native-cpu/**' + - 'skainet-backends/skainet-backend-api/src/jvmMain/kotlin/sk/ainet/backend/api/kernel/**' + - '.github/workflows/native-cpu-multiarch.yml' + pull_request: + paths: + - 'skainet-backends/skainet-backend-native-cpu/**' + - 'skainet-backends/skainet-backend-api/src/jvmMain/kotlin/sk/ainet/backend/api/kernel/**' + - '.github/workflows/native-cpu-multiarch.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + native-build-test: + name: ${{ matrix.arch_label }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + arch_label: linux-x86_64 + lib_name: libskainet_kernels.so + - os: ubuntu-24.04-arm + arch_label: linux-arm64 + lib_name: libskainet_kernels.so + - os: macos-14 + arch_label: macos-arm64 + lib_name: libskainet_kernels.dylib + - os: windows-latest + arch_label: windows-x86_64 + lib_name: skainet_kernels.dll + runs-on: ${{ matrix.os }} + timeout-minutes: 30 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Copy CI gradle.properties (Unix) + if: runner.os != 'Windows' + run: | + mkdir -p ~/.gradle + cp .github/ci-gradle.properties ~/.gradle/gradle.properties + + - name: Copy CI gradle.properties (Windows) + if: runner.os == 'Windows' + shell: pwsh + run: | + New-Item -ItemType Directory -Force -Path "$HOME\.gradle" | Out-Null + Copy-Item .github\ci-gradle.properties "$HOME\.gradle\gradle.properties" + + - name: Set up JDK 25 + uses: actions/setup-java@v5 + with: + distribution: 'zulu' + java-version: 25 + + - name: Verify cmake + run: cmake --version + + - name: Build + test native module (Unix) + if: runner.os != 'Windows' + env: + GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8 + run: | + ./gradlew --no-daemon --stacktrace \ + :skainet-backends:skainet-backend-native-cpu:jvmTest \ + :skainet-backends:skainet-backend-native-cpu:jvmJar + + - name: Build + test native module (Windows) + if: runner.os == 'Windows' + shell: pwsh + env: + GRADLE_OPTS: -Dorg.gradle.jvmargs=-Xmx4g -Dfile.encoding=UTF-8 + run: | + .\gradlew.bat --no-daemon --stacktrace ` + :skainet-backends:skainet-backend-native-cpu:jvmTest ` + :skainet-backends:skainet-backend-native-cpu:jvmJar + + - name: Upload native library + if: success() + uses: actions/upload-artifact@v7 + with: + name: libskainet_kernels-${{ matrix.arch_label }} + path: skainet-backends/skainet-backend-native-cpu/build/native/resources/native/${{ matrix.arch_label }}/${{ matrix.lib_name }} + if-no-files-found: error + retention-days: 14 + + - name: Upload test reports + if: always() + uses: actions/upload-artifact@v7 + with: + name: native-cpu-test-reports-${{ matrix.arch_label }} + path: | + skainet-backends/skainet-backend-native-cpu/build/reports/tests/** + skainet-backends/skainet-backend-native-cpu/build/test-results/** + retention-days: 14 diff --git a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt index f881a78a..f4672e0a 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt +++ b/skainet-backends/skainet-backend-native-cpu/native/CMakeLists.txt @@ -24,8 +24,11 @@ if(WIN32) set_target_properties(skainet_kernels PROPERTIES PREFIX "") endif() -# Hide non-exported symbols on ELF / Mach-O for a smaller surface area -# and let the compiler auto-vectorize the Q4_K hot loop. +# Per-compiler tuning. The Q4_K kernel hot loop is straight-line FP +# arithmetic that auto-vectorizes cleanly under aggressive optimization +# (AVX2 on x86_64, NEON on ARM64). Visibility is also handled here on +# ELF / Mach-O; on Windows the SKAINET_API macro adds dllexport so we +# don't need /VISIBILITY flags. if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") target_compile_options(skainet_kernels PRIVATE -fvisibility=hidden @@ -35,4 +38,10 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU") -funroll-loops ) set_target_properties(skainet_kernels PROPERTIES C_VISIBILITY_PRESET hidden) +elseif(CMAKE_C_COMPILER_ID MATCHES "MSVC") + target_compile_options(skainet_kernels PRIVATE + /O2 + /fp:fast + /W3 + ) endif() diff --git a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h index 573b2221..c4c6a36d 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h +++ b/skainet-backends/skainet-backend-native-cpu/native/include/skainet_kernels.h @@ -11,6 +11,17 @@ # define SKAINET_API #endif +/* Portable "restrict" qualifier: GNU/Clang accept __restrict__, + * MSVC accepts __restrict, and the C99 keyword `restrict` is + * unreliable across compiler modes. */ +#if defined(__GNUC__) || defined(__clang__) +# define SKAINET_RESTRICT __restrict__ +#elif defined(_MSC_VER) +# define SKAINET_RESTRICT __restrict +#else +# define SKAINET_RESTRICT +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c index 88f06fe1..7c742793 100644 --- a/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c +++ b/skainet-backends/skainet-backend-native-cpu/native/src/q4k_matmul.c @@ -76,13 +76,13 @@ static inline void skainet_q4k_decode_scales( * codeSum/inputSum accumulators on AVX2/NEON. */ SKAINET_API void skainet_q4k_matmul( - const float* __restrict__ input, + const float* SKAINET_RESTRICT input, int32_t input_offset, - const uint8_t* __restrict__ weight, + const uint8_t* SKAINET_RESTRICT weight, int32_t weight_byte_offset, int32_t input_dim, int32_t output_dim, - float* __restrict__ output, + float* SKAINET_RESTRICT output, int32_t output_offset ) { if (output_dim <= 0 || input_dim <= 0) return;