Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 43 additions & 47 deletions .github/workflows/build-llama.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: build-llama

# Vendor prebuild: builds llama-server for every supported platform,
# Vendor prebuild: builds llama-server for every supported platform
# (linux amd64/arm64 + darwin arm64; Windows support is dropped for now),
# publishes the binary to GHCR (when permissions allow), and emits an
# actions/upload-artifact so callers via workflow_call can consume the
# binary without going through GHCR. build.yml and release.yaml call
Expand All @@ -17,7 +18,11 @@ name: build-llama
# - push to main on the same paths: build AND push to GHCR.
#
# Artifacts:
# GHCR: ghcr.io/<repo-lowercase>/llama-server:<llama-cpp-ref>-<os>-<arch>
# GHCR: ghcr.io/<repo-lowercase>/llama-server:<ref>-<os-type>-<os-version>-<arch>
# e.g. ...-ubuntu-2204-amd64 / ...-mac-14-arm64.
# OS type+version are read from the running image (not the runner
# label, which can be a rolling alias), so an OS/glibc bump busts the
# cache and triggers a fresh build.
# Workflow artifact: llama-server-<os>-<arch> containing the binary.

on:
Expand Down Expand Up @@ -61,10 +66,9 @@ jobs:
fail-fast: false
matrix:
include:
- { runner: macos-14, os: darwin, arch: arm64, kind: unix }
- { runner: ubuntu-latest, os: linux, arch: amd64, kind: unix }
- { runner: ubuntu-24.04-arm, os: linux, arch: arm64, kind: unix }
- { runner: windows-latest, os: windows, arch: amd64, kind: windows }
- { runner: macos-14, os: darwin, arch: arm64 }
- { runner: ubuntu-22.04, os: linux, arch: amd64 }
- { runner: ubuntu-22.04-arm, os: linux, arch: arm64 }

steps:
- uses: actions/checkout@v4
Expand All @@ -83,8 +87,35 @@ jobs:
id: tag
shell: bash
run: |
set -euo pipefail
ref_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
echo "tag=ghcr.io/${ref_repo}/llama-server:${{ steps.ref.outputs.ref }}-${{ matrix.os }}-${{ matrix.arch }}" >> "$GITHUB_OUTPUT"
# Tag layout: <ref>-<os-type>-<os-version>-<arch>, e.g.
# b9352-ubuntu-2204-amd64 / b9352-mac-14-arm64
# OS type+version are read from the running image (not the runner
# label, which can be a rolling alias like ubuntu-latest) so the
# build baseline — most importantly the linux glibc version — is part
# of the cache key. An OS bump then busts the cache and forces a
# fresh build instead of reusing a binary built against another glibc.
case "${RUNNER_OS}" in
Linux)
. /etc/os-release
os_type="${ID}" # ubuntu
os_version="${VERSION_ID//./}" # 22.04 -> 2204
;;
macOS)
os_type="mac"
os_version="$(sw_vers -productVersion | cut -d. -f1)" # 14
;;
*)
echo "::error::unhandled RUNNER_OS=${RUNNER_OS}" >&2
exit 1
;;
esac
if [ -z "${os_type:-}" ] || [ -z "${os_version:-}" ]; then
echo "::error::could not resolve OS type/version for the tag (RUNNER_OS=${RUNNER_OS})" >&2
exit 1
fi
echo "tag=ghcr.io/${ref_repo}/llama-server:${{ steps.ref.outputs.ref }}-${os_type}-${os_version}-${{ matrix.arch }}" >> "$GITHUB_OUTPUT"

# ─── GHCR login + existence check ─────────────────────────────────
- name: Install oras
Expand Down Expand Up @@ -128,7 +159,7 @@ jobs:

# ─── Path 1: Pull existing binary from GHCR ─────────────────────────
- name: Pull llama-server from GHCR
if: steps.plan.outputs.skip_build == 'true' && matrix.kind == 'unix'
if: steps.plan.outputs.skip_build == 'true'
shell: bash
run: |
set -euo pipefail
Expand All @@ -137,60 +168,25 @@ jobs:
chmod +x bin/llama-server
bin/llama-server --version

- name: Pull llama-server.exe from GHCR
if: steps.plan.outputs.skip_build == 'true' && matrix.kind == 'windows'
shell: pwsh
run: |
New-Item -ItemType Directory -Force -Path bin | Out-Null
oras pull --output bin "${{ steps.tag.outputs.tag }}"

# ─── Path 2: Build llama-server fresh ─────────────────────────────
- name: Clone llama.cpp (windows)
if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'windows'
shell: bash
run: |
mkdir -p third_party
git clone --depth 1 --branch ${{ steps.ref.outputs.ref }} \
https://github.com/ggml-org/llama.cpp third_party/llama.cpp

- name: Build llama-server (unix)
if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'unix'
- name: Build llama-server
if: steps.plan.outputs.skip_build != 'true'
run: make llama-server

- name: Build llama-server.exe (windows)
if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'windows'
shell: pwsh
run: |
cmake -B third_party/llama.cpp/build -S third_party/llama.cpp -C ci/llama-cpp-cache.cmake
# -j 2 caps parallel compile jobs to avoid OOM on 16GB runners.
cmake --build third_party/llama.cpp/build --target llama-server -j 2 --config Release
New-Item -ItemType Directory -Force -Path bin | Out-Null
Copy-Item third_party/llama.cpp/build/bin/Release/llama-server.exe bin/

- name: Sanity check binary
if: matrix.kind == 'unix'
shell: bash
run: bin/llama-server --version

# ─── GHCR push (only when we built fresh AND we have write perms) ────
- name: Push llama-server to GHCR (unix)
if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request' && matrix.kind == 'unix'
- name: Push llama-server to GHCR
if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request'
shell: bash
working-directory: bin
run: |
oras push "${{ steps.tag.outputs.tag }}" \
--artifact-type "application/vnd.runed.llama-server" \
llama-server

- name: Push llama-server.exe to GHCR (windows)
if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request' && matrix.kind == 'windows'
shell: pwsh
working-directory: bin
run: |
oras push "${{ steps.tag.outputs.tag }}" `
--artifact-type "application/vnd.runed.llama-server" `
llama-server.exe

# ─── Workflow artifact (always — consumed by build.yml / release.yaml) ─
- name: Upload llama-server artifact
uses: actions/upload-artifact@v4
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ jobs:
matrix:
include:
- { runner: macos-14, goos: darwin, goarch: arm64, smoke: true }
- { runner: ubuntu-latest, goos: linux, goarch: amd64, smoke: true }
- { runner: ubuntu-24.04-arm, goos: linux, goarch: arm64, smoke: true }
- { runner: ubuntu-22.04, goos: linux, goarch: amd64, smoke: true }
- { runner: ubuntu-22.04-arm, goos: linux, goarch: arm64, smoke: true }
runs-on: ${{ matrix.runner }}
steps:
- uses: actions/checkout@v4
Expand Down
67 changes: 27 additions & 40 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,10 @@ jobs:
matrix:
include:
# Full-stack tarballs: runed + rundemo + llama-server.
- runner: macos-14
goos: darwin
goarch: arm64
kind: tarball
- runner: ubuntu-latest
goos: linux
goarch: amd64
kind: tarball
- runner: ubuntu-24.04-arm
goos: linux
goarch: arm64
kind: tarball
# Windows: llama-server.exe only zip (runed Windows support deferred).
- runner: windows-latest
goos: windows
goarch: amd64
kind: llama-only
# (Windows support is dropped for now — no Windows target here.)
- { runner: macos-14, goos: darwin, goarch: arm64 }
- { runner: ubuntu-22.04, goos: linux, goarch: amd64 }
- { runner: ubuntu-22.04-arm, goos: linux, goarch: arm64 }

steps:
- uses: actions/checkout@v4
Expand All @@ -73,25 +60,40 @@ jobs:
name: llama-server-${{ matrix.goos }}-${{ matrix.goarch }}
path: bin/

- name: Ensure llama-server executable (unix)
if: matrix.kind == 'tarball'
- name: Ensure llama-server executable
shell: bash
run: chmod +x bin/llama-server

# ─── Smoke gate before packaging (tarball matrix only) ──────────
# Build OS label (ubuntu-2204 / mac-14) for the release asset name, read
# from the running image so it reflects the actual build baseline/glibc.
- name: Resolve build OS label
id: osinfo
shell: bash
run: |
set -euo pipefail
case "${RUNNER_OS}" in
Linux)
. /etc/os-release
echo "label=${ID}-${VERSION_ID//./}" >> "$GITHUB_OUTPUT" # ubuntu-2204
;;
macOS)
echo "label=mac-$(sw_vers -productVersion | cut -d. -f1)" >> "$GITHUB_OUTPUT" # mac-14
;;
*)
echo "::error::unhandled RUNNER_OS=${RUNNER_OS}" >&2; exit 1 ;;
esac

# ─── Smoke gate before packaging ────────────────────────────────
- uses: actions/setup-go@v5
if: matrix.kind == 'tarball'
with:
go-version-file: go.mod
check-latest: true

- uses: bufbuild/buf-setup-action@v1
if: matrix.kind == 'tarball'
with:
github_token: ${{ secrets.GITHUB_TOKEN }}

- name: Read embedding model pin
if: matrix.kind == 'tarball'
id: model
shell: bash
run: |
Expand All @@ -100,14 +102,12 @@ jobs:
echo "file=$(awk '/^file:/ {print $2}' .embedding-model.yaml)" >> "$GITHUB_OUTPUT"

- name: Cache embedding model
if: matrix.kind == 'tarball'
uses: actions/cache@v4
with:
path: models
key: embedding-model-${{ steps.model.outputs.sha256 }}

- name: Fetch embedding model (cache miss only)
if: matrix.kind == 'tarball'
shell: bash
env:
FILE: ${{ steps.model.outputs.file }}
Expand All @@ -122,15 +122,13 @@ jobs:
echo "${SHA} models/${FILE}" | shasum -a 256 -c -

- name: Run integration tests (release gate)
if: matrix.kind == 'tarball'
env:
RUNED_TEST_LLAMA_SERVER: ${{ github.workspace }}/bin/llama-server
RUNED_TEST_GGUF: ${{ github.workspace }}/models/${{ steps.model.outputs.file }}
run: go test -race -v ./internal/backend ./internal/server -run 'EmbedReturns|ReturnsVector'

# ─── Build runed + package tarball ──────────────────────────────
- name: Build runed binaries
if: matrix.kind == 'tarball'
env:
GOOS: ${{ matrix.goos }}
GOARCH: ${{ matrix.goarch }}
Expand All @@ -139,24 +137,13 @@ jobs:
run: make build

- name: Package tarball
if: matrix.kind == 'tarball'
env:
GOOS: ${{ matrix.goos }}
GOARCH: ${{ matrix.goarch }}
OS_LABEL: ${{ steps.osinfo.outputs.label }}
VERSION: ${{ steps.meta.outputs.version }}
run: make release-tarball

# ─── Windows: llama-server.exe zip ──────────────────────────────
- name: Package llama-server zip (windows)
if: matrix.kind == 'llama-only'
shell: pwsh
run: |
New-Item -ItemType Directory -Force -Path dist | Out-Null
$name = "llama-server-${{ steps.meta.outputs.version }}-${{ matrix.goos }}-${{ matrix.goarch }}.zip"
Compress-Archive -Path bin/llama-server.exe -DestinationPath "dist/$name"
$hash = (Get-FileHash "dist/$name" -Algorithm SHA256).Hash.ToLower()
"$hash $name" | Out-File -Encoding ascii "dist/$name.sha256"

- uses: actions/upload-artifact@v4
with:
name: release-${{ matrix.goos }}-${{ matrix.goarch }}
Expand Down Expand Up @@ -224,4 +211,4 @@ jobs:
--title "${{ steps.meta.outputs.version }}" \
--generate-notes \
--prerelease \
manifest.json *.tar.gz *.zip *.sha256
manifest.json *.tar.gz *.sha256
19 changes: 11 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ VERSION ?= v0.1.0-alpha
GOOS ?= $(shell go env GOOS)
GOARCH ?= $(shell go env GOARCH)

# OS_LABEL names the build OS in the release tarball (e.g. ubuntu-2204, mac-14)
# so the artifact records its glibc/SDK baseline. CI passes the running image's
# real identity; local `make release-tarball` falls back to GOOS.
OS_LABEL ?= $(GOOS)

LLAMA_CPP_REF := $(shell cat .llama-cpp-version)
LLAMA_CPP_DIR := third_party/llama.cpp
LLAMA_CPP_CACHE := $(CURDIR)/ci/llama-cpp-cache.cmake
Expand Down Expand Up @@ -57,18 +62,17 @@ build: proto
-o bin/rundemo ./cmd/rundemo

# Clone (shallow) and CPU-build llama-server at the pinned ref.
# Unix-only target — Windows CI invokes cmake directly because make/sh aren't
# the natural toolchain there. Reentrant: skips git clone if the directory
# already exists at the right ref, and skips cmake if the binary is fresh.
# Reentrant: skips git clone if the directory already exists at the right ref,
# and skips cmake if the binary is fresh.
llama-server:
@if [ ! -d "$(LLAMA_CPP_DIR)/.git" ]; then \
mkdir -p $(dir $(LLAMA_CPP_DIR)); \
git clone --depth 1 --branch $(LLAMA_CPP_REF) \
https://github.com/ggml-org/llama.cpp $(LLAMA_CPP_DIR); \
fi
cmake -B $(LLAMA_CPP_DIR)/build -S $(LLAMA_CPP_DIR) -C $(LLAMA_CPP_CACHE) $(LLAMA_CMAKE_EXTRA)
# -j2 caps parallel compile jobs: ubuntu-latest (16GB / 4 vCPU) OOM-killed
# the build at -j auto. ubuntu-24.04-arm and macos-14 survived but the
# -j2 caps parallel compile jobs: ubuntu-22.04 (16GB / 4 vCPU) OOM-killed
# the build at -j auto. ubuntu-22.04-arm and macos-14 survived but the
# bound is uniform across matrices for predictability.
cmake --build $(LLAMA_CPP_DIR)/build --target llama-server -j 2 --config Release
mkdir -p bin
Expand All @@ -81,11 +85,10 @@ clean:
rm -rf bin/ gen/ dist/

# Packages the Go binaries plus llama-server into a single tarball.
# Assumes `make build` and `make llama-server` (or the workflow's Windows
# equivalent) have already populated bin/.
# Assumes `make build` and `make llama-server` have already populated bin/.
release-tarball:
mkdir -p dist
TARNAME=runed-$(VERSION)-$(GOOS)-$(GOARCH).tar.gz; \
TARNAME=runed-$(VERSION)-$(OS_LABEL)-$(GOARCH).tar.gz; \
tar -czf dist/$$TARNAME -C bin runed rundemo llama-server; \
cd dist && ( \
(command -v shasum >/dev/null 2>&1 && shasum -a 256 $$TARNAME > $$TARNAME.sha256) \
Expand Down
Loading