CryptoLabInc · jh-lee-cryptolab · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.github/workflows/build-llama.yml b/.github/workflows/build-llama.yml
@@ -1,6 +1,7 @@
 name: build-llama
 
-# Vendor prebuild: builds llama-server for every supported platform,
+# Vendor prebuild: builds llama-server for every supported platform
+# (linux amd64/arm64 + darwin arm64; Windows support is dropped for now),
 # publishes the binary to GHCR (when permissions allow), and emits an
 # actions/upload-artifact so callers via workflow_call can consume the
 # binary without going through GHCR. build.yml and release.yaml call
@@ -17,7 +18,11 @@ name: build-llama
 #   - push to main on the same paths: build AND push to GHCR.
 #
 # Artifacts:
-#   GHCR: ghcr.io/<repo-lowercase>/llama-server:<llama-cpp-ref>-<os>-<arch>
+#   GHCR: ghcr.io/<repo-lowercase>/llama-server:<ref>-<os-type>-<os-version>-<arch>
+#         e.g. ...-ubuntu-2204-amd64 / ...-mac-14-arm64.
+#         OS type+version are read from the running image (not the runner
+#         label, which can be a rolling alias), so an OS/glibc bump busts the
+#         cache and triggers a fresh build.
 #   Workflow artifact: llama-server-<os>-<arch> containing the binary.
 
 on:
@@ -61,10 +66,9 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { runner: macos-14,         os: darwin,  arch: arm64, kind: unix    }
-          - { runner: ubuntu-latest,    os: linux,   arch: amd64, kind: unix    }
-          - { runner: ubuntu-24.04-arm, os: linux,   arch: arm64, kind: unix    }
-          - { runner: windows-latest,   os: windows, arch: amd64, kind: windows }
+          - { runner: macos-14,         os: darwin, arch: arm64 }
+          - { runner: ubuntu-22.04,     os: linux,  arch: amd64 }
+          - { runner: ubuntu-22.04-arm, os: linux,  arch: arm64 }
 
     steps:
       - uses: actions/checkout@v4
@@ -83,8 +87,35 @@ jobs:
         id: tag
         shell: bash
         run: |
+          set -euo pipefail
           ref_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
-          echo "tag=ghcr.io/${ref_repo}/llama-server:${{ steps.ref.outputs.ref }}-${{ matrix.os }}-${{ matrix.arch }}" >> "$GITHUB_OUTPUT"
+          # Tag layout: <ref>-<os-type>-<os-version>-<arch>, e.g.
+          #   b9352-ubuntu-2204-amd64 / b9352-mac-14-arm64
+          # OS type+version are read from the running image (not the runner
+          # label, which can be a rolling alias like ubuntu-latest) so the
+          # build baseline — most importantly the linux glibc version — is part
+          # of the cache key. An OS bump then busts the cache and forces a
+          # fresh build instead of reusing a binary built against another glibc.
+          case "${RUNNER_OS}" in
+            Linux)
+              . /etc/os-release
+              os_type="${ID}"                 # ubuntu
+              os_version="${VERSION_ID//./}"  # 22.04 -> 2204
+              ;;
+            macOS)
+              os_type="mac"
+              os_version="$(sw_vers -productVersion | cut -d. -f1)"  # 14
+              ;;
+            *)
+              echo "::error::unhandled RUNNER_OS=${RUNNER_OS}" >&2
+              exit 1
+              ;;
+          esac
+          if [ -z "${os_type:-}" ] || [ -z "${os_version:-}" ]; then
+            echo "::error::could not resolve OS type/version for the tag (RUNNER_OS=${RUNNER_OS})" >&2
+            exit 1
+          fi
+          echo "tag=ghcr.io/${ref_repo}/llama-server:${{ steps.ref.outputs.ref }}-${os_type}-${os_version}-${{ matrix.arch }}" >> "$GITHUB_OUTPUT"
 
       # ─── GHCR login + existence check ─────────────────────────────────
       - name: Install oras
@@ -128,7 +159,7 @@ jobs:
 
       # ─── Path 1: Pull existing binary from GHCR ─────────────────────────
       - name: Pull llama-server from GHCR
-        if: steps.plan.outputs.skip_build == 'true' && matrix.kind == 'unix'
+        if: steps.plan.outputs.skip_build == 'true'
         shell: bash
         run: |
           set -euo pipefail
@@ -137,60 +168,25 @@ jobs:
           chmod +x bin/llama-server
           bin/llama-server --version
 
-      - name: Pull llama-server.exe from GHCR
-        if: steps.plan.outputs.skip_build == 'true' && matrix.kind == 'windows'
-        shell: pwsh
-        run: |
-          New-Item -ItemType Directory -Force -Path bin | Out-Null
-          oras pull --output bin "${{ steps.tag.outputs.tag }}"
-
       # ─── Path 2: Build llama-server fresh ─────────────────────────────
-      - name: Clone llama.cpp (windows)
-        if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'windows'
-        shell: bash
-        run: |
-          mkdir -p third_party
-          git clone --depth 1 --branch ${{ steps.ref.outputs.ref }} \
-            https://github.com/ggml-org/llama.cpp third_party/llama.cpp
-
-      - name: Build llama-server (unix)
-        if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'unix'
+      - name: Build llama-server
+        if: steps.plan.outputs.skip_build != 'true'
         run: make llama-server
 
-      - name: Build llama-server.exe (windows)
-        if: steps.plan.outputs.skip_build != 'true' && matrix.kind == 'windows'
-        shell: pwsh
-        run: |
-          cmake -B third_party/llama.cpp/build -S third_party/llama.cpp -C ci/llama-cpp-cache.cmake
-          # -j 2 caps parallel compile jobs to avoid OOM on 16GB runners.
-          cmake --build third_party/llama.cpp/build --target llama-server -j 2 --config Release
-          New-Item -ItemType Directory -Force -Path bin | Out-Null
-          Copy-Item third_party/llama.cpp/build/bin/Release/llama-server.exe bin/
-
       - name: Sanity check binary
-        if: matrix.kind == 'unix'
         shell: bash
         run: bin/llama-server --version
 
       # ─── GHCR push (only when we built fresh AND we have write perms) ────
-      - name: Push llama-server to GHCR (unix)
-        if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request' && matrix.kind == 'unix'
+      - name: Push llama-server to GHCR
+        if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request'
         shell: bash
         working-directory: bin
         run: |
           oras push "${{ steps.tag.outputs.tag }}" \
             --artifact-type "application/vnd.runed.llama-server" \
             llama-server
 
-      - name: Push llama-server.exe to GHCR (windows)
-        if: steps.plan.outputs.skip_build != 'true' && github.event_name != 'pull_request' && matrix.kind == 'windows'
-        shell: pwsh
-        working-directory: bin
-        run: |
-          oras push "${{ steps.tag.outputs.tag }}" `
-            --artifact-type "application/vnd.runed.llama-server" `
-            llama-server.exe
-
       # ─── Workflow artifact (always — consumed by build.yml / release.yaml) ─
       - name: Upload llama-server artifact
         uses: actions/upload-artifact@v4

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -23,8 +23,8 @@ jobs:
       matrix:
         include:
           - { runner: macos-14,         goos: darwin, goarch: arm64, smoke: true }
-          - { runner: ubuntu-latest,    goos: linux,  goarch: amd64, smoke: true }
-          - { runner: ubuntu-24.04-arm, goos: linux,  goarch: arm64, smoke: true }
+          - { runner: ubuntu-22.04,     goos: linux,  goarch: amd64, smoke: true }
+          - { runner: ubuntu-22.04-arm, goos: linux,  goarch: arm64, smoke: true }
     runs-on: ${{ matrix.runner }}
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -34,23 +34,10 @@ jobs:
       matrix:
         include:
           # Full-stack tarballs: runed + rundemo + llama-server.
-          - runner: macos-14
-            goos: darwin
-            goarch: arm64
-            kind: tarball
-          - runner: ubuntu-latest
-            goos: linux
-            goarch: amd64
-            kind: tarball
-          - runner: ubuntu-24.04-arm
-            goos: linux
-            goarch: arm64
-            kind: tarball
-          # Windows: llama-server.exe only zip (runed Windows support deferred).
-          - runner: windows-latest
-            goos: windows
-            goarch: amd64
-            kind: llama-only
+          # (Windows support is dropped for now — no Windows target here.)
+          - { runner: macos-14,         goos: darwin, goarch: arm64 }
+          - { runner: ubuntu-22.04,     goos: linux,  goarch: amd64 }
+          - { runner: ubuntu-22.04-arm, goos: linux,  goarch: arm64 }
 
     steps:
       - uses: actions/checkout@v4
@@ -73,25 +60,40 @@ jobs:
           name: llama-server-${{ matrix.goos }}-${{ matrix.goarch }}
           path: bin/
 
-      - name: Ensure llama-server executable (unix)
-        if: matrix.kind == 'tarball'
+      - name: Ensure llama-server executable
         shell: bash
         run: chmod +x bin/llama-server
 
-      # ─── Smoke gate before packaging (tarball matrix only) ──────────
+      # Build OS label (ubuntu-2204 / mac-14) for the release asset name, read
+      # from the running image so it reflects the actual build baseline/glibc.
+      - name: Resolve build OS label
+        id: osinfo
+        shell: bash
+        run: |
+          set -euo pipefail
+          case "${RUNNER_OS}" in
+            Linux)
+              . /etc/os-release
+              echo "label=${ID}-${VERSION_ID//./}" >> "$GITHUB_OUTPUT"          # ubuntu-2204
+              ;;
+            macOS)
+              echo "label=mac-$(sw_vers -productVersion | cut -d. -f1)" >> "$GITHUB_OUTPUT"  # mac-14
+              ;;
+            *)
+              echo "::error::unhandled RUNNER_OS=${RUNNER_OS}" >&2; exit 1 ;;
+          esac
+
+      # ─── Smoke gate before packaging ────────────────────────────────
       - uses: actions/setup-go@v5
-        if: matrix.kind == 'tarball'
         with:
           go-version-file: go.mod
           check-latest: true
 
       - uses: bufbuild/buf-setup-action@v1
-        if: matrix.kind == 'tarball'
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Read embedding model pin
-        if: matrix.kind == 'tarball'
         id: model
         shell: bash
         run: |
@@ -100,14 +102,12 @@ jobs:
           echo "file=$(awk '/^file:/ {print $2}' .embedding-model.yaml)"     >> "$GITHUB_OUTPUT"
 
       - name: Cache embedding model
-        if: matrix.kind == 'tarball'
         uses: actions/cache@v4
         with:
           path: models
           key: embedding-model-${{ steps.model.outputs.sha256 }}
 
       - name: Fetch embedding model (cache miss only)
-        if: matrix.kind == 'tarball'
         shell: bash
         env:
           FILE: ${{ steps.model.outputs.file }}
@@ -122,15 +122,13 @@ jobs:
           echo "${SHA}  models/${FILE}" | shasum -a 256 -c -
 
       - name: Run integration tests (release gate)
-        if: matrix.kind == 'tarball'
         env:
           RUNED_TEST_LLAMA_SERVER: ${{ github.workspace }}/bin/llama-server
           RUNED_TEST_GGUF:         ${{ github.workspace }}/models/${{ steps.model.outputs.file }}
         run: go test -race -v ./internal/backend ./internal/server -run 'EmbedReturns|ReturnsVector'
 
       # ─── Build runed + package tarball ──────────────────────────────
       - name: Build runed binaries
-        if: matrix.kind == 'tarball'
         env:
           GOOS: ${{ matrix.goos }}
           GOARCH: ${{ matrix.goarch }}
@@ -139,24 +137,13 @@ jobs:
         run: make build
 
       - name: Package tarball
-        if: matrix.kind == 'tarball'
         env:
           GOOS: ${{ matrix.goos }}
           GOARCH: ${{ matrix.goarch }}
+          OS_LABEL: ${{ steps.osinfo.outputs.label }}
           VERSION: ${{ steps.meta.outputs.version }}
         run: make release-tarball
 
-      # ─── Windows: llama-server.exe zip ──────────────────────────────
-      - name: Package llama-server zip (windows)
-        if: matrix.kind == 'llama-only'
-        shell: pwsh
-        run: |
-          New-Item -ItemType Directory -Force -Path dist | Out-Null
-          $name = "llama-server-${{ steps.meta.outputs.version }}-${{ matrix.goos }}-${{ matrix.goarch }}.zip"
-          Compress-Archive -Path bin/llama-server.exe -DestinationPath "dist/$name"
-          $hash = (Get-FileHash "dist/$name" -Algorithm SHA256).Hash.ToLower()
-          "$hash  $name" | Out-File -Encoding ascii "dist/$name.sha256"
-
       - uses: actions/upload-artifact@v4
         with:
           name: release-${{ matrix.goos }}-${{ matrix.goarch }}
@@ -224,4 +211,4 @@ jobs:
             --title "${{ steps.meta.outputs.version }}" \
             --generate-notes \
             --prerelease \
-            manifest.json *.tar.gz *.zip *.sha256
+            manifest.json *.tar.gz *.sha256
diff --git a/Makefile b/Makefile
@@ -4,6 +4,11 @@ VERSION ?= v0.1.0-alpha
 GOOS ?= $(shell go env GOOS)
 GOARCH ?= $(shell go env GOARCH)
 
+# OS_LABEL names the build OS in the release tarball (e.g. ubuntu-2204, mac-14)
+# so the artifact records its glibc/SDK baseline. CI passes the running image's
+# real identity; local `make release-tarball` falls back to GOOS.
+OS_LABEL ?= $(GOOS)
+
 LLAMA_CPP_REF := $(shell cat .llama-cpp-version)
 LLAMA_CPP_DIR := third_party/llama.cpp
 LLAMA_CPP_CACHE := $(CURDIR)/ci/llama-cpp-cache.cmake
@@ -57,18 +62,17 @@ build: proto
 		-o bin/rundemo ./cmd/rundemo
 
 # Clone (shallow) and CPU-build llama-server at the pinned ref.
-# Unix-only target — Windows CI invokes cmake directly because make/sh aren't
-# the natural toolchain there. Reentrant: skips git clone if the directory
-# already exists at the right ref, and skips cmake if the binary is fresh.
+# Reentrant: skips git clone if the directory already exists at the right ref,
+# and skips cmake if the binary is fresh.
 llama-server:
 	@if [ ! -d "$(LLAMA_CPP_DIR)/.git" ]; then \
 		mkdir -p $(dir $(LLAMA_CPP_DIR)); \
 		git clone --depth 1 --branch $(LLAMA_CPP_REF) \
 			https://github.com/ggml-org/llama.cpp $(LLAMA_CPP_DIR); \
 	fi
 	cmake -B $(LLAMA_CPP_DIR)/build -S $(LLAMA_CPP_DIR) -C $(LLAMA_CPP_CACHE) $(LLAMA_CMAKE_EXTRA)
-	# -j2 caps parallel compile jobs: ubuntu-latest (16GB / 4 vCPU) OOM-killed
-	# the build at -j auto. ubuntu-24.04-arm and macos-14 survived but the
+	# -j2 caps parallel compile jobs: ubuntu-22.04 (16GB / 4 vCPU) OOM-killed
+	# the build at -j auto. ubuntu-22.04-arm and macos-14 survived but the
 	# bound is uniform across matrices for predictability.
 	cmake --build $(LLAMA_CPP_DIR)/build --target llama-server -j 2 --config Release
 	mkdir -p bin
@@ -81,11 +85,10 @@ clean:
 	rm -rf bin/ gen/ dist/
 
 # Packages the Go binaries plus llama-server into a single tarball.
-# Assumes `make build` and `make llama-server` (or the workflow's Windows
-# equivalent) have already populated bin/.
+# Assumes `make build` and `make llama-server` have already populated bin/.
 release-tarball:
 	mkdir -p dist
-	TARNAME=runed-$(VERSION)-$(GOOS)-$(GOARCH).tar.gz; \
+	TARNAME=runed-$(VERSION)-$(OS_LABEL)-$(GOARCH).tar.gz; \
 	tar -czf dist/$$TARNAME -C bin runed rundemo llama-server; \
 	cd dist && ( \
 		(command -v shasum >/dev/null 2>&1 && shasum -a 256 $$TARNAME > $$TARNAME.sha256) \