From 28c3c60700a0e64dca9d825900bfe144b1d91ced Mon Sep 17 00:00:00 2001 From: Jonas Toelke Date: Tue, 28 Apr 2026 17:19:04 -0500 Subject: [PATCH 1/4] ci(docker): use prebuilt Rust binaries by default Flip Docker image builds to consume staged native Rust artifacts, remove in-Docker Rust build stages, and publish per-arch images with a manifest merge. Add local staging support for prebuilt gateway and sandbox binaries so development image builds continue to work without CI artifacts. Signed-off-by: Jonas Toelke --- .../skills/debug-openshell-cluster/SKILL.md | 4 +- .github/workflows/docker-build.yml | 205 ++++++++++++++-- .github/workflows/shadow-docker-build.yml | 123 +++------- .../workflows/shadow-rust-native-build.yml | 223 ++++++++++++------ architecture/build-containers.md | 10 +- architecture/podman-driver.md | 4 +- crates/openshell-bootstrap/src/errors.rs | 2 +- crates/openshell-bootstrap/src/runtime.rs | 2 +- crates/openshell-vm/scripts/build-rootfs.sh | 4 +- deploy/docker/Dockerfile.images | 195 +-------------- deploy/docker/cluster-healthcheck.sh | 4 +- tasks/docker.toml | 10 + tasks/scripts/cluster-deploy-fast.sh | 41 ++-- tasks/scripts/docker-build-image.sh | 161 ++++++------- tasks/scripts/gateway-docker.sh | 6 +- tasks/scripts/stage-prebuilt-binaries.sh | 198 ++++++++++++++++ 16 files changed, 696 insertions(+), 496 deletions(-) create mode 100755 tasks/scripts/stage-prebuilt-binaries.sh diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index b883631b7..4d48cf7c5 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -184,7 +184,7 @@ Component images (server, sandbox) can reach kubelet via two paths: **Local/external pull mode** (default local via `mise run cluster`): Local images are tagged to the configured local registry base (default `127.0.0.1:5000/openshell/*`), pushed to that registry, and pulled by k3s via `registries.yaml` mirror endpoint (typically `host.docker.internal:5000`). The `cluster` task pushes prebuilt local tags (`openshell/*:dev`, falling back to `localhost:5000/openshell/*:dev` or `127.0.0.1:5000/openshell/*:dev`). -Gateway image builds now stage a partial Rust workspace from `deploy/docker/Dockerfile.images`. If cargo fails with a missing manifest under `/build/crates/...`, or an imported symbol exists locally but is missing in the image build, verify that every current gateway dependency crate (including `openshell-driver-docker`, `openshell-driver-kubernetes`, and `openshell-ocsf`) is copied into the staged workspace there. +Gateway and cluster image builds consume Rust binaries staged at `deploy/docker/.build/prebuilt-binaries//`. In CI these come from the reusable Rust native build workflow; locally `tasks/scripts/docker-build-image.sh` runs `tasks/scripts/stage-prebuilt-binaries.sh` before invoking Docker unless `PREBUILT_AUTO_STAGE=0` is set. ```bash # Verify image refs currently used by openshell deployment @@ -368,7 +368,7 @@ If DNS is broken, all image pulls from the distribution registry will fail, as w | `metrics-server` errors in logs | Normal k3s noise, not the root cause | These errors are benign — look for the actual failing health check component | | Stale NotReady nodes from previous deploys | Volume reused across container recreations | The deploy flow now auto-cleans stale nodes; if it still fails, manually delete NotReady nodes (see Step 2) or choose "Recreate" when prompted | | gRPC `UNIMPLEMENTED` for newer RPCs in push mode | Helm values still point at older pulled images instead of the pushed refs | Verify rendered `openshell-helmchart.yaml` uses the expected push refs (`server`, `sandbox`, `pki-job`) and not `:latest` | -| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without the `supervisor-builder` target in `deploy/docker/Dockerfile.images`. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker | +| Sandbox pods crash with `/opt/openshell/bin/openshell-sandbox: no such file or directory` | Supervisor binary missing from cluster image | The cluster image was built/published without a staged `openshell-sandbox` prebuilt binary. Rebuild with `mise run docker:build:cluster` and recreate gateway. Bootstrap auto-detects via `HEALTHCHECK_MISSING_SUPERVISOR` marker | | `HEALTHCHECK_MISSING_SUPERVISOR` in health check logs | `/opt/openshell/bin/openshell-sandbox` not found in gateway container | Rebuild cluster image: `mise run docker:build:cluster`, then `openshell gateway destroy && openshell gateway start` | | `nvidia-ctk cdi list` returns no `k8s.device-plugin.nvidia.com/gpu=` entries | CDI specs not yet generated by device plugin | Device plugin may still be starting; wait and retry, or check pod logs (Step 8) | diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 669fa9f19..c131418e5 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -8,7 +8,7 @@ on: required: true type: string timeout-minutes: - description: "Job timeout in minutes" + description: "Per-arch Docker image job timeout in minutes" required: false type: number default: 20 @@ -23,7 +23,7 @@ on: type: string default: "linux/amd64,linux/arm64" runner: - description: "GitHub Actions runner label" + description: "Deprecated; per-arch native runners are selected automatically" required: false type: string default: "build-amd64" @@ -35,17 +35,121 @@ on: env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - SCCACHE_MEMCACHED_ENDPOINT: ${{ vars.SCCACHE_MEMCACHED_ENDPOINT }} permissions: contents: read packages: write +defaults: + run: + shell: bash + jobs: + resolve: + name: Resolve build plan + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.resolve.outputs.matrix }} + platform_count: ${{ steps.resolve.outputs.platform_count }} + arches: ${{ steps.resolve.outputs.arches }} + binary_component: ${{ steps.resolve.outputs.binary_component }} + binary_name: ${{ steps.resolve.outputs.binary_name }} + artifact_prefix: ${{ steps.resolve.outputs.artifact_prefix }} + steps: + - name: Resolve component and platform matrix + id: resolve + run: | + set -euo pipefail + + component="${{ inputs.component }}" + case "$component" in + gateway) + binary_component=gateway + binary_name=openshell-gateway + ;; + supervisor|cluster) + binary_component=sandbox + binary_name=openshell-sandbox + ;; + *) + echo "unsupported component: $component" >&2 + exit 1 + ;; + esac + + platform_input="${{ inputs.platform }}" + platform_input="${platform_input//[[:space:]]/}" + if [[ -z "$platform_input" ]]; then + echo "platform input must not be empty" >&2 + exit 1 + fi + + IFS=',' read -r -a platforms <<< "$platform_input" + matrix='{"include":[' + arches=() + count=0 + + for platform in "${platforms[@]}"; do + case "$platform" in + linux/amd64) + arch=amd64 + runner=linux-amd64-cpu8 + ;; + linux/arm64) + arch=arm64 + runner=linux-arm64-cpu8 + ;; + *) + echo "unsupported platform: $platform" >&2 + echo "supported platforms: linux/amd64, linux/arm64" >&2 + exit 1 + ;; + esac + + if [[ $count -gt 0 ]]; then + matrix+=',' + fi + matrix+='{"platform":"'"$platform"'","arch":"'"$arch"'","runner":"'"$runner"'"}' + arches+=("$arch") + count=$((count + 1)) + done + + matrix+=']}' + { + echo "matrix=$matrix" + echo "platform_count=$count" + echo "arches=${arches[*]}" + echo "binary_component=$binary_component" + echo "binary_name=$binary_name" + echo "artifact_prefix=rust-binary-${component}-${binary_component}" + } >> "$GITHUB_OUTPUT" + + rust-binary: + name: Rust ${{ needs.resolve.outputs.binary_component }} (${{ matrix.arch }}) + needs: resolve + permissions: + contents: read + packages: read + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.resolve.outputs.matrix) }} + uses: ./.github/workflows/shadow-rust-native-build.yml + with: + component: ${{ needs.resolve.outputs.binary_component }} + arch: ${{ matrix.arch }} + cargo-version: ${{ inputs['cargo-version'] }} + features: openshell-core/dev-settings + artifact-name: ${{ needs.resolve.outputs.artifact_prefix }}-linux-${{ matrix.arch }} + secrets: inherit + build: - name: Build ${{ inputs.component }} - runs-on: ${{ inputs.runner }} - timeout-minutes: ${{ inputs.timeout-minutes }} + name: Build ${{ inputs.component }} (${{ matrix.arch }}) + needs: [resolve, rust-binary] + runs-on: ${{ matrix.runner }} + timeout-minutes: ${{ inputs['timeout-minutes'] }} + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.resolve.outputs.matrix) }} container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -54,11 +158,14 @@ jobs: options: --privileged volumes: - /var/run/docker.sock:/var/run/docker.sock + # Expose the nv-gha-runners buildkitd.toml registry mirror config + # inside the container so setup-buildx can read it. + - /etc/buildkit:/etc/buildkit:ro env: - IMAGE_TAG: ${{ github.sha }} + IMAGE_TAG: ${{ needs.resolve.outputs.platform_count == '1' && github.sha || format('{0}-{1}', github.sha, matrix.arch) }} IMAGE_REGISTRY: ghcr.io/nvidia/openshell DOCKER_PUSH: ${{ inputs.push && '1' || '0' }} - DOCKER_PLATFORM: ${{ inputs.platform }} + DOCKER_PLATFORM: ${{ matrix.platform }} steps: - uses: actions/checkout@v4 with: @@ -67,30 +174,76 @@ jobs: - name: Mark workspace safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Fetch tags - run: git fetch --tags --force - - - name: Compute cargo version - id: version - run: | - set -eu - if [[ -n "${{ inputs.cargo-version }}" ]]; then - echo "cargo_version=${{ inputs.cargo-version }}" >> "$GITHUB_OUTPUT" - else - echo "cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo)" >> "$GITHUB_OUTPUT" - fi + - name: Install tools + run: mise install --locked - name: Log in to GHCR + if: ${{ inputs.push }} run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin - - name: Set up Docker Buildx + - name: Set up buildx (local driver) uses: ./.github/actions/setup-buildx + with: + driver: local + buildkitd-config: /etc/buildkit/buildkitd.toml + + - name: Download Rust binary artifact + uses: actions/download-artifact@v4 + with: + name: ${{ needs.resolve.outputs.artifact_prefix }}-linux-${{ matrix.arch }} + path: prebuilt-rust-binary + + - name: Stage Rust binary in Docker build context + run: | + set -euo pipefail + binary="${{ needs.resolve.outputs.binary_name }}" + download_dir="prebuilt-rust-binary" + stage="deploy/docker/.build/prebuilt-binaries/${{ matrix.arch }}" + found="$(find "$download_dir" -type f -name "$binary" -print -quit)" + if [[ -z "$found" ]]; then + echo "missing downloaded artifact file: $binary" >&2 + find "$download_dir" -maxdepth 4 -type f -print >&2 || true + exit 1 + fi + mkdir -p "$stage" + install -m 0755 "$found" "$stage/$binary" + ls -lh "$stage/" - name: Build ${{ inputs.component }} image env: DOCKER_BUILDER: openshell - OPENSHELL_CARGO_VERSION: ${{ steps.version.outputs.cargo_version }} - # Enable dev-settings feature for test settings (dummy_bool, dummy_int) - # used by e2e tests. - EXTRA_CARGO_FEATURES: openshell-core/dev-settings - run: mise run --no-deps build:docker:${{ inputs.component }} + run: | + set -euo pipefail + mise exec -- tasks/scripts/docker-build-image.sh "${{ inputs.component }}" \ + --cache-from "type=gha,scope=${{ inputs.component }}-${{ matrix.arch }}" \ + --cache-to "type=gha,mode=max,scope=${{ inputs.component }}-${{ matrix.arch }}" + + merge: + name: Merge ${{ inputs.component }} manifest + needs: [resolve, build] + if: ${{ inputs.push && needs.resolve.outputs.platform_count != '1' }} + runs-on: linux-amd64-cpu8 + timeout-minutes: 10 + container: + image: ghcr.io/nvidia/openshell/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + volumes: + - /var/run/docker.sock:/var/run/docker.sock + steps: + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Create multi-arch manifest + run: | + set -euo pipefail + image="ghcr.io/nvidia/openshell/${{ inputs.component }}" + refs=() + for arch in ${{ needs.resolve.outputs.arches }}; do + refs+=("${image}:${GITHUB_SHA}-${arch}") + done + docker buildx imagetools create \ + --prefer-index=false \ + -t "${image}:${GITHUB_SHA}" \ + "${refs[@]}" diff --git a/.github/workflows/shadow-docker-build.yml b/.github/workflows/shadow-docker-build.yml index 05dcc6889..62e687867 100644 --- a/.github/workflows/shadow-docker-build.yml +++ b/.github/workflows/shadow-docker-build.yml @@ -1,98 +1,43 @@ -name: Shadow — Docker Build (local driver + GHA cache) +name: Shadow Docker Build -# OS-49 Phase 3 / PR 3 — non-blocking shadow of docker-build.yml. -# -# Exercises buildx's local (docker-container) driver plus GHA-cache -# (type=gha, scoped per component+arch) so Docker builds no longer depend on -# the in-cluster BuildKit pods. Per-arch matrix on nv-gha-runners; each job -# builds a single platform natively (no QEMU). No multi-arch manifest -# merging — that folds into the real cut-over in Phase 6. -# -# Plan, decision thresholds, and results: OS-127 Linear issue. Dispatch -# manually 4–5 times after merge to collect cold + warm numbers. +# OS-128 Phase 4: manual non-publishing exercise of the production Docker +# image workflow. This stays off main's push surface because the image path is +# not a required signal while the prebuilt-binary rollout is being measured. on: - push: - branches: [main] workflow_dispatch: + inputs: + platform: + description: "Target platform(s)" + required: false + type: string + default: "linux/amd64,linux/arm64" permissions: contents: read - packages: read - -env: - MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + packages: write jobs: - shadow-build: - name: shadow ${{ matrix.component }} (${{ matrix.arch }}) - strategy: - fail-fast: false - matrix: - component: [gateway, supervisor, cluster] - arch: [amd64, arm64] - include: - - arch: amd64 - runner: linux-amd64-cpu8 - - arch: arm64 - runner: linux-arm64-cpu8 - runs-on: ${{ matrix.runner }} - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - volumes: - - /var/run/docker.sock:/var/run/docker.sock - # Expose the nv-gha-runners buildkitd.toml (registry-mirror config) - # inside the container so docker/setup-buildx-action can read it. - # The file is pre-populated on every nv-gha-runner per: - # https://docs.gha-runners.nvidia.com/platform/best-practices/#use-docker-cache-for-buildkit - - /etc/buildkit:/etc/buildkit:ro - timeout-minutes: 45 - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Mark workspace safe for git - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - - name: Install tools - run: mise install - - - name: Set up buildx (local driver) - uses: ./.github/actions/setup-buildx - with: - driver: local - # Bind-mounted above via container.volumes; without that, the file - # is on the host but invisible to the action (which runs inside - # the ci:latest container). - buildkitd-config: /etc/buildkit/buildkitd.toml - - - name: Package Helm chart (cluster only) - if: matrix.component == 'cluster' - run: | - mkdir -p deploy/docker/.build/charts - helm package deploy/helm/openshell -d deploy/docker/.build/charts/ - - - name: Build ${{ matrix.component }} (${{ matrix.arch }}) - # Matches docker-build.yml's default EXTRA_CARGO_FEATURES so CI image - # content is comparable. No --push: the shadow measures build/cache - # mechanics, not publish behavior. Multi-arch manifests are Phase 6. - run: | - docker buildx build \ - --builder openshell \ - --platform linux/${{ matrix.arch }} \ - --cache-from type=gha,scope=${{ matrix.component }}-${{ matrix.arch }} \ - --cache-to type=gha,mode=max,scope=${{ matrix.component }}-${{ matrix.arch }} \ - --build-arg EXTRA_CARGO_FEATURES=openshell-core/dev-settings \ - --load \ - --file deploy/docker/Dockerfile.images \ - --target ${{ matrix.component }} \ - . - - - name: buildx du - if: always() - run: docker buildx du --builder openshell || true + gateway: + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: ${{ inputs.platform }} + push: false + secrets: inherit + + supervisor: + uses: ./.github/workflows/docker-build.yml + with: + component: supervisor + platform: ${{ inputs.platform }} + push: false + secrets: inherit + + cluster: + uses: ./.github/workflows/docker-build.yml + with: + component: cluster + platform: ${{ inputs.platform }} + push: false + secrets: inherit diff --git a/.github/workflows/shadow-rust-native-build.yml b/.github/workflows/shadow-rust-native-build.yml index b252777ea..1c12950ad 100644 --- a/.github/workflows/shadow-rust-native-build.yml +++ b/.github/workflows/shadow-rust-native-build.yml @@ -1,28 +1,80 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -name: Shadow — Rust Native Build (openshell-gateway / openshell-sandbox) - -# OS-49 Phase 4 / PR 4a — non-blocking shadow that builds openshell-gateway -# and openshell-sandbox natively per-arch on the nv-gha-runners shared CPU -# pool (`linux-{amd64,arm64}-cpu8`) with a GHA-backed sccache, and uploads -# the resulting binaries as artifacts. Reuses the pattern from PR #853's -# release-dev.yml "Build standalone {gateway,supervisor} binaries" jobs. -# -# The artifacts match the layout PR 4c expects when consuming `BINARY_SOURCE= -# prebuilt` (wired up in #945): one binary per (component, arch), staged to -# `deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox}`. -# -# Dispatch 4-5 times after merge to collect cold + warm numbers and compare -# against the Rust portion of docker-build.yml's 17.5 m ARC baseline. Success -# criteria, gotchas, and dependency graph live on the OS-128 Linear issue. +name: Rust Native Build (openshell-gateway / openshell-sandbox) + +# OS-128 Phase 4: build Rust binaries natively per Linux architecture before +# the Docker image build consumes them as prebuilt artifacts. on: - # workflow_dispatch only — keeps this shadow off main's required-check - # surface and avoids cluttering CI history with non-blocking failures - # while we're still collecting Phase 4 data. Dispatch from the Actions - # UI to collect cold/warm-cache numbers. + workflow_call: + inputs: + component: + description: "Binary component to build (gateway or sandbox)" + required: true + type: string + arch: + description: "Linux architecture to build (amd64 or arm64)" + required: true + type: string + cargo-version: + description: "Pre-computed cargo version (skips internal git-based computation)" + required: false + type: string + default: "" + features: + description: "Cargo features to enable" + required: false + type: string + default: "openshell-core/dev-settings" + retention-days: + description: "Artifact retention period" + required: false + type: number + default: 5 + artifact-name: + description: "Artifact name override" + required: false + type: string + default: "" workflow_dispatch: + inputs: + component: + description: "Binary component to build" + required: true + type: choice + default: gateway + options: + - gateway + - sandbox + arch: + description: "Linux architecture to build" + required: true + type: choice + default: amd64 + options: + - amd64 + - arm64 + cargo-version: + description: "Cargo version override" + required: false + type: string + default: "" + features: + description: "Cargo features to enable" + required: false + type: string + default: "openshell-core/dev-settings" + retention-days: + description: "Artifact retention period" + required: false + type: number + default: 5 + artifact-name: + description: "Artifact name override" + required: false + type: string + default: "" permissions: contents: read @@ -36,39 +88,28 @@ env: # backend instead of the EKS memcached used by ARC. SCCACHE_GHA_ENABLED: "true" +defaults: + run: + shell: bash + jobs: rust-native-build: - name: ${{ matrix.component }} (${{ matrix.arch }}) - strategy: - fail-fast: false - matrix: - component: [gateway, sandbox] - arch: [amd64, arm64] - include: - - component: gateway - crate: openshell-server - binary: openshell-gateway - - component: sandbox - crate: openshell-sandbox - binary: openshell-sandbox - - arch: amd64 - runner: linux-amd64-cpu8 - target: x86_64-unknown-linux-gnu - - arch: arm64 - runner: linux-arm64-cpu8 - target: aarch64-unknown-linux-gnu - runs-on: ${{ matrix.runner }} + name: ${{ inputs.component }} (${{ inputs.arch }}) + runs-on: ${{ inputs.arch == 'arm64' && 'linux-arm64-cpu8' || 'linux-amd64-cpu8' }} + timeout-minutes: 60 env: + COMPONENT: ${{ inputs.component }} + ARCH: ${{ inputs.arch }} + FEATURES: ${{ inputs.features }} # Partition the GHA sccache cache per (component, arch). Without this, - # concurrent matrix jobs collide on the same cache key and later-starting + # concurrent jobs collide on the same cache key and later-starting # writers hit 409 Conflict (PR #961 fix for shadow-shared-cpu-spike). - SCCACHE_GHA_VERSION: ${{ matrix.component }}-${{ matrix.arch }} + SCCACHE_GHA_VERSION: ${{ inputs.component }}-${{ inputs.arch }} container: image: ghcr.io/nvidia/openshell/ci:latest credentials: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - timeout-minutes: 60 steps: - uses: actions/checkout@v4 with: @@ -81,7 +122,46 @@ jobs: run: git fetch --tags --force - name: Install tools - run: mise install + run: mise install --locked + + - name: Resolve build target + id: target + run: | + set -euo pipefail + + case "$COMPONENT" in + gateway) + crate=openshell-server + binary=openshell-gateway + ;; + sandbox) + crate=openshell-sandbox + binary=openshell-sandbox + ;; + *) + echo "unsupported component: $COMPONENT" >&2 + exit 1 + ;; + esac + + case "$ARCH" in + amd64) + target=x86_64-unknown-linux-gnu + ;; + arm64) + target=aarch64-unknown-linux-gnu + ;; + *) + echo "unsupported arch: $ARCH" >&2 + exit 1 + ;; + esac + + { + echo "crate=$crate" + echo "binary=$binary" + echo "target=$target" + } >> "$GITHUB_OUTPUT" - name: Configure GHA sccache backend # Exposes ACTIONS_CACHE_URL / ACTIONS_RUNTIME_TOKEN so sccache (wrapped @@ -91,7 +171,7 @@ jobs: - name: Cache Rust target and registry uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 with: - shared-key: shadow-rust-native-${{ matrix.component }}-${{ matrix.arch }} + shared-key: rust-native-${{ inputs.component }}-${{ inputs.arch }} cache-directories: .cache/sccache cache-targets: "true" @@ -99,7 +179,11 @@ jobs: id: version run: | set -euo pipefail - echo "cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo)" >> "$GITHUB_OUTPUT" + if [[ -n "${{ inputs['cargo-version'] }}" ]]; then + echo "cargo_version=${{ inputs['cargo-version'] }}" >> "$GITHUB_OUTPUT" + else + echo "cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo)" >> "$GITHUB_OUTPUT" + fi - name: Patch workspace version if: steps.version.outputs.cargo_version != '' @@ -107,50 +191,53 @@ jobs: set -euo pipefail sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ steps.version.outputs.cargo_version }}"'"/}' Cargo.toml - - name: Build ${{ matrix.binary }} (${{ matrix.target }}) - # Matches docker-build.yml's default EXTRA_CARGO_FEATURES so the - # binary content is byte-comparable to what Dockerfile.images produces - # today (precondition for PR 4c's drop-in swap). + - name: Build ${{ steps.target.outputs.binary }} (${{ steps.target.outputs.target }}) run: | set -euo pipefail - mise x -- cargo build \ - --release \ - --target ${{ matrix.target }} \ - -p ${{ matrix.crate }} \ - --bin ${{ matrix.binary }} \ - --features openshell-core/dev-settings + args=( + --release + --target "${{ steps.target.outputs.target }}" + -p "${{ steps.target.outputs.crate }}" + --bin "${{ steps.target.outputs.binary }}" + ) + if [[ -n "$FEATURES" ]]; then + args+=(--features "$FEATURES") + fi + if [[ -n "${{ steps.version.outputs.cargo_version }}" ]]; then + export GIT_DIR=/nonexistent + fi + mise x -- cargo build "${args[@]}" - name: Verify packaged binary run: | set -euo pipefail - BIN="target/${{ matrix.target }}/release/${{ matrix.binary }}" + BIN="target/${{ steps.target.outputs.target }}/release/${{ steps.target.outputs.binary }}" OUTPUT="$("$BIN" --version)" echo "$OUTPUT" - grep -q "^${{ matrix.binary }} " <<<"$OUTPUT" + grep -q "^${{ steps.target.outputs.binary }} " <<<"$OUTPUT" # Record glibc linkage so drift from the Ubuntu noble runtime base - # image is visible in logs (not asserted — the runtime check lands - # when PR 4c builds images on top of these artifacts). - ldd --version | head -1 - ldd "$BIN" | head -20 || true + # image is visible in logs. + ldd --version + ldd "$BIN" || true - name: sccache stats if: always() run: mise x -- sccache --show-stats - name: Stage binary for prebuilt layout - # Shape mirrors `deploy/docker/.build/prebuilt-binaries//` - # so PR 4c can download the artifact directly into the build context. run: | set -euo pipefail - STAGE="prebuilt-binaries/${{ matrix.arch }}" + STAGE="prebuilt-binaries/$ARCH" mkdir -p "$STAGE" - install -m 0755 "target/${{ matrix.target }}/release/${{ matrix.binary }}" "$STAGE/${{ matrix.binary }}" + install -m 0755 \ + "target/${{ steps.target.outputs.target }}/release/${{ steps.target.outputs.binary }}" \ + "$STAGE/${{ steps.target.outputs.binary }}" ls -lh "$STAGE/" - name: Upload artifact uses: actions/upload-artifact@v4 with: - name: rust-binary-${{ matrix.component }}-linux-${{ matrix.arch }} - path: prebuilt-binaries/${{ matrix.arch }}/${{ matrix.binary }} - retention-days: 5 + name: ${{ inputs['artifact-name'] != '' && inputs['artifact-name'] || format('rust-binary-{0}-linux-{1}', inputs.component, inputs.arch) }} + path: prebuilt-binaries/${{ inputs.arch }}/${{ steps.target.outputs.binary }} + retention-days: ${{ inputs['retention-days'] }} if-no-files-found: error diff --git a/architecture/build-containers.md b/architecture/build-containers.md index e619534b2..59a749e25 100644 --- a/architecture/build-containers.md +++ b/architecture/build-containers.md @@ -19,7 +19,13 @@ The cluster image is a single-container Kubernetes distribution that bundles the - **Registry**: `ghcr.io/nvidia/openshell/cluster:latest` - **Pulled when**: `openshell gateway start` -The supervisor binary (`openshell-sandbox`) is built by the shared `supervisor-builder` stage in `deploy/docker/Dockerfile.images` and placed at `/opt/openshell/bin/openshell-sandbox`. It is exposed to sandbox pods at runtime via a read-only `hostPath` volume mount — it is not baked into sandbox images. +The supervisor binary (`openshell-sandbox`) is built before the image build, staged under `deploy/docker/.build/prebuilt-binaries//`, and copied into the cluster image at `/opt/openshell/bin/openshell-sandbox`. It is exposed to sandbox pods at runtime via a read-only `hostPath` volume mount — it is not baked into sandbox images. + +## Image Build Pipeline + +`deploy/docker/Dockerfile.images` no longer compiles Rust. CI calls `.github/workflows/shadow-rust-native-build.yml` through `workflow_call` to build `openshell-gateway` or `openshell-sandbox` natively on the target architecture. `.github/workflows/docker-build.yml` downloads the resulting artifact, stages it at `deploy/docker/.build/prebuilt-binaries//`, builds the per-arch image with the local Buildx driver, and merges multi-arch pushes with `docker buildx imagetools create`. + +Local Docker builds use `tasks/scripts/stage-prebuilt-binaries.sh` through `tasks/scripts/docker-build-image.sh` before invoking Docker, so clean checkouts do not need to create the staging directory manually. ## Standalone Gateway Binary @@ -62,7 +68,7 @@ The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes | Changed files | Rebuild triggered | |---|---| -| Cargo manifests, proto definitions, cross-build script | Gateway + supervisor | +| Cargo manifests, proto definitions, prebuilt staging script | Gateway + supervisor | | `crates/openshell-server/*`, `crates/openshell-ocsf/*`, `deploy/docker/Dockerfile.images` | Gateway | | `crates/openshell-sandbox/*`, `crates/openshell-policy/*` | Supervisor | | `deploy/helm/openshell/*` | Helm upgrade | diff --git a/architecture/podman-driver.md b/architecture/podman-driver.md index 37b4d469e..155937a77 100644 --- a/architecture/podman-driver.md +++ b/architecture/podman-driver.md @@ -100,7 +100,7 @@ sequenceDiagram C->>C: entrypoint: /opt/openshell/bin/openshell-sandbox ``` -The supervisor image is a `FROM scratch` image containing only the `openshell-sandbox` binary. It is built by the `supervisor-output` target in `deploy/docker/Dockerfile.images`. The `image_volumes` field in the container spec mounts this image's filesystem at `/opt/openshell/bin` with `rw: false`, making it a read-only overlay that the sandbox cannot tamper with. +The supervisor image is a `FROM scratch` image containing only the prebuilt `openshell-sandbox` binary. It is built by the `supervisor-output` target in `deploy/docker/Dockerfile.images`. The `image_volumes` field in the container spec mounts this image's filesystem at `/opt/openshell/bin` with `rw: false`, making it a read-only overlay that the sandbox cannot tamper with. ## Network Model @@ -256,4 +256,4 @@ The Podman driver is designed for rootless operation. The following adaptations - SSRF mitigation: `crates/openshell-core/src/net.rs` (IP classification: `is_always_blocked_ip`, `is_internal_ip`), `crates/openshell-sandbox/src/proxy.rs` (runtime enforcement on CONNECT/forward proxy), `crates/openshell-server/src/grpc/policy.rs` (load-time validation via `validate_rule_not_always_blocked`) - Sandbox supervisor: `crates/openshell-sandbox/src/` (Landlock, seccomp, netns, proxy -- shared by all drivers) - Container engine abstraction: `tasks/scripts/container-engine.sh` (build/deploy support for Docker and Podman) -- Supervisor image build: `deploy/docker/Dockerfile.images` (lines 183-184, `supervisor-output` target) +- Supervisor image build: `deploy/docker/Dockerfile.images` (`supervisor-output` target) diff --git a/crates/openshell-bootstrap/src/errors.rs b/crates/openshell-bootstrap/src/errors.rs index 9e385c680..8a6ccf5bd 100644 --- a/crates/openshell-bootstrap/src/errors.rs +++ b/crates/openshell-bootstrap/src/errors.rs @@ -375,7 +375,7 @@ fn diagnose_missing_supervisor(gateway_name: &str) -> GatewayFailureDiagnosis { was not found in the gateway container. This binary is side-loaded into every \ sandbox pod via a hostPath volume mount. Without it, all sandbox pods will \ crash immediately with \"no such file or directory\". This typically means the \ - cluster image was built or published without the supervisor-builder stage." + cluster image was built or published without the staged prebuilt openshell-sandbox binary." .to_string(), recovery_steps: vec![ RecoveryStep::with_command( diff --git a/crates/openshell-bootstrap/src/runtime.rs b/crates/openshell-bootstrap/src/runtime.rs index 0f9a96e6b..5bbfb5b6c 100644 --- a/crates/openshell-bootstrap/src/runtime.rs +++ b/crates/openshell-bootstrap/src/runtime.rs @@ -136,7 +136,7 @@ where The file /opt/openshell/bin/openshell-sandbox was not found in the gateway \ container. Without it, sandbox pods cannot start.\n\n\ This usually means the cluster image was built or published without the \ - supervisor-builder stage.\n\n\ + staged prebuilt openshell-sandbox binary.\n\n\ To fix:\n \ 1. Rebuild the cluster image: mise run docker:build:cluster\n \ 2. Or update to a cluster image that includes the supervisor binary\n \ diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index c55c004e0..9aa67ade4 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -367,8 +367,8 @@ chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" # The supervisor binary runs inside every sandbox pod. It is side-loaded # from the node filesystem via a read-only hostPath volume mount at # /opt/openshell/bin. In the Docker-based gateway this is built in the -# Dockerfile.cluster supervisor-builder stage; here we cross-compile -# from the host using cargo-zigbuild. + # Container images consume a prebuilt supervisor binary; here we + # cross-compile from the host using cargo-zigbuild. SUPERVISOR_TARGET="${RUST_TARGET}" SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 300dd1b46..ebe5e267e 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -9,9 +9,10 @@ # gateway Final gateway image # supervisor Final supervisor image # cluster Final cluster image -# gateway-builder Release openshell-gateway binary -# supervisor-builder Release openshell-sandbox binary # supervisor-output Minimal stage exporting only the supervisor binary +# +# Rust binaries are built natively before the image build and staged at: +# deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox} # Pin by tag AND manifest-list digest to prevent silent upstream republishes # from breaking the build. Update both when bumping k3s versions. @@ -22,205 +23,21 @@ ARG K9S_VERSION=v0.50.18 ARG HELM_VERSION=v3.17.3 ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1 -# OS-128 Phase 4: select binary source for final images. `build` (default) -# compiles Rust inside the builder stages below; `prebuilt` consumes binaries -# staged at deploy/docker/.build/prebuilt-binaries//. Declared at global -# scope so BuildKit can substitute it in `FROM *-binary-${BINARY_SOURCE}`. -ARG BINARY_SOURCE=build - -# --------------------------------------------------------------------------- -# Shared Rust build stages -# --------------------------------------------------------------------------- -FROM --platform=$BUILDPLATFORM rust:1.88-slim AS rust-builder-base -ARG TARGETARCH -ARG BUILDARCH -ARG CARGO_TARGET_CACHE_SCOPE=default -ARG SCCACHE_MEMCACHED_ENDPOINT -# CI sets this to 1 for maximum optimization; local builds leave it unset -# so cargo uses the Cargo.toml default (parallel codegen for fast linking). -ARG CARGO_CODEGEN_UNITS - -RUN apt-get update && apt-get install -y --no-install-recommends \ - cmake g++ make protobuf-compiler curl && rm -rf /var/lib/apt/lists/* - -COPY deploy/docker/cross-build.sh /usr/local/bin/ -RUN . cross-build.sh && install_cross_toolchain && install_sccache && add_rust_target - -WORKDIR /build - -FROM rust-builder-base AS rust-builder-skeleton - -COPY Cargo.toml Cargo.lock ./ -COPY crates/openshell-bootstrap/Cargo.toml crates/openshell-bootstrap/Cargo.toml -COPY crates/openshell-cli/Cargo.toml crates/openshell-cli/Cargo.toml -COPY crates/openshell-core/Cargo.toml crates/openshell-core/Cargo.toml -COPY crates/openshell-driver-docker/Cargo.toml crates/openshell-driver-docker/Cargo.toml -COPY crates/openshell-driver-kubernetes/Cargo.toml crates/openshell-driver-kubernetes/Cargo.toml -COPY crates/openshell-driver-podman/Cargo.toml crates/openshell-driver-podman/Cargo.toml -COPY crates/openshell-ocsf/Cargo.toml crates/openshell-ocsf/Cargo.toml -COPY crates/openshell-policy/Cargo.toml crates/openshell-policy/Cargo.toml -COPY crates/openshell-providers/Cargo.toml crates/openshell-providers/Cargo.toml -COPY crates/openshell-prover/Cargo.toml crates/openshell-prover/Cargo.toml -COPY crates/openshell-router/Cargo.toml crates/openshell-router/Cargo.toml -COPY crates/openshell-sandbox/Cargo.toml crates/openshell-sandbox/Cargo.toml -COPY crates/openshell-server/Cargo.toml crates/openshell-server/Cargo.toml -COPY crates/openshell-tui/Cargo.toml crates/openshell-tui/Cargo.toml -COPY crates/openshell-vm/Cargo.toml crates/openshell-vm/Cargo.toml -COPY crates/openshell-core/build.rs crates/openshell-core/build.rs -COPY proto/ proto/ - -RUN mkdir -p \ - crates/openshell-bootstrap/src \ - crates/openshell-cli/src \ - crates/openshell-core/src \ - crates/openshell-driver-docker/src \ - crates/openshell-driver-kubernetes/src \ - crates/openshell-driver-podman/src \ - crates/openshell-ocsf/src \ - crates/openshell-policy/src \ - crates/openshell-providers/src \ - crates/openshell-prover/src \ - crates/openshell-router/src \ - crates/openshell-sandbox/src \ - crates/openshell-server/src \ - crates/openshell-tui/src \ - crates/openshell-vm/src && \ - touch crates/openshell-bootstrap/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-cli/src/main.rs && \ - touch crates/openshell-core/src/lib.rs && \ - touch crates/openshell-driver-docker/src/lib.rs && \ - touch crates/openshell-driver-kubernetes/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-driver-kubernetes/src/main.rs && \ - touch crates/openshell-driver-podman/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-driver-podman/src/main.rs && \ - touch crates/openshell-ocsf/src/lib.rs && \ - touch crates/openshell-policy/src/lib.rs && \ - touch crates/openshell-providers/src/lib.rs && \ - touch crates/openshell-prover/src/lib.rs && \ - touch crates/openshell-router/src/lib.rs && \ - touch crates/openshell-sandbox/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-sandbox/src/main.rs && \ - touch crates/openshell-server/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-server/src/main.rs && \ - touch crates/openshell-tui/src/lib.rs && \ - touch crates/openshell-vm/src/lib.rs && \ - printf 'fn main() {}\n' > crates/openshell-vm/src/main.rs - -FROM rust-builder-skeleton AS rust-deps - -RUN --mount=type=cache,id=cargo-registry-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ - --mount=type=cache,id=cargo-git-${TARGETARCH},sharing=locked,target=/usr/local/cargo/git \ - --mount=type=cache,id=cargo-target-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ - --mount=type=cache,id=sccache-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && cargo_cross_build --release -p openshell-server -p openshell-sandbox - # --------------------------------------------------------------------------- -# Per-target workspace stages +# Per-arch binary stages # --------------------------------------------------------------------------- -# Copy only the crates needed for each target so that a change to -# openshell-sandbox does not invalidate the gateway build and vice versa. -# The skeleton stage already has stub Cargo.toml + src/ for every crate, -# so cargo workspace resolution continues to work — we just overwrite the -# crates whose real source is needed for compilation. - -FROM rust-deps AS gateway-workspace -ARG OPENSHELL_CARGO_VERSION - -COPY crates/openshell-core/ crates/openshell-core/ -COPY crates/openshell-driver-docker/ crates/openshell-driver-docker/ -COPY crates/openshell-driver-kubernetes/ crates/openshell-driver-kubernetes/ -COPY crates/openshell-driver-podman/ crates/openshell-driver-podman/ -COPY crates/openshell-ocsf/ crates/openshell-ocsf/ -COPY crates/openshell-policy/ crates/openshell-policy/ -COPY crates/openshell-providers/ crates/openshell-providers/ -COPY crates/openshell-router/ crates/openshell-router/ -COPY crates/openshell-server/ crates/openshell-server/ - -RUN touch \ - crates/openshell-core/build.rs \ - crates/openshell-server/src/main.rs \ - proto/*.proto && \ - if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ - sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ - fi - -FROM gateway-workspace AS gateway-builder -ARG CARGO_CODEGEN_UNITS -ARG EXTRA_CARGO_FEATURES="" - -RUN --mount=type=cache,id=cargo-registry-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ - --mount=type=cache,id=cargo-git-${TARGETARCH},sharing=locked,target=/usr/local/cargo/git \ - --mount=type=cache,id=cargo-target-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ - --mount=type=cache,id=sccache-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && \ - cargo_cross_build --release -p openshell-server ${EXTRA_CARGO_FEATURES:+--features "$EXTRA_CARGO_FEATURES"} && \ - mkdir -p /build/out && \ - cp "$(cross_output_dir release)/openshell-gateway" /build/out/ - -FROM rust-deps AS supervisor-workspace -ARG OPENSHELL_CARGO_VERSION - -COPY crates/openshell-core/ crates/openshell-core/ -COPY crates/openshell-ocsf/ crates/openshell-ocsf/ -COPY crates/openshell-policy/ crates/openshell-policy/ -COPY crates/openshell-router/ crates/openshell-router/ -COPY crates/openshell-sandbox/ crates/openshell-sandbox/ - -RUN touch \ - crates/openshell-core/build.rs \ - crates/openshell-ocsf/src/lib.rs \ - crates/openshell-sandbox/src/main.rs \ - proto/*.proto && \ - if [ -n "${OPENSHELL_CARGO_VERSION:-}" ]; then \ - sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' Cargo.toml; \ - fi - -FROM supervisor-workspace AS supervisor-builder -ARG CARGO_CODEGEN_UNITS -ARG EXTRA_CARGO_FEATURES="" - -RUN --mount=type=cache,id=cargo-registry-${TARGETARCH},sharing=locked,target=/usr/local/cargo/registry \ - --mount=type=cache,id=cargo-git-${TARGETARCH},sharing=locked,target=/usr/local/cargo/git \ - --mount=type=cache,id=cargo-target-${TARGETARCH}-${CARGO_TARGET_CACHE_SCOPE},sharing=locked,target=/build/target \ - --mount=type=cache,id=sccache-${TARGETARCH},sharing=locked,target=/tmp/sccache \ - . cross-build.sh && \ - cargo_cross_build --release -p openshell-sandbox ${EXTRA_CARGO_FEATURES:+--features "$EXTRA_CARGO_FEATURES"} && \ - mkdir -p /build/out && \ - cp "$(cross_output_dir release)/openshell-sandbox" /build/out/ - -# --------------------------------------------------------------------------- -# Binary source selector (OS-128 Phase 4) -# --------------------------------------------------------------------------- -# `BINARY_SOURCE` is declared at global scope above (near the other version -# ARGs). `build` (default) routes through the Rust builder stages above; -# `prebuilt` routes through the scratch stages below, which COPY from -# deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox} -# in the build context. Prebuilt-artifact production + end-to-end workflow -# wiring land in later Phase 4 PRs; the `prebuilt` path is inert unless a -# caller sets BINARY_SOURCE=prebuilt and stages the binaries. - -FROM gateway-builder AS gateway-binary-build -# Inherits /build/out/openshell-gateway from the cargo build stage. - -FROM scratch AS gateway-binary-prebuilt +FROM scratch AS gateway-binary ARG TARGETARCH # --chmod=755 preserves the executable bit through actions/upload-artifact + # download-artifact, which strip exec perms during the roundtrip. COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-gateway /build/out/openshell-gateway -FROM gateway-binary-${BINARY_SOURCE} AS gateway-binary - -FROM supervisor-builder AS supervisor-binary-build -# Inherits /build/out/openshell-sandbox from the cargo build stage. - -FROM scratch AS supervisor-binary-prebuilt +FROM scratch AS supervisor-binary ARG TARGETARCH # --chmod=755 preserves the executable bit through actions/upload-artifact + # download-artifact, which strip exec perms during the roundtrip. COPY --chmod=755 deploy/docker/.build/prebuilt-binaries/${TARGETARCH}/openshell-sandbox /build/out/openshell-sandbox -FROM supervisor-binary-${BINARY_SOURCE} AS supervisor-binary - # Minimal extraction stage for fast-deploy: exports only the supervisor # binary (~20-40 MB) instead of the entire build environment (~968 MB). FROM scratch AS supervisor-output diff --git a/deploy/docker/cluster-healthcheck.sh b/deploy/docker/cluster-healthcheck.sh index 6766ca95a..96c326446 100644 --- a/deploy/docker/cluster-healthcheck.sh +++ b/deploy/docker/cluster-healthcheck.sh @@ -54,8 +54,8 @@ kubectl -n openshell wait --for=jsonpath='{.status.readyReplicas}'=1 statefulset # Verify the sandbox supervisor binary exists on the node filesystem. # Sandbox pods mount /opt/openshell/bin as a read-only hostPath volume and # exec /opt/openshell/bin/openshell-sandbox as their entrypoint. If the binary -# is missing (e.g. cluster image was built without the supervisor-builder -# stage), every sandbox pod will crash with "no such file or directory". +# is missing (e.g. cluster image was built without the staged prebuilt +# binary), every sandbox pod will crash with "no such file or directory". # --------------------------------------------------------------------------- if [ ! -x /opt/openshell/bin/openshell-sandbox ]; then echo "HEALTHCHECK_MISSING_SUPERVISOR: /opt/openshell/bin/openshell-sandbox not found" >&2 diff --git a/tasks/docker.toml b/tasks/docker.toml index 323b47e13..c2cd0122a 100644 --- a/tasks/docker.toml +++ b/tasks/docker.toml @@ -18,6 +18,11 @@ description = "Build the CI Docker image" run = "tasks/scripts/docker-build-ci.sh" hide = true +["build:docker:prebuilt"] +description = "Build and stage Rust binaries consumed by Docker image builds" +run = "tasks/scripts/stage-prebuilt-binaries.sh all" +hide = true + ["build:docker:gateway"] description = "Build the gateway Docker image" run = "tasks/scripts/docker-build-image.sh gateway" @@ -58,6 +63,11 @@ description = "Alias for build:docker:cluster" depends = ["build:docker:cluster"] hide = true +["docker:stage-prebuilt"] +description = "Build and stage Rust binaries consumed by Docker image builds" +depends = ["build:docker:prebuilt"] +hide = true + ["build:docker:cluster:multiarch"] description = "Build multi-arch cluster image and push to a registry" run = "tasks/scripts/docker-publish-multiarch.sh" diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index b4d79d4eb..08f503a92 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -148,10 +148,10 @@ fi matches_gateway() { local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|deploy/docker/cross-build.sh) - return 0 - ;; + case "${path}" in + Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) + return 0 + ;; deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) return 0 ;; @@ -169,10 +169,10 @@ matches_gateway() { matches_supervisor() { local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|deploy/docker/cross-build.sh) - return 0 - ;; + case "${path}" in + Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) + return 0 + ;; deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) return 0 ;; @@ -210,13 +210,13 @@ compute_fingerprint() { # hashes. This ensures that committed changes (e.g. after `git pull` # or amend) are detected even when there are no uncommitted edits. local committed_trees="" - case "${component}" in - gateway) - committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/cross-build.sh deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh crates/openshell-core/ crates/openshell-driver-kubernetes/ crates/openshell-ocsf/ crates/openshell-policy/ crates/openshell-providers/ crates/openshell-router/ crates/openshell-server/ 2>/dev/null || true) - ;; - supervisor) - committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/cross-build.sh deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh crates/openshell-core/ crates/openshell-policy/ crates/openshell-router/ crates/openshell-sandbox/ 2>/dev/null || true) - ;; + case "${component}" in + gateway) + committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh tasks/scripts/stage-prebuilt-binaries.sh crates/openshell-core/ crates/openshell-driver-kubernetes/ crates/openshell-ocsf/ crates/openshell-policy/ crates/openshell-providers/ crates/openshell-router/ crates/openshell-server/ 2>/dev/null || true) + ;; + supervisor) + committed_trees=$(git ls-tree HEAD Cargo.toml Cargo.lock proto/ deploy/docker/Dockerfile.images tasks/scripts/docker-build-image.sh tasks/scripts/stage-prebuilt-binaries.sh crates/openshell-core/ crates/openshell-policy/ crates/openshell-router/ crates/openshell-sandbox/ 2>/dev/null || true) + ;; helm) committed_trees=$(git ls-tree HEAD deploy/helm/openshell/ 2>/dev/null || true) ;; @@ -329,8 +329,8 @@ if [[ "${build_supervisor}" == "1" ]]; then x86_64) HOST_ARCH=amd64 ;; esac - # Build the supervisor binary from the shared image build graph, then - # extract it via --output so fast deploys reuse the same Rust cache. + # Stage the supervisor binary through the prebuilt path, then extract it + # via --output from the minimal Docker target. SUPERVISOR_BUILD_DIR=$(mktemp -d) trap 'rm -rf "${SUPERVISOR_BUILD_DIR}"' EXIT @@ -340,10 +340,9 @@ if [[ "${build_supervisor}" == "1" ]]; then _cargo_version=$(uv run python tasks/scripts/release.py get-version --cargo 2>/dev/null || true) fi - # Only set DOCKER_PLATFORM when actually cross-compiling. Omitting it - # for native builds lets docker-build-image.sh pick the fast "docker" - # driver (same as gateway), which shares BuildKit cache mounts (sccache, - # cargo registry/target) and avoids docker-container IPC overhead. + # Only set DOCKER_PLATFORM when the cluster architecture differs from the + # local container engine architecture. Omitting it for native builds lets + # docker-build-image.sh pick the fast default builder. _platform_env=() if [[ "${CLUSTER_ARCH}" != "${HOST_ARCH}" ]]; then _platform_env=(DOCKER_PLATFORM="linux/${CLUSTER_ARCH}") diff --git a/tasks/scripts/docker-build-image.sh b/tasks/scripts/docker-build-image.sh index 1b30542db..537b2a2cc 100755 --- a/tasks/scripts/docker-build-image.sh +++ b/tasks/scripts/docker-build-image.sh @@ -8,40 +8,89 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/container-engine.sh" -sha256_16() { - if command -v sha256sum >/dev/null 2>&1; then - sha256sum "$1" | awk '{print substr($1, 1, 16)}' - else - shasum -a 256 "$1" | awk '{print substr($1, 1, 16)}' - fi +normalize_arch() { + case "$1" in + x86_64|amd64) echo "amd64" ;; + aarch64|arm64) echo "arm64" ;; + *) echo "$1" ;; + esac } -sha256_16_stdin() { - if command -v sha256sum >/dev/null 2>&1; then - sha256sum | awk '{print substr($1, 1, 16)}' - else - shasum -a 256 | awk '{print substr($1, 1, 16)}' +prebuilt_arches() { + if [[ -n "${DOCKER_PLATFORM:-}" ]]; then + local raw_platforms=${DOCKER_PLATFORM//[[:space:]]/} + local platform + IFS=',' read -r -a platforms <<< "${raw_platforms}" + for platform in "${platforms[@]}"; do + case "${platform}" in + linux/amd64) echo "amd64" ;; + linux/arm64) echo "arm64" ;; + *) + echo "Error: unsupported DOCKER_PLATFORM '${platform}'" >&2 + echo "Supported platforms: linux/amd64, linux/arm64" >&2 + exit 1 + ;; + esac + done + return fi + + normalize_arch "$(ce_info_arch)" } -detect_rust_scope() { - local dockerfile="$1" - local rust_from - rust_from=$(grep -E '^FROM --platform=\$BUILDPLATFORM rust:[^ ]+' "$dockerfile" | head -n1 | sed -E 's/^FROM --platform=\$BUILDPLATFORM rust:([^ ]+).*/\1/' || true) - if [[ -n "${rust_from}" ]]; then - echo "rust-${rust_from}" - return - fi +required_prebuilt_binaries() { + case "$1" in + gateway) + echo "openshell-gateway" + ;; + supervisor|cluster|supervisor-output) + echo "openshell-sandbox" + ;; + esac +} - if grep -q "rustup.rs" "$dockerfile"; then - echo "rustup-stable" - return +missing_prebuilt_paths() { + local target=$1 + local arch + local binary + local path + + mapfile -t arches < <(prebuilt_arches) + read -r -a binaries <<< "$(required_prebuilt_binaries "${target}")" + + for arch in "${arches[@]}"; do + for binary in "${binaries[@]}"; do + path="deploy/docker/.build/prebuilt-binaries/${arch}/${binary}" + if [[ ! -f "${path}" ]]; then + echo "${path}" + fi + done + done +} + +ensure_prebuilt_binaries() { + local target=$1 + local missing + local arch + + if [[ -z "${CI:-}" && "${PREBUILT_AUTO_STAGE:-1}" != "0" ]]; then + echo "Staging prebuilt Rust binaries for Docker target '${target}'..." + mapfile -t arches < <(prebuilt_arches) + for arch in "${arches[@]}"; do + PREBUILT_ARCH="${arch}" "${SCRIPT_DIR}/stage-prebuilt-binaries.sh" "${target}" + done fi - echo "no-rust" + missing="$(missing_prebuilt_paths "${target}")" + if [[ -n "${missing}" ]]; then + echo "Error: missing prebuilt Rust binaries required by Docker target '${target}':" >&2 + printf ' %s\n' ${missing} >&2 + echo "Stage binaries at deploy/docker/.build/prebuilt-binaries// before building." >&2 + exit 1 + fi } -TARGET=${1:?"Usage: docker-build-image.sh [extra-args...]"} +TARGET=${1:?"Usage: docker-build-image.sh [extra-args...]"} shift DOCKERFILE="deploy/docker/Dockerfile.images" @@ -69,9 +118,6 @@ case "${TARGET}" in IMAGE_NAME="openshell/cluster" DOCKER_TARGET="cluster" ;; - supervisor-builder) - DOCKER_TARGET="supervisor-builder" - ;; supervisor-output) IS_FINAL_IMAGE=1 IMAGE_NAME="openshell/supervisor" @@ -114,26 +160,6 @@ if [[ -z "${CI:-}" ]]; then fi fi -SCCACHE_ARGS=() -if [[ -n "${SCCACHE_MEMCACHED_ENDPOINT:-}" ]]; then - SCCACHE_ARGS=(--build-arg "SCCACHE_MEMCACHED_ENDPOINT=${SCCACHE_MEMCACHED_ENDPOINT}") -fi - -VERSION_ARGS=() -if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then - VERSION_ARGS=(--build-arg "OPENSHELL_CARGO_VERSION=${OPENSHELL_CARGO_VERSION}") -elif [[ -n "${CI:-}" ]]; then - CARGO_VERSION=$(uv run python tasks/scripts/release.py get-version --cargo 2>/dev/null || true) - if [[ -n "${CARGO_VERSION}" ]]; then - VERSION_ARGS=(--build-arg "OPENSHELL_CARGO_VERSION=${CARGO_VERSION}") - fi -fi - -LOCK_HASH=$(sha256_16 Cargo.lock) -RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-$(detect_rust_scope "${DOCKERFILE}")} -CACHE_SCOPE_INPUT="v2|shared|release|${LOCK_HASH}|${RUST_SCOPE}" -CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "${CACHE_SCOPE_INPUT}" | sha256_16_stdin) - # The cluster image embeds the packaged Helm chart. if [[ "${TARGET}" == "cluster" ]]; then mkdir -p deploy/docker/.build/charts @@ -145,31 +171,7 @@ if [[ "${TARGET}" == "cluster" && -n "${K3S_VERSION:-}" ]]; then K3S_ARGS=(--build-arg "K3S_VERSION=${K3S_VERSION}") fi -# CI builds use codegen-units=1 for maximum optimization; local builds omit -# the arg so cargo uses the Cargo.toml default (parallel codegen, fast links). -CODEGEN_ARGS=() -if [[ -n "${CI:-}" ]]; then - CODEGEN_ARGS=(--build-arg "CARGO_CODEGEN_UNITS=1") -fi - -# OS-128 Phase 4: opt in to consuming pre-built Rust binaries instead of -# compiling inside Docker. Default path (`build`) is unchanged. When -# USE_PREBUILT_BINARIES=true, the Dockerfile's BINARY_SOURCE=prebuilt stages -# are selected, which COPY from deploy/docker/.build/prebuilt-binaries// -# in the build context. Callers must stage the binaries before invoking. -BINARY_SOURCE_ARGS=() -if [[ "${USE_PREBUILT_BINARIES:-}" == "true" ]]; then - case "${TARGET}" in - gateway|supervisor|cluster|supervisor-output) - if [[ ! -d deploy/docker/.build/prebuilt-binaries ]]; then - echo "Error: USE_PREBUILT_BINARIES=true but deploy/docker/.build/prebuilt-binaries/ does not exist" >&2 - echo " Stage binaries at deploy/docker/.build/prebuilt-binaries//openshell-{gateway,sandbox}" >&2 - exit 1 - fi - BINARY_SOURCE_ARGS=(--build-arg "BINARY_SOURCE=prebuilt") - ;; - esac -fi +ensure_prebuilt_binaries "${TARGET}" TAG_ARGS=() if [[ "${IS_FINAL_IMAGE}" == "1" ]]; then @@ -192,26 +194,11 @@ else exit 1 fi -# Default to dev-settings so local builds include test-only settings -# (dummy_bool, dummy_int) that e2e tests depend on, matching CI behaviour. -EXTRA_CARGO_FEATURES="${EXTRA_CARGO_FEATURES:-openshell-core/dev-settings}" - -FEATURE_ARGS=() -if [[ -n "${EXTRA_CARGO_FEATURES}" ]]; then - FEATURE_ARGS=(--build-arg "EXTRA_CARGO_FEATURES=${EXTRA_CARGO_FEATURES}") -fi - ce_build \ ${BUILDER_ARGS[@]+"${BUILDER_ARGS[@]}"} \ ${DOCKER_PLATFORM:+--platform ${DOCKER_PLATFORM}} \ ${CACHE_ARGS[@]+"${CACHE_ARGS[@]}"} \ - ${SCCACHE_ARGS[@]+"${SCCACHE_ARGS[@]}"} \ - ${VERSION_ARGS[@]+"${VERSION_ARGS[@]}"} \ ${K3S_ARGS[@]+"${K3S_ARGS[@]}"} \ - ${CODEGEN_ARGS[@]+"${CODEGEN_ARGS[@]}"} \ - ${BINARY_SOURCE_ARGS[@]+"${BINARY_SOURCE_ARGS[@]}"} \ - ${FEATURE_ARGS[@]+"${FEATURE_ARGS[@]}"} \ - --build-arg "CARGO_TARGET_CACHE_SCOPE=${CARGO_TARGET_CACHE_SCOPE}" \ -f "${DOCKERFILE}" \ --target "${DOCKER_TARGET}" \ ${TAG_ARGS[@]+"${TAG_ARGS[@]}"} \ diff --git a/tasks/scripts/gateway-docker.sh b/tasks/scripts/gateway-docker.sh index 23527741f..a481692a3 100644 --- a/tasks/scripts/gateway-docker.sh +++ b/tasks/scripts/gateway-docker.sh @@ -136,10 +136,8 @@ if [[ "${HOST_OS}" == "Linux" && "${HOST_ARCH}" == "${DAEMON_ARCH}" ]]; then mkdir -p "${SUPERVISOR_OUT_DIR}" cp "${ROOT}/target/${SUPERVISOR_TARGET}/debug/openshell-sandbox" "${SUPERVISOR_BIN}" else - # Cross-compile via the existing Docker pipeline. The supervisor-output - # stage in deploy/docker/Dockerfile.images extracts just the openshell- - # sandbox binary, with the actual link happening inside Linux containers - # where FD limits are not a problem. + # Cross-compile through the prebuilt-binary staging helper, then use the + # supervisor-output stage to extract just the openshell-sandbox binary. # # This task is gated on a working Docker daemon above, so pin the # container-engine helper to docker — otherwise it auto-detects podman diff --git a/tasks/scripts/stage-prebuilt-binaries.sh b/tasks/scripts/stage-prebuilt-binaries.sh new file mode 100755 index 000000000..03439aa5b --- /dev/null +++ b/tasks/scripts/stage-prebuilt-binaries.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +usage() { + echo "Usage: stage-prebuilt-binaries.sh " >&2 +} + +normalize_arch() { + case "$1" in + x86_64|amd64) echo "amd64" ;; + aarch64|arm64) echo "arm64" ;; + *) echo "$1" ;; + esac +} + +target_triple() { + case "$1" in + amd64) echo "x86_64-unknown-linux-gnu" ;; + arm64) echo "aarch64-unknown-linux-gnu" ;; + *) + echo "unsupported architecture: $1" >&2 + exit 1 + ;; + esac +} + +host_arch() { + normalize_arch "$(uname -m)" +} + +host_os() { + uname -s +} + +detect_arches() { + if [[ -n "${PREBUILT_ARCH:-}" ]]; then + normalize_arch "${PREBUILT_ARCH}" + return + fi + + if [[ -n "${DOCKER_PLATFORM:-}" ]]; then + local raw_platforms=${DOCKER_PLATFORM//[[:space:]]/} + local platform + IFS=',' read -r -a platforms <<< "$raw_platforms" + for platform in "${platforms[@]}"; do + case "$platform" in + linux/amd64) echo "amd64" ;; + linux/arm64) echo "arm64" ;; + *) + echo "unsupported Docker platform for prebuilt binaries: $platform" >&2 + exit 1 + ;; + esac + done + return + fi + + host_arch +} + +components_for_target() { + case "$1" in + gateway) + echo "gateway" + ;; + sandbox|supervisor|cluster|supervisor-output) + echo "sandbox" + ;; + all) + echo "gateway sandbox" + ;; + *) + usage + exit 1 + ;; + esac +} + +resolve_component() { + case "$1" in + gateway) + crate=openshell-server + binary=openshell-gateway + ;; + sandbox) + crate=openshell-sandbox + binary=openshell-sandbox + ;; + *) + echo "unsupported binary component: $1" >&2 + exit 1 + ;; + esac +} + +patch_workspace_version() { + if [[ -z "${OPENSHELL_CARGO_VERSION:-}" ]]; then + return + fi + + cargo_toml="${ROOT}/Cargo.toml" + cargo_toml_backup="$(mktemp)" + cp "$cargo_toml" "$cargo_toml_backup" + restore_cargo_toml=1 + sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${OPENSHELL_CARGO_VERSION}"'"/}' "$cargo_toml" +} + +restore_workspace_version() { + if [[ "${restore_cargo_toml:-0}" == "1" ]]; then + cp "$cargo_toml_backup" "$cargo_toml" + rm -f "$cargo_toml_backup" + fi +} + +build_component_for_arch() { + local component=$1 + local arch=$2 + local target + local stage + local features + local cargo_subcommand + local current_host_os + local current_host_arch + + resolve_component "$component" + target="$(target_triple "$arch")" + stage="${ROOT}/deploy/docker/.build/prebuilt-binaries/${arch}" + features="${EXTRA_CARGO_FEATURES:-openshell-core/dev-settings}" + current_host_os="$(host_os)" + current_host_arch="$(host_arch)" + + cargo_subcommand=(cargo build) + if [[ "$current_host_os" != "Linux" || "$current_host_arch" != "$arch" ]]; then + if command -v cargo-zigbuild >/dev/null 2>&1 || mise which cargo-zigbuild >/dev/null 2>&1; then + cargo_subcommand=(cargo zigbuild) + else + echo "Error: cannot build ${binary} for linux/${arch} on ${current_host_os}/${current_host_arch}." >&2 + echo "Install cargo-zigbuild + zig, build on a matching Linux host, or provide prebuilt binaries in:" >&2 + echo " deploy/docker/.build/prebuilt-binaries/${arch}/" >&2 + exit 1 + fi + fi + + echo "Building ${binary} for linux/${arch} (${target})..." + mise x -- rustup target add "$target" >/dev/null 2>&1 || true + + args=( + --release + --target "$target" + -p "$crate" + --bin "$binary" + ) + if [[ -n "$features" ]]; then + args+=(--features "$features") + fi + + ( + cd "$ROOT" + if [[ -n "${OPENSHELL_CARGO_VERSION:-}" ]]; then + export GIT_DIR=/nonexistent + fi + CARGO_INCREMENTAL=0 mise x -- "${cargo_subcommand[@]}" "${args[@]}" + ) + + mkdir -p "$stage" + install -m 0755 "${ROOT}/target/${target}/release/${binary}" "${stage}/${binary}" + ls -lh "${stage}/${binary}" +} + +target=${1:-all} +if [[ "$#" -gt 0 ]]; then + shift +fi +if [[ "$#" -gt 0 ]]; then + usage + exit 1 +fi + +restore_cargo_toml=0 +trap restore_workspace_version EXIT + +patch_workspace_version + +mapfile -t arches < <(detect_arches) +read -r -a components <<< "$(components_for_target "$target")" + +for arch in "${arches[@]}"; do + for component in "${components[@]}"; do + build_component_for_arch "$component" "$arch" + done +done From 287a0567c51c3f94f14eeebcef12f4aefc95dbd6 Mon Sep 17 00:00:00 2001 From: Jonas Toelke Date: Wed, 29 Apr 2026 16:32:54 -0500 Subject: [PATCH 2/4] ci(docker): address prebuilt build review feedback --- .github/workflows/docker-build.yml | 23 +++++++++++++++++++ .github/workflows/release-vm-kernel.yml | 2 +- .../workflows/shadow-rust-native-build.yml | 4 ++++ crates/openshell-vm/scripts/build-rootfs.sh | 5 ++-- deploy/docker/Dockerfile.python-wheels | 2 +- deploy/docker/Dockerfile.python-wheels-macos | 2 +- mise.lock | 2 +- mise.toml | 2 +- tasks/docker.toml | 5 ---- tasks/python.toml | 2 +- tasks/scripts/cluster-deploy-fast.sh | 16 ++++++------- tasks/scripts/vm/build-libkrun.sh | 4 ++-- 12 files changed, 45 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index c131418e5..586bd7a5a 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -218,6 +218,29 @@ jobs: --cache-from "type=gha,scope=${{ inputs.component }}-${{ matrix.arch }}" \ --cache-to "type=gha,mode=max,scope=${{ inputs.component }}-${{ matrix.arch }}" + - name: Smoke check ${{ inputs.component }} image + if: ${{ !inputs.push }} + run: | + set -euo pipefail + image="${IMAGE_REGISTRY}/${{ inputs.component }}:${IMAGE_TAG}" + case "${{ inputs.component }}" in + gateway) + output="$(docker run --rm --platform "${{ matrix.platform }}" "$image" --version)" + echo "$output" + grep -q '^openshell-gateway ' <<<"$output" + ;; + supervisor) + output="$(docker run --rm --platform "${{ matrix.platform }}" "$image" --version)" + echo "$output" + grep -q '^openshell-sandbox ' <<<"$output" + ;; + cluster) + output="$(docker run --rm --platform "${{ matrix.platform }}" --entrypoint /opt/openshell/bin/openshell-sandbox "$image" --version)" + echo "$output" + grep -q '^openshell-sandbox ' <<<"$output" + ;; + esac + merge: name: Merge ${{ inputs.component }} manifest needs: [resolve, build] diff --git a/.github/workflows/release-vm-kernel.yml b/.github/workflows/release-vm-kernel.yml index c1593da31..81712cd97 100644 --- a/.github/workflows/release-vm-kernel.yml +++ b/.github/workflows/release-vm-kernel.yml @@ -135,7 +135,7 @@ jobs: - name: Install dependencies run: | set -euo pipefail - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" brew install lld dtc xz diff --git a/.github/workflows/shadow-rust-native-build.yml b/.github/workflows/shadow-rust-native-build.yml index 1c12950ad..d71ac33fa 100644 --- a/.github/workflows/shadow-rust-native-build.yml +++ b/.github/workflows/shadow-rust-native-build.yml @@ -192,6 +192,10 @@ jobs: sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "'"${{ steps.version.outputs.cargo_version }}"'"/}' Cargo.toml - name: Build ${{ steps.target.outputs.binary }} (${{ steps.target.outputs.target }}) + env: + # Preserve the release-codegen setting used by the old Dockerfile + # Rust build path so image artifacts keep the same release profile. + CARGO_PROFILE_RELEASE_CODEGEN_UNITS: "1" run: | set -euo pipefail args=( diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh index 9aa67ade4..02f72cdfe 100755 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -366,9 +366,8 @@ chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" # ── Build and inject openshell-sandbox supervisor binary ───────────── # The supervisor binary runs inside every sandbox pod. It is side-loaded # from the node filesystem via a read-only hostPath volume mount at -# /opt/openshell/bin. In the Docker-based gateway this is built in the - # Container images consume a prebuilt supervisor binary; here we - # cross-compile from the host using cargo-zigbuild. +# /opt/openshell/bin. Container images consume a prebuilt supervisor +# binary; here we cross-compile from the host using cargo-zigbuild. SUPERVISOR_TARGET="${RUST_TARGET}" SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" diff --git a/deploy/docker/Dockerfile.python-wheels b/deploy/docker/Dockerfile.python-wheels index 91e2223c4..1b3c72ab8 100644 --- a/deploy/docker/Dockerfile.python-wheels +++ b/deploy/docker/Dockerfile.python-wheels @@ -21,7 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 RUN pip install --no-cache-dir maturin COPY deploy/docker/cross-build.sh /usr/local/bin/ diff --git a/deploy/docker/Dockerfile.python-wheels-macos b/deploy/docker/Dockerfile.python-wheels-macos index 79cc6d9b4..7440dc13e 100644 --- a/deploy/docker/Dockerfile.python-wheels-macos +++ b/deploy/docker/Dockerfile.python-wheels-macos @@ -36,7 +36,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # arm64-apple-macosx-ld. Provide a linker alias to osxcross ld64. RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 RUN rustup target add aarch64-apple-darwin RUN pip install --no-cache-dir maturin diff --git a/mise.lock b/mise.lock index d5d110bcc..af5f7464f 100644 --- a/mise.lock +++ b/mise.lock @@ -155,7 +155,7 @@ url = "https://github.com/astral-sh/python-build-standalone/releases/download/20 provenance = "github-attestations" [[tools.rust]] -version = "stable" +version = "1.88.0" backend = "core:rust" [[tools.uv]] diff --git a/mise.toml b/mise.toml index 236cca4f8..c47baa118 100644 --- a/mise.toml +++ b/mise.toml @@ -20,7 +20,7 @@ lockfile_platforms = ["linux-x64", "linux-arm64", "macos-arm64"] [tools] python = "3.13.13" -rust = "stable" +rust = "1.88.0" node = "24.15.0" kubectl = "1.35.4" uv = "0.10.12" diff --git a/tasks/docker.toml b/tasks/docker.toml index c2cd0122a..b952d559c 100644 --- a/tasks/docker.toml +++ b/tasks/docker.toml @@ -63,11 +63,6 @@ description = "Alias for build:docker:cluster" depends = ["build:docker:cluster"] hide = true -["docker:stage-prebuilt"] -description = "Build and stage Rust binaries consumed by Docker image builds" -depends = ["build:docker:prebuilt"] -hide = true - ["build:docker:cluster:multiarch"] description = "Build multi-arch cluster image and push to a registry" run = "tasks/scripts/docker-publish-multiarch.sh" diff --git a/tasks/python.toml b/tasks/python.toml index 368fbfc93..98f6a5b1e 100644 --- a/tasks/python.toml +++ b/tasks/python.toml @@ -168,7 +168,7 @@ if [ -z "$CARGO_VERSION" ] && [ -n "${CI:-}" ]; then fi LOCK_HASH=$(sha256_16 Cargo.lock) -RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-rustup-stable} +RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-rustup-1.88.0} CACHE_SCOPE_INPUT="v1|python-wheels-macos|base|${LOCK_HASH}|${RUST_SCOPE}" CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "$CACHE_SCOPE_INPUT" | sha256_16_stdin) diff --git a/tasks/scripts/cluster-deploy-fast.sh b/tasks/scripts/cluster-deploy-fast.sh index 08f503a92..e94c00f7d 100755 --- a/tasks/scripts/cluster-deploy-fast.sh +++ b/tasks/scripts/cluster-deploy-fast.sh @@ -148,10 +148,10 @@ fi matches_gateway() { local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) - return 0 - ;; + case "${path}" in + Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) + return 0 + ;; deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) return 0 ;; @@ -169,10 +169,10 @@ matches_gateway() { matches_supervisor() { local path=$1 - case "${path}" in - Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) - return 0 - ;; + case "${path}" in + Cargo.toml|Cargo.lock|proto/*|tasks/scripts/stage-prebuilt-binaries.sh) + return 0 + ;; deploy/docker/Dockerfile.images|tasks/scripts/docker-build-image.sh) return 0 ;; diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 9e2217f50..70d28ac39 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -283,8 +283,8 @@ ensure_cargo_for_libkrun() { echo "ERROR: Cargo >= ${min_ver} is required to build libkrun (Rust edition 2024)." >&2 echo " Current: $(command -v cargo 2>/dev/null || echo '(no cargo in PATH)') $(cargo --version 2>/dev/null || true)" >&2 - echo " Typical fix: run vm:setup via mise from the repo so Rust stable is on PATH," >&2 - echo " or: rustup update stable && export PATH=\"\$HOME/.cargo/bin:\$PATH\"" >&2 + echo " Typical fix: run vm:setup via mise from the repo so Rust 1.88.0 is on PATH," >&2 + echo " or: rustup toolchain install 1.88.0 && rustup default 1.88.0 && export PATH=\"\$HOME/.cargo/bin:\$PATH\"" >&2 echo " Override minimum: LIBKRUN_MIN_CARGO_VERSION=…" >&2 exit 1 } From cecd028c297a707fc4b419fe719272e63a505a71 Mon Sep 17 00:00:00 2001 From: Jonas Toelke Date: Wed, 29 Apr 2026 16:48:07 -0500 Subject: [PATCH 3/4] ci(rust): allow existing vfio complexity --- crates/openshell-vfio/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs index bc226a78c..946c9474d 100644 --- a/crates/openshell-vfio/src/lib.rs +++ b/crates/openshell-vfio/src/lib.rs @@ -605,6 +605,7 @@ fn bind_device_to_vfio(sysfs: &SysfsRoot, bdf: &str) -> Result /// Also binds all companion devices in the same IOMMU group (e.g. the /// HD Audio function on consumer GPUs). All bound companions are tracked /// and restored when the guard is dropped. +#[allow(clippy::cognitive_complexity)] pub fn prepare_gpu_for_passthrough( sysfs: &SysfsRoot, bdf: &str, @@ -748,6 +749,7 @@ fn restore_gpu_to_host_driver_ex( /// Removes the state file only when all bindings are resolved; rewrites it /// with the remaining entries when some restorations fail so they can be /// retried on the next process start. +#[allow(clippy::cognitive_complexity)] pub fn reconcile_stale_bindings(sysfs: &SysfsRoot, state_path: &Path) -> Vec { let state = match GpuBindState::load(state_path) { Ok(s) => s, From d46a3902d86489e420748395e0f70d747cfcd4e0 Mon Sep 17 00:00:00 2001 From: Jonas Toelke Date: Wed, 29 Apr 2026 18:52:57 -0500 Subject: [PATCH 4/4] ci(rust): pin toolchain to 1.95 --- .github/workflows/release-vm-kernel.yml | 2 +- crates/openshell-vfio/src/lib.rs | 2 -- deploy/docker/Dockerfile.cli-macos | 2 +- deploy/docker/Dockerfile.driver-vm-macos | 2 +- deploy/docker/Dockerfile.gateway-macos | 2 +- deploy/docker/Dockerfile.python-wheels | 2 +- deploy/docker/Dockerfile.python-wheels-macos | 2 +- deploy/docker/Dockerfile.vm-macos | 2 +- mise.lock | 2 +- mise.toml | 2 +- rust-toolchain.toml | 2 +- tasks/python.toml | 2 +- tasks/scripts/vm/build-libkrun.sh | 4 ++-- 13 files changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/workflows/release-vm-kernel.yml b/.github/workflows/release-vm-kernel.yml index 81712cd97..ca773ef0f 100644 --- a/.github/workflows/release-vm-kernel.yml +++ b/.github/workflows/release-vm-kernel.yml @@ -135,7 +135,7 @@ jobs: - name: Install dependencies run: | set -euo pipefail - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" brew install lld dtc xz diff --git a/crates/openshell-vfio/src/lib.rs b/crates/openshell-vfio/src/lib.rs index 946c9474d..bc226a78c 100644 --- a/crates/openshell-vfio/src/lib.rs +++ b/crates/openshell-vfio/src/lib.rs @@ -605,7 +605,6 @@ fn bind_device_to_vfio(sysfs: &SysfsRoot, bdf: &str) -> Result /// Also binds all companion devices in the same IOMMU group (e.g. the /// HD Audio function on consumer GPUs). All bound companions are tracked /// and restored when the guard is dropped. -#[allow(clippy::cognitive_complexity)] pub fn prepare_gpu_for_passthrough( sysfs: &SysfsRoot, bdf: &str, @@ -749,7 +748,6 @@ fn restore_gpu_to_host_driver_ex( /// Removes the state file only when all bindings are resolved; rewrites it /// with the remaining entries when some restorations fail so they can be /// retried on the next process start. -#[allow(clippy::cognitive_complexity)] pub fn reconcile_stale_bindings(sysfs: &SysfsRoot, state_path: &Path) -> Vec { let state = match GpuBindState::load(state_path) { Ok(s) => s, diff --git a/deploy/docker/Dockerfile.cli-macos b/deploy/docker/Dockerfile.cli-macos index f9370691c..7565a4a16 100644 --- a/deploy/docker/Dockerfile.cli-macos +++ b/deploy/docker/Dockerfile.cli-macos @@ -37,7 +37,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 # aws-lc-sys probes with --target=arm64-apple-macosx and clang then looks for # arm64-apple-macosx-ld. Provide a linker alias to osxcross ld64. diff --git a/deploy/docker/Dockerfile.driver-vm-macos b/deploy/docker/Dockerfile.driver-vm-macos index ac0aec952..47fcbd3e1 100644 --- a/deploy/docker/Dockerfile.driver-vm-macos +++ b/deploy/docker/Dockerfile.driver-vm-macos @@ -40,7 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 RUN rustup target add aarch64-apple-darwin diff --git a/deploy/docker/Dockerfile.gateway-macos b/deploy/docker/Dockerfile.gateway-macos index b0ac282fc..29f72a65d 100644 --- a/deploy/docker/Dockerfile.gateway-macos +++ b/deploy/docker/Dockerfile.gateway-macos @@ -31,7 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld diff --git a/deploy/docker/Dockerfile.python-wheels b/deploy/docker/Dockerfile.python-wheels index 1b3c72ab8..e93bf8f22 100644 --- a/deploy/docker/Dockerfile.python-wheels +++ b/deploy/docker/Dockerfile.python-wheels @@ -21,7 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libssl-dev \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 RUN pip install --no-cache-dir maturin COPY deploy/docker/cross-build.sh /usr/local/bin/ diff --git a/deploy/docker/Dockerfile.python-wheels-macos b/deploy/docker/Dockerfile.python-wheels-macos index 7440dc13e..b0fc4ddfb 100644 --- a/deploy/docker/Dockerfile.python-wheels-macos +++ b/deploy/docker/Dockerfile.python-wheels-macos @@ -36,7 +36,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # arm64-apple-macosx-ld. Provide a linker alias to osxcross ld64. RUN ln -sf /osxcross/bin/arm64-apple-darwin25.1-ld /usr/local/bin/arm64-apple-macosx-ld -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 RUN rustup target add aarch64-apple-darwin RUN pip install --no-cache-dir maturin diff --git a/deploy/docker/Dockerfile.vm-macos b/deploy/docker/Dockerfile.vm-macos index c033e43e8..4527217bc 100644 --- a/deploy/docker/Dockerfile.vm-macos +++ b/deploy/docker/Dockerfile.vm-macos @@ -40,7 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ pkg-config \ && rm -rf /var/lib/apt/lists/* -RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.88.0 +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.95.0 RUN rustup target add aarch64-apple-darwin diff --git a/mise.lock b/mise.lock index a6bdc45ef..e5b6ce16b 100644 --- a/mise.lock +++ b/mise.lock @@ -247,7 +247,7 @@ url = "https://github.com/astral-sh/python-build-standalone/releases/download/20 provenance = "github-attestations" [[tools.rust]] -version = "1.88.0" +version = "1.95.0" backend = "core:rust" [[tools.uv]] diff --git a/mise.toml b/mise.toml index c47baa118..fc4961db8 100644 --- a/mise.toml +++ b/mise.toml @@ -20,7 +20,7 @@ lockfile_platforms = ["linux-x64", "linux-arm64", "macos-arm64"] [tools] python = "3.13.13" -rust = "1.88.0" +rust = "1.95.0" node = "24.15.0" kubectl = "1.35.4" uv = "0.10.12" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 25f96ab68..26a307fdf 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -2,4 +2,4 @@ # SPDX-License-Identifier: Apache-2.0 [toolchain] -channel = "stable" +channel = "1.95.0" diff --git a/tasks/python.toml b/tasks/python.toml index 98f6a5b1e..b95d96671 100644 --- a/tasks/python.toml +++ b/tasks/python.toml @@ -168,7 +168,7 @@ if [ -z "$CARGO_VERSION" ] && [ -n "${CI:-}" ]; then fi LOCK_HASH=$(sha256_16 Cargo.lock) -RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-rustup-1.88.0} +RUST_SCOPE=${RUST_TOOLCHAIN_SCOPE:-rustup-1.95.0} CACHE_SCOPE_INPUT="v1|python-wheels-macos|base|${LOCK_HASH}|${RUST_SCOPE}" CARGO_TARGET_CACHE_SCOPE=$(printf '%s' "$CACHE_SCOPE_INPUT" | sha256_16_stdin) diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 70d28ac39..ec636f2a3 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -283,8 +283,8 @@ ensure_cargo_for_libkrun() { echo "ERROR: Cargo >= ${min_ver} is required to build libkrun (Rust edition 2024)." >&2 echo " Current: $(command -v cargo 2>/dev/null || echo '(no cargo in PATH)') $(cargo --version 2>/dev/null || true)" >&2 - echo " Typical fix: run vm:setup via mise from the repo so Rust 1.88.0 is on PATH," >&2 - echo " or: rustup toolchain install 1.88.0 && rustup default 1.88.0 && export PATH=\"\$HOME/.cargo/bin:\$PATH\"" >&2 + echo " Typical fix: run vm:setup via mise from the repo so Rust 1.95.0 is on PATH," >&2 + echo " or: rustup toolchain install 1.95.0 && rustup default 1.95.0 && export PATH=\"\$HOME/.cargo/bin:\$PATH\"" >&2 echo " Override minimum: LIBKRUN_MIN_CARGO_VERSION=…" >&2 exit 1 }