Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 32 additions & 26 deletions runner/.justfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
# Configuration:
# - DSTACK_SHIM_UPLOAD_VERSION: Version of the runner and shim to upload
# - DSTACK_SHIM_UPLOAD_S3_BUCKET: S3 bucket to upload binaries to
# - DSTACK_SHIM_BUILD_ARCH: Target architecture for runner and shim (defaults to amd64)
#
# Build Process:
# - Runner is always built for linux/amd64
# - Shim can be built for any platform (defaults to host platform)
# - When uploading, shim is automatically built for linux/amd64
# - Runner and shim are always built for linux (GOOS=linux is the only supported OS)
# - The target architecture is configurable via DSTACK_SHIM_BUILD_ARCH (or `just --set arch ...`)
# - CGO is enabled only for native builds (Linux host with a matching architecture);
# otherwise it is disabled and DCGM support is dropped
#
# Development Workflows:
# - Local Development:
Expand All @@ -31,13 +33,12 @@ export version := env("DSTACK_SHIM_UPLOAD_VERSION", "0.0.0")
# S3 bucket to upload binaries to
export s3_bucket := env("DSTACK_SHIM_UPLOAD_S3_BUCKET", "dstack-runner-downloads-stgn")

# Download URLs
export runner_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-runner-linux-amd64"
export shim_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-shim-linux-amd64"
# Target architecture for runner and shim (GOOS is always linux)
export arch := env("DSTACK_SHIM_BUILD_ARCH", "amd64")

# Shim build configuration
export shim_os := ""
export shim_arch := ""
# Download URLs
export runner_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-runner-linux-" + arch
export shim_download_url := "s3://" + s3_bucket + "/" + version + "/binaries/dstack-shim-linux-" + arch

# Go toolchain image for running tests in a container (keep in sync with go.mod)
export go_version := env("DSTACK_GO_VERSION", "1.25")
Expand All @@ -47,8 +48,8 @@ export go_version := env("DSTACK_GO_VERSION", "1.25")
build-runner-binary:
#!/usr/bin/env bash
set -e
echo "Building runner for linux/amd64"
cd {{source_directory()}}/cmd/runner && CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
echo "Building runner for linux/$arch"
cd {{source_directory()}}/cmd/runner && CGO_ENABLED=0 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
echo "Runner build complete!"

# Build shim
Expand All @@ -57,23 +58,23 @@ build-shim-binary:
#!/usr/bin/env bash
set -e
cd {{source_directory()}}/cmd/shim
if [ -n "$shim_os" ] && [ -n "$shim_arch" ]; then
echo "Building shim for $shim_os/$shim_arch"
if [ "$shim_os" = "linux" ] && [ "$(uname -s)" != "Linux" ]; then
echo "WARNING: Cross-compiling to Linux, disabling CGO (DCGM unavailable)"
CGO_ENABLED=0 GOOS=$shim_os GOARCH=$shim_arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
else
CGO_ENABLED=1 GOOS=$shim_os GOARCH=$shim_arch go build -ldflags "-X 'main.Version=$version'"
fi
echo "Building shim for linux/$arch"
host_arch=$(uname -m)
case "$host_arch" in
x86_64) host_arch=amd64 ;;
aarch64 | arm64) host_arch=arm64 ;;
esac
if [ "$(uname -s)" = "Linux" ] && [ "$host_arch" = "$arch" ]; then
CGO_ENABLED=1 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version'"
else
echo "Building shim for current platform"
go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
echo "WARNING: Cross-compiling to linux/$arch, disabling CGO (DCGM unavailable)"
CGO_ENABLED=0 GOOS=linux GOARCH=$arch go build -ldflags "-X 'main.Version=$version' -extldflags '-static'"
fi
echo "Shim build (version: $version) complete!"

# Build both runner and shim
build-runner: build-runner-binary build-shim-binary
echo "Build complete! Linux AMD64 binaries are in their respective cmd directories."
echo "Build complete! linux/$arch binaries are in their respective cmd directories."

# Clean build artifacts
clean-runner:
Expand All @@ -98,13 +99,18 @@ test-runner-in-container *args="-short ./...":
golang:{{go_version}} \
go test -race {{args}}

# Validate shim is built for linux/amd64
# Validate shim is built for the configured linux architecture
[private]
validate-shim-binary:
#!/usr/bin/env bash
set -e
if ! file {{source_directory()}}/cmd/shim/shim | grep -q "ELF 64-bit LSB executable, x86-64"; then
echo "Error: Shim must be built for linux/amd64 for upload"
case "$arch" in
amd64) expected="x86-64" ;;
arm64) expected="ARM aarch64" ;;
*) echo "Error: Unsupported arch '$arch'"; exit 1 ;;
esac
if ! file {{source_directory()}}/cmd/shim/shim | grep -q "ELF 64-bit LSB executable, $expected"; then
echo "Error: Shim must be built for linux/$arch for upload"
exit 1
fi

Expand All @@ -125,7 +131,7 @@ upload-runner-binary:
upload-shim-binary:
#!/usr/bin/env bash
set -e
just --set shim_os linux --set shim_arch amd64 build-shim-binary
just build-shim-binary
just validate-shim-binary
aws s3 cp {{source_directory()}}/cmd/shim/shim "{{shim_download_url}}" --acl public-read
echo "Uploaded shim to S3"
3 changes: 2 additions & 1 deletion runner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,14 @@ These are non-exhaustive lists of external dependencies (executables, libraries)
* `umount`
* `mountpoint`
* `lsblk`
* `blkid`
* `mkfs.ext4`
* (NVIDIA GPU SSH fleet instances only) `nvidia-smi`
* (AMD SSH fleet instances only) `docker` (used for `amd-smi` container)
* (Intel Gaudi SSH fleet instances only) `hl-smi`
* ...

Debian/Ubuntu packages: `mount` (`mount`, `umount`), `util-linux` (`mountpoint`, `lsblk`), `e2fsprogs` (`mkfs.ext4`)
Debian/Ubuntu packages: `mount` (`mount`, `umount`), `util-linux` (`mountpoint`, `lsblk`, `blkid`), `e2fsprogs` (`mkfs.ext4`)

### `dstack-runner`

Expand Down
15 changes: 15 additions & 0 deletions runner/internal/shim/backends/backends.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package backends

import (
"fmt"
)

func GetBackend(backendType string) (Backend, error) {
switch backendType {
case "aws":
return NewAWSBackend(), nil
case "gcp":
return NewGCPBackend(), nil
}
return nil, fmt.Errorf("unknown backend: %q", backendType)
}
165 changes: 0 additions & 165 deletions runner/internal/shim/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"fmt"
"io"
"os"
"os/exec"
"os/user"
"path/filepath"
"strconv"
Expand All @@ -35,7 +34,6 @@ import (
"github.com/dstackai/dstack/runner/internal/common/gpu"
"github.com/dstackai/dstack/runner/internal/common/log"
"github.com/dstackai/dstack/runner/internal/common/types"
"github.com/dstackai/dstack/runner/internal/shim/backends"
"github.com/dstackai/dstack/runner/internal/shim/host"
)

Expand Down Expand Up @@ -612,169 +610,6 @@ func (d *DockerRunner) remove(ctx context.Context, task *Task) (err error) {
return nil
}

func getBackend(backendType string) (backends.Backend, error) {
switch backendType {
case "aws":
return backends.NewAWSBackend(), nil
case "gcp":
return backends.NewGCPBackend(), nil
}
return nil, fmt.Errorf("unknown backend: %q", backendType)
}

func prepareVolumes(ctx context.Context, taskConfig TaskConfig) error {
for _, volume := range taskConfig.Volumes {
err := formatAndMountVolume(ctx, volume)
if err != nil {
return fmt.Errorf("format and mount volume: %w", err)
}
}
return nil
}

func unmountVolumes(ctx context.Context, taskConfig TaskConfig) error {
if len(taskConfig.Volumes) == 0 {
return nil
}
log.Debug(ctx, "Unmounting volumes...")
var failed []string
for _, volume := range taskConfig.Volumes {
mountPoint := getVolumeMountPoint(volume.Name)
cmd := exec.CommandContext(ctx, "mountpoint", mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
log.Info(ctx, "skipping", "mountpoint", mountPoint, "output", output)
continue
}
cmd = exec.CommandContext(ctx, "umount", "-qf", mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
log.Error(ctx, "failed to unmount", "mountpoint", mountPoint, "output", output)
failed = append(failed, mountPoint)
} else {
log.Debug(ctx, "unmounted", "mountpoint", mountPoint)
}
}
if len(failed) > 0 {
return fmt.Errorf("failed to unmount volume(s): %v", failed)
}
return nil
}

func formatAndMountVolume(ctx context.Context, volume VolumeInfo) error {
backend, err := getBackend(volume.Backend)
if err != nil {
return fmt.Errorf("get backend: %w", err)
}
deviceName, err := backend.GetRealDeviceName(volume.VolumeId, volume.DeviceName)
if err != nil {
return fmt.Errorf("get real device name: %w", err)
}
fsCreated, err := initFileSystem(ctx, deviceName, !volume.InitFs)
if err != nil {
return fmt.Errorf("init file system: %w", err)
}
// Make FS root directory world-writable (0777) to give any job user
// a permission to create new files
// NOTE: mke2fs (that is, mkfs.ext4) supports `-E root_perms=0777` since 1.47.1:
// https://e2fsprogs.sourceforge.net/e2fsprogs-release.html#1.47.1
// but, as of 2024-12-04, this version is too new to rely on, for example,
// Ubuntu 24.04 LTS has only 1.47.0
// 0 means "do not chmod root directory"
var fsRootPerms os.FileMode = 0
// Change permissions only if the FS was created by us, don't mess with
// user-formatted volumes
if fsCreated {
fsRootPerms = 0o777
}
err = mountDisk(ctx, deviceName, getVolumeMountPoint(volume.Name), fsRootPerms)
if err != nil {
return fmt.Errorf("mount disk: %w", err)
}
return nil
}

func getVolumeMountPoint(volumeName string) string {
// Put volumes in dstack-specific dir to avoid clashes with host dirs.
// /mnt/disks is used since on some VM images other places may not be writable (e.g. GCP COS).
return fmt.Sprintf("/mnt/disks/dstack-volumes/%s", volumeName)
}

func prepareInstanceMountPoints(taskConfig TaskConfig) error {
// If the instance volume directory doesn't exist, create it with world-writable permissions (0777)
// to give any job user a permission to create new files
// If the directory already exists, do nothing, don't mess with already set permissions, especially
// on SSH fleets where permissions are managed by the host admin
for _, mountPoint := range taskConfig.InstanceMounts {
if _, err := os.Stat(mountPoint.InstancePath); errors.Is(err, os.ErrNotExist) {
// All missing parent dirs are created with 0755 permissions
if err = os.MkdirAll(mountPoint.InstancePath, 0o755); err != nil {
return fmt.Errorf("create instance mount directory: %w", err)
}
if err = os.Chmod(mountPoint.InstancePath, 0o777); err != nil {
return fmt.Errorf("chmod instance mount directory: %w", err)
}
} else if err != nil {
return fmt.Errorf("stat instance mount directory: %w", err)
}
}
return nil
}

// initFileSystem creates an ext4 file system on a disk only if the disk is not already has a file system.
// Returns true if the file system is created.
func initFileSystem(ctx context.Context, deviceName string, errorIfNotExists bool) (bool, error) {
// Run the lsblk command to get filesystem type
cmd := exec.CommandContext(ctx, "lsblk", "-no", "FSTYPE", deviceName)
var out bytes.Buffer
cmd.Stdout = &out
if err := cmd.Run(); err != nil {
return false, fmt.Errorf("failed to check if disk is formatted: %w", err)
}

// If the output is not empty, the disk is already formatted
fsType := strings.TrimSpace(out.String())
if fsType != "" {
return false, nil
}

if errorIfNotExists {
return false, fmt.Errorf("disk has no file system")
}

log.Debug(ctx, "formatting disk with ext4 filesystem...", "device", deviceName)
cmd = exec.CommandContext(ctx, "mkfs.ext4", "-F", deviceName)
if output, err := cmd.CombinedOutput(); err != nil {
return false, fmt.Errorf("failed to format disk: %w, output: %s", err, string(output))
}
log.Debug(ctx, "disk formatted succesfully!", "device", deviceName)
return true, nil
}

func mountDisk(ctx context.Context, deviceName, mountPoint string, fsRootPerms os.FileMode) error {
// Create the mount point directory if it doesn't exist
if _, err := os.Stat(mountPoint); os.IsNotExist(err) {
log.Debug(ctx, "creating mount point...", "mountpoint", mountPoint)
if err := os.MkdirAll(mountPoint, 0o755); err != nil {
return fmt.Errorf("failed to create mount point: %w", err)
}
}

// Mount the disk to the mount point
log.Debug(ctx, "mounting disk...", "device", deviceName, "mountpoint", mountPoint)
cmd := exec.CommandContext(ctx, "mount", deviceName, mountPoint)
if output, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to mount disk: %w, output: %s", err, string(output))
}

if fsRootPerms != 0 {
if err := os.Chmod(mountPoint, fsRootPerms); err != nil {
return fmt.Errorf("failed to chmod volume root directory %s: %w", mountPoint, err)
}
}

log.Debug(ctx, "disk mounted successfully!")
return nil
}

func pullImage(ctx context.Context, client docker.APIClient, taskConfig TaskConfig, logPath string, tracker *PullTracker) error {
if !strings.Contains(taskConfig.ImageName, ":") {
taskConfig.ImageName += ":latest"
Expand Down
Loading
Loading