Skip to content

Commit bde9b10

Browse files
committed
refactor(selfupdate): CC-style stateless self-update — runner self-decides, reports nothing
Collapse the upgrade flow to the Claude-Code model: the backend only advertises the latest version; the runner alone decides whether to apply and reports no status over the wire. The backend infers the outcome purely from the version the runner reports on its next heartbeat. - Drop the upgrade.status wire message + UpgradeStatusPayload; the phase labels are now local-log-only constants (journald), not a protocol. - Add --disable-auto-update (env FLASHDUTY_RUNNER_DISABLE_AUTO_UPDATE): a per-runner opt-out replaces any server-side policy. handleUpgrade short-circuits on it before any disk work. - CanSelfUpdate(): probe whether the binary's dir is writable and skip the swap for manual root-owned installs instead of looping a doomed upgrade. - Runner-local last-rolled-back-version guard (ShouldSkipTarget): since the backend re-advertises the same latest every heartbeat, a rollback records the bad version and the handler ignores re-advertisements of exactly that version until a newer one is published or a later upgrade commits — no more endless download -> swap -> rollback loop. - Download resilience: a 60s no-bytes stall watchdog plus bounded retry on transient failures (connection drop, stall, 5xx/429), and NO retry on deterministic ones (checksum mismatch, 4xx, bad archive). - install.sh documents the opt-out in the generated env file.
1 parent 10fadba commit bde9b10

13 files changed

Lines changed: 482 additions & 103 deletions

cmd/main.go

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import (
1010
"os"
1111
"os/signal"
1212
"path/filepath"
13-
"sync"
13+
"strings"
1414
"sync/atomic"
1515
"syscall"
1616
"time"
@@ -19,7 +19,6 @@ import (
1919

2020
"github.com/flashcatcloud/flashduty-runner/environment"
2121
"github.com/flashcatcloud/flashduty-runner/permission"
22-
"github.com/flashcatcloud/flashduty-runner/protocol"
2322
"github.com/flashcatcloud/flashduty-runner/selfupdate"
2423
"github.com/flashcatcloud/flashduty-runner/ws"
2524
)
@@ -33,11 +32,12 @@ var (
3332

3433
// Command line flags
3534
var (
36-
flagToken string
37-
flagURL string
38-
flagWorkspace string
39-
flagLogLevel string
40-
flagMaxAttempts int
35+
flagToken string
36+
flagURL string
37+
flagWorkspace string
38+
flagLogLevel string
39+
flagMaxAttempts int
40+
flagDisableAutoUpdate bool
4141
)
4242

4343
// Default values
@@ -113,6 +113,7 @@ Environment variables:
113113
cmd.Flags().StringVar(&flagWorkspace, "workspace", "", "Workspace root directory (env: FLASHDUTY_RUNNER_WORKSPACE)")
114114
cmd.Flags().StringVar(&flagLogLevel, "log-level", "", "Log level: debug, info, warn, error (env: FLASHDUTY_RUNNER_LOG_LEVEL)")
115115
cmd.Flags().IntVar(&flagMaxAttempts, "max-attempts", -1, "Max reconnect attempts (0=unlimited, default=30, env: FLASHDUTY_RUNNER_MAX_ATTEMPTS)")
116+
cmd.Flags().BoolVar(&flagDisableAutoUpdate, "disable-auto-update", false, "Opt out of automatic self-updates; this runner stays on its installed version (env: FLASHDUTY_RUNNER_DISABLE_AUTO_UPDATE)")
116117

117118
return cmd
118119
}
@@ -131,11 +132,12 @@ func versionCmd() *cobra.Command {
131132

132133
// Config holds the runtime configuration
133134
type Config struct {
134-
Token string
135-
URL string
136-
WorkspaceRoot string
137-
LogLevel string
138-
MaxAttempts int
135+
Token string
136+
URL string
137+
WorkspaceRoot string
138+
LogLevel string
139+
MaxAttempts int
140+
DisableAutoUpdate bool
139141
}
140142

141143
func loadConfig() (*Config, error) {
@@ -197,9 +199,23 @@ func loadConfig() (*Config, error) {
197199
}
198200
}
199201

202+
// Auto-update opt-out: flag OR env truthy disables it. Opt-out only, so we
203+
// never need to distinguish "explicitly false" from "unset".
204+
cfg.DisableAutoUpdate = flagDisableAutoUpdate || isTruthy(os.Getenv("FLASHDUTY_RUNNER_DISABLE_AUTO_UPDATE"))
205+
200206
return cfg, nil
201207
}
202208

209+
// isTruthy reports whether an env-var string requests an enabled boolean.
210+
func isTruthy(v string) bool {
211+
switch strings.ToLower(strings.TrimSpace(v)) {
212+
case "1", "true", "yes", "on":
213+
return true
214+
default:
215+
return false
216+
}
217+
}
218+
203219
func runRunner() error {
204220
// Load configuration
205221
cfg, err := loadConfig()
@@ -241,6 +257,10 @@ func runRunner() error {
241257

242258
// Create message handler
243259
handler := ws.NewHandler(wspace)
260+
handler.SetDisableAutoUpdate(cfg.DisableAutoUpdate)
261+
if cfg.DisableAutoUpdate {
262+
slog.Info("auto-update disabled; this runner stays on its installed version", "version", Version)
263+
}
244264

245265
// Create WebSocket client
246266
client := ws.NewClient(cfg.Token, cfg.URL, cfg.WorkspaceRoot, handler.Handle, Version, cfg.MaxAttempts)
@@ -265,27 +285,17 @@ func runRunner() error {
265285
}
266286

267287
// On the first successful handshake during probation, commit the upgrade
268-
// (clear marker + .bak) and report it; if we rolled back on boot, report
269-
// that once and clear the marker. onConnected fires only after a successful
288+
// (clear marker + .bak + failed-version memory). The backend infers success
289+
// purely from the version the runner reports on its next heartbeat — there is
290+
// no upgrade-status wire message. onConnected fires only after a successful
270291
// welcome handshake (ws.Client.Connect returns an error otherwise).
271-
var reportRolledBackOnce sync.Once
272292
client.SetOnConnected(func() {
273293
if bootOutcome.InProbation && committed.CompareAndSwap(false, true) {
274294
if probationTimer != nil {
275295
probationTimer.Stop()
276296
}
277297
pm.Commit()
278-
_ = client.SendPayload(protocol.MessageTypeUpgradeStatus, protocol.UpgradeStatusPayload{
279-
TargetVersion: Version, Phase: protocol.UpgradePhaseCommitted,
280-
})
281-
}
282-
if bootOutcome.ReportRolledBack {
283-
reportRolledBackOnce.Do(func() {
284-
_ = client.SendPayload(protocol.MessageTypeUpgradeStatus, protocol.UpgradeStatusPayload{
285-
TargetVersion: bootOutcome.RolledBackTarget, Phase: protocol.UpgradePhaseRolledBack,
286-
})
287-
pm.ClearAfterReport()
288-
})
298+
slog.Info("self-update committed after successful handshake", "version", Version)
289299
}
290300
})
291301

install.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,10 @@ write_env_file() {
419419
printf 'FLASHDUTY_RUNNER_URL=%s\n' "$URL"
420420
printf 'FLASHDUTY_RUNNER_WORKSPACE=%s\n' "$WORKSPACE_DIR"
421421
printf 'FLASHDUTY_RUNNER_LOG_LEVEL=info\n'
422+
printf '# Auto-update is on by default: the runner upgrades itself when Flashduty\n'
423+
printf '# publishes a newer version. To pin this host to its installed version,\n'
424+
printf '# uncomment the next line (then: systemctl restart flashduty-runner).\n'
425+
printf '# FLASHDUTY_RUNNER_DISABLE_AUTO_UPDATE=true\n'
422426
} >"$ENV_FILE"
423427
chown 0:0 "$ENV_FILE" 2>/dev/null || true
424428
chmod 0600 "$ENV_FILE"

protocol/messages.go

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ const (
3131
// unregister timeout.
3232
MessageTypeShutdown MessageType = "shutdown"
3333

34-
// Flashduty -> Runner
34+
// Flashduty -> Runner. Advertises the target version in response to a
35+
// heartbeat; the runner self-decides whether to apply (it reports only its
36+
// own version back — there is no upgrade-status channel).
3537
MessageTypeUpgrade MessageType = "upgrade"
36-
// Runner -> Flashduty
37-
MessageTypeUpgradeStatus MessageType = "upgrade.status"
3838
)
3939

4040
// Message is the base WebSocket message structure.
@@ -488,28 +488,21 @@ type ReconcileKnowledgeManifestResult struct {
488488
NeedsStage []string `json:"needs_stage,omitempty"`
489489
}
490490

491-
// Upgrade phases (UpgradeStatusPayload.Phase values).
491+
// Upgrade phase labels for the runner's own local logging (journald). They are
492+
// not reported over the wire — the backend infers progress from the version the
493+
// runner reports on its next heartbeat.
492494
const (
493-
UpgradePhaseReceived = "received"
494-
UpgradePhaseDeferred = "deferred"
495495
UpgradePhaseDownloading = "downloading"
496496
UpgradePhaseVerifying = "verifying"
497497
UpgradePhaseSwapping = "swapping"
498-
UpgradePhaseCommitted = "committed"
499-
UpgradePhaseRolledBack = "rolled_back"
500498
UpgradePhaseFailed = "failed"
501499
)
502500

503-
// UpgradePayload is sent by Flashduty to trigger an in-process self-update.
501+
// UpgradePayload is sent by Flashduty to advertise the target version (with its
502+
// download URL + checksum) in response to a heartbeat. The runner self-decides
503+
// whether to apply it.
504504
type UpgradePayload struct {
505505
TargetVersion string `json:"target_version"`
506506
DownloadURL string `json:"download_url"`
507507
SHA256 string `json:"sha256"`
508508
}
509-
510-
// UpgradeStatusPayload reports self-update progress back to Flashduty.
511-
type UpgradeStatusPayload struct {
512-
TargetVersion string `json:"target_version"`
513-
Phase string `json:"phase"`
514-
Error string `json:"error,omitempty"`
515-
}

selfupdate/capability.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package selfupdate
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
)
7+
8+
// CanSelfUpdate reports whether the runner can replace its own binary in place.
9+
// The atomic swap copies the current binary to <dir>/.bak and renames the new
10+
// one over the canonical path, so the executable's *directory* must be writable
11+
// by the service user. A manual install that drops the binary in root-owned
12+
// /usr/local/bin and runs as a non-root user cannot — and must not attempt to —
13+
// self-update; the handler checks this before a doomed download ever begins, so
14+
// such runners simply stay on their version instead of failing in a loop.
15+
func CanSelfUpdate(exePath string) bool {
16+
f, err := os.CreateTemp(filepath.Dir(exePath), ".swcheck-*")
17+
if err != nil {
18+
return false
19+
}
20+
name := f.Name()
21+
_ = f.Close()
22+
_ = os.Remove(name)
23+
return true
24+
}

selfupdate/capability_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package selfupdate
2+
3+
import (
4+
"os"
5+
"path/filepath"
6+
"testing"
7+
)
8+
9+
// TestCanSelfUpdateWritableDir verifies that a runner whose executable lives in
10+
// a writable directory can self-update, and that the writability probe leaves no
11+
// litter behind.
12+
func TestCanSelfUpdateWritableDir(t *testing.T) {
13+
dir := t.TempDir()
14+
exe := filepath.Join(dir, "flashduty-runner")
15+
if err := os.WriteFile(exe, []byte("x"), 0o755); err != nil { //nolint:gosec // test fixture
16+
t.Fatal(err)
17+
}
18+
if !CanSelfUpdate(exe) {
19+
t.Fatal("expected CanSelfUpdate true for a writable exe dir")
20+
}
21+
entries, _ := os.ReadDir(dir)
22+
for _, e := range entries {
23+
if e.Name() != "flashduty-runner" {
24+
t.Fatalf("writability probe left a stray file: %s", e.Name())
25+
}
26+
}
27+
}
28+
29+
// TestCanSelfUpdateNonWritableDir verifies that a manual install whose binary
30+
// sits in a directory the runner cannot write (modelled here as a missing
31+
// parent) reports it cannot self-update, so the handler skips the upgrade
32+
// locally instead of looping on a swap it can never complete.
33+
func TestCanSelfUpdateNonWritableDir(t *testing.T) {
34+
exe := filepath.Join(t.TempDir(), "missing-subdir", "flashduty-runner")
35+
if CanSelfUpdate(exe) {
36+
t.Fatal("expected CanSelfUpdate false when the exe dir is not writable")
37+
}
38+
}

selfupdate/download_retry_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package selfupdate
2+
3+
import (
4+
"context"
5+
"net/http"
6+
"net/http/httptest"
7+
"path/filepath"
8+
"sync/atomic"
9+
"testing"
10+
"time"
11+
)
12+
13+
// TestDownloadRetriesTransientThenSucceeds verifies the borrowed download
14+
// resilience: two 503s are retried and the third (200) succeeds.
15+
func TestDownloadRetriesTransientThenSucceeds(t *testing.T) {
16+
binBytes := []byte("hi")
17+
targz := makeTarGz(t, "flashduty-runner", binBytes)
18+
sum := sha256Hex(targz)
19+
20+
var calls int32
21+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
22+
if atomic.AddInt32(&calls, 1) < 3 {
23+
w.WriteHeader(http.StatusServiceUnavailable)
24+
return
25+
}
26+
_, _ = w.Write(targz)
27+
}))
28+
defer srv.Close()
29+
30+
// Shrink the backoff so the test doesn't actually sleep seconds between tries.
31+
origBackoff := downloadBackoffBase
32+
downloadBackoffBase = time.Millisecond
33+
defer func() { downloadBackoffBase = origBackoff }()
34+
35+
dest := filepath.Join(t.TempDir(), "flashduty-runner.new")
36+
if err := downloadVerifyExtract(context.Background(), srv.URL, sum, dest); err != nil {
37+
t.Fatalf("expected success after retries, got %v", err)
38+
}
39+
if got := atomic.LoadInt32(&calls); got != 3 {
40+
t.Fatalf("expected 3 attempts, got %d", got)
41+
}
42+
}
43+
44+
// TestDownloadDoesNotRetry4xx verifies a deterministic failure (404 wrong URL)
45+
// is not retried — another attempt would fail identically and burn the window.
46+
func TestDownloadDoesNotRetry4xx(t *testing.T) {
47+
var calls int32
48+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
49+
atomic.AddInt32(&calls, 1)
50+
w.WriteHeader(http.StatusNotFound)
51+
}))
52+
defer srv.Close()
53+
54+
dest := filepath.Join(t.TempDir(), "flashduty-runner.new")
55+
if err := downloadVerifyExtract(context.Background(), srv.URL, "abc", dest); err == nil {
56+
t.Fatal("expected error on 404")
57+
}
58+
if got := atomic.LoadInt32(&calls); got != 1 {
59+
t.Fatalf("expected exactly 1 attempt (no retry on 4xx), got %d", got)
60+
}
61+
}
62+
63+
// TestDownloadDoesNotRetryChecksumMismatch verifies a checksum mismatch is
64+
// deterministic: a re-download of the same artifact can't fix it, so it fails
65+
// after a single attempt.
66+
func TestDownloadDoesNotRetryChecksumMismatch(t *testing.T) {
67+
targz := makeTarGz(t, "flashduty-runner", []byte("hi"))
68+
69+
var calls int32
70+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
71+
atomic.AddInt32(&calls, 1)
72+
_, _ = w.Write(targz)
73+
}))
74+
defer srv.Close()
75+
76+
dest := filepath.Join(t.TempDir(), "flashduty-runner.new")
77+
if err := downloadVerifyExtract(context.Background(), srv.URL, "deadbeef", dest); err == nil {
78+
t.Fatal("expected checksum error")
79+
}
80+
if got := atomic.LoadInt32(&calls); got != 1 {
81+
t.Fatalf("expected exactly 1 attempt (no retry on checksum mismatch), got %d", got)
82+
}
83+
}

selfupdate/failed.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
package selfupdate
2+
3+
import (
4+
"os"
5+
"strings"
6+
)
7+
8+
// Failed-version memory. The backend is stateless: it keeps advertising the
9+
// channel's latest version on every heartbeat. Without a local guard, a runner
10+
// that rolls back from a bad latest would re-download → swap → roll back on a
11+
// loop. So a rollback records the version it backed out of; ShouldSkipTarget
12+
// then ignores re-advertisements of exactly that version until a *newer* one is
13+
// published (different string) or a later upgrade commits (cleared on commit).
14+
15+
func recordFailedVersion(path, version string) {
16+
if version == "" {
17+
return
18+
}
19+
_ = os.WriteFile(path, []byte(version), 0o600)
20+
}
21+
22+
func lastFailedVersion(path string) string {
23+
data, err := os.ReadFile(path)
24+
if err != nil {
25+
return ""
26+
}
27+
return strings.TrimSpace(string(data))
28+
}
29+
30+
func clearFailedVersion(path string) {
31+
_ = os.Remove(path)
32+
}
33+
34+
// ShouldSkipTarget reports whether an advertised target should be ignored
35+
// because this host already rolled back from exactly that version.
36+
func ShouldSkipTarget(exePath, target string) bool {
37+
return target != "" && lastFailedVersion(layoutFor(exePath).Failed) == target
38+
}

0 commit comments

Comments
 (0)