diff --git a/cmd/serve.go b/cmd/serve.go index 0b4c3d3..7096133 100644 --- a/cmd/serve.go +++ b/cmd/serve.go @@ -49,6 +49,12 @@ func runServe() error { workspaceRoot = filepath.Join(homeDir, ".flashduty") } + // Surface the resolved workspace root at startup. When the cloud image and the + // runner disagree on where the workspace lives (the class of root-divergence + // bug that previously cost hours), this single line makes the truth obvious in + // the logs instead of leaving us to infer it from downstream path errors. + slog.Info("envd workspace root resolved", "workspace_root", workspaceRoot) + // Pin the runner home into the environment so environment.BundledToolsDir // resolves the SAME /bin the workspace uses (keeps install dir and bash // PATH dir from diverging). No-op when FLASHDUTY_RUNNER_HOME was already set. diff --git a/envd/connect.go b/envd/connect.go index 67ccfaa..76a4633 100644 --- a/envd/connect.go +++ b/envd/connect.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "log/slog" "net/http" ) @@ -17,17 +18,97 @@ type connectError struct { // writeUnary writes a Connect-RPC unary response (JSON body + 200 on success, // 4xx/5xx + error envelope on failure). +// +// Invariant: the response body is ALWAYS a complete JSON value, never empty. +// The safari client decodes every unary response with json.Decode, and an empty +// body decodes to io.EOF — which surfaced to the agent as the opaque +// "remote write operation failed: EOF", hiding the real failure (see +// fix/envd-write-error-opacity). On success with no payload we emit `{}` so the +// client always gets a decodable envelope; on the error path we never drop the +// encode error silently. func writeUnary(w http.ResponseWriter, v any, err error) { w.Header().Set("Content-Type", "application/json") if err != nil { code, status := mapError(err) w.WriteHeader(status) - _ = json.NewEncoder(w).Encode(connectError{Code: code, Message: err.Error()}) + if encErr := json.NewEncoder(w).Encode(connectError{Code: code, Message: err.Error()}); encErr != nil { + // The status line + headers are already flushed, so we can't change + // the status now — but we must not leave the connection looking like a + // clean close with no body. Log loudly so the dropped envelope is + // visible in runner logs instead of silently becoming a client EOF. + slog.Error("envd: failed to encode error envelope", + "code", code, "status", status, "orig_error", err.Error(), "encode_error", encErr.Error()) + } return } w.WriteHeader(http.StatusOK) - if v != nil { - _ = json.NewEncoder(w).Encode(v) + // Always write a body. A nil payload (e.g. Write, which returns no value) + // must still serialize to `{}`, not an empty body, or the client's + // json.Decode hits io.EOF and reports a phantom failure. + if v == nil { + v = struct{}{} + } + if encErr := json.NewEncoder(w).Encode(v); encErr != nil { + slog.Error("envd: failed to encode success envelope", "encode_error", encErr.Error()) + } +} + +// recoverMiddleware turns a panic in any handler into a clean Connect-RPC +// internal-error envelope instead of letting the goroutine die and the +// connection close with no body. A dropped connection is exactly what the +// safari client reports as a bare io.EOF, hiding the real cause. We can only +// write a body if nothing has been flushed yet; we use a small interceptor to +// know whether WriteHeader was already called. +func recoverMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + rw := &recordingWriter{ResponseWriter: w} + defer func() { + if rec := recover(); rec != nil { + // http.ErrAbortHandler is net/http's own sentinel for a + // deliberate connection abort (e.g. client disconnect on a + // streaming response). Re-panic so the server suppresses it the + // standard way — swallowing it would leak the handler goroutine, + // which would go on believing it can still write to a dead conn. + if rec == http.ErrAbortHandler { + panic(rec) + } + slog.Error("envd: handler panic recovered", + "path", r.URL.Path, "panic", fmt.Sprintf("%v", rec)) + // Write through rw, not the raw w, so wroteHeader tracking stays + // consistent with the rest of the response path. + if !rw.wroteHeader { + writeUnary(rw, nil, fmt.Errorf("internal error: handler panicked: %v", rec)) + } + // If a header was already written we can't change the status, but + // the log above ensures the panic is never silent. + } + }() + next.ServeHTTP(rw, r) + }) +} + +// recordingWriter tracks whether the wrapped handler has begun the response, so +// recoverMiddleware knows if it can still emit a clean error envelope. +type recordingWriter struct { + http.ResponseWriter + wroteHeader bool +} + +func (rw *recordingWriter) WriteHeader(status int) { + rw.wroteHeader = true + rw.ResponseWriter.WriteHeader(status) +} + +func (rw *recordingWriter) Write(b []byte) (int, error) { + rw.wroteHeader = true + return rw.ResponseWriter.Write(b) +} + +// Flush proxies http.Flusher so streaming handlers (writeStream) keep working +// through the recording wrapper. +func (rw *recordingWriter) Flush() { + if f, ok := rw.ResponseWriter.(http.Flusher); ok { + f.Flush() } } diff --git a/envd/connect_test.go b/envd/connect_test.go index a061fa9..bc91568 100644 --- a/envd/connect_test.go +++ b/envd/connect_test.go @@ -34,6 +34,27 @@ func TestWriteUnary_Error(t *testing.T) { } } +// TestWriteUnary_NilSuccessBodyIsNotEmpty pins the EOF-opacity fix: a successful +// unary response with no payload (e.g. Write) must serialize to a decodable JSON +// value, never an empty body. An empty body decodes to io.EOF on the safari +// client and surfaced as the opaque "remote write operation failed: EOF". +func TestWriteUnary_NilSuccessBodyIsNotEmpty(t *testing.T) { + w := httptest.NewRecorder() + writeUnary(w, nil, nil) + if w.Code != 200 { + t.Fatalf("code=%d", w.Code) + } + body := w.Body.Bytes() + if len(body) == 0 { + t.Fatal("nil-payload success wrote an empty body; client would see io.EOF") + } + // Must be valid JSON the client can Decode without hitting io.EOF. + var got json.RawMessage + if err := json.NewDecoder(w.Body).Decode(&got); err != nil { + t.Fatalf("success body is not decodable JSON: %v (body=%q)", err, string(body)) + } +} + func TestReadUnary_Decodes(t *testing.T) { body, _ := json.Marshal(map[string]int{"x": 7}) r := httptest.NewRequestWithContext(context.Background(), http.MethodPost, "/svc/m", bytes.NewReader(body)) diff --git a/envd/process.go b/envd/process.go index e88ba3a..6617398 100644 --- a/envd/process.go +++ b/envd/process.go @@ -29,11 +29,13 @@ func (pr *processRegistry) add(p *procEntry) { pr.all[p.cmd.Process.Pid] = p pr.mu.Unlock() } + func (pr *processRegistry) remove(pid int) { pr.mu.Lock() delete(pr.all, pid) pr.mu.Unlock() } + func (pr *processRegistry) get(pid int) *procEntry { pr.mu.Lock() defer pr.mu.Unlock() @@ -89,8 +91,19 @@ func (s *Server) handleProcessStart(w http.ResponseWriter, r *http.Request) { // that is the contract. Auth and sandbox isolation enforce the trust boundary; // gosec's caller-controlled-input rule does not apply here. cmd := exec.CommandContext(ctx, req.Process.Cmd, req.Process.Args...) //nolint:gosec // intentional - if req.Process.Cwd != "" { + // Default the working directory to the workspace root when the client omits + // cwd, so a bare `bash` lands in the SAME root the file tools (read/write/ + // glob/sync_files) operate under. Without this the child inherits envd's own + // process cwd — the image WORKDIR (e.g. /home/runner) — which diverges from + // WorkspaceRoot (e.g. /home/runner/.flashduty): files written by bash become + // invisible to the file tools and absolute paths get rejected as "path + // escapes environment root". The BYOC exec path already defaults to the + // workspace root via resolveWorkdir; this brings the cloud process path in line. + switch { + case req.Process.Cwd != "": cmd.Dir = req.Process.Cwd + case s.cfg.WorkspaceRoot != "": + cmd.Dir = s.cfg.WorkspaceRoot } if len(req.Process.Envs) > 0 { env := make([]string, 0, len(req.Process.Envs)) @@ -226,6 +239,7 @@ func (s *Server) handleProcessList(w http.ResponseWriter, r *http.Request) { func (s *Server) handleProcessSendInput(w http.ResponseWriter, _ *http.Request) { writeUnary(w, nil, fmt.Errorf("%w: send_input", errUnimplemented)) } + func (s *Server) handleProcessConnect(w http.ResponseWriter, _ *http.Request) { writeUnary(w, nil, fmt.Errorf("%w: connect", errUnimplemented)) } diff --git a/envd/process_test.go b/envd/process_test.go index 70d0f53..ced8e19 100644 --- a/envd/process_test.go +++ b/envd/process_test.go @@ -5,6 +5,8 @@ import ( "encoding/json" "io" "net/http" + "os" + "path/filepath" "runtime" "strconv" "strings" @@ -121,3 +123,53 @@ func TestProcessSendSignal_KillsRunning(t *testing.T) { t.Fatal("start stream did not close after kill") } } + +func TestProcessStart_DefaultsCwdToWorkspaceRoot(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("envd Process.Start uses POSIX /bin commands; serve mode targets the Linux sandbox container") + } + // A bare process (no cwd) must run in the workspace root, NOT envd's own + // process cwd — otherwise the agent's bash and the file tools (rooted at the + // workspace root) diverge, which is exactly the prod failure this fixes. + // Prove it: write a marker in the workspace root and read it back via a + // RELATIVE path, which only resolves if the child's cwd == WorkspaceRoot. + _, dir, hs := newTestServerWithWS(t) + const marker = "cwd-is-workspace-root" + if err := os.WriteFile(filepath.Join(dir, "marker.txt"), []byte(marker), 0o644); err != nil { + t.Fatal(err) + } + + body := strings.NewReader(`{"process":{"cmd":"/bin/cat","args":["marker.txt"]}}`) + req, _ := http.NewRequestWithContext(context.Background(), http.MethodPost, hs.URL+"/process.Process/Start", body) + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatal(err) + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode != 200 { + t.Fatalf("status=%d", resp.StatusCode) + } + + dec := json.NewDecoder(resp.Body) + var stdout strings.Builder + for { + var frame map[string]any + if decErr := dec.Decode(&frame); decErr != nil { + if decErr == io.EOF { + break + } + t.Fatalf("decode: %v", decErr) + } + if ev, _ := frame["event"].(map[string]any); ev != nil { + if d, ok := ev["data"].(map[string]any); ok { + if s, _ := d["stdout"].(string); s != "" { + stdout.WriteString(s) + } + } + } + } + if got := strings.TrimSpace(stdout.String()); got != marker { + t.Fatalf("relative cat did not resolve against workspace root: stdout=%q want %q (cwd likely defaulted to envd process cwd, not WorkspaceRoot)", got, marker) + } +} diff --git a/envd/server.go b/envd/server.go index b025f07..6740971 100644 --- a/envd/server.go +++ b/envd/server.go @@ -69,7 +69,7 @@ func (s *Server) registerRoutes() { func (s *Server) Run(ctx context.Context) error { srv := &http.Server{ Addr: s.cfg.Listen, - Handler: s.authMiddleware(s.mux), + Handler: recoverMiddleware(s.authMiddleware(s.mux)), ReadHeaderTimeout: 10 * time.Second, } errCh := make(chan error, 1) diff --git a/envd/test_helpers_test.go b/envd/test_helpers_test.go index 506f922..31d11b9 100644 --- a/envd/test_helpers_test.go +++ b/envd/test_helpers_test.go @@ -13,7 +13,7 @@ import ( func newTestServer(t *testing.T) (*Server, *httptest.Server) { t.Helper() s := NewServer(Config{Listen: ":0"}) - hs := httptest.NewServer(s.authMiddleware(s.mux)) + hs := httptest.NewServer(recoverMiddleware(s.authMiddleware(s.mux))) t.Cleanup(hs.Close) return s, hs } @@ -29,7 +29,7 @@ func newTestServerWithWS(t *testing.T) (*Server, string, *httptest.Server) { t.Fatal(err) } s := NewServer(Config{Listen: ":0", Workspace: ws, WorkspaceRoot: dir}) - hs := httptest.NewServer(s.authMiddleware(s.mux)) + hs := httptest.NewServer(recoverMiddleware(s.authMiddleware(s.mux))) t.Cleanup(hs.Close) return s, dir, hs } diff --git a/envd/write_eof_test.go b/envd/write_eof_test.go new file mode 100644 index 0000000..48ba99d --- /dev/null +++ b/envd/write_eof_test.go @@ -0,0 +1,142 @@ +package envd + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/flashcatcloud/flashduty-runner/protocol" +) + +// postJSON POSTs a JSON body to the given test-server URL+path and returns the +// status code and the full response body. It mirrors how the safari envd client +// talks to the runner. +func postJSON(t *testing.T, base, path string, body any) (int, []byte) { + t.Helper() + buf, err := json.Marshal(body) + if err != nil { + t.Fatalf("marshal: %v", err) + } + req, err := http.NewRequestWithContext(context.Background(), http.MethodPost, base+path, bytes.NewReader(buf)) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) //nolint:gosec // URL is a test server + if err != nil { + t.Fatalf("do: %v", err) + } + defer func() { _ = resp.Body.Close() }() + out, err := io.ReadAll(resp.Body) + if err != nil { + t.Fatalf("read body: %v", err) + } + return resp.StatusCode, out +} + +// TestHandleWrite_SuccessReturnsDecodableBody proves the EOF-opacity fix at the +// transport boundary: a successful /safari.Tools/Write must return a 200 with a +// NON-EMPTY, decodable JSON body. Before the fix, handleWrite passed a nil +// payload, writeUnary emitted an empty body, and the safari client decoded that +// as io.EOF — surfacing as "remote write operation failed: EOF". +func TestHandleWrite_SuccessReturnsDecodableBody(t *testing.T) { + _, dir, hs := newTestServerWithWS(t) + + status, body := postJSON(t, hs.URL, "/safari.Tools/Write", protocol.WriteArgs{ + Path: "sub/file.txt", + Content: base64.StdEncoding.EncodeToString([]byte("hello")), + }) + if status != http.StatusOK { + t.Fatalf("status=%d body=%s", status, body) + } + if len(body) == 0 { + t.Fatal("successful write returned an empty body; safari client would see io.EOF") + } + // The safari client decodes the response into a json.RawMessage; that decode + // must not hit io.EOF. + var raw json.RawMessage + if err := json.NewDecoder(bytes.NewReader(body)).Decode(&raw); err != nil { + t.Fatalf("response not decodable (would be EOF on client): %v body=%q", err, string(body)) + } + + // And the file actually landed. + got, err := os.ReadFile(filepath.Join(dir, "sub", "file.txt")) + if err != nil { + t.Fatalf("read back: %v", err) + } + if string(got) != "hello" { + t.Fatalf("content=%q", string(got)) + } +} + +// TestHandleWrite_FailureReturnsDescriptiveError proves a failing write returns a +// descriptive, NON-EMPTY error envelope (never an empty body / EOF). We force a +// failure by writing a path whose parent is an existing regular file, so +// MkdirAll(parent) fails with ENOTDIR. +func TestHandleWrite_FailureReturnsDescriptiveError(t *testing.T) { + _, dir, hs := newTestServerWithWS(t) + + // Create a regular file that we then try to treat as a directory. + if err := os.WriteFile(filepath.Join(dir, "afile"), []byte("x"), 0o644); err != nil { + t.Fatalf("seed file: %v", err) + } + + status, body := postJSON(t, hs.URL, "/safari.Tools/Write", protocol.WriteArgs{ + Path: "afile/child.txt", // parent "afile" is a regular file → ENOTDIR + Content: base64.StdEncoding.EncodeToString([]byte("data")), + }) + if status == http.StatusOK { + t.Fatalf("expected non-200, got 200 body=%s", body) + } + if len(body) == 0 { + t.Fatal("failure returned an empty body; client would see io.EOF instead of the cause") + } + var ce connectError + if err := json.Unmarshal(body, &ce); err != nil { + t.Fatalf("error body not a connect envelope: %v body=%q", err, string(body)) + } + if ce.Message == "" { + t.Fatalf("error envelope has empty message: %+v", ce) + } + // The message must name the directory it failed on — not a bare "EOF". + if ce.Message == "EOF" || ce.Message == "io: read/write on closed pipe" { + t.Fatalf("opaque error leaked: %q", ce.Message) + } + if !bytes.Contains(body, []byte("afile")) { + t.Fatalf("error does not name the offending path: %q", ce.Message) + } +} + +// TestRecoverMiddleware_PanicBecomesErrorEnvelope proves a handler panic is +// turned into a clean internal-error envelope instead of a dropped connection +// (which the safari client would report as a bare io.EOF). +func TestRecoverMiddleware_PanicBecomesErrorEnvelope(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/boom", func(http.ResponseWriter, *http.Request) { + panic("kaboom") + }) + hs := httptest.NewServer(recoverMiddleware(mux)) + defer hs.Close() + + status, body := postJSON(t, hs.URL, "/boom", struct{}{}) + if status != http.StatusInternalServerError { + t.Fatalf("status=%d body=%s", status, body) + } + var ce connectError + if err := json.Unmarshal(body, &ce); err != nil { + t.Fatalf("panic response not a connect envelope: %v body=%q", err, string(body)) + } + if ce.Code != "internal" || ce.Message == "" { + t.Fatalf("unexpected envelope: %+v", ce) + } + if !bytes.Contains(body, []byte("kaboom")) { + t.Fatalf("panic value not surfaced: %q", ce.Message) + } +} diff --git a/environment/environment.go b/environment/environment.go index a852c8c..bf2c1f7 100644 --- a/environment/environment.go +++ b/environment/environment.go @@ -217,10 +217,23 @@ func (e *Environment) Write(ctx context.Context, args *protocol.WriteArgs) error // Ensure parent directory exists dir := filepath.Dir(realPath) if err := os.MkdirAll(dir, 0o755); err != nil { - return fmt.Errorf("failed to create directory: %w", err) + // Name the directory so a permission-denied / read-only-fs / parent-missing + // failure is self-describing. A bare wrapped os error here is what the + // agent ultimately sees, so it must name the dir it couldn't create. + return fmt.Errorf("create parent dir %s: %w", dir, err) + } + + // Return os errors verbatim WITH the path. os.WriteFile yields an + // *os.PathError that already embeds the path, but wrapping makes intent + // explicit and stable. Critically, never return a bare/empty error: on the + // cloud envd HTTP path the safari client decodes an empty success body as + // io.EOF and reports the opaque "remote write operation failed: EOF"; on the + // BYOC WS path a bare os-error string is equally opaque. Naming the cause + // (permission denied, read-only fs, ENOSPC) fixes both. + if err := os.WriteFile(realPath, content, 0o644); err != nil { + return fmt.Errorf("write %s: %w", realPath, err) } - - return os.WriteFile(realPath, content, 0o644) + return nil } // List lists entries in a directory. @@ -969,10 +982,13 @@ func (e *Environment) writeRaw(ctx context.Context, path string, content []byte) // Ensure parent directory exists dir := filepath.Dir(realPath) if err := os.MkdirAll(dir, 0o755); err != nil { - return fmt.Errorf("failed to create directory: %w", err) + return fmt.Errorf("create parent dir %s: %w", dir, err) } - return os.WriteFile(realPath, content, 0o644) + if err := os.WriteFile(realPath, content, 0o644); err != nil { + return fmt.Errorf("write %s: %w", realPath, err) + } + return nil } // MCPCall executes an MCP tool call. diff --git a/environment/environment_test.go b/environment/environment_test.go index de4a0cf..c882f2c 100644 --- a/environment/environment_test.go +++ b/environment/environment_test.go @@ -9,6 +9,7 @@ import ( "errors" "os" "path/filepath" + "runtime" "strings" "testing" @@ -114,6 +115,63 @@ func TestEnvironment_Write(t *testing.T) { assert.Equal(t, testContent, string(content)) } +// TestEnvironment_Write_FailureIsDescriptive pins the EOF-opacity fix at the +// environment layer: a failing Write must return a NON-EMPTY error that names +// the offending path, never a bare/empty error. (An empty error body is what +// the safari client turns into the opaque "remote write operation failed: EOF".) +// We trigger the failure by making the target's parent a regular file, so +// MkdirAll(parent) fails with ENOTDIR. +func TestEnvironment_Write_FailureIsDescriptive(t *testing.T) { + ws := newTestEnvironment(t) + ctx := context.Background() + + // Seed a regular file we will then try to use as a parent directory. + require.NoError(t, os.WriteFile(filepath.Join(ws.Root(), "notadir"), []byte("x"), 0o644)) + + err := ws.Write(ctx, &protocol.WriteArgs{ + Path: "notadir/child.txt", + Content: base64.StdEncoding.EncodeToString([]byte("data")), + }) + require.Error(t, err) + msg := err.Error() + assert.NotEmpty(t, msg, "write failure must not return an empty error") + assert.NotEqual(t, "EOF", msg, "write failure must not surface as a bare EOF") + assert.Contains(t, msg, "notadir", "error must name the offending path") +} + +// TestEnvironment_Write_PermissionDeniedIsVerbatim proves an os permission error +// is surfaced verbatim with the path, not swallowed into an opaque error. Skipped +// when the test runs as root (root bypasses the 0o500 dir permission), e.g. in +// the cloud sandbox where the runner IS root, and on Windows, where chmod's Unix +// permission bits don't restrict directory writes. The runner only ever runs on +// Linux in prod; Windows is a CI build target. +func TestEnvironment_Write_PermissionDeniedIsVerbatim(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("windows ignores unix dir permission bits, so the write is not denied") + } + if os.Geteuid() == 0 { + t.Skip("running as root: directory permissions do not block writes") + } + ws := newTestEnvironment(t) + ctx := context.Background() + + // Create a read-only directory the write target lives under. + roDir := filepath.Join(ws.Root(), "ro") + require.NoError(t, os.MkdirAll(roDir, 0o755)) + require.NoError(t, os.Chmod(roDir, 0o500)) //nolint:gosec // r-x: intentional read-only dir to provoke a permission-denied write + t.Cleanup(func() { _ = os.Chmod(roDir, 0o755) }) //nolint:gosec // restore perms so t.TempDir cleanup can remove it + + err := ws.Write(ctx, &protocol.WriteArgs{ + Path: "ro/denied.txt", + Content: base64.StdEncoding.EncodeToString([]byte("data")), + }) + require.Error(t, err) + msg := err.Error() + assert.NotEqual(t, "EOF", msg) + assert.Contains(t, msg, "denied.txt", "error must name the target file") + assert.Contains(t, msg, "permission denied", "os permission error must be verbatim") +} + func TestEnvironment_List(t *testing.T) { ws := newTestEnvironment(t) ctx := context.Background()