From 7215eba54446a85f1674aa71c2d2f79f83a5cad2 Mon Sep 17 00:00:00 2001 From: Kunal Kushwaha Date: Mon, 22 Jun 2026 00:30:19 +0900 Subject: [PATCH] feat(eval): add `--serve` to auto-launch the EvalServer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the two-terminal dance from `agk eval`. With `--serve`, AGK builds and launches the project in EvalServer mode (AGK_EVAL_MODE=true), waits for it to become healthy, runs the tests, and tears it down — all in one command. - startEvalServer runs `go run .` (or a custom --serve-cmd) in its own process group so the compiled child is reliably killed on teardown (SIGTERM→SIGKILL). - waitForHealthy polls the test file's target.url /health until ready or timeout. - Server stdout/stderr is captured and printed if startup fails (and streamed with a [server] prefix under --verbose). - Lifecycle is signal-safe and torn down before the os.Exit on test failure. - Flags: --serve, --serve-dir, --serve-cmd, --serve-timeout. - Docs: EVAL.md "Run Tests" now leads with the one-command flow. Tests: parseServeCmd + waitForHealthy (httptest, healthy and timeout paths). Verified end-to-end against a stub EvalServer (launch → run → clean teardown, no lingering process). Co-Authored-By: Claude Opus 4.8 --- cmd/eval.go | 36 ++++++++- cmd/eval_serve.go | 167 +++++++++++++++++++++++++++++++++++++++++ cmd/eval_serve_test.go | 63 ++++++++++++++++ docs/EVAL.md | 28 ++++++- 4 files changed, 290 insertions(+), 4 deletions(-) create mode 100644 cmd/eval_serve.go create mode 100644 cmd/eval_serve_test.go diff --git a/cmd/eval.go b/cmd/eval.go index eb1af42..d2c7303 100644 --- a/cmd/eval.go +++ b/cmd/eval.go @@ -3,7 +3,9 @@ package cmd import ( "fmt" "os" + "os/signal" "path/filepath" + "syscall" "time" "github.com/spf13/cobra" @@ -39,6 +41,10 @@ var ( evalOutputFormat string evalFailFast bool evalReportFile string + evalServe bool + evalServeDir string + evalServeCmd string + evalServeWait int ) func init() { @@ -50,6 +56,10 @@ func init() { evalCmd.Flags().StringVarP(&evalOutputFormat, "format", "f", "console", "Output format (console, json, junit, markdown)") evalCmd.Flags().BoolVar(&evalFailFast, "fail-fast", false, "Stop on first test failure") evalCmd.Flags().StringVarP(&evalReportFile, "report", "r", "", "Save detailed report to file (auto-generated if not specified)") + evalCmd.Flags().BoolVar(&evalServe, "serve", false, "Build & launch the project in EvalServer mode, run tests, then stop it") + evalCmd.Flags().StringVar(&evalServeDir, "serve-dir", ".", "Project directory to launch when --serve is set") + evalCmd.Flags().StringVar(&evalServeCmd, "serve-cmd", "", "Custom command to launch the server (default: go run .)") + evalCmd.Flags().IntVar(&evalServeWait, "serve-timeout", 90, "Seconds to wait for the server to become healthy") } func runEval(cmd *cobra.Command, args []string) error { @@ -86,6 +96,26 @@ func runEval(cmd *cobra.Command, args []string) error { return nil } + // Optionally launch the project's EvalServer for the duration of the run. + var srv *evalServer + if evalServe { + s, err := launchAndWait(evalServeDir, evalServeCmd, suite.Target.URL, evalServeWait, evalVerbose) + if err != nil { + return err + } + srv = s + defer srv.Stop() + + // Ensure the server is stopped if the user interrupts the run. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM) + go func() { + <-sigCh + srv.Stop() + os.Exit(130) + }() + } + // Create test runner runner := eval.NewRunner(&eval.RunnerConfig{ Timeout: time.Duration(evalTimeout) * time.Second, @@ -139,8 +169,12 @@ func runEval(cmd *cobra.Command, args []string) error { } } - // Exit with error code if tests failed + // Exit with error code if tests failed. os.Exit skips deferred calls, so stop + // the server explicitly first. if !results.AllPassed() { + if srv != nil { + srv.Stop() + } os.Exit(1) } diff --git a/cmd/eval_serve.go b/cmd/eval_serve.go new file mode 100644 index 0000000..04a8e8f --- /dev/null +++ b/cmd/eval_serve.go @@ -0,0 +1,167 @@ +package cmd + +import ( + "bytes" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + "sync" + "syscall" + "time" +) + +// evalServer manages a user EvalServer process launched for the duration of a +// test run (the `agk eval --serve` workflow). +type evalServer struct { + cmd *exec.Cmd + output *syncBuffer + once sync.Once +} + +// syncBuffer is a goroutine-safe buffer for capturing child process output. +type syncBuffer struct { + mu sync.Mutex + buf bytes.Buffer +} + +func (b *syncBuffer) Write(p []byte) (int, error) { + b.mu.Lock() + defer b.mu.Unlock() + return b.buf.Write(p) +} + +func (b *syncBuffer) String() string { + b.mu.Lock() + defer b.mu.Unlock() + return b.buf.String() +} + +// startEvalServer launches the project in EvalServer mode (AGK_EVAL_MODE=true). +// The default command is `go run .` in dir; customCmd overrides it. The process is +// started in its own process group so `go run`'s compiled child can be reliably killed. +func startEvalServer(dir, customCmd string, streamOutput bool) (*evalServer, error) { + name, args := parseServeCmd(customCmd) + + c := exec.Command(name, args...) //nolint:gosec // command is user-provided by design + c.Dir = dir + c.Env = append(os.Environ(), "AGK_EVAL_MODE=true") + c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + out := &syncBuffer{} + var w io.Writer = out + if streamOutput { + w = io.MultiWriter(out, &prefixWriter{prefix: "[server] ", w: os.Stderr}) + } + c.Stdout = w + c.Stderr = w + + if err := c.Start(); err != nil { + return nil, fmt.Errorf("failed to start eval server (%s): %w", name, err) + } + return &evalServer{cmd: c, output: out}, nil +} + +// parseServeCmd returns the command name and args, defaulting to `go run .`. +func parseServeCmd(customCmd string) (string, []string) { + if fields := strings.Fields(customCmd); len(fields) > 0 { + return fields[0], fields[1:] + } + return "go", []string{"run", "."} +} + +// Stop terminates the server process group (idempotent), escalating SIGTERM→SIGKILL. +func (s *evalServer) Stop() { + s.once.Do(func() { + if s.cmd.Process == nil { + return + } + s.signalGroup(syscall.SIGTERM) + + done := make(chan struct{}) + go func() { _ = s.cmd.Wait(); close(done) }() + + select { + case <-done: + case <-time.After(3 * time.Second): + s.signalGroup(syscall.SIGKILL) + <-done + } + }) +} + +func (s *evalServer) signalGroup(sig syscall.Signal) { + if pgid, err := syscall.Getpgid(s.cmd.Process.Pid); err == nil { + _ = syscall.Kill(-pgid, sig) + } else { + _ = s.cmd.Process.Signal(sig) + } +} + +// Output returns everything the server printed to stdout/stderr so far. +func (s *evalServer) Output() string { return s.output.String() } + +// waitForHealthy polls url + "/health" until it returns 200 or the timeout elapses. +func waitForHealthy(url string, timeout time.Duration) error { + client := &http.Client{Timeout: 3 * time.Second} + deadline := time.Now().Add(timeout) + healthURL := strings.TrimRight(url, "/") + "/health" + + var lastErr error + for time.Now().Before(deadline) { + resp, err := client.Get(healthURL) + if err == nil { + _ = resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + lastErr = fmt.Errorf("health returned HTTP %d", resp.StatusCode) + } else { + lastErr = err + } + time.Sleep(500 * time.Millisecond) + } + return fmt.Errorf("server not healthy within %s: %w", timeout, lastErr) +} + +// launchAndWait starts the eval server and blocks until it is healthy. On failure it +// stops the server and surfaces its captured output to aid debugging. +func launchAndWait(dir, customCmd, targetURL string, waitSecs int, verbose bool) (*evalServer, error) { + if targetURL == "" { + return nil, fmt.Errorf("--serve requires a target URL in the test file") + } + + fmt.Printf("🚀 Launching EvalServer from %s (AGK_EVAL_MODE=true)...\n", dir) + srv, err := startEvalServer(dir, customCmd, verbose) + if err != nil { + return nil, err + } + + fmt.Printf("⏳ Waiting up to %ds for %s to become healthy...\n", waitSecs, targetURL) + if err := waitForHealthy(targetURL, time.Duration(waitSecs)*time.Second); err != nil { + out := srv.Output() + srv.Stop() + if strings.TrimSpace(out) != "" { + fmt.Fprintf(os.Stderr, "\n--- server output ---\n%s\n---------------------\n", out) + } + return nil, fmt.Errorf("eval server did not start: %w", err) + } + + fmt.Println("✓ Server is healthy") + return srv, nil +} + +// prefixWriter prefixes each write with a label (used to tag streamed server output). +type prefixWriter struct { + prefix string + w io.Writer +} + +func (p *prefixWriter) Write(b []byte) (int, error) { + if _, err := io.WriteString(p.w, p.prefix); err != nil { + return 0, err + } + return p.w.Write(b) +} diff --git a/cmd/eval_serve_test.go b/cmd/eval_serve_test.go new file mode 100644 index 0000000..c83ae46 --- /dev/null +++ b/cmd/eval_serve_test.go @@ -0,0 +1,63 @@ +package cmd + +import ( + "net/http" + "net/http/httptest" + "reflect" + "sync/atomic" + "testing" + "time" +) + +func TestParseServeCmd(t *testing.T) { + cases := []struct { + in string + wantName string + wantArgs []string + }{ + {"", "go", []string{"run", "."}}, + {" ", "go", []string{"run", "."}}, + {"./server", "./server", []string{}}, + {"go run ./cmd/server", "go", []string{"run", "./cmd/server"}}, + {"mybin --eval --port 8787", "mybin", []string{"--eval", "--port", "8787"}}, + } + for _, c := range cases { + name, args := parseServeCmd(c.in) + if name != c.wantName || !reflect.DeepEqual(args, c.wantArgs) { + t.Errorf("parseServeCmd(%q) = (%q, %v), want (%q, %v)", c.in, name, args, c.wantName, c.wantArgs) + } + } +} + +func TestWaitForHealthyBecomesHealthy(t *testing.T) { + var ready atomic.Bool + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/health" && ready.Load() { + w.WriteHeader(http.StatusOK) + return + } + http.Error(w, "starting", http.StatusServiceUnavailable) + })) + defer server.Close() + + // Flip to healthy shortly after we start polling. + go func() { + time.Sleep(300 * time.Millisecond) + ready.Store(true) + }() + + if err := waitForHealthy(server.URL, 5*time.Second); err != nil { + t.Fatalf("waitForHealthy returned error: %v", err) + } +} + +func TestWaitForHealthyTimesOut(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Error(w, "never ready", http.StatusServiceUnavailable) + })) + defer server.Close() + + if err := waitForHealthy(server.URL, 1*time.Second); err == nil { + t.Fatal("expected timeout error, got nil") + } +} diff --git a/docs/EVAL.md b/docs/EVAL.md index a3eff62..fb1a3f4 100644 --- a/docs/EVAL.md +++ b/docs/EVAL.md @@ -135,15 +135,37 @@ tests: ### 3. Run Tests +The simplest path is `--serve`, which builds and launches your project in EvalServer +mode, waits for it to become healthy, runs the tests, and stops it automatically: + +```bash +# One command: launch the server, run tests, tear down +agk eval tests.yaml --serve + +# View report +cat .agk/reports/eval-report-*.md +``` + +`--serve` options: + +| Flag | Default | Description | +|------|---------|-------------| +| `--serve` | off | Launch the project in EvalServer mode for the run, then stop it | +| `--serve-dir` | `.` | Project directory to launch | +| `--serve-cmd` | `go run .` | Custom launch command (e.g. a prebuilt binary) | +| `--serve-timeout` | `90` | Seconds to wait for the server to become healthy | + +It sets `AGK_EVAL_MODE=true` in the launched process and derives the health URL from the +test file's `target.url`. Server output is captured and printed if startup fails. + +Prefer to manage the server yourself? Run it in a separate terminal and omit `--serve`: + ```bash # Terminal 1: Start your workflow in EvalServer mode AGK_EVAL_MODE=true ./myworkflow # Terminal 2: Run tests agk eval tests.yaml --timeout 200 - -# View report -cat .agk/reports/eval-report-*.md ``` ---