AgenticGoKit · kunalkushwaha · Jun 21, 2026
diff --git a/cmd/eval.go b/cmd/eval.go
@@ -3,7 +3,9 @@ package cmd
 import (
 	"fmt"
 	"os"
+	"os/signal"
 	"path/filepath"
+	"syscall"
 	"time"
 
 	"github.com/spf13/cobra"
@@ -39,6 +41,10 @@ var (
 	evalOutputFormat string
 	evalFailFast     bool
 	evalReportFile   string
+	evalServe        bool
+	evalServeDir     string
+	evalServeCmd     string
+	evalServeWait    int
 )
 
 func init() {
@@ -50,6 +56,10 @@ func init() {
 	evalCmd.Flags().StringVarP(&evalOutputFormat, "format", "f", "console", "Output format (console, json, junit, markdown)")
 	evalCmd.Flags().BoolVar(&evalFailFast, "fail-fast", false, "Stop on first test failure")
 	evalCmd.Flags().StringVarP(&evalReportFile, "report", "r", "", "Save detailed report to file (auto-generated if not specified)")
+	evalCmd.Flags().BoolVar(&evalServe, "serve", false, "Build & launch the project in EvalServer mode, run tests, then stop it")
+	evalCmd.Flags().StringVar(&evalServeDir, "serve-dir", ".", "Project directory to launch when --serve is set")
+	evalCmd.Flags().StringVar(&evalServeCmd, "serve-cmd", "", "Custom command to launch the server (default: go run .)")
+	evalCmd.Flags().IntVar(&evalServeWait, "serve-timeout", 90, "Seconds to wait for the server to become healthy")
 }
 
 func runEval(cmd *cobra.Command, args []string) error {
@@ -86,6 +96,26 @@ func runEval(cmd *cobra.Command, args []string) error {
 		return nil
 	}
 
+	// Optionally launch the project's EvalServer for the duration of the run.
+	var srv *evalServer
+	if evalServe {
+		s, err := launchAndWait(evalServeDir, evalServeCmd, suite.Target.URL, evalServeWait, evalVerbose)
+		if err != nil {
+			return err
+		}
+		srv = s
+		defer srv.Stop()
+
+		// Ensure the server is stopped if the user interrupts the run.
+		sigCh := make(chan os.Signal, 1)
+		signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM)
+		go func() {
+			<-sigCh
+			srv.Stop()
+			os.Exit(130)
+		}()
+	}
+
 	// Create test runner
 	runner := eval.NewRunner(&eval.RunnerConfig{
 		Timeout:      time.Duration(evalTimeout) * time.Second,
@@ -139,8 +169,12 @@ func runEval(cmd *cobra.Command, args []string) error {
 		}
 	}
 
-	// Exit with error code if tests failed
+	// Exit with error code if tests failed. os.Exit skips deferred calls, so stop
+	// the server explicitly first.
 	if !results.AllPassed() {
+		if srv != nil {
+			srv.Stop()
+		}
 		os.Exit(1)
 	}
 

diff --git a/cmd/eval_serve.go b/cmd/eval_serve.go
@@ -0,0 +1,167 @@
+package cmd
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"os/exec"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+)
+
+// evalServer manages a user EvalServer process launched for the duration of a
+// test run (the `agk eval --serve` workflow).
+type evalServer struct {
+	cmd    *exec.Cmd
+	output *syncBuffer
+	once   sync.Once
+}
+
+// syncBuffer is a goroutine-safe buffer for capturing child process output.
+type syncBuffer struct {
+	mu  sync.Mutex
+	buf bytes.Buffer
+}
+
+func (b *syncBuffer) Write(p []byte) (int, error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.buf.Write(p)
+}
+
+func (b *syncBuffer) String() string {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.buf.String()
+}
+
+// startEvalServer launches the project in EvalServer mode (AGK_EVAL_MODE=true).
+// The default command is `go run .` in dir; customCmd overrides it. The process is
+// started in its own process group so `go run`'s compiled child can be reliably killed.
+func startEvalServer(dir, customCmd string, streamOutput bool) (*evalServer, error) {
+	name, args := parseServeCmd(customCmd)
+
+	c := exec.Command(name, args...) //nolint:gosec // command is user-provided by design
+	c.Dir = dir
+	c.Env = append(os.Environ(), "AGK_EVAL_MODE=true")
+	c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
+
+	out := &syncBuffer{}
+	var w io.Writer = out
+	if streamOutput {
+		w = io.MultiWriter(out, &prefixWriter{prefix: "[server] ", w: os.Stderr})
+	}
+	c.Stdout = w
+	c.Stderr = w
+
+	if err := c.Start(); err != nil {
+		return nil, fmt.Errorf("failed to start eval server (%s): %w", name, err)
+	}
+	return &evalServer{cmd: c, output: out}, nil
+}
+
+// parseServeCmd returns the command name and args, defaulting to `go run .`.
+func parseServeCmd(customCmd string) (string, []string) {
+	if fields := strings.Fields(customCmd); len(fields) > 0 {
+		return fields[0], fields[1:]
+	}
+	return "go", []string{"run", "."}
+}
+
+// Stop terminates the server process group (idempotent), escalating SIGTERM→SIGKILL.
+func (s *evalServer) Stop() {
+	s.once.Do(func() {
+		if s.cmd.Process == nil {
+			return
+		}
+		s.signalGroup(syscall.SIGTERM)
+
+		done := make(chan struct{})
+		go func() { _ = s.cmd.Wait(); close(done) }()
+
+		select {
+		case <-done:
+		case <-time.After(3 * time.Second):
+			s.signalGroup(syscall.SIGKILL)
+			<-done
+		}
+	})
+}
+
+func (s *evalServer) signalGroup(sig syscall.Signal) {
+	if pgid, err := syscall.Getpgid(s.cmd.Process.Pid); err == nil {
+		_ = syscall.Kill(-pgid, sig)
+	} else {
+		_ = s.cmd.Process.Signal(sig)
+	}
+}
+
+// Output returns everything the server printed to stdout/stderr so far.
+func (s *evalServer) Output() string { return s.output.String() }
+
+// waitForHealthy polls url + "/health" until it returns 200 or the timeout elapses.
+func waitForHealthy(url string, timeout time.Duration) error {
+	client := &http.Client{Timeout: 3 * time.Second}
+	deadline := time.Now().Add(timeout)
+	healthURL := strings.TrimRight(url, "/") + "/health"
+
+	var lastErr error
+	for time.Now().Before(deadline) {
+		resp, err := client.Get(healthURL)
+		if err == nil {
+			_ = resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+			lastErr = fmt.Errorf("health returned HTTP %d", resp.StatusCode)
+		} else {
+			lastErr = err
+		}
+		time.Sleep(500 * time.Millisecond)
+	}
+	return fmt.Errorf("server not healthy within %s: %w", timeout, lastErr)
+}
+
+// launchAndWait starts the eval server and blocks until it is healthy. On failure it
+// stops the server and surfaces its captured output to aid debugging.
+func launchAndWait(dir, customCmd, targetURL string, waitSecs int, verbose bool) (*evalServer, error) {
+	if targetURL == "" {
+		return nil, fmt.Errorf("--serve requires a target URL in the test file")
+	}
+
+	fmt.Printf("🚀 Launching EvalServer from %s (AGK_EVAL_MODE=true)...\n", dir)
+	srv, err := startEvalServer(dir, customCmd, verbose)
+	if err != nil {
+		return nil, err
+	}
+
+	fmt.Printf("⏳ Waiting up to %ds for %s to become healthy...\n", waitSecs, targetURL)
+	if err := waitForHealthy(targetURL, time.Duration(waitSecs)*time.Second); err != nil {
+		out := srv.Output()
+		srv.Stop()
+		if strings.TrimSpace(out) != "" {
+			fmt.Fprintf(os.Stderr, "\n--- server output ---\n%s\n---------------------\n", out)
+		}
+		return nil, fmt.Errorf("eval server did not start: %w", err)
+	}
+
+	fmt.Println("✓ Server is healthy")
+	return srv, nil
+}
+
+// prefixWriter prefixes each write with a label (used to tag streamed server output).
+type prefixWriter struct {
+	prefix string
+	w      io.Writer
+}
+
+func (p *prefixWriter) Write(b []byte) (int, error) {
+	if _, err := io.WriteString(p.w, p.prefix); err != nil {
+		return 0, err
+	}
+	return p.w.Write(b)
+}
diff --git a/cmd/eval_serve_test.go b/cmd/eval_serve_test.go
@@ -0,0 +1,63 @@
+package cmd
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"reflect"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func TestParseServeCmd(t *testing.T) {
+	cases := []struct {
+		in       string
+		wantName string
+		wantArgs []string
+	}{
+		{"", "go", []string{"run", "."}},
+		{"   ", "go", []string{"run", "."}},
+		{"./server", "./server", []string{}},
+		{"go run ./cmd/server", "go", []string{"run", "./cmd/server"}},
+		{"mybin --eval --port 8787", "mybin", []string{"--eval", "--port", "8787"}},
+	}
+	for _, c := range cases {
+		name, args := parseServeCmd(c.in)
+		if name != c.wantName || !reflect.DeepEqual(args, c.wantArgs) {
+			t.Errorf("parseServeCmd(%q) = (%q, %v), want (%q, %v)", c.in, name, args, c.wantName, c.wantArgs)
+		}
+	}
+}
+
+func TestWaitForHealthyBecomesHealthy(t *testing.T) {
+	var ready atomic.Bool
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/health" && ready.Load() {
+			w.WriteHeader(http.StatusOK)
+			return
+		}
+		http.Error(w, "starting", http.StatusServiceUnavailable)
+	}))
+	defer server.Close()
+
+	// Flip to healthy shortly after we start polling.
+	go func() {
+		time.Sleep(300 * time.Millisecond)
+		ready.Store(true)
+	}()
+
+	if err := waitForHealthy(server.URL, 5*time.Second); err != nil {
+		t.Fatalf("waitForHealthy returned error: %v", err)
+	}
+}
+
+func TestWaitForHealthyTimesOut(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		http.Error(w, "never ready", http.StatusServiceUnavailable)
+	}))
+	defer server.Close()
+
+	if err := waitForHealthy(server.URL, 1*time.Second); err == nil {
+		t.Fatal("expected timeout error, got nil")
+	}
+}
diff --git a/docs/EVAL.md b/docs/EVAL.md
@@ -135,15 +135,37 @@ tests:
 
 ### 3. Run Tests
 
+The simplest path is `--serve`, which builds and launches your project in EvalServer
+mode, waits for it to become healthy, runs the tests, and stops it automatically:
+
+```bash
+# One command: launch the server, run tests, tear down
+agk eval tests.yaml --serve
+
+# View report
+cat .agk/reports/eval-report-*.md
+```
+
+`--serve` options:
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--serve` | off | Launch the project in EvalServer mode for the run, then stop it |
+| `--serve-dir` | `.` | Project directory to launch |
+| `--serve-cmd` | `go run .` | Custom launch command (e.g. a prebuilt binary) |
+| `--serve-timeout` | `90` | Seconds to wait for the server to become healthy |
+
+It sets `AGK_EVAL_MODE=true` in the launched process and derives the health URL from the
+test file's `target.url`. Server output is captured and printed if startup fails.
+
+Prefer to manage the server yourself? Run it in a separate terminal and omit `--serve`:
+
 ```bash
 # Terminal 1: Start your workflow in EvalServer mode
 AGK_EVAL_MODE=true ./myworkflow
 
 # Terminal 2: Run tests
 agk eval tests.yaml --timeout 200
-
-# View report
-cat .agk/reports/eval-report-*.md
 ```
 
 ---