Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions docs/EVAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,47 @@ tests:

---

## Trace (Behavioral) Assertions

Content matching checks *what* an agent answered. Trace assertions check *how* it got
there — which tools it called, how many LLM calls it made, and the path it took. They run
**after** the content match and are configured under `expect.trace` (the canonical schema
lives in `internal/eval/types.go`).

After a successful invoke, AGK fetches the run's trace from the EvalServer
(`GET /traces/{id}`) and evaluates the assertions. Tool calls also use the `tools_called`
field from the invoke response, so `tool_calls` is checked even if the trace can't be fetched.

```yaml
tests:
- name: "Answers about Paris using search, efficiently"
input: "What's the weather in Paris?"
expect:
type: contains
values: ["Paris"]
trace:
tool_calls: ["search"] # each listed tool must have been called
llm_calls: 2 # exact LLM-call count
execution_path: ["research", "format"] # must appear, in order, as a subsequence
min_steps: 2 # observed steps >= 2
max_steps: 8 # observed steps <= 8
```

### Trace Fields (`expect.trace`)

| Field | Type | Check |
|-------|------|-------|
| `tool_calls` | string[] | Every listed tool must appear among the called tools (subset). |
| `llm_calls` | int | When > 0, the observed LLM-call count must match **exactly**. |
| `execution_path` | string[] | The listed span names must appear **in order** (gaps allowed). |
| `min_steps` | int | Observed step count (total spans) must be **≥** this. |
| `max_steps` | int | Observed step count (total spans) must be **≤** this. |

A test fails if any assertion fails; the report lists every failed assertion. Omit a field
to skip that check.

---

## Semantic Matching Strategies

### 1. Embedding Strategy
Expand Down
25 changes: 25 additions & 0 deletions internal/eval/http_target.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,31 @@ func (ht *HTTPTarget) Invoke(input string, timeout int) (*InvokeResponse, error)
return &invokeResp, nil
}

// FetchTrace retrieves a trace by ID from the EvalServer's GET /traces/{id} endpoint.
// It is used to validate trace (behavioral) expectations after an invoke.
func (ht *HTTPTarget) FetchTrace(traceID string) (*evalTrace, error) {
if traceID == "" {
return nil, fmt.Errorf("empty trace id")
}

resp, err := ht.client.Get(ht.baseURL + "/traces/" + traceID)
if err != nil {
return nil, fmt.Errorf("trace request failed: %w", err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("trace fetch returned HTTP %d: %s", resp.StatusCode, string(body))
}

var trace evalTrace
if err := json.NewDecoder(resp.Body).Decode(&trace); err != nil {
return nil, fmt.Errorf("failed to parse trace: %w", err)
}
return &trace, nil
}

// Health checks if the target is healthy
func (ht *HTTPTarget) Health() error {
resp, err := ht.client.Get(ht.baseURL + "/health")
Expand Down
68 changes: 68 additions & 0 deletions internal/eval/http_target_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package eval

import (
"net/http"
"net/http/httptest"
"testing"
"time"
)

func TestFetchTrace(t *testing.T) {
const traceID = "trace-abc"
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/traces/"+traceID {
http.Error(w, "not found", http.StatusNotFound)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{
"id": "trace-abc",
"spans": [
{"name": "agk.agent.run"},
{"name": "agk.llm.generate"},
{"name": "agk.tool.call", "attributes": {"agk.tool.name": "search"}}
]
}`))
}))
defer server.Close()

target := NewHTTPTarget(server.URL, 5*time.Second)

trace, err := target.FetchTrace(traceID)
if err != nil {
t.Fatalf("FetchTrace error: %v", err)
}
if trace.ID != traceID {
t.Errorf("trace ID = %q, want %q", trace.ID, traceID)
}
if len(trace.Spans) != 3 {
t.Fatalf("got %d spans, want 3", len(trace.Spans))
}

// Round-trip through the normalizer to confirm the wire format is consumable.
obs := buildObservedTrace(trace, nil)
if obs.LLMCalls != 1 {
t.Errorf("LLMCalls = %d, want 1", obs.LLMCalls)
}
if len(obs.ToolCalls) != 1 || obs.ToolCalls[0] != "search" {
t.Errorf("ToolCalls = %v, want [search]", obs.ToolCalls)
}
}

func TestFetchTraceErrors(t *testing.T) {
target := NewHTTPTarget("http://127.0.0.1:0", time.Second)

if _, err := target.FetchTrace(""); err == nil {
t.Error("expected error for empty trace id")
}

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "nope", http.StatusNotFound)
}))
defer server.Close()

target = NewHTTPTarget(server.URL, 5*time.Second)
if _, err := target.FetchTrace("missing"); err == nil {
t.Error("expected error for 404 trace fetch")
}
}
27 changes: 26 additions & 1 deletion internal/eval/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package eval
import (
"context"
"fmt"
"strings"
"time"
)

Expand Down Expand Up @@ -190,8 +191,32 @@ func (r *Runner) runTest(test Test, target *HTTPTarget) TestResult {
return result
}

// TODO: Validate trace expectations if specified (test.Expect.Trace)
// Validate trace (behavioral) expectations if specified
if test.Expect.Trace != nil {
if failures := r.validateTraceExpectation(test, target, resp); len(failures) > 0 {
result.Passed = false
result.ErrorMessage = "trace assertion failed: " + strings.Join(failures, "; ")
return result
}
}

result.Passed = true
return result
}

// validateTraceExpectation fetches the run's trace (when available) and checks it
// against the test's trace expectation. Tool calls come from the invoke response;
// LLM-call count, execution path, and step counts come from the fetched trace.
func (r *Runner) validateTraceExpectation(test Test, target *HTTPTarget, resp *InvokeResponse) []string {
var observed *evalTrace
if resp.TraceID != "" {
if t, err := target.FetchTrace(resp.TraceID); err == nil {
observed = t
} else if r.config.Verbose {
fmt.Printf(" [trace] could not fetch trace %s: %v\n", resp.TraceID, err)
}
}

obs := buildObservedTrace(observed, resp.ToolsCalled)
return ValidateTrace(test.Expect.Trace, obs)
}
160 changes: 160 additions & 0 deletions internal/eval/trace_validator.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
package eval

import (
"fmt"
"strings"
)

// evalTrace mirrors the subset of the EvalServer's GET /traces/{id} response that
// trace assertions need. The server type lives in the framework (v1beta.EvalTrace);
// this is a decode-only copy so the CLI doesn't depend on the framework package.
type evalTrace struct {
ID string `json:"id"`
Spans []*evalSpan `json:"spans"`
}

type evalSpan struct {
Name string `json:"name"`
Attributes map[string]interface{} `json:"attributes"`
}

// ObservedTrace is the normalized view of a run's behavior used for assertions.
type ObservedTrace struct {
ToolCalls []string // distinct tool names invoked, in first-seen order
LLMCalls int // number of LLM spans
Path []string // ordered span names (the execution path)
Steps int // total spans (a proxy for execution steps)
}

// buildObservedTrace normalizes a fetched trace (and the tools_called list from the
// invoke response) into an ObservedTrace. Either source may be empty; tools_called is
// treated as the authoritative tool list and augmented with any tool spans found.
func buildObservedTrace(t *evalTrace, toolsCalled []string) ObservedTrace {
obs := ObservedTrace{}
seen := make(map[string]bool)

addTool := func(name string) {
name = strings.TrimSpace(name)
if name == "" || seen[name] {
return
}
seen[name] = true
obs.ToolCalls = append(obs.ToolCalls, name)
}

for _, name := range toolsCalled {
addTool(name)
}

if t != nil {
for _, sp := range t.Spans {
if sp == nil {
continue
}
obs.Path = append(obs.Path, sp.Name)

lname := strings.ToLower(sp.Name)
if strings.Contains(lname, "llm") {
obs.LLMCalls++
}
if isToolSpan(lname) {
addTool(toolNameFromSpan(sp))
}
}
obs.Steps = len(t.Spans)
}

return obs
}

func isToolSpan(lowerName string) bool {
return strings.Contains(lowerName, "tool.call") || strings.Contains(lowerName, "tool_call")
}

// toolNameFromSpan extracts a tool name from a tool span's attributes, trying the
// AgenticGoKit key first and a couple of common fallbacks.
func toolNameFromSpan(sp *evalSpan) string {
for _, key := range []string{"agk.tool.name", "tool.name", "tool"} {
if v, ok := sp.Attributes[key]; ok {
if s, ok := v.(string); ok && s != "" {
return s
}
}
}
return ""
}

// ValidateTrace checks an ObservedTrace against a TraceExpectation and returns a list
// of human-readable failure messages (empty means all assertions passed).
//
// Semantics:
// - tool_calls: every listed tool must have been called (subset check)
// - llm_calls: when > 0, the observed LLM-call count must match exactly
// - min_steps: observed step count must be >= min
// - max_steps: observed step count must be <= max
// - execution_path: the listed names must appear, in order, as a subsequence
func ValidateTrace(exp *TraceExpectation, obs ObservedTrace) []string {
if exp == nil {
return nil
}

var failures []string

if len(exp.ToolCalls) > 0 {
have := make(map[string]bool, len(obs.ToolCalls))
for _, t := range obs.ToolCalls {
have[t] = true
}
var missing []string
for _, want := range exp.ToolCalls {
if !have[want] {
missing = append(missing, want)
}
}
if len(missing) > 0 {
failures = append(failures, fmt.Sprintf(
"expected tool call(s) not found: %v (called: %v)", missing, orNone(obs.ToolCalls)))
}
}

if exp.LLMCalls > 0 && obs.LLMCalls != exp.LLMCalls {
failures = append(failures, fmt.Sprintf(
"expected %d LLM call(s), observed %d", exp.LLMCalls, obs.LLMCalls))
}

if exp.MinSteps > 0 && obs.Steps < exp.MinSteps {
failures = append(failures, fmt.Sprintf(
"expected at least %d step(s), observed %d", exp.MinSteps, obs.Steps))
}

if exp.MaxSteps > 0 && obs.Steps > exp.MaxSteps {
failures = append(failures, fmt.Sprintf(
"expected at most %d step(s), observed %d", exp.MaxSteps, obs.Steps))
}

if len(exp.ExecutionPath) > 0 && !isOrderedSubsequence(exp.ExecutionPath, obs.Path) {
failures = append(failures, fmt.Sprintf(
"expected execution path %v not found (in order) within observed %v",
exp.ExecutionPath, orNone(obs.Path)))
}

return failures
}

// isOrderedSubsequence reports whether want appears within have in order (gaps allowed).
func isOrderedSubsequence(want, have []string) bool {
i := 0
for _, h := range have {
if i < len(want) && h == want[i] {
i++
}
}
return i == len(want)
}

func orNone(s []string) []string {
if len(s) == 0 {
return []string{"<none>"}
}
return s
}
Loading
Loading