From 35fc605289b50d47396a9dfa5c791f4513e2c88f Mon Sep 17 00:00:00 2001
From: Kunal Kushwaha <kunal.kushwaha@gmail.com>
Date: Mon, 22 Jun 2026 00:17:19 +0900
Subject: [PATCH] feat(trace): add `agk trace diff` to compare two runs

Answers "did my change help?" by diffing two trace runs across duration,
spans, LLM calls, tokens, and estimated cost, with colored deltas
(improvements green, regressions red) and percentage change.

Run selection:
  agk trace diff                 # two most recent runs
  agk trace diff <a>             # <a> (baseline) vs latest
  agk trace diff <a> <b>         # explicit pair

Pairs naturally with `agk run`'s trace summary and the per-model cost work.
Includes unit tests for the delta direction, metric set, and formatting
(the cmd package's first tests).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md              |   1 +
 cmd/trace_diff.go      | 202 +++++++++++++++++++++++++++++++++++++++++
 cmd/trace_diff_test.go |  73 +++++++++++++++
 3 files changed, 276 insertions(+)
 create mode 100644 cmd/trace_diff.go
 create mode 100644 cmd/trace_diff_test.go
diff --git a/README.md b/README.md
index 375d8a7..2630bce 100644
--- a/README.md
+++ b/README.md
@@ -203,6 +203,7 @@ agk trace mermaid > trace_flow.md
 | `trace list` | List all captured trace runs. |
 | `trace show` | Display summary of a specific run. |
 | `trace view` | Open the interactive TUI trace explorer. |
+| `trace diff` | Compare two trace runs (duration, tokens, cost, LLM calls). |
 | `trace mermaid` | Generate Mermaid flowchart of trace execution. |
 
 ---
diff --git a/cmd/trace_diff.go b/cmd/trace_diff.go
new file mode 100644
index 0000000..4fc72d2
--- /dev/null
+++ b/cmd/trace_diff.go
@@ -0,0 +1,202 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+	"text/tabwriter"
+	"time"
+
+	"github.com/fatih/color"
+	"github.com/spf13/cobra"
+)
+
+// diffMetric is a single comparable metric between two runs.
+type diffMetric struct {
+	Label       string
+	A, B        float64
+	LowerBetter bool             // a lower B than A is an improvement
+	Colorize    bool             // whether the delta should be colored good/bad
+	Format      func(float64) string
+}
+
+// diffCmd compares two trace runs to answer "did my change help?".
+var diffCmd = &cobra.Command{
+	Use:   "diff [run-a] [run-b]",
+	Short: "Compare two trace runs (duration, tokens, cost, ...)",
+	Long: `Compare two trace runs and show the deltas for duration, spans, LLM calls,
+tokens, and estimated cost.
+
+Run selection:
+  agk trace diff                 # compare the two most recent runs
+  agk trace diff <run-a>         # compare <run-a> (baseline) against the latest run
+  agk trace diff <run-a> <run-b> # compare two explicit runs
+
+The first run is treated as the baseline (A); the second is the new run (B).
+For duration, tokens, and cost, a lower value in B is shown as an improvement.`,
+	Args: cobra.MaximumNArgs(2),
+	RunE: func(cmd *cobra.Command, args []string) error {
+		return runTraceDiff(args)
+	},
+}
+
+func init() {
+	traceCmd.AddCommand(diffCmd)
+}
+
+func runTraceDiff(args []string) error {
+	idA, idB, err := resolveDiffRuns(args)
+	if err != nil {
+		return err
+	}
+
+	for _, id := range []string{idA, idB} {
+		if _, err := os.Stat(filepath.Join(runsDirName, id)); err != nil {
+			return fmt.Errorf("trace not found: %s", id)
+		}
+	}
+
+	manifestA, err := readManifest(filepath.Join(runsDirName, idA))
+	if err != nil {
+		return fmt.Errorf("failed to read run %s: %w", idA, err)
+	}
+	manifestB, err := readManifest(filepath.Join(runsDirName, idB))
+	if err != nil {
+		return fmt.Errorf("failed to read run %s: %w", idB, err)
+	}
+
+	printDiff(manifestA, manifestB)
+	return nil
+}
+
+// resolveDiffRuns determines the two run IDs to compare based on the args provided.
+func resolveDiffRuns(args []string) (string, string, error) {
+	switch len(args) {
+	case 2:
+		return args[0], args[1], nil
+	case 1:
+		latest := ""
+		for _, id := range recentRunIDs(runsDirName) {
+			if id != args[0] {
+				latest = id
+				break
+			}
+		}
+		if latest == "" {
+			return "", "", fmt.Errorf("need a second run to diff against %s", args[0])
+		}
+		return args[0], latest, nil
+	default: // 0 args
+		ids := recentRunIDs(runsDirName)
+		if len(ids) < 2 {
+			return "", "", fmt.Errorf("need at least two trace runs to diff (found %d)", len(ids))
+		}
+		// ids[0] is newest; baseline is the older of the two most recent.
+		return ids[1], ids[0], nil
+	}
+}
+
+// recentRunIDs returns run directory names sorted newest-first by modification time.
+func recentRunIDs(runsDir string) []string {
+	entries, err := os.ReadDir(runsDir)
+	if err != nil {
+		return nil
+	}
+	type run struct {
+		name string
+		mod  time.Time
+	}
+	var runs []run
+	for _, e := range entries {
+		if !e.IsDir() {
+			continue
+		}
+		info, err := e.Info()
+		if err != nil {
+			continue
+		}
+		runs = append(runs, run{e.Name(), info.ModTime()})
+	}
+	sort.Slice(runs, func(i, j int) bool { return runs[i].mod.After(runs[j].mod) })
+
+	ids := make([]string, len(runs))
+	for i, r := range runs {
+		ids[i] = r.name
+	}
+	return ids
+}
+
+func runDiffMetrics(a, b TraceRun) []diffMetric {
+	return []diffMetric{
+		{Label: "Duration", A: a.Duration, B: b.Duration, LowerBetter: true, Colorize: true, Format: fmtSeconds},
+		{Label: "Spans", A: float64(a.SpanCount), B: float64(b.SpanCount), Format: fmtCount},
+		{Label: "LLM Calls", A: float64(a.LLMCalls), B: float64(b.LLMCalls), Format: fmtCount},
+		{Label: "Tokens", A: float64(a.TotalTokens), B: float64(b.TotalTokens), LowerBetter: true, Colorize: true, Format: fmtCount},
+		{Label: "Est. Cost", A: a.EstimatedCost, B: b.EstimatedCost, LowerBetter: true, Colorize: true, Format: fmtUSD},
+	}
+}
+
+// deltaDirection returns +1 if B is an improvement over A, -1 if a regression,
+// and 0 if unchanged or not colorized.
+func deltaDirection(m diffMetric) int {
+	d := m.B - m.A
+	if d == 0 || !m.Colorize {
+		return 0
+	}
+	if (d < 0) == m.LowerBetter {
+		return 1
+	}
+	return -1
+}
+
+func printDiff(a, b TraceRun) {
+	fmt.Println()
+	color.Cyan("📊 Trace Diff")
+	fmt.Printf("  A (baseline): %s\n", a.RunID)
+	fmt.Printf("  B (new):      %s\n", b.RunID)
+	fmt.Println(strings.Repeat("─", 64))
+
+	w := tabwriter.NewWriter(os.Stdout, 0, 0, 2, ' ', 0)
+	fmt.Fprintln(w, "METRIC\tA (baseline)\tB (new)\tΔ")
+	for _, m := range runDiffMetrics(a, b) {
+		delta := formatDelta(m)
+		switch deltaDirection(m) {
+		case 1:
+			delta = color.GreenString(delta)
+		case -1:
+			delta = color.RedString(delta)
+		}
+		fmt.Fprintf(w, "%s\t%s\t%s\t%s\n", m.Label, m.Format(m.A), m.Format(m.B), delta)
+	}
+	_ = w.Flush()
+	fmt.Println()
+}
+
+func formatDelta(m diffMetric) string {
+	d := m.B - m.A
+	if d == 0 {
+		return "—"
+	}
+	sign, arrow := "+", "▲"
+	if d < 0 {
+		sign, arrow = "-", "▼"
+	}
+	out := fmt.Sprintf("%s%s %s", sign, m.Format(absF(d)), arrow)
+	if m.A != 0 {
+		out += fmt.Sprintf(" %.0f%%", d/m.A*100)
+	}
+	return out
+}
+
+func fmtSeconds(v float64) string { return fmt.Sprintf("%.2fs", v) }
+func fmtCount(v float64) string   { return fmt.Sprintf("%.0f", v) }
+func fmtUSD(v float64) string     { return fmt.Sprintf("$%.4f", v) }
+
+func absF(v float64) float64 {
+	if v < 0 {
+		return -v
+	}
+	return v
+}
diff --git a/cmd/trace_diff_test.go b/cmd/trace_diff_test.go
new file mode 100644
index 0000000..117e09b
--- /dev/null
+++ b/cmd/trace_diff_test.go
@@ -0,0 +1,73 @@
+package cmd
+
+import "testing"
+
+func TestDeltaDirection(t *testing.T) {
+	cases := []struct {
+		name string
+		m    diffMetric
+		want int
+	}{
+		{"lower-better improvement", diffMetric{A: 10, B: 5, LowerBetter: true, Colorize: true}, 1},
+		{"lower-better regression", diffMetric{A: 5, B: 10, LowerBetter: true, Colorize: true}, -1},
+		{"equal is neutral", diffMetric{A: 7, B: 7, LowerBetter: true, Colorize: true}, 0},
+		{"not colorized is neutral", diffMetric{A: 10, B: 5, LowerBetter: true, Colorize: false}, 0},
+		{"higher-better improvement", diffMetric{A: 5, B: 10, LowerBetter: false, Colorize: true}, 1},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := deltaDirection(c.m); got != c.want {
+				t.Errorf("deltaDirection = %d, want %d", got, c.want)
+			}
+		})
+	}
+}
+
+func TestRunDiffMetrics(t *testing.T) {
+	a := TraceRun{Duration: 2.0, SpanCount: 5, LLMCalls: 2, TotalTokens: 1000, EstimatedCost: 0.0100}
+	b := TraceRun{Duration: 1.0, SpanCount: 4, LLMCalls: 1, TotalTokens: 500, EstimatedCost: 0.0050}
+
+	metrics := runDiffMetrics(a, b)
+	if len(metrics) != 5 {
+		t.Fatalf("expected 5 metrics, got %d", len(metrics))
+	}
+
+	byLabel := make(map[string]diffMetric, len(metrics))
+	for _, m := range metrics {
+		byLabel[m.Label] = m
+	}
+
+	// Tokens halved → improvement.
+	if dir := deltaDirection(byLabel["Tokens"]); dir != 1 {
+		t.Errorf("Tokens direction = %d, want 1 (improvement)", dir)
+	}
+	// Cost halved → improvement.
+	if dir := deltaDirection(byLabel["Est. Cost"]); dir != 1 {
+		t.Errorf("Cost direction = %d, want 1 (improvement)", dir)
+	}
+	// Spans is not colorized → neutral even though it changed.
+	if dir := deltaDirection(byLabel["Spans"]); dir != 0 {
+		t.Errorf("Spans direction = %d, want 0 (neutral)", dir)
+	}
+}
+
+func TestFormatDelta(t *testing.T) {
+	if got := formatDelta(diffMetric{A: 10, B: 10, Format: fmtCount}); got != "—" {
+		t.Errorf("equal delta = %q, want em dash", got)
+	}
+	// 1000 -> 500 tokens: -500, 50% lower.
+	got := formatDelta(diffMetric{A: 1000, B: 500, Format: fmtCount})
+	if got != "-500 ▼ -50%" {
+		t.Errorf("formatDelta = %q, want %q", got, "-500 ▼ -50%")
+	}
+}
+
+func TestResolveDiffRunsExplicit(t *testing.T) {
+	a, b, err := resolveDiffRuns([]string{"run-1", "run-2"})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if a != "run-1" || b != "run-2" {
+		t.Errorf("got (%s, %s), want (run-1, run-2)", a, b)
+	}
+}