diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 17e7a2a1e..0b0b2fd87 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -897,7 +897,7 @@ describe('writeArtifactsFromResults', () => { }, }, ]); - expect(envelope.schema_version).toBe('agentv.execution_trace.v1'); + expect(envelope.schema_version).toBe('agentv.trace.v1'); expect(envelope.artifact_id).toMatch(/^execution-trace-/); expect(envelope.eval.test_id).toBe('transcript-case'); expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([ diff --git a/apps/cli/test/commands/results/remote-auto-export.test.ts b/apps/cli/test/commands/results/remote-auto-export.test.ts index a056d1c6a..f2283f09b 100644 --- a/apps/cli/test/commands/results/remote-auto-export.test.ts +++ b/apps/cli/test/commands/results/remote-auto-export.test.ts @@ -85,7 +85,7 @@ function payload(projectDir: string, runDir: string) { target: 'mock', timestamp: '2026-06-13T00:00:00.000Z', trace: { - schemaVersion: 'agentv.trace.v1', + schemaVersion: 'agentv.trajectory.v1', messages: [], events: [], eventCount: 0, diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index b69c03e24..f974df20a 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -206,7 +206,7 @@ describe('trace utils', () => { expect(results).toHaveLength(1); expect(results[0].test_id).toBe('test-2'); expect(results[0].trace).toMatchObject({ - schema_version: 'agentv.trace.v1', + schema_version: 'agentv.trajectory.v1', event_count: 0, messages: [], events: [], @@ -222,7 +222,7 @@ describe('trace utils', () => { expect(results).toHaveLength(1); expect(results[0].test_id).toBe('test-2'); expect(results[0].trace).toMatchObject({ - schema_version: 'agentv.trace.v1', + schema_version: 'agentv.trajectory.v1', event_count: 0, messages: [], events: [], diff --git a/apps/cli/test/fixtures/mock-run-evaluation.ts b/apps/cli/test/fixtures/mock-run-evaluation.ts index 51f2b3a1e..98eda7a2c 100644 --- a/apps/cli/test/fixtures/mock-run-evaluation.ts +++ b/apps/cli/test/fixtures/mock-run-evaluation.ts @@ -72,7 +72,7 @@ function evalCaseIds(evalCases: ReadonlyArray | undefined): readonly st function buildTrace(targetName: string, testId: string, output: string): Record { const message = { role: 'assistant', content: output }; return { - schemaVersion: 'agentv.trace.v1', + schemaVersion: 'agentv.trajectory.v1', eventCount: 2, toolCalls: {}, errorCount: 0, diff --git a/docs/adr/2026-06-11-phoenix-observability-adapter.md b/docs/adr/2026-06-11-phoenix-observability-adapter.md index c866606d3..d39703229 100644 --- a/docs/adr/2026-06-11-phoenix-observability-adapter.md +++ b/docs/adr/2026-06-11-phoenix-observability-adapter.md @@ -6,7 +6,7 @@ Status: Proposed ## Context -AgentV exports evaluation traces through generic OpenTelemetry/OTLP plumbing and is adding a normalized trajectory contract for post-hoc trace evaluation. A focused follow-up proposed adding a Phoenix OTel backend preset for `--otel-backend phoenix`, but that raised a scope concern: Phoenix project routing, collector endpoint conventions, API keys, dataset concepts, and experiment behavior are backend-specific. +AgentV exports evaluation traces through generic OpenTelemetry/OTLP plumbing and is adding a derived trajectory contract for post-hoc trace evaluation. A focused follow-up proposed adding a Phoenix OTel backend preset for `--otel-backend phoenix`, but that raised a scope concern: Phoenix project routing, collector endpoint conventions, API keys, dataset concepts, and experiment behavior are backend-specific. AgentV's architecture principles prefer a lightweight core with extension points and adapters. Built-ins should be universal primitives that most users compose. Backend-specific observability integrations should not make AgentV core behave like a hosted trace or experiment platform. @@ -25,7 +25,7 @@ AgentV core should own: - generic OTLP/HTTP export configuration; - OTLP JSON file export; -- normalized trajectory types and wire conversion; +- derived trajectory types and wire conversion; - generic OTLP/OpenInference import/export mapping where it is backend-neutral; - small registry/discovery primitives for extension points. @@ -94,7 +94,7 @@ Negative: ## Tracker impact -- `av-vwa.6` remains valid: core should map normalized trajectories to and from generic OTLP/OpenInference shapes, while Phoenix-specific dataset, experiment, project, and span-kind behavior stays in adapter space. +- `av-vwa.6` remains valid: core should map derived trajectories to and from generic OTLP/OpenInference shapes, while Phoenix-specific dataset, experiment, project, and span-kind behavior stays in adapter space. - `av-vwa.6.1` should be revised from adding a Phoenix preset in core to adding the minimal observability backend extension seam plus a Phoenix resolver in the Phoenix adapter. If the extension seam is not approved, defer the bead and document generic OTLP environment-variable configuration for Phoenix instead. ## Open questions diff --git a/docs/plans/replay-target-workflow-handoff.md b/docs/plans/replay-target-workflow-handoff.md index b3598d2af..046108346 100644 --- a/docs/plans/replay-target-workflow-handoff.md +++ b/docs/plans/replay-target-workflow-handoff.md @@ -24,7 +24,7 @@ This work owns the replay target database loop: - preserve target output/messages/tool calls/transcript/usage/cost/duration where available, - prove replay makes zero live target calls with live-provider environment variables blanked. -The broader normalized trajectory contract remains a separate architecture unit. This replay loop should not invent a competing full trace schema. +The broader derived trajectory contract remains a separate architecture unit. This replay loop should not invent a competing full trace schema. ## Existing Useful Surface diff --git a/docs/plans/trace-envelope-implementation-spec.md b/docs/plans/trace-envelope-implementation-spec.md index df1103aab..ab8c985f2 100644 --- a/docs/plans/trace-envelope-implementation-spec.md +++ b/docs/plans/trace-envelope-implementation-spec.md @@ -10,7 +10,7 @@ date: 2026-06-15 ## Decision And Scope AgentV stores and interchanges full execution traces as an -`agentv.execution_trace.v1` artifact. The canonical trace body is an +`agentv.trace.v1` artifact. The canonical trace body is an OpenTelemetry span graph with GenAI semantic convention attributes and OpenInference attributes where they cover the concept. AgentV owns only the small artifact wrapper around that graph: eval and replay identity, source @@ -18,9 +18,9 @@ metadata, capture/redaction policy, conversion warnings, artifact pointers, and score provenance. This supersedes the older wording in `docs/plans/trace-evaluation-architecture.md` -that treats AgentV's normalized `Trace` or `NormalizedTrajectory` object as the -canonical artifact. Those objects can remain, but they must be documented and -implemented as derived read/projection views over the canonical span graph. +that treats AgentV's result-local `Trace` or trajectory object as the canonical +artifact. Those objects can remain, but they must be documented and implemented +as derived read/projection views over the canonical span graph. Source of truth: @@ -29,11 +29,11 @@ Source of truth: - Official OTLP JSON is a boundary format generated from, or imported into, that span body. Attribute names remain exact standard names such as `gen_ai.operation.name` and `openinference.span.kind`. -- `Message[]`, `outputs/transcript.jsonl`, `TraceSummary`, - `TraceArtifact`/`NormalizedTrajectory`, replay target output, and compact - grader inputs are derived compatibility/read views. +- `Message[]`, `outputs/transcript.jsonl`, `TraceSummary`, `TraceArtifact`, + replay target output, and compact grader inputs are derived compatibility/read + views. - Derived views must be named and treated as projections over - `agentv.execution_trace.v1`, not as separate canonical graphs: + `agentv.trace.v1`, not as separate canonical graphs: `traceEnvelopeToMessages()` for Provider `Message[]` and replay provider responses, `traceEnvelopeToTranscriptMessages()` for `outputs/transcript.jsonl`, `traceEnvelopeToTraceSummary()` for metrics @@ -61,7 +61,7 @@ their source keys exactly. Directional v1 shape: ```yaml -schema_version: agentv.execution_trace.v1 +schema_version: agentv.trace.v1 artifact_id: execution-trace-01j... created_at: "2026-06-15T12:00:00.000Z" @@ -210,7 +210,7 @@ Implementation pattern: ```ts interface TraceEnvelopeWire { - readonly schema_version: 'agentv.execution_trace.v1'; + readonly schema_version: 'agentv.trace.v1'; readonly artifact_id: string; readonly created_at: string; readonly eval: TraceEnvelopeEvalWire; @@ -219,7 +219,7 @@ interface TraceEnvelopeWire { } interface TraceEnvelope { - readonly schemaVersion: 'agentv.execution_trace.v1'; + readonly schemaVersion: 'agentv.trace.v1'; readonly artifactId: string; readonly createdAt: string; readonly eval: TraceEnvelopeEval; @@ -305,8 +305,8 @@ Minimal code slices: 4. Envelope -> derived views. Implement projections from envelope spans to `Message[]`, `TraceSummary`, - `TraceArtifact`/`NormalizedTrajectory` if still needed, and - `outputs/transcript.jsonl`. Existing artifacts should be produced by these + `TraceArtifact` if still needed, and `outputs/transcript.jsonl`. Existing + artifacts should be produced by these projections once tests prove parity. 5. Artifact sidecar wiring. @@ -381,9 +381,9 @@ bun run test Red/green UAT scenario: -1. Red on `origin/main` (`0ac6b294`): run the replay showcase and confirm the - run writes current result artifacts and `outputs/transcript.jsonl`, but no - canonical `agentv.execution_trace.v1` sidecar exists. +1. Red before this namespace change: run the replay showcase and confirm the run + writes current result artifacts and `outputs/transcript.jsonl`, but the + execution trace sidecar does not validate as canonical `agentv.trace.v1`. ```bash bun apps/cli/src/cli.ts eval \ @@ -394,7 +394,7 @@ Red/green UAT scenario: 2. Green on the implementation branch: run the identical command with a new output directory. Confirm each test artifact has the execution trace sidecar, the - sidecar validates against `agentv.execution_trace.v1`, spans export to OTLP + sidecar validates against `agentv.trace.v1`, spans export to OTLP JSON, and regenerated transcript rows match the existing transcript artifact except for any documented additive pointer fields. diff --git a/docs/plans/trace-evaluation-architecture.md b/docs/plans/trace-evaluation-architecture.md index a257b8ddb..deb6b72e7 100644 --- a/docs/plans/trace-evaluation-architecture.md +++ b/docs/plans/trace-evaluation-architecture.md @@ -11,6 +11,11 @@ date: 2026-06-04 Build AgentV's trace evaluation architecture around a versioned, provider-neutral trajectory contract. AgentV should ingest traces from AgentV runs, OTLP/OpenInference/Phoenix exports, Pi sessions, and transcript-style agent logs; normalize them into one trajectory model; and run existing and future graders against that model without becoming a hosted observability platform. +Update, 2026-06-17: `agentv.trace.v1` is the canonical AgentV trace artifact +namespace for the span-graph sidecar. The trajectory/read-model contract in this +older plan is a derived projection and uses `agentv.trajectory.v1` when it needs +a schema string. + --- ## Problem Frame @@ -25,8 +30,8 @@ The best-practice direction is clear: larger players own trace stores, dashboard **Canonical Trajectory Contract** -- R1. AgentV must define a versioned normalized trajectory object that preserves ordered tool calls, model turns, tool inputs, tool outputs, IDs, timing, status, errors, source metadata, and optional raw evidence. -- R2. The normalized contract must be provider-neutral and must not require OpenTelemetry as the source format. +- R1. AgentV must define a versioned derived trajectory object that preserves ordered tool calls, model turns, tool inputs, tool outputs, IDs, timing, status, errors, source metadata, and optional raw evidence. +- R2. The derived trajectory contract must be provider-neutral and must not require OpenTelemetry as the source format. - R3. The contract must support branchable session sources by selecting an evaluation path before grading. - R4. The contract must retain enough metadata to explain grader evidence without requiring graders to parse provider-native payloads. - R5. All persisted trajectory wire formats must use `snake_case`; TypeScript internals must use `camelCase`. @@ -34,22 +39,22 @@ The best-practice direction is clear: larger players own trace stores, dashboard **Standards and Integrations** - R6. AgentV OTLP export must continue to emit standards-aligned GenAI spans, especially `invoke_agent`, `chat`, and `execute_tool` operations. -- R7. AgentV must map normalized trajectories to and from OTLP/OpenInference-style traces without making Phoenix-specific assumptions in core. +- R7. AgentV must map derived trajectories to and from OTLP/OpenInference-style traces without making Phoenix-specific assumptions in core. - R8. Phoenix integration must use AgentV as the eval-definition and grader layer while Phoenix remains the trace/dataset/experiment backend. - R9. Unsupported or lossy mappings must be explicit in conversion reports instead of silently approximated. **Post-Hoc Trace And Transcript Evaluation** -- R10. Existing deterministic trace graders, including `tool-trajectory` and `execution-metrics`, must run against normalized trajectories, not only live provider output messages or compact summaries. +- R10. Existing deterministic trace graders, including `tool-trajectory` and `execution-metrics`, must run against derived trajectories, not only live provider output messages or compact summaries. - R11. Post-hoc evaluation must accept AgentV run artifacts, AgentV OTLP files, Phoenix/Langfuse/OTLP exports, imported coding-agent transcripts, Pi session JSONL, and compact transcript JSONL through source-specific adapters. -- R12. Grader output must cite normalized trajectory evidence, such as matched tool call IDs, positions, timing, or source event IDs. -- R13. Trace evaluation must preserve current lightweight result output by deriving compact summaries from normalized trajectories. +- R12. Grader output must cite derived trajectory evidence, such as matched tool call IDs, positions, timing, or source event IDs. +- R13. Trace evaluation must preserve current lightweight result output by deriving compact summaries from derived trajectories. **Replay** -- R14. Normalized traces and transcripts must support replay as well as grading: AgentV should be able to replay recorded model/tool messages as provider output for eval suites without invoking a live agent. +- R14. Derived trajectory artifacts and transcripts must support replay as well as grading: AgentV should be able to replay recorded model/tool messages as provider output for eval suites without invoking a live agent. - R15. Replay must preserve source provider metadata, ordered messages, tool calls, tool outputs, token usage, duration, cost, and redaction state where present. -- R16. Replay and grading must share the same normalized source artifact so users do not maintain separate transcript and trace formats for the same session. +- R16. Replay and grading must share the same derived source artifact so users do not maintain separate transcript and trace formats for the same session. - R17. Replay must be configurable as a normal target substitute, such as a replay target alias replacing a live coding-agent target without changing eval YAML or grader configuration. - R18. Replay lookup must use strict test/run identity, including at least suite or eval identity, test ID, target identity, and attempt or variant when present; missing or ambiguous records must fail loudly. - R19. AgentV must support recording live target responses into replayable JSONL fixtures so a later replay run can execute the same graders without live LLM calls. @@ -70,17 +75,17 @@ The best-practice direction is clear: larger players own trace stores, dashboard ## Key Technical Decisions -- **Start from realistic characterization evals:** The first implementation phase should collect a small set of real trace fixtures and write evals that answer useful agent-quality questions. The normalized contract should be pressure-tested by those evals before broad schema or adapter work expands. +- **Start from realistic characterization evals:** The first implementation phase should collect a small set of real trace fixtures and write evals that answer useful agent-quality questions. The derived trajectory contract should be pressure-tested by those evals before broad schema or adapter work expands. - **Normalize first, grade second:** Graders should consume AgentV's trajectory contract. Importers translate raw sources into the contract; exporters translate the contract into OTLP/OpenInference/Phoenix shapes. This avoids coupling graders to Phoenix, Pi, VS Code, or provider-specific logs. - **OTel is an interchange layer, not the canonical model:** VS Code and industry tooling make OTLP/HTTP and GenAI span semantics important, but entireio-style logs and Pi sessions prove valuable traces are often transcript or lifecycle JSON. AgentV should support OTel strongly without making it mandatory. -- **Tool trajectory is turn-centric, not span-centric:** The canonical object should model sessions, turns, messages, tool calls, tool results, and selected branches. Spans are one source and export view of those facts. +- **Tool trajectory is turn-centric, not span-centric:** The derived trajectory object should model sessions, turns, messages, tool calls, tool results, and selected branches as a projection over the canonical trace artifact. - **Coding-agent transcripts are trace sources:** `agentv import claude`, `agentv import codex`, `agentv import copilot`, and `agentv eval --transcript` already establish transcript import as offline grading infrastructure. The architecture should extend that path into trajectory normalization instead of creating a separate trace-only mechanism. - **Replay fixtures are not cached grader results:** AgentV replay should return previously recorded target output and then run graders fresh. This is closer to curated transcript replay than result-cache reuse, and it preserves realistic partial or failed behavior for evaluator development. - **Replay is target substitution, not only eval mode:** `agentv eval --transcript` is useful, but the showcase should prove a replay target can replace a live target in the same eval configuration. That keeps replay aligned with AgentV's target composition model. - **Strict lookup over fuzzy cache matching:** A replay database should fail on missing or ambiguous records rather than silently falling back to similar prompts. Smooth demo fallback can come later if real use demands it. - **Fix cache DX before expanding cache concepts:** AgentV already has response caching through `--cache`, `--no-cache`, YAML `execution.cache`, YAML `execution.cache_path`, and TS config `cache.enabled` / `cache.path`. The plan should first make that surface coherent, including wiring TS `cache.path`, before adding any new cache-related behavior. - **Branch selection is explicit:** Pi sessions are tree-structured. Import must choose a leaf/path or deterministic default before grading so a grader does not accidentally evaluate omitted branches. -- **Raw provider evidence stays adapter-owned:** Provider-native payloads may be stored as optional evidence for debugging, but built-in graders should use normalized fields. +- **Raw provider evidence stays adapter-owned:** Provider-native payloads may be stored as optional evidence for debugging, but built-in graders should use derived AgentV fields. - **Lossiness is reportable:** Conversion should emit warnings when timing is inferred, tool outputs are unavailable, content is redacted, or source semantics cannot map cleanly. - **Privacy policy sits at the boundary:** Import/export should apply one content capture and redaction policy before data reaches persisted artifacts or remote backends. @@ -88,11 +93,11 @@ The best-practice direction is clear: larger players own trace stores, dashboard ## High-Level Technical Design -AgentV should introduce a normalized trace layer between raw sources and evaluation. +AgentV should introduce a derived trajectory layer between raw sources and evaluation. ```mermaid flowchart TB - A[AgentV run output] --> N[Normalized trajectory] + A[AgentV run output] --> N[Derived trajectory] B[OTLP / OpenInference export] --> N C[Phoenix trace export] --> N D[Pi session JSONL] --> N @@ -110,15 +115,16 @@ flowchart TB Q --> R ``` -The normalized trace model should keep one canonical source of truth plus derived read models: +The derived trajectory model should stay a projection over the canonical +`agentv.trace.v1` sidecar plus derived read models: -- The full trajectory is the canonical artifact for grading, replay, and explanation: ordered model turns, tool calls/results, branch metadata, source event IDs, content redaction state, and raw evidence handles. +- The full trajectory is the derived artifact for grading, replay, and explanation: ordered model turns, tool calls/results, branch metadata, source event IDs, content redaction state, and raw evidence handles. - The compact summary is a derived compatibility/read model for cheap result storage and dashboard aggregation: counts, durations, token usage, cost, error count, and tool-call counts. It must be recomputable from a full trajectory and should not be authored as separate trace state when the trajectory is available. Directional wire shape: ```yaml -schema_version: agentv.trace.v1 +schema_version: agentv.trajectory.v1 source: kind: pi_session | agentv_run | otlp | phoenix | langfuse | imported_transcript | compact_transcript path: traces/session.jsonl @@ -172,17 +178,17 @@ The exact schema belongs in implementation, but these concepts should be stable: - **Test Scenarios:** The initial evals should ask whether the agent called the right tools in order, avoided unnecessary tools, edited expected files when file-change evidence exists, recovered from a tool error, stayed inside cost/latency bounds when metrics exist, and produced grader evidence tied back to source events. - **Verification:** If the proposed trajectory contract cannot express these fixtures cleanly, revise the contract before expanding adapters. If a field is unused by these evals, keep it optional or defer it. -### U1. Normalized Trajectory Model +### U1. Derived Trajectory Model - **Goal:** Introduce the core TypeScript model, Zod validation, and snake_case wire conversion for full trajectories. - **Files:** `packages/core/src/evaluation/trace.ts`, `packages/core/src/evaluation/types.ts`, `packages/eval/src/schemas.ts`, new focused files under `packages/core/src/evaluation/trace/` if the existing file becomes too large. - **Patterns:** Follow the existing `TraceSummary`, `TokenUsage`, and project wire conversion conventions. Keep internal fields camelCase and wire fields snake_case. - **Test Scenarios:** Add tests that validate round-trip conversion, version rejection, missing optional content, inferred duration flags, branch metadata, and raw evidence handles. -- **Verification:** Unit tests should prove summaries can be derived from full trajectories without changing current summary behavior, and that normalized trajectory artifacts do not embed a separate summary payload. +- **Verification:** Unit tests should prove summaries can be derived from full trajectories without changing current summary behavior, and that derived trajectory artifacts do not embed a separate summary payload. ### U2. Trajectory Extraction From AgentV Runs -- **Goal:** Convert live provider `Message[]` output and existing AgentV result artifacts into normalized trajectories. +- **Goal:** Convert live provider `Message[]` output and existing AgentV result artifacts into derived trajectories. - **Files:** `packages/core/src/evaluation/orchestrator.ts`, `packages/core/src/evaluation/providers/types.ts`, `packages/core/src/evaluation/trace.ts`, `apps/cli/src/commands/inspect/utils.ts`. - **Patterns:** Preserve `Message.toolCalls` as the primary high-fidelity source. Keep `TraceSummary` as derived output, not the only trace representation. - **Test Scenarios:** Cover assistant tool calls with IDs, calls without IDs, tool outputs, token usage, model metadata, errors, missing timing, and no-tool-call runs. @@ -190,7 +196,7 @@ The exact schema belongs in implementation, but these concepts should be stable: ### U3. OTLP and OpenInference Import/Export Mapping -- **Goal:** Map normalized trajectories to and from OTLP JSON/HTTP-compatible spans using GenAI and OpenInference-compatible semantics where available. +- **Goal:** Map derived trajectories to and from OTLP JSON/HTTP-compatible spans using GenAI and OpenInference-compatible semantics where available. - **Files:** `packages/core/src/observability/otel-exporter.ts`, `packages/core/src/observability/otlp-json-file-exporter.ts`, `apps/cli/src/commands/inspect/utils.ts`, tests under `packages/core/test/observability/` and `apps/cli/test/`. - **Patterns:** Continue human-readable span names (`invoke_agent `, `chat `, `execute_tool `) plus machine-stable attributes. Use `gen_ai.operation.name`, `gen_ai.tool.name`, `gen_ai.tool.call.id`, token usage attributes, and `agentv.*` only where standards do not cover the concept. - **Test Scenarios:** Import an AgentV OTLP file into a trajectory, export it back, and verify tool call order, call IDs, token usage, durations, redaction state, and grader score events survive when representable. @@ -201,12 +207,12 @@ The exact schema belongs in implementation, but these concepts should be stable: - **Goal:** Extend the Phoenix integration so Phoenix can be a trace source and experiment backend while AgentV remains the grader/eval definition layer. - **Files:** `packages/phoenix-adapter/src/`, `packages/phoenix-adapter/docs/support-matrix.md`, `packages/phoenix-adapter/docs/e2e-verification.md`, related CLI entry points if promoted beyond package scripts. - **Patterns:** Keep unsupported mappings visible, as the current Phoenix adapter already does. Do not move Phoenix-specific dataset or experiment concepts into AgentV core. -- **Test Scenarios:** Convert a Phoenix/OTLP trace export into normalized trajectories, run deterministic trace graders, report unsupported evaluator families, and emit a Phoenix experiment/report with AgentV grader results where supported. +- **Test Scenarios:** Convert a Phoenix/OTLP trace export into derived trajectories, run deterministic trace graders, report unsupported evaluator families, and emit a Phoenix experiment/report with AgentV grader results where supported. - **Verification:** Dry-run conversion must work offline. Live Phoenix smoke can remain a separately documented manual check. ### U5. Pi Session Importer -- **Goal:** Import Pi session JSONL, including Hugging Face `pi-mono` style files, into normalized trajectories. +- **Goal:** Import Pi session JSONL, including Hugging Face `pi-mono` style files, into derived trajectories. - **Files:** `packages/core/src/evaluation/providers/pi-cli.ts`, `packages/core/src/evaluation/providers/pi-coding-agent.ts`, likely new importer files under `packages/core/src/import/` or `packages/core/src/evaluation/trace/importers/`. - **Patterns:** Reuse existing Pi parsing where possible, but add branch/path selection. Fold `toolResult` messages into matching assistant tool calls by ID. Treat `bashExecution` mapping as explicit policy. - **Test Scenarios:** Cover session header parsing, branch selection, assistant `toolCall` blocks, separate `toolResult` entries, `bashExecution`, inline images, thinking blocks, token usage, cost, and inferred timing. @@ -225,16 +231,16 @@ The exact schema belongs in implementation, but these concepts should be stable: - **Goal:** Upgrade the current transcript import and replay path so imported Claude, Codex, and Copilot sessions can be normalized and graded as trajectories. - **Files:** `apps/cli/src/commands/import/`, `packages/core/src/import/`, `packages/core/src/import/transcript-provider.ts`, `apps/cli/src/commands/eval/run-eval.ts`. - **Patterns:** Reuse `TranscriptEntry`, `TranscriptJsonLine`, and `TranscriptProvider` instead of inventing a parallel transcript format. Preserve `agentv eval --transcript` compatibility while adding trajectory derivation. -- **Test Scenarios:** Import one Claude, Codex, and Copilot fixture; replay each through `--transcript`; derive normalized trajectories; run `tool-trajectory`; verify tool IDs, order, tool outputs, source provider metadata, duration, cost, token usage, and redaction state where available. +- **Test Scenarios:** Import one Claude, Codex, and Copilot fixture; replay each through `--transcript`; derive derived trajectories; run `tool-trajectory`; verify tool IDs, order, tool outputs, source provider metadata, duration, cost, token usage, and redaction state where available. - **Verification:** Existing transcript-provider tests keep passing, and new trace-evaluation tests prove imported transcripts and OTLP/Phoenix exports feed the same grader path. -### U6c. Replay From Normalized Trajectory +### U6c. Replay From Derived Trajectory -- **Goal:** Let normalized trajectories act as replay sources, not only grader inputs. +- **Goal:** Let derived trajectories act as replay sources, not only grader inputs. - **Files:** `packages/core/src/import/transcript-provider.ts`, `packages/core/src/import/types.ts`, trajectory model files from U1, `apps/cli/src/commands/eval/run-eval.ts`. -- **Patterns:** Treat replay as a projection from normalized trajectory to provider `Message[]`. Do not duplicate storage by keeping one trace file for graders and a separate transcript file for replay when the normalized artifact can serve both. -- **Test Scenarios:** Replay a normalized trajectory generated from AgentV output, an imported transcript, and an OTLP/Phoenix-style trace. Verify each produces the expected provider output messages and identical compact summaries where data is representable. -- **Verification:** Existing `agentv eval --transcript` behavior remains compatible, with a migration path to replay normalized trajectory artifacts directly. +- **Patterns:** Treat replay as a projection from derived trajectory to provider `Message[]`. Do not duplicate storage by keeping one trace file for graders and a separate transcript file for replay when the derived artifact can serve both. +- **Test Scenarios:** Replay a derived trajectory generated from AgentV output, an imported transcript, and an OTLP/Phoenix-style trace. Verify each produces the expected provider output messages and identical compact summaries where data is representable. +- **Verification:** Existing `agentv eval --transcript` behavior remains compatible, with a migration path to replay derived trajectory artifacts directly. ### U6d. Replay Target Database Loop @@ -254,10 +260,10 @@ The exact schema belongs in implementation, but these concepts should be stable: ### U7. Grader Context Upgrade -- **Goal:** Let built-in and code graders receive normalized trajectories in addition to compact summaries and output messages. +- **Goal:** Let built-in and code graders receive derived trajectories in addition to compact summaries and output messages. - **Files:** `packages/core/src/evaluation/graders/types.ts`, `packages/core/src/evaluation/graders/tool-trajectory.ts`, `packages/core/src/evaluation/graders/execution-metrics.ts`, `packages/core/src/evaluation/graders/code-grader.ts`, `packages/eval/src/index.ts`, `packages/eval/src/schemas.ts`. - **Patterns:** Keep backward compatibility: existing graders that only read `trace` or `output` continue to work. New trajectory-aware graders use the richer object. -- **Test Scenarios:** Existing `tool-trajectory` modes should pass from live output and from normalized trajectory input. Argument matching, ordering, latency, status/error matching, and evidence text should be covered. +- **Test Scenarios:** Existing `tool-trajectory` modes should pass from live output and from derived trajectory input. Argument matching, ordering, latency, status/error matching, and evidence text should be covered. - **Verification:** `trace score` should run `tool-trajectory` against imported traces, not only metrics-only graders. ### U8. CLI and Artifact Workflow @@ -280,14 +286,14 @@ The exact schema belongs in implementation, but these concepts should be stable: ## Acceptance Examples -- AE1. **Covers R1, R10, R13.** Given an AgentV run with ordered tool calls, when the run is converted to a normalized trajectory, then `tool-trajectory` can grade the order and the result still has the same compact summary counts. +- AE1. **Covers R1, R10, R13.** Given an AgentV run with ordered tool calls, when the run is converted to a derived trajectory, then `tool-trajectory` can grade the order and the result still has the same compact summary counts. - AE2. **Covers R2, R7, R11.** Given an OTLP trace export with `execute_tool` spans, when `agentv trace score` runs, then AgentV imports the spans into a trajectory and grades tool usage without requiring a Phoenix server. - AE3. **Covers R3, R11.** Given a branchable Pi session, when a selected leaf is provided or inferred, then only the selected branch path is graded and omitted branch IDs are recorded. - AE4. **Covers R8, R9.** Given a Phoenix trace export containing unsupported evaluator semantics, when the Phoenix adapter runs, then the report lists unsupported mappings instead of treating them as successful conversions. - AE5. **Covers R23.** Given content capture is disabled, when a trace with tool arguments and results is exported, then the trajectory preserves IDs, names, timing, and redaction state but does not persist raw content. - AE6. **Covers R12.** Given a failed tool trajectory expectation, when AgentV reports the grader result, then the assertion cites the actual matched or missing tool call positions and source event IDs. - AE7. **Covers R10, R11.** Given a transcript imported with `agentv import codex`, `agentv import claude`, or `agentv import copilot`, when the transcript is evaluated post-hoc, then the same trajectory graders used for OTLP/Phoenix traces can score it. -- AE8. **Covers R14, R15, R16.** Given the same normalized artifact, when AgentV uses it for replay and for grading, then replayed provider output and grader evidence come from the same ordered messages and tool calls. +- AE8. **Covers R14, R15, R16.** Given the same derived artifact, when AgentV uses it for replay and for grading, then replayed provider output and grader evidence come from the same ordered messages and tool calls. - AE9. **Covers R17, R18, R19.** Given a live target run has been recorded into replay JSONL, when the same eval runs with a replay target alias and shuffled fixture records, then AgentV selects the exact matching record, makes no live LLM call, and runs graders fresh. - AE10. **Covers R18.** Given a replay database has no matching record or has multiple matching records for the current test and target, when replay runs, then AgentV fails with an actionable error instead of falling back to a near match. - AE11. **Covers R20, R21, R22.** Given cache is enabled through TS config with a custom path, when an eval runs, then AgentV stores provider responses in that configured path and reports cache usage without confusing it with replay fixtures. diff --git a/examples/showcase/trace-evaluation/README.md b/examples/showcase/trace-evaluation/README.md index b4d2dbda1..1fae2bccb 100644 --- a/examples/showcase/trace-evaluation/README.md +++ b/examples/showcase/trace-evaluation/README.md @@ -41,7 +41,7 @@ The replay target looks up records by `suite`, `eval_path` when present, `test_i `source_target`, `attempt`, and `variant` when configured. Missing or duplicate records fail before grading. -Replay can also read `agentv.execution_trace.v1` artifacts by using +Replay can also read `agentv.trace.v1` artifacts by using `execution_traces` instead of `fixtures` on the replay target. Configure exactly one source field: diff --git a/packages/core/src/evaluation/replay-trace-envelopes.ts b/packages/core/src/evaluation/replay-trace-envelopes.ts index 11637bde4..3d369c2d9 100644 --- a/packages/core/src/evaluation/replay-trace-envelopes.ts +++ b/packages/core/src/evaluation/replay-trace-envelopes.ts @@ -1,7 +1,7 @@ /** * Execution-trace replay source for target-output substitution. * - * This module lets the replay provider read `agentv.execution_trace.v1` + * This module lets the replay provider read `agentv.trace.v1` * artifacts as the target-output source. Lookup uses the same replay identity * dimensions as JSONL fixtures, then projects the matched artifact to the * existing ProviderResponse shape with traceEnvelopeToMessages(). Opaque diff --git a/packages/core/src/evaluation/trace-envelope.ts b/packages/core/src/evaluation/trace-envelope.ts index 4ec392a4c..38aa70097 100644 --- a/packages/core/src/evaluation/trace-envelope.ts +++ b/packages/core/src/evaluation/trace-envelope.ts @@ -1,7 +1,7 @@ /** - * AgentV execution trace v1: AgentV-owned metadata around an OTel/OpenInference span graph. + * AgentV trace v1: AgentV-owned metadata around an OTel/OpenInference span graph. * - * The `agentv.execution_trace.v1` artifact is the canonical full trace sidecar + * The `agentv.trace.v1` artifact is the canonical full trace sidecar * for eval artifacts. AgentV owns the outer structure, eval/replay identity, * capture policy, warnings, artifact pointers, and score provenance. The trace * body is a standards-shaped span graph, so attribute keys such as @@ -35,7 +35,7 @@ import { } from './trace.js'; import type { EvaluationResult, EvaluationVerdict, GraderKind } from './types.js'; -export const EXECUTION_TRACE_SCHEMA_VERSION = 'agentv.execution_trace.v1' as const; +export const EXECUTION_TRACE_SCHEMA_VERSION = 'agentv.trace.v1' as const; const TRACE_ENVELOPE_FORMAT = 'otlp_openinference_spans' as const; const TRANSCRIPT_MESSAGE_EVENT_NAME = 'agentv.transcript.message' as const; @@ -367,7 +367,7 @@ export interface TraceEnvelopeToolTrajectoryItem { } export interface TraceEnvelopeToolTrajectoryView { - readonly schemaVersion: typeof EXECUTION_TRACE_SCHEMA_VERSION; + readonly schemaVersion: typeof NORMALIZED_TRAJECTORY_SCHEMA_VERSION; readonly traceId: string; readonly rootSpanId: string; readonly tools: readonly TraceEnvelopeToolTrajectoryItem[]; @@ -1494,7 +1494,7 @@ export function traceEnvelopeToToolTrajectoryView( }); return { - schemaVersion: envelope.schemaVersion, + schemaVersion: NORMALIZED_TRAJECTORY_SCHEMA_VERSION, traceId: envelope.trace.traceId, rootSpanId: envelope.trace.rootSpanId, tools, diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 89be72986..86430af4a 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -1,9 +1,9 @@ /** - * Trace read models for evaluation-time agent behavior. + * Derived trace read models for evaluation-time agent behavior. * - * `Trace` is the result-local compatibility/read model attached to evaluation - * results. The canonical exported execution trace sidecar is - * `agentv.execution_trace.v1` in `trace-envelope.ts`; result JSONL keeps + * `Trace` is the result-local projection attached to evaluation results. The + * canonical exported trace sidecar is `agentv.trace.v1` in + * `trace-envelope.ts`; result JSONL keeps * `output` as the final answer/scored result only, while the full transcript, * tool calls/results, errors, timing, usage, provider/session provenance, and * replay/eval metrics live in this read model. @@ -19,7 +19,7 @@ import { z } from 'zod'; import type { Message } from './providers/types.js'; -export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trace.v1' as const; +export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trajectory.v1' as const; export const NORMALIZED_TRACE_SOURCE_KINDS = [ 'agentv_run', @@ -173,7 +173,7 @@ export interface TraceEvent { } /** - * Legacy imported trace artifact shape used by older import/replay helpers. + * Derived trajectory artifact shape used by import/replay helpers. * * New evaluation results use `Trace` below: final answer in `output`, full * transcript under `trace.messages`, structured spans under `trace.events`, and @@ -194,8 +194,8 @@ export interface TraceArtifact { } /** - * @deprecated Use `Trace` for evaluation results or `TraceArtifact` for legacy - * import/replay artifacts. + * @deprecated Use `Trace` for evaluation results or `TraceArtifact` for + * derived import/replay trajectory artifacts. */ export type NormalizedTrajectory = TraceArtifact; @@ -724,18 +724,18 @@ export interface TraceSummary { } /** - * Result-local trace attached to every evaluation result. + * Result-local derived trace attached to every evaluation result. * * The compact TraceSummary fields are mirrored for existing * metric graders; `messages` and `events` are the complete execution record for * result JSONL compatibility. Result `output` is only the * final answer; tools, intermediate assistant text, timing, usage, provider * provenance, and replay metadata live here. Full export/import work should use - * the execution trace artifact and derive this shape from it. + * the canonical trace artifact and derive this shape from it. */ export interface Trace extends TraceSummary { readonly schemaVersion: typeof TRACE_SCHEMA_VERSION; - /** Complete normalized chat transcript used for transcript-aware graders. */ + /** Complete chat transcript used for transcript-aware graders. */ readonly messages: readonly Message[]; /** Structured event stream derived from the same messages and metrics. */ readonly events: readonly TraceEvent[]; diff --git a/packages/core/test/evaluation/fixtures/execution-trace/nested-subagent.json b/packages/core/test/evaluation/fixtures/execution-trace/nested-subagent.json index 4e752de7b..697641dc6 100644 --- a/packages/core/test/evaluation/fixtures/execution-trace/nested-subagent.json +++ b/packages/core/test/evaluation/fixtures/execution-trace/nested-subagent.json @@ -1,5 +1,5 @@ { - "schema_version": "agentv.execution_trace.v1", + "schema_version": "agentv.trace.v1", "artifact_id": "execution-trace-nested-subagent", "created_at": "2026-06-15T12:00:05.000Z", "eval": { diff --git a/packages/core/test/evaluation/trace-envelope.test.ts b/packages/core/test/evaluation/trace-envelope.test.ts index b25735c9f..205b46c0c 100644 --- a/packages/core/test/evaluation/trace-envelope.test.ts +++ b/packages/core/test/evaluation/trace-envelope.test.ts @@ -18,7 +18,11 @@ import { traceEnvelopeToTraceSummary, traceEnvelopeToTranscriptMessages, } from '../../src/evaluation/trace-envelope.js'; -import { buildTraceFromMessages, computeTraceSummary } from '../../src/evaluation/trace.js'; +import { + NORMALIZED_TRAJECTORY_SCHEMA_VERSION, + buildTraceFromMessages, + computeTraceSummary, +} from '../../src/evaluation/trace.js'; import type { EvaluationResult } from '../../src/evaluation/types.js'; function jsonComparable(value: unknown): unknown { @@ -348,6 +352,7 @@ describe('execution trace artifact v1', () => { 'tool_call', 'tool_call', ]); + expect(compact.schemaVersion).toBe(NORMALIZED_TRAJECTORY_SCHEMA_VERSION); expect(compact.tools.map((tool) => [tool.position, tool.tool, tool.toolCallId])).toEqual([ [0, 'Read', 'call-read'], [1, 'Edit', 'call-edit'], diff --git a/packages/core/test/evaluation/trace-trajectory.test.ts b/packages/core/test/evaluation/trace-trajectory.test.ts index 9ff5ccff5..d6294ca70 100644 --- a/packages/core/test/evaluation/trace-trajectory.test.ts +++ b/packages/core/test/evaluation/trace-trajectory.test.ts @@ -135,13 +135,13 @@ function buildTrajectory(): NormalizedTrajectory { }; } -describe('normalized trajectory contract', () => { +describe('derived trajectory contract', () => { it('round-trips between internal camelCase and snake_case wire format', () => { const trajectory = buildTrajectory(); const wire = toNormalizedTrajectoryWire(trajectory); - expect(wire.schema_version).toBe('agentv.trace.v1'); + expect(wire.schema_version).toBe('agentv.trajectory.v1'); expect(wire.source.kind).toBe('pi_session'); expect(wire.session.session_id).toBe('session-123'); expect(wire.branch?.selected_leaf_id).toBe('leaf-success'); diff --git a/packages/eval/src/schemas.ts b/packages/eval/src/schemas.ts index 3200cdc38..1c42f6666 100644 --- a/packages/eval/src/schemas.ts +++ b/packages/eval/src/schemas.ts @@ -46,7 +46,7 @@ export const TraceSummarySchema = z.object({ llmCallCount: z.number().optional(), }); -export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trace.v1' as const; +export const NORMALIZED_TRAJECTORY_SCHEMA_VERSION = 'agentv.trajectory.v1' as const; export const NORMALIZED_TRACE_SOURCE_KINDS = [ 'agentv_run', @@ -188,7 +188,7 @@ export const NormalizedTraceEventSchema = z.object({ }); /** - * Canonical trajectory schema exposed to custom graders. + * Derived trajectory schema exposed to custom graders. * * AgentV-owned persisted trajectory artifacts use the snake_case wire schemas * and converters in @agentv/core. This SDK schema mirrors the internal @@ -291,7 +291,7 @@ export const MessageSchema = z.object({ }); /** - * Canonical evaluation trace exposed to custom graders. + * Derived evaluation trace read model exposed to custom graders. * * Top-level summary fields (`eventCount`, `toolCalls`, `errorCount`) remain * available for existing metric graders; full transcript/tool evidence is under diff --git a/packages/eval/test/define-code-grader.test.ts b/packages/eval/test/define-code-grader.test.ts index e17e93230..0a348e5a2 100644 --- a/packages/eval/test/define-code-grader.test.ts +++ b/packages/eval/test/define-code-grader.test.ts @@ -15,7 +15,7 @@ import { } from '../src/schemas.js'; const makeTrace = (overrides: Record = {}) => ({ - schemaVersion: 'agentv.trace.v1', + schemaVersion: 'agentv.trajectory.v1', eventCount: 3, toolCalls: { read: 2, write: 1 }, errorCount: 0, diff --git a/packages/eval/test/define-prompt-template.test.ts b/packages/eval/test/define-prompt-template.test.ts index 2ed471c26..0d6fe7818 100644 --- a/packages/eval/test/define-prompt-template.test.ts +++ b/packages/eval/test/define-prompt-template.test.ts @@ -3,7 +3,7 @@ import { describe, expect, it } from 'bun:test'; import { PromptTemplateInputSchema } from '../src/schemas.js'; const makeTrace = (overrides: Record = {}) => ({ - schemaVersion: 'agentv.trace.v1', + schemaVersion: 'agentv.trajectory.v1', eventCount: 3, toolCalls: { read: 2, write: 1 }, errorCount: 0,