diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index f322a2cd3..65037ce7c 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -8,11 +8,13 @@ import { type GraderResult, type Message, type TargetDefinition, + type TraceEnvelope, type TraceSummary, buildTraceEnvelopeFromEvaluationResult, buildTraceFromMessages, extractLastAssistantContent, toTraceEnvelopeWire, + traceEnvelopeToTranscriptMessages, traceToTranscriptJsonLines, } from '@agentv/core'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; @@ -736,31 +738,37 @@ function resolveEnvelopeEvalPath( return source?.evalFileRepoPath ?? source?.evalFilePath ?? fallbackEvalFile; } +function resultHasExecutionTraceTranscript(result: EvaluationResult): boolean { + return result.output.length > 0 || result.trace.messages.length > 0; +} + async function writeTraceEnvelopeSidecar(params: { readonly result: EvaluationResult; readonly outputDir: string; readonly outputsDir: string; readonly evalPath?: string; readonly experiment?: string; -}): Promise { - const hasTranscript = params.result.output.length > 0 || params.result.trace.messages.length > 0; +}): Promise { + const hasTranscript = resultHasExecutionTraceTranscript(params.result); const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, { evalPath: params.evalPath, runId: path.basename(params.outputDir), experiment: params.experiment, source: { path: RESULT_INDEX_FILENAME }, + capture: { content: 'full', redactionLevel: 'none', redactedFields: [] }, artifacts: { - envelope_path: 'outputs/trace-envelope.json', + execution_trace_path: 'outputs/execution-trace.json', answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined, response_path: params.result.output.length > 0 ? 'outputs/response.md' : undefined, transcript_path: hasTranscript ? 'outputs/transcript.jsonl' : undefined, }, }); await writeFile( - path.join(params.outputsDir, 'trace-envelope.json'), + path.join(params.outputsDir, 'execution-trace.json'), `${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`, 'utf8', ); + return envelope; } export function buildIndexArtifactEntry( @@ -829,7 +837,7 @@ export function buildResultIndexArtifact( const artifactSubdir = buildArtifactSubdir(result); const input = extractInput(result); const hasAnswer = result.output.length > 0; - const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0; + const hasTranscript = resultHasExecutionTraceTranscript(result); return { timestamp: result.timestamp, @@ -885,8 +893,23 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr await writeFile(filePath, content, 'utf8'); } -async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise { - const lines = traceToTranscriptJsonLines(result.trace, { +function traceProjectionForTranscript(result: EvaluationResult, envelope: TraceEnvelope) { + return { + ...result.trace, + messages: traceEnvelopeToTranscriptMessages(envelope), + }; +} + +function hasTranscriptProjection(result: EvaluationResult, envelope: TraceEnvelope): boolean { + return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0; +} + +async function writeTranscriptJsonl( + filePath: string, + result: EvaluationResult, + envelope: TraceEnvelope, +): Promise { + const lines = traceToTranscriptJsonLines(traceProjectionForTranscript(result, envelope), { testId: result.testId, target: result.target, }); @@ -1256,16 +1279,16 @@ export async function writePerTestArtifacts( // for scored output or transcript.jsonl for the full execution record. await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); } - if (result.output.length > 0 || result.trace.messages.length > 0) { - await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result); - } - await writeTraceEnvelopeSidecar({ + const envelope = await writeTraceEnvelopeSidecar({ result, outputDir, outputsDir, evalPath: resolveEnvelopeEvalPath(result, testByTestId), experiment: options?.experiment, }); + if (hasTranscriptProjection(result, envelope)) { + await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope); + } const taskBundle = await materializeTaskBundleForResult({ result, @@ -1336,16 +1359,16 @@ export async function writeArtifactsFromResults( // for scored output or transcript.jsonl for the full execution record. await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8'); } - if (result.output.length > 0 || result.trace.messages.length > 0) { - await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result); - } - await writeTraceEnvelopeSidecar({ + const envelope = await writeTraceEnvelopeSidecar({ result, outputDir, outputsDir, evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile), experiment: options?.experiment, }); + if (hasTranscriptProjection(result, envelope)) { + await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope); + } const taskBundle = await materializeTaskBundleForResult({ result, diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 8926f2c6f..83dfde8b4 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -8,7 +8,10 @@ import { type GraderResult, TraceEnvelopeWireSchema, buildTraceFromMessages, + fromTraceEnvelopeWire, parseYamlValue, + traceEnvelopeToTranscriptMessages, + traceToTranscriptJsonLines, } from '@agentv/core'; import { @@ -803,11 +806,31 @@ describe('writeArtifactsFromResults', () => { await writeArtifactsFromResults(results, testDir); - const transcriptLines = (await readFile(path.join(testDir, 'transcript.jsonl'), 'utf8')) + const transcriptLines = ( + await readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'), 'utf8') + ) .trim() .split('\n') .map((line) => JSON.parse(line)); + const envelope = TraceEnvelopeWireSchema.parse( + JSON.parse( + await readFile( + path.join(testDir, 'transcript-case', 'outputs', 'execution-trace.json'), + 'utf8', + ), + ), + ); + const projectedEnvelope = fromTraceEnvelopeWire(envelope); + const projectedTranscript = traceToTranscriptJsonLines( + { + ...results[0].trace, + messages: traceEnvelopeToTranscriptMessages(projectedEnvelope), + }, + { testId: 'transcript-case', target: 'codex' }, + ); + + expect(transcriptLines).toEqual(JSON.parse(JSON.stringify(projectedTranscript))); expect(transcriptLines).toEqual([ { test_id: 'transcript-case', @@ -845,16 +868,8 @@ describe('writeArtifactsFromResults', () => { }, }, ]); - - const envelope = TraceEnvelopeWireSchema.parse( - JSON.parse( - await readFile( - path.join(testDir, 'transcript-case', 'outputs', 'trace-envelope.json'), - 'utf8', - ), - ), - ); - expect(envelope.schema_version).toBe('agentv.trace_envelope.v1'); + expect(envelope.schema_version).toBe('agentv.execution_trace.v1'); + expect(envelope.artifact_id).toMatch(/^execution-trace-/); expect(envelope.eval.test_id).toBe('transcript-case'); expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([ 'invoke_agent', @@ -865,7 +880,37 @@ describe('writeArtifactsFromResults', () => { const indexLine = JSON.parse( (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), ); - expect(indexLine).not.toHaveProperty('trace_envelope_path'); + expect(indexLine).not.toHaveProperty('execution_trace_path'); + }); + + it('omits per-test transcript links when the execution trace has no transcript rows', async () => { + const results = [ + makeResult({ + testId: 'no-transcript-case', + output: '', + trace: buildTraceFromMessages(), + }), + ]; + + await writeArtifactsFromResults(results, testDir); + + const transcriptPath = path.join(testDir, 'no-transcript-case', 'outputs', 'transcript.jsonl'); + await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow(); + + const indexLine = JSON.parse( + (await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(), + ); + expect(indexLine).not.toHaveProperty('transcript_path'); + + const envelope = TraceEnvelopeWireSchema.parse( + JSON.parse( + await readFile( + path.join(testDir, 'no-transcript-case', 'outputs', 'execution-trace.json'), + 'utf8', + ), + ), + ); + expect(envelope.artifacts).not.toHaveProperty('transcript_path'); }); it('sanitizes test IDs for directory names', async () => { diff --git a/apps/cli/test/commands/trace/trace.test.ts b/apps/cli/test/commands/trace/trace.test.ts index c0b5ec535..b69c03e24 100644 --- a/apps/cli/test/commands/trace/trace.test.ts +++ b/apps/cli/test/commands/trace/trace.test.ts @@ -130,7 +130,7 @@ const OTLP_TRACE = JSON.stringify({ startTimeUnixNano: '2500000000', endTimeUnixNano: '3000000000', attributes: [{ key: 'gen_ai.tool.name', value: { stringValue: 'read_file' } }], - status: { code: 1 }, + status: { code: 2, message: 'tool failed' }, }, ], }, @@ -257,6 +257,7 @@ describe('trace utils', () => { expect(results[0].trace?.event_count).toBe(1); expect(results[0].trace?.llm_call_count).toBe(1); expect(results[0].trace?.tool_calls).toEqual({ read_file: 1 }); + expect(results[0].trace?.error_count).toBe(1); }); }); diff --git a/docs/plans/trace-envelope-implementation-spec.md b/docs/plans/trace-envelope-implementation-spec.md index 5ce4cacc6..df1103aab 100644 --- a/docs/plans/trace-envelope-implementation-spec.md +++ b/docs/plans/trace-envelope-implementation-spec.md @@ -1,20 +1,21 @@ --- -title: Trace Envelope Implementation Spec +title: Execution Trace Implementation Spec type: spec status: active date: 2026-06-15 --- -# Trace Envelope Implementation Spec +# Execution Trace Implementation Spec ## Decision And Scope -AgentV should store and interchange full execution traces as an `agentv.trace_envelope.v1` -artifact. The canonical trace body is an OpenTelemetry span graph with GenAI -semantic convention attributes and OpenInference attributes where they cover the -concept. AgentV owns only the small envelope around that graph: eval and replay -identity, source metadata, capture/redaction policy, conversion warnings, artifact -pointers, and score provenance. +AgentV stores and interchanges full execution traces as an +`agentv.execution_trace.v1` artifact. The canonical trace body is an +OpenTelemetry span graph with GenAI semantic convention attributes and +OpenInference attributes where they cover the concept. AgentV owns only the +small artifact wrapper around that graph: eval and replay identity, source +metadata, capture/redaction policy, conversion warnings, artifact pointers, and +score provenance. This supersedes the older wording in `docs/plans/trace-evaluation-architecture.md` that treats AgentV's normalized `Trace` or `NormalizedTrajectory` object as the @@ -23,14 +24,21 @@ implemented as derived read/projection views over the canonical span graph. Source of truth: -- `trace.spans` in the envelope is the canonical ordered span body for AgentV +- `trace.spans` in the execution trace artifact is the canonical ordered span body for AgentV trace evaluation, replay projection, export, and import. - Official OTLP JSON is a boundary format generated from, or imported into, that span body. Attribute names remain exact standard names such as `gen_ai.operation.name` and `openinference.span.kind`. - `Message[]`, `outputs/transcript.jsonl`, `TraceSummary`, `TraceArtifact`/`NormalizedTrajectory`, replay target output, and compact - grader inputs are derived compatibility views. + grader inputs are derived compatibility/read views. +- Derived views must be named and treated as projections over + `agentv.execution_trace.v1`, not as separate canonical graphs: + `traceEnvelopeToMessages()` for Provider `Message[]` and replay provider + responses, `traceEnvelopeToTranscriptMessages()` for + `outputs/transcript.jsonl`, `traceEnvelopeToTraceSummary()` for metrics + aggregation, compact tool trajectory views for trajectory graders, and + `traceEnvelopeToOtlpJson()` for OTLP/OpenInference export bodies. Non-goals: @@ -53,8 +61,8 @@ their source keys exactly. Directional v1 shape: ```yaml -schema_version: agentv.trace_envelope.v1 -envelope_id: trace-env-01j... +schema_version: agentv.execution_trace.v1 +artifact_id: execution-trace-01j... created_at: "2026-06-15T12:00:00.000Z" eval: @@ -68,7 +76,7 @@ eval: variant: null run_id: "2026-06-15T12-00-00-000Z" category: showcase - experiment: trace-envelope-v1 + experiment: execution-trace-v1 replay: lookup_key: @@ -111,7 +119,13 @@ trace: agentv.eval_path: examples/showcase/trace-evaluation/evals/coding-agent-replay.eval.yaml agentv.test_id: inspect-and-fix-config agentv.target: replay_coding_agent - events: [] + events: + - name: agentv.transcript.message + attributes: + agentv.transcript.message.index: 0 + agentv.transcript.message: + role: user + content: Inspect and fix the config. source: kind: agentv_run @@ -148,7 +162,7 @@ conversion_warnings: message: Deterministic tool call id generated from source order. artifacts: - envelope_path: outputs/trace-envelope.json + execution_trace_path: outputs/execution-trace.json otlp_path: outputs/trace.otlp.json answer_path: outputs/answer.md transcript_path: outputs/transcript.jsonl @@ -196,7 +210,8 @@ Implementation pattern: ```ts interface TraceEnvelopeWire { - readonly schema_version: 'agentv.trace_envelope.v1'; + readonly schema_version: 'agentv.execution_trace.v1'; + readonly artifact_id: string; readonly created_at: string; readonly eval: TraceEnvelopeEvalWire; readonly trace: TraceEnvelopeBodyWire; @@ -204,7 +219,8 @@ interface TraceEnvelopeWire { } interface TraceEnvelope { - readonly schemaVersion: 'agentv.trace_envelope.v1'; + readonly schemaVersion: 'agentv.execution_trace.v1'; + readonly artifactId: string; readonly createdAt: string; readonly eval: TraceEnvelopeEval; readonly trace: TraceEnvelopeBody; @@ -221,6 +237,7 @@ explicit known-field conversion plus Zod validation. It should not look like | Concept | Span representation | Known candidate standard attributes | AgentV envelope/attributes | Notes and uncertainty | | --- | --- | --- | --- | --- | | AgentV run/eval root | Root span named `invoke_agent ` for new envelopes. Accept `agentv.eval` on import for compatibility. `kind: INTERNAL` unless the source is a client call into a remote agent. | `gen_ai.operation.name=invoke_agent`; `gen_ai.provider.name`; `gen_ai.agent.name`; `gen_ai.agent.version`; `gen_ai.conversation.id`; `openinference.span.kind=AGENT`; `session.id`. | Envelope `eval.*` is authoritative. Duplicate searchable values on root as `agentv.eval_path`, `agentv.suite`, `agentv.test_id`, `agentv.target`, `agentv.run_id`, `agentv.attempt`, `agentv.variant`. | Current exporter uses `agentv.eval` and `gen_ai.operation.name=evaluate`; keep reader compatibility but do not make `evaluate` a v1 canonical requirement unless the GenAI spec stabilizes it for root spans. | +| Transcript compatibility rows | Root span events named `agentv.transcript.message` preserve ordered transcript rows needed for `outputs/transcript.jsonl`, including user/system input turns that are not provider output. | No stable OTel/OpenInference message-event shape covers AgentV's transcript JSONL compatibility artifact. | Event attribute `agentv.transcript.message.index` stores source order. Event attribute `agentv.transcript.message` stores a snake_case message object (`role`, `content`, `tool_calls`, `start_time`, `end_time`, `duration_ms`, `metadata`, `token_usage`). | `traceEnvelopeToTranscriptMessages()` uses these events for transcript JSONL. `traceEnvelopeToMessages()` intentionally remains assistant/output-only for replay provider responses. Opaque content, metadata, and tool input/output payload keys are preserved exactly inside the message object. | | Model/chat span | Child span named `chat ` for each model turn. Parent is root agent span or the source parent span if importing external OTel. | `gen_ai.operation.name=chat`; `gen_ai.provider.name`; `gen_ai.request.model`; `gen_ai.response.model`; `gen_ai.response.id`; `gen_ai.response.finish_reasons`; `gen_ai.input.messages` opt-in; `gen_ai.output.messages` opt-in; `openinference.span.kind=LLM`; `llm.system`; `llm.provider`; `llm.model_name`; `input.value`; `output.value`; `input.mime_type`; `output.mime_type`. | `agentv.turn_index`, `agentv.message_index`, `agentv.source_event_id` only when standards do not carry the identity. | Content attributes are sensitive and must follow `capture.content`. Preserve message payload keys when storing structured content internally or in raw evidence. | | Tool execution span | Span named `execute_tool `. For AgentV-generated traces, parent it to the chat span that requested the tool when known; otherwise parent it to the root agent span. | `gen_ai.operation.name=execute_tool`; `gen_ai.tool.name`; `gen_ai.tool.call.id`; `gen_ai.tool.type`; `gen_ai.tool.description`; `gen_ai.tool.call.arguments` opt-in; `gen_ai.tool.call.result` opt-in; `openinference.span.kind=TOOL`; `tool.name`; `tool.id`; `tool.description`; `tool.json_schema`; `input.value`; `output.value`. | `agentv.tool.index`, `agentv.generated_tool_call_id=true`, `agentv.source_event_id`, and warning `missing_tool_call_id` when AgentV generated an ID. | OTel and OpenInference both have tool-call identifiers but use different names. Emit both when useful and unambiguous. | | Tool result/event | Prefer result data on the `execute_tool` span (`gen_ai.tool.call.result` and/or OpenInference `output.value`) when capture policy allows. | `gen_ai.tool.call.result`; `output.value`; `output.mime_type`. | If result content is large or redacted, put an artifact pointer in `artifacts.raw_evidence_dir` and set `agentv.tool.result_ref` on the span. | Do not create a separate canonical `tool_result` span unless the source emitted one. Derived `Message.toolCalls[].output` and transcript rows can still expose a paired result. | @@ -237,8 +254,8 @@ explicit known-field conversion plus Zod validation. It should not look like ## Fixture Plan Golden fixtures should live under a trace-specific fixture directory, for example -`packages/core/test/evaluation/fixtures/trace-envelope/` or -`examples/showcase/trace-evaluation/fixtures/envelopes/`, with small raw-source +`packages/core/test/evaluation/fixtures/execution-trace/` or +`examples/showcase/trace-evaluation/fixtures/execution-traces/`, with small raw-source fixtures beside expected envelope JSON. Tests should compare semantic fields rather than full timestamps when timestamps are generated. @@ -293,8 +310,8 @@ Minimal code slices: projections once tests prove parity. 5. Artifact sidecar wiring. - Write `outputs/trace-envelope.json` or an equivalent sidecar and add an - optional `trace_envelope_path` pointer to per-test index entries only if the + Write `outputs/execution-trace.json` or an equivalent sidecar and add an + optional `execution_trace_path` pointer to per-test index entries only if the team accepts an additive index change. If not, write the sidecar inside the per-test artifact directory and leave index JSONL unchanged for the first PR. @@ -366,18 +383,18 @@ Red/green UAT scenario: 1. Red on `origin/main` (`0ac6b294`): run the replay showcase and confirm the run writes current result artifacts and `outputs/transcript.jsonl`, but no - canonical `agentv.trace_envelope.v1` sidecar exists. + canonical `agentv.execution_trace.v1` sidecar exists. ```bash bun apps/cli/src/cli.ts eval \ examples/showcase/trace-evaluation/evals/coding-agent-replay.eval.yaml \ --target replay_coding_agent \ - --output /tmp/agentv-trace-envelope-red + --output /tmp/agentv-execution-trace-red ``` 2. Green on the implementation branch: run the identical command with a new - output directory. Confirm each test artifact has the envelope sidecar, the - sidecar validates against `agentv.trace_envelope.v1`, spans export to OTLP + output directory. Confirm each test artifact has the execution trace sidecar, the + sidecar validates against `agentv.execution_trace.v1`, spans export to OTLP JSON, and regenerated transcript rows match the existing transcript artifact except for any documented additive pointer fields. @@ -385,13 +402,13 @@ Red/green UAT scenario: bun apps/cli/src/cli.ts eval \ examples/showcase/trace-evaluation/evals/coding-agent-replay.eval.yaml \ --target replay_coding_agent \ - --output /tmp/agentv-trace-envelope-green + --output /tmp/agentv-execution-trace-green ``` Artifacts to inspect: -- `/tmp/agentv-trace-envelope-green/index.jsonl` -- per-test `outputs/trace-envelope.json` +- `/tmp/agentv-execution-trace-green/index.jsonl` +- per-test `outputs/execution-trace.json` - per-test `outputs/transcript.jsonl` - per-test `outputs/answer.md` - generated OTLP JSON, if the implementation writes an OTLP sidecar @@ -412,7 +429,7 @@ Stability proof: Recommended defaults are included so implementation is not blocked. -1. Should the first `.9` PR add `trace_envelope_path` to `index.jsonl`? +1. Should the first `.9` PR add `execution_trace_path` to `index.jsonl`? Recommended default: write the sidecar in each per-test artifact directory first and defer the index pointer unless the dashboard/CLI needs discovery in the same PR. diff --git a/examples/showcase/trace-evaluation/README.md b/examples/showcase/trace-evaluation/README.md index 5e1e1c54d..b4d2dbda1 100644 --- a/examples/showcase/trace-evaluation/README.md +++ b/examples/showcase/trace-evaluation/README.md @@ -41,20 +41,20 @@ The replay target looks up records by `suite`, `eval_path` when present, `test_i `source_target`, `attempt`, and `variant` when configured. Missing or duplicate records fail before grading. -Replay can also read `agentv.trace_envelope.v1` artifacts by using -`trace_envelopes` instead of `fixtures` on the replay target. Configure exactly +Replay can also read `agentv.execution_trace.v1` artifacts by using +`execution_traces` instead of `fixtures` on the replay target. Configure exactly one source field: ```yaml targets: - - name: replay_from_envelopes + - name: replay_from_execution_traces provider: replay - trace_envelopes: ../fixtures/trace-envelopes.jsonl + execution_traces: ../fixtures/execution-traces.jsonl suite: trace-evaluation-showcase source_target: live_coding_agent ``` -Envelope replay requires the matched envelope to contain full captured assistant +Execution trace replay requires the matched artifact to contain full captured assistant output. Metadata-only trace sidecars fail before grading rather than replaying an empty answer. diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts index c1376e5e8..e519c667e 100644 --- a/packages/core/src/evaluation/graders/types.ts +++ b/packages/core/src/evaluation/graders/types.ts @@ -37,7 +37,7 @@ export interface EvaluationContext { readonly evaluator?: GraderConfig; /** Output messages from agent execution (primary source for tool trajectory) */ readonly output?: readonly Message[]; - /** Canonical execution trace with messages, events, metrics, and provenance. */ + /** Result-local trace read model with messages, events, metrics, and provenance. */ readonly trace?: Trace; /** Token usage from provider execution (promoted from TraceSummary) */ readonly tokenUsage?: TokenUsage; diff --git a/packages/core/src/evaluation/providers/replay.ts b/packages/core/src/evaluation/providers/replay.ts index c3858b43a..9e94cd471 100644 --- a/packages/core/src/evaluation/providers/replay.ts +++ b/packages/core/src/evaluation/providers/replay.ts @@ -3,7 +3,7 @@ * * Configure it in targets.yaml with `provider: replay`, the `source_target` * whose live outputs were recorded, and exactly one replay source: `fixtures` - * JSONL or `trace_envelopes`. The provider does not invoke the source target; + * JSONL or `execution_traces`. The provider does not invoke the source target; * it only performs strict replay lookup and returns the recorded * ProviderResponse so graders can run fresh. */ @@ -43,7 +43,7 @@ export class ReplayProvider implements Provider { const record = findReplayFixtureRecord(records, this.lookupForRequest(request)); return replayFixtureRecordToProviderResponse(record); } - case 'trace_envelopes': { + case 'execution_traces': { const records = await readTraceEnvelopeReplayRecords(source.path); const record = findTraceEnvelopeReplayRecord(records, this.lookupForRequest(request)); return traceEnvelopeReplayRecordToProviderResponse(record); @@ -62,7 +62,7 @@ export class ReplayProvider implements Provider { ), ); } - case 'trace_envelopes': { + case 'execution_traces': { const records = await readTraceEnvelopeReplayRecords(source.path); return requests.map((request) => traceEnvelopeReplayRecordToProviderResponse( @@ -100,6 +100,6 @@ function resolveReplaySource( return { kind: 'fixtures', path: config.fixturesPath }; } throw new Error( - 'Replay provider requires exactly one replay source: fixtures or trace_envelopes', + 'Replay provider requires exactly one replay source: fixtures or execution_traces', ); } diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index be94fe827..396c3ab7f 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -579,7 +579,7 @@ export interface ReplayResolvedConfig { export type ReplayResolvedSource = | { readonly kind: 'fixtures'; readonly path: string } - | { readonly kind: 'trace_envelopes'; readonly path: string }; + | { readonly kind: 'execution_traces'; readonly path: string }; export interface TargetDeprecationWarning { readonly location: string; @@ -2033,26 +2033,26 @@ function resolveReplayConfig( const fixtures = resolveOptionalString(target.fixtures, env, `${target.name} replay fixtures`, { allowLiteral: true, }); - const traceEnvelopes = resolveOptionalString( - target.trace_envelopes, + const executionTraces = resolveOptionalString( + target.execution_traces, env, - `${target.name} replay trace_envelopes`, + `${target.name} replay execution_traces`, { allowLiteral: true, }, ); - if ((fixtures ? 1 : 0) + (traceEnvelopes ? 1 : 0) !== 1) { + if ((fixtures ? 1 : 0) + (executionTraces ? 1 : 0) !== 1) { throw new Error( - `Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "trace_envelopes"`, + `Target "${target.name}" (provider: replay) requires exactly one replay source: "fixtures" or "execution_traces"`, ); } const fixturesPath = fixtures ? resolveReplaySourcePath(fixtures, evalFilePath) : undefined; - const traceEnvelopesPath = traceEnvelopes - ? resolveReplaySourcePath(traceEnvelopes, evalFilePath) + const executionTracesPath = executionTraces + ? resolveReplaySourcePath(executionTraces, evalFilePath) : undefined; const source: ReplayResolvedSource = fixturesPath ? { kind: 'fixtures', path: fixturesPath } - : { kind: 'trace_envelopes', path: traceEnvelopesPath as string }; + : { kind: 'execution_traces', path: executionTracesPath as string }; const sourceTarget = resolveString( target.source_target, env, diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index a3270df52..49d928f11 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -413,7 +413,7 @@ export interface TargetDefinition { readonly response?: string | unknown | undefined; // Replay fixture fields readonly fixtures?: string | unknown | undefined; - readonly trace_envelopes?: string | unknown | undefined; + readonly execution_traces?: string | unknown | undefined; readonly source_target?: string | unknown | undefined; readonly eval_path?: string | unknown | undefined; // VSCode fields diff --git a/packages/core/src/evaluation/replay-trace-envelopes.ts b/packages/core/src/evaluation/replay-trace-envelopes.ts index cdaf2c3b3..11637bde4 100644 --- a/packages/core/src/evaluation/replay-trace-envelopes.ts +++ b/packages/core/src/evaluation/replay-trace-envelopes.ts @@ -1,11 +1,11 @@ /** - * Trace-envelope replay source for target-output substitution. + * Execution-trace replay source for target-output substitution. * - * This module lets the replay provider read `agentv.trace_envelope.v1` + * This module lets the replay provider read `agentv.execution_trace.v1` * artifacts as the target-output source. Lookup uses the same replay identity - * dimensions as JSONL fixtures, then projects the matched envelope to the + * dimensions as JSONL fixtures, then projects the matched artifact to the * existing ProviderResponse shape with traceEnvelopeToMessages(). Opaque - * message, tool, provider, and source payloads stay inside the envelope + * message, tool, provider, and source payloads stay inside the execution trace * projection without recursive key conversion. */ @@ -40,7 +40,7 @@ export async function readTraceEnvelopeReplayRecords( } catch (error) { const reason = error instanceof Error ? error.message : String(error); throw new Error( - `Trace envelope replay source not found or unreadable: ${sourcePath}: ${reason}`, + `Execution trace replay source not found or unreadable: ${sourcePath}: ${reason}`, ); } @@ -65,10 +65,10 @@ export function findTraceEnvelopeReplayRecord( const key = formatReplayLookupKey(lookup); if (matches.length === 0) { - throw new Error(`Trace envelope replay lookup found no record for ${key}`); + throw new Error(`Execution trace replay lookup found no record for ${key}`); } throw new Error( - `Trace envelope replay lookup found ${matches.length} duplicate records for ${key}`, + `Execution trace replay lookup found ${matches.length} duplicate records for ${key}`, ); } @@ -89,8 +89,8 @@ export function traceEnvelopeReplayRecordToProviderResponse( startTime: summary.startTime, endTime: summary.endTime, raw: { - replay_trace_envelope: dropUndefined({ - envelope_id: record.envelope.envelopeId, + replay_execution_trace: dropUndefined({ + artifact_id: record.envelope.artifactId, source_path: record.sourcePath, line_number: record.lineNumber, suite: identity.suite, @@ -138,7 +138,7 @@ function parseTraceEnvelopeDocuments( documents.push({ value: JSON.parse(line), lineNumber: i + 1 }); } catch (error) { const reason = error instanceof Error ? error.message : String(error); - throw new Error(`Invalid trace envelope JSONL at ${sourcePath}:${i + 1}: ${reason}`); + throw new Error(`Invalid execution trace JSONL at ${sourcePath}:${i + 1}: ${reason}`); } } return documents; @@ -155,7 +155,7 @@ function parseTraceEnvelopeDocument( } catch (error) { const location = lineNumber === undefined ? sourcePath : `${sourcePath}:${lineNumber}`; const reason = error instanceof Error ? error.message : String(error); - throw new Error(`Invalid trace envelope replay record at ${location}: ${reason}`); + throw new Error(`Invalid execution trace replay record at ${location}: ${reason}`); } } @@ -196,14 +196,14 @@ function assertReplayableMessages( ): void { if (output.length === 0) { throw new Error( - `Trace envelope replay source ${formatRecordLocation(record)} cannot project to provider Message[]: no chat spans found`, + `Execution trace replay source ${formatRecordLocation(record)} cannot project to provider Message[]: no chat spans found`, ); } const lastAssistant = [...output].reverse().find((message) => message.role === 'assistant'); if (!lastAssistant || lastAssistant.content === undefined) { throw new Error( - `Trace envelope replay source ${formatRecordLocation(record)} is missing assistant output content; replay requires a full-content trace envelope before grading`, + `Execution trace replay source ${formatRecordLocation(record)} is missing assistant output content; replay requires a full-content execution trace before grading`, ); } } diff --git a/packages/core/src/evaluation/trace-envelope.ts b/packages/core/src/evaluation/trace-envelope.ts index 18a58e9c8..4ec392a4c 100644 --- a/packages/core/src/evaluation/trace-envelope.ts +++ b/packages/core/src/evaluation/trace-envelope.ts @@ -1,11 +1,20 @@ /** - * Trace envelope v1: AgentV-owned metadata around an OTel/OpenInference span graph. + * AgentV execution trace v1: AgentV-owned metadata around an OTel/OpenInference span graph. * - * The envelope is the canonical full trace sidecar for eval artifacts. AgentV - * owns the outer structure, eval/replay identity, capture policy, warnings, - * artifact pointers, and score provenance. The trace body is a standards-shaped - * span graph, so attribute keys such as `gen_ai.operation.name` and - * `openinference.span.kind` are copied exactly and never case-converted. + * The `agentv.execution_trace.v1` artifact is the canonical full trace sidecar + * for eval artifacts. AgentV owns the outer structure, eval/replay identity, + * capture policy, warnings, artifact pointers, and score provenance. The trace + * body is a standards-shaped span graph, so attribute keys such as + * `gen_ai.operation.name` and `openinference.span.kind` are copied exactly and + * never case-converted. + * + * Derived views such as Provider `Message[]`, `outputs/transcript.jsonl`, + * `TraceSummary`, compact tool trajectories, replay provider responses, and + * OTLP JSON export bodies must project from this artifact. Transcript JSONL + * uses AgentV transcript events on the root span so compatibility rows can + * include input/system turns without changing replay's assistant-only view. + * Do not introduce a second canonical graph for those compatibility/read + * models. * * To extend the wire shape, add snake_case fields to the focused Zod schema, * convert them explicitly in the matching to/from helper, and keep opaque maps @@ -26,9 +35,10 @@ import { } from './trace.js'; import type { EvaluationResult, EvaluationVerdict, GraderKind } from './types.js'; -export const TRACE_ENVELOPE_SCHEMA_VERSION = 'agentv.trace_envelope.v1' as const; +export const EXECUTION_TRACE_SCHEMA_VERSION = 'agentv.execution_trace.v1' as const; const TRACE_ENVELOPE_FORMAT = 'otlp_openinference_spans' as const; +const TRANSCRIPT_MESSAGE_EVENT_NAME = 'agentv.transcript.message' as const; const CAPTURE_CONTENT_VALUES = ['none', 'metadata', 'full'] as const; const REDACTION_LEVEL_VALUES = ['none', 'partial', 'full'] as const; @@ -147,8 +157,8 @@ export interface TraceEnvelopeScore { } export interface TraceEnvelope { - readonly schemaVersion: typeof TRACE_ENVELOPE_SCHEMA_VERSION; - readonly envelopeId: string; + readonly schemaVersion: typeof EXECUTION_TRACE_SCHEMA_VERSION; + readonly artifactId: string; readonly createdAt: string; readonly eval: TraceEnvelopeEval; readonly replay?: TraceEnvelopeReplay; @@ -298,8 +308,8 @@ export const TraceEnvelopeScoreWireSchema = z export const TraceEnvelopeWireSchema = z .object({ - schema_version: z.literal(TRACE_ENVELOPE_SCHEMA_VERSION), - envelope_id: z.string(), + schema_version: z.literal(EXECUTION_TRACE_SCHEMA_VERSION), + artifact_id: z.string(), created_at: z.string(), eval: TraceEnvelopeEvalWireSchema, replay: TraceEnvelopeReplayWireSchema.optional(), @@ -339,10 +349,115 @@ export interface BuildTraceEnvelopeOptions { readonly now?: () => Date; } +export interface TraceEnvelopeToolTrajectoryItem { + readonly position: number; + readonly traceId: string; + readonly spanId: string; + readonly parentSpanId?: string; + readonly ancestorSpanIds: readonly string[]; + readonly tool: string; + readonly toolCallId: string; + readonly parentToolCallId?: string; + readonly input?: unknown; + readonly output?: unknown; + readonly status: 'ok' | 'error'; + readonly startTime?: string; + readonly endTime?: string; + readonly durationMs?: number; +} + +export interface TraceEnvelopeToolTrajectoryView { + readonly schemaVersion: typeof EXECUTION_TRACE_SCHEMA_VERSION; + readonly traceId: string; + readonly rootSpanId: string; + readonly tools: readonly TraceEnvelopeToolTrajectoryItem[]; +} + +export interface TraceEnvelopeOtlpJson { + readonly resourceSpans: readonly { + readonly resource: { + readonly attributes: readonly TraceEnvelopeOtlpAttribute[]; + }; + readonly scopeSpans: readonly { + readonly scope: { + readonly name?: string; + readonly version?: string; + }; + readonly spans: readonly TraceEnvelopeOtlpSpan[]; + }[]; + }[]; +} + +export interface TraceEnvelopeOtlpSpan { + readonly traceId: string; + readonly spanId: string; + readonly parentSpanId?: string; + readonly name: string; + readonly kind: number; + readonly startTimeUnixNano: string; + readonly endTimeUnixNano: string; + readonly attributes: readonly TraceEnvelopeOtlpAttribute[]; + readonly status: TraceEnvelopeOtlpSpanStatus; + readonly events?: readonly { + readonly name: string; + readonly timeUnixNano?: string; + readonly attributes: readonly TraceEnvelopeOtlpAttribute[]; + }[]; +} + +export interface TraceEnvelopeOtlpSpanStatus { + readonly code: number; + readonly message?: string; +} + +export interface TraceEnvelopeOtlpAttribute { + readonly key: string; + readonly value: TraceEnvelopeOtlpAnyValue; +} + +export type TraceEnvelopeOtlpAnyValue = + | { readonly stringValue: string } + | { readonly intValue: number } + | { readonly doubleValue: number } + | { readonly boolValue: boolean } + | { readonly arrayValue: { readonly values: readonly TraceEnvelopeOtlpAnyValue[] } }; + +interface TraceEnvelopeTranscriptToolCallWire { + readonly tool: string; + readonly input?: unknown; + readonly output?: unknown; + readonly id?: string; + readonly start_time?: string; + readonly end_time?: string; + readonly duration_ms?: number; +} + +interface TraceEnvelopeTranscriptMessageWire { + readonly role: string; + readonly name?: string; + readonly content?: Message['content']; + readonly tool_calls?: readonly TraceEnvelopeTranscriptToolCallWire[]; + readonly start_time?: string; + readonly end_time?: string; + readonly duration_ms?: number; + readonly metadata?: Readonly>; + readonly token_usage?: TokenUsage; +} + +interface TraceEnvelopeMessageEntry { + readonly index: number; + readonly timeUnixNano?: string; + readonly message: Message; +} + function dropUndefined>(value: T): T { return Object.fromEntries(Object.entries(value).filter(([, entry]) => entry !== undefined)) as T; } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + function definedStringRecord( value: Readonly> | undefined, ): Record | undefined { @@ -374,6 +489,35 @@ function msToUnixNano(ms: number): string { return String(BigInt(Math.round(ms)) * 1_000_000n); } +function compareUnixNanoStrings(first: string, second: string): number { + try { + const left = BigInt(first); + const right = BigInt(second); + return left < right ? -1 : left > right ? 1 : 0; + } catch { + return first.localeCompare(second); + } +} + +function compareSpanTime(first: TraceEnvelopeSpan, second: TraceEnvelopeSpan): number { + const byStart = compareUnixNanoStrings(first.startTimeUnixNano, second.startTimeUnixNano); + if (byStart !== 0) { + return byStart; + } + if (first.spanId === second.parentSpanId) { + return -1; + } + if (second.spanId === first.parentSpanId) { + return 1; + } + const byEnd = compareUnixNanoStrings(first.endTimeUnixNano, second.endTimeUnixNano); + return byEnd !== 0 ? byEnd : first.spanId.localeCompare(second.spanId); +} + +function orderedSpans(spans: readonly TraceEnvelopeSpan[]): TraceEnvelopeSpan[] { + return [...spans].sort(compareSpanTime); +} + function unixNanoToIso(value: string | undefined): string | undefined { if (!value) { return undefined; @@ -495,6 +639,54 @@ function maybeToolContentAttributes( }); } +function toTranscriptToolCallWire( + toolCall: ToolCall, + capture: TraceEnvelopeCapture, +): TraceEnvelopeTranscriptToolCallWire { + return dropUndefined({ + tool: toolCall.tool, + input: capture.content === 'full' ? toolCall.input : undefined, + output: capture.content === 'full' ? toolCall.output : undefined, + id: toolCall.id, + start_time: toolCall.startTime, + end_time: toolCall.endTime, + duration_ms: toolCall.durationMs, + }); +} + +function toTranscriptMessageWire( + message: Message, + capture: TraceEnvelopeCapture, +): TraceEnvelopeTranscriptMessageWire { + return dropUndefined({ + role: message.role, + name: message.name, + content: capture.content === 'full' ? message.content : undefined, + tool_calls: message.toolCalls?.map((toolCall) => toTranscriptToolCallWire(toolCall, capture)), + start_time: message.startTime, + end_time: message.endTime, + duration_ms: message.durationMs, + metadata: message.metadata, + token_usage: message.tokenUsage, + }); +} + +function transcriptMessageEvent( + message: Message, + index: number, + capture: TraceEnvelopeCapture, +): TraceEnvelopeSpanEvent { + const startMs = parseTimeMs(message.startTime); + return { + name: TRANSCRIPT_MESSAGE_EVENT_NAME, + timeUnixNano: startMs !== undefined ? msToUnixNano(startMs) : undefined, + attributes: dropUndefined({ + 'agentv.transcript.message.index': index, + 'agentv.transcript.message': toTranscriptMessageWire(message, capture), + }), + }; +} + function spanStatusFromResult(result: EvaluationResult): TraceEnvelopeSpanStatus { if (result.executionStatus === 'execution_error' || result.error) { return { code: 'ERROR', message: result.error }; @@ -550,7 +742,14 @@ export function buildTraceEnvelopeFromEvaluationResult( const capture = capturePolicy(options); const source = sourceFromResult(result, options); const traceId = hashHex( - ['trace-envelope', result.timestamp, result.suite, result.testId, result.target, options.runId], + [ + 'execution-trace', + result.timestamp, + result.suite, + result.testId, + result.target, + options.runId, + ], 32, ); const rootSpanId = hashHex([traceId, 'root'], 16); @@ -562,6 +761,9 @@ export function buildTraceEnvelopeFromEvaluationResult( const rootStatus = spanStatusFromResult(result); const conversionWarnings: TraceEnvelopeConversionWarning[] = []; const spans: TraceEnvelopeSpan[] = []; + const rootEvents: TraceEnvelopeSpanEvent[] = result.trace.messages.map((message, index) => + transcriptMessageEvent(message, index, capture), + ); const rootAttributes = dropUndefined({ 'gen_ai.operation.name': 'invoke_agent', @@ -596,13 +798,14 @@ export function buildTraceEnvelopeFromEvaluationResult( attributes: rootAttributes, events: result.error ? [ + ...rootEvents, { name: 'exception', timeUnixNano: msToUnixNano(Math.max(rootStartMs, rootEndMs)), attributes: { 'exception.message': result.error }, }, ] - : [], + : rootEvents, }); const assistantEntries = assistantMessages(result.trace.messages); @@ -709,7 +912,7 @@ export function buildTraceEnvelopeFromEvaluationResult( } } - const envelopeId = `trace-env-${hashHex([traceId, result.timestamp, result.score], 20)}`; + const artifactId = `execution-trace-${hashHex([traceId, result.timestamp, result.score], 20)}`; const evalIdentity: TraceEnvelopeEval = { evalId: options.evalId, evalPath: options.evalPath, @@ -725,8 +928,8 @@ export function buildTraceEnvelopeFromEvaluationResult( }; return { - schemaVersion: TRACE_ENVELOPE_SCHEMA_VERSION, - envelopeId, + schemaVersion: EXECUTION_TRACE_SCHEMA_VERSION, + artifactId, createdAt: now.toISOString(), eval: evalIdentity, replay: options.replay, @@ -750,7 +953,7 @@ export function toTraceEnvelopeWire(envelope: TraceEnvelope): TraceEnvelopeWire return TraceEnvelopeWireSchema.parse( dropUndefined({ schema_version: envelope.schemaVersion, - envelope_id: envelope.envelopeId, + artifact_id: envelope.artifactId, created_at: envelope.createdAt, eval: toTraceEnvelopeEvalWire(envelope.eval), replay: envelope.replay ? toTraceEnvelopeReplayWire(envelope.replay) : undefined, @@ -768,7 +971,7 @@ export function fromTraceEnvelopeWire(input: unknown): TraceEnvelope { const wire = TraceEnvelopeWireSchema.parse(input); return { schemaVersion: wire.schema_version, - envelopeId: wire.envelope_id, + artifactId: wire.artifact_id, createdAt: wire.created_at, eval: fromTraceEnvelopeEvalWire(wire.eval), replay: wire.replay ? fromTraceEnvelopeReplayWire(wire.replay) : undefined, @@ -1106,10 +1309,89 @@ function toolCallFromSpan(span: TraceEnvelopeSpan): ToolCall { }; } -export function traceEnvelopeToMessages(envelope: TraceEnvelope): readonly Message[] { - const spans = [...envelope.trace.spans].sort((first, second) => - first.startTimeUnixNano.localeCompare(second.startTimeUnixNano), - ); +function buildSpanMap(spans: readonly TraceEnvelopeSpan[]): ReadonlyMap { + return new Map(spans.map((span) => [span.spanId, span])); +} + +function ancestorSpanIds( + span: TraceEnvelopeSpan, + spansById: ReadonlyMap, +): readonly string[] { + const ancestors: string[] = []; + const seen = new Set(); + let parentSpanId = span.parentSpanId ?? undefined; + + while (parentSpanId && !seen.has(parentSpanId)) { + seen.add(parentSpanId); + ancestors.push(parentSpanId); + parentSpanId = spansById.get(parentSpanId)?.parentSpanId ?? undefined; + } + + return ancestors; +} + +function nearestAncestorToolCallId( + ancestorIds: readonly string[], + spansById: ReadonlyMap, +): string | undefined { + for (const ancestorId of ancestorIds) { + const ancestor = spansById.get(ancestorId); + if (ancestor && isToolSpan(ancestor)) { + return toolCallFromSpan(ancestor).id; + } + } + return undefined; +} + +function fromTranscriptToolCallWire(wire: unknown): ToolCall | undefined { + if (!isRecord(wire) || typeof wire.tool !== 'string') { + return undefined; + } + return { + tool: wire.tool, + input: wire.input, + output: wire.output, + id: typeof wire.id === 'string' ? wire.id : undefined, + startTime: typeof wire.start_time === 'string' ? wire.start_time : undefined, + endTime: typeof wire.end_time === 'string' ? wire.end_time : undefined, + durationMs: numberAttribute(wire, 'duration_ms'), + }; +} + +function fromTranscriptMessageWire(wire: unknown): Message | undefined { + if (!isRecord(wire) || typeof wire.role !== 'string') { + return undefined; + } + const toolCalls = Array.isArray(wire.tool_calls) + ? wire.tool_calls + .map(fromTranscriptToolCallWire) + .filter((toolCall): toolCall is ToolCall => toolCall !== undefined) + : undefined; + return dropUndefined({ + role: wire.role, + name: typeof wire.name === 'string' ? wire.name : undefined, + content: wire.content as Message['content'], + toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : undefined, + startTime: typeof wire.start_time === 'string' ? wire.start_time : undefined, + endTime: typeof wire.end_time === 'string' ? wire.end_time : undefined, + durationMs: numberAttribute(wire, 'duration_ms'), + metadata: isRecord(wire.metadata) ? wire.metadata : undefined, + tokenUsage: isRecord(wire.token_usage) + ? tokenUsageFromAttributes({ + 'gen_ai.usage.input_tokens': wire.token_usage.input, + 'gen_ai.usage.output_tokens': wire.token_usage.output, + 'gen_ai.usage.cache_read.input_tokens': wire.token_usage.cached, + 'gen_ai.usage.reasoning.output_tokens': wire.token_usage.reasoning, + }) + : undefined, + }); +} + +function traceEnvelopeToMessageEntries( + envelope: TraceEnvelope, +): readonly TraceEnvelopeMessageEntry[] { + const spans = orderedSpans(envelope.trace.spans); + const spansById = buildSpanMap(spans); const toolSpansByParent = new Map(); for (const span of spans.filter(isToolSpan)) { const parentSpanId = span.parentSpanId ?? envelope.trace.rootSpanId; @@ -1118,21 +1400,107 @@ export function traceEnvelopeToMessages(envelope: TraceEnvelope): readonly Messa toolSpansByParent.set(parentSpanId, existing); } - return spans.filter(isChatSpan).map((span) => ({ - role: 'assistant', - content: span.attributes['gen_ai.output.messages'] as Message['content'], - toolCalls: toolSpansByParent.get(span.spanId)?.map(toolCallFromSpan), - startTime: unixNanoToIso(span.startTimeUnixNano), - endTime: unixNanoToIso(span.endTimeUnixNano), - durationMs: durationMsFromSpan(span), - tokenUsage: tokenUsageFromAttributes(span.attributes), - metadata: { - span_id: span.spanId, - trace_id: span.traceId, + return spans.filter(isChatSpan).map((span, fallbackIndex) => ({ + index: numberAttribute(span.attributes, 'agentv.message.index') ?? fallbackIndex, + timeUnixNano: span.startTimeUnixNano, + message: { + role: 'assistant', + content: span.attributes['gen_ai.output.messages'] as Message['content'], + toolCalls: toolSpansByParent.get(span.spanId)?.map(toolCallFromSpan), + startTime: unixNanoToIso(span.startTimeUnixNano), + endTime: unixNanoToIso(span.endTimeUnixNano), + durationMs: durationMsFromSpan(span), + tokenUsage: tokenUsageFromAttributes(span.attributes), + metadata: dropUndefined({ + span_id: span.spanId, + trace_id: span.traceId, + parent_span_id: span.parentSpanId ?? undefined, + parent_tool_call_id: nearestAncestorToolCallId(ancestorSpanIds(span, spansById), spansById), + }), }, })); } +export function traceEnvelopeToMessages(envelope: TraceEnvelope): readonly Message[] { + return traceEnvelopeToMessageEntries(envelope).map((entry) => entry.message); +} + +function transcriptMessageEntries(envelope: TraceEnvelope): readonly TraceEnvelopeMessageEntry[] { + const entries: TraceEnvelopeMessageEntry[] = []; + for (const span of orderedSpans(envelope.trace.spans)) { + for (const event of span.events ?? []) { + if (event.name !== TRANSCRIPT_MESSAGE_EVENT_NAME) { + continue; + } + const attributes = event.attributes ?? {}; + const message = fromTranscriptMessageWire(attributes['agentv.transcript.message']); + if (!message) { + continue; + } + entries.push({ + index: numberAttribute(attributes, 'agentv.transcript.message.index') ?? entries.length, + timeUnixNano: event.timeUnixNano, + message, + }); + } + } + return entries; +} + +export function traceEnvelopeToTranscriptMessages(envelope: TraceEnvelope): readonly Message[] { + const entries = transcriptMessageEntries(envelope); + if (entries.length === 0) { + return traceEnvelopeToMessages(envelope); + } + return [...entries] + .sort((first, second) => { + const byIndex = first.index - second.index; + if (byIndex !== 0) { + return byIndex; + } + if (first.timeUnixNano && second.timeUnixNano) { + return compareUnixNanoStrings(first.timeUnixNano, second.timeUnixNano); + } + return 0; + }) + .map((entry) => entry.message); +} + +export function traceEnvelopeToToolTrajectoryView( + envelope: TraceEnvelope, +): TraceEnvelopeToolTrajectoryView { + const spans = orderedSpans(envelope.trace.spans); + const spansById = buildSpanMap(spans); + const tools = spans.filter(isToolSpan).map((span, position) => { + const toolCall = toolCallFromSpan(span); + const toolCallId = toolCall.id ?? span.spanId; + const ancestorIds = ancestorSpanIds(span, spansById); + return { + position, + traceId: span.traceId, + spanId: span.spanId, + parentSpanId: span.parentSpanId ?? undefined, + ancestorSpanIds: ancestorIds, + tool: toolCall.tool, + toolCallId, + parentToolCallId: nearestAncestorToolCallId(ancestorIds, spansById), + input: toolCall.input, + output: toolCall.output, + status: span.status.code === 'ERROR' ? 'error' : 'ok', + startTime: toolCall.startTime, + endTime: toolCall.endTime, + durationMs: toolCall.durationMs, + } satisfies TraceEnvelopeToolTrajectoryItem; + }); + + return { + schemaVersion: envelope.schemaVersion, + traceId: envelope.trace.traceId, + rootSpanId: envelope.trace.rootSpanId, + tools, + }; +} + export function traceEnvelopeToTraceSummary(envelope: TraceEnvelope): TraceComputeResult { const toolCallCounts: Record = {}; const toolDurations: Record = {}; @@ -1196,7 +1564,7 @@ export function traceEnvelopeToTraceSummary(envelope: TraceEnvelope): TraceCompu export function traceEnvelopeToTraceArtifact(envelope: TraceEnvelope): TraceArtifact { const events: TraceEvent[] = []; let ordinal = 0; - for (const span of envelope.trace.spans) { + for (const span of orderedSpans(envelope.trace.spans)) { if (isChatSpan(span)) { events.push({ eventId: `span-${span.spanId}`, @@ -1269,3 +1637,100 @@ export function traceEnvelopeToTraceArtifact(envelope: TraceEnvelope): TraceArti export function getTraceEnvelopeSummary(envelope: TraceEnvelope): TraceSummary { return traceEnvelopeToTraceSummary(envelope).trace; } + +export function traceEnvelopeToOtlpJson(envelope: TraceEnvelope): TraceEnvelopeOtlpJson { + return { + resourceSpans: [ + { + resource: { + attributes: attributesToOtlp(envelope.trace.resource?.attributes), + }, + scopeSpans: [ + { + scope: dropUndefined({ + name: envelope.trace.scope?.name, + version: envelope.trace.scope?.version, + }), + spans: orderedSpans(envelope.trace.spans).map((span) => + dropUndefined({ + traceId: span.traceId, + spanId: span.spanId, + parentSpanId: span.parentSpanId ?? undefined, + name: span.name, + kind: spanKindToOtlp(span.kind), + startTimeUnixNano: span.startTimeUnixNano, + endTimeUnixNano: span.endTimeUnixNano, + attributes: attributesToOtlp(span.attributes), + status: spanStatusToOtlp(span.status), + events: span.events?.map((event) => + dropUndefined({ + name: event.name, + timeUnixNano: event.timeUnixNano, + attributes: attributesToOtlp(event.attributes), + }), + ), + }), + ), + }, + ], + }, + ], + }; +} + +function spanKindToOtlp(kind: string): number { + if (kind === 'SERVER') { + return 1; + } + if (kind === 'CLIENT') { + return 2; + } + if (kind === 'PRODUCER') { + return 3; + } + if (kind === 'CONSUMER') { + return 4; + } + return 0; +} + +function spanStatusToOtlp(status: TraceEnvelopeSpanStatus): TraceEnvelopeOtlpSpanStatus { + const code = status.code === 'OK' ? 1 : status.code === 'ERROR' ? 2 : 0; + return dropUndefined({ code, message: status.message }); +} + +function attributesToOtlp( + attributes: Readonly> | undefined, +): readonly TraceEnvelopeOtlpAttribute[] { + return Object.entries(attributes ?? {}).map(([key, value]) => ({ + key, + value: toOtlpAnyValue(value), + })); +} + +function toOtlpAnyValue(value: unknown): TraceEnvelopeOtlpAnyValue { + if (typeof value === 'string') { + return { stringValue: value }; + } + if (typeof value === 'number') { + return Number.isInteger(value) ? { intValue: value } : { doubleValue: value }; + } + if (typeof value === 'boolean') { + return { boolValue: value }; + } + if (Array.isArray(value)) { + return { arrayValue: { values: value.map(toOtlpAnyValue) } }; + } + return { stringValue: stringifyOtlpAttribute(value) }; +} + +function stringifyOtlpAttribute(value: unknown): string { + if (value === undefined) { + return ''; + } + try { + return JSON.stringify(value); + } catch { + return String(value); + } +} diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 1541961db..89be72986 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -1,10 +1,12 @@ /** - * Trace models for evaluation-time agent behavior. + * Trace read models for evaluation-time agent behavior. * - * `Trace` is AgentV's canonical normalized execution model. Evaluation results - * keep `output` as the final answer/scored result only; the full transcript, + * `Trace` is the result-local compatibility/read model attached to evaluation + * results. The canonical exported execution trace sidecar is + * `agentv.execution_trace.v1` in `trace-envelope.ts`; result JSONL keeps + * `output` as the final answer/scored result only, while the full transcript, * tool calls/results, errors, timing, usage, provider/session provenance, and - * replay/eval metrics live in `trace`. + * replay/eval metrics live in this read model. * * `TraceSummary` is a derived compact read model for metric-style graders and * aggregation. Derive it from `Trace.messages`/`Trace.events`; do not treat it @@ -722,13 +724,14 @@ export interface TraceSummary { } /** - * Canonical trace attached to every evaluation result. + * Result-local trace attached to every evaluation result. * * The compact TraceSummary fields are mirrored for existing - * metric graders; `messages` and `events` are the complete canonical - * execution record. Result `output` is only the final answer; tools, - * intermediate assistant text, timing, usage, provider provenance, and replay - * metadata live here. + * metric graders; `messages` and `events` are the complete execution record for + * result JSONL compatibility. Result `output` is only the + * final answer; tools, intermediate assistant text, timing, usage, provider + * provenance, and replay metadata live here. Full export/import work should use + * the execution trace artifact and derive this shape from it. */ export interface Trace extends TraceSummary { readonly schemaVersion: typeof TRACE_SCHEMA_VERSION; @@ -816,8 +819,8 @@ function toTraceError(error: TraceError | string): TraceError { } /** - * Build the canonical trace for an evaluation case from provider messages and - * execution metrics. This is the single projection used by result JSONL, + * Build the result-local trace read model for an evaluation case from provider + * messages and execution metrics. This is the projection used by result JSONL, * code-grader stdin, `outputs/answer.md`, and `outputs/transcript.jsonl`. */ export function buildTraceFromMessages(options: BuildTraceOptions = {}): Trace { diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index 596a538e8..2b73d9d80 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -165,7 +165,7 @@ const MOCK_SETTINGS = new Set([ const REPLAY_SETTINGS = new Set([ ...COMMON_SETTINGS, 'fixtures', - 'trace_envelopes', + 'execution_traces', 'source_target', 'suite', 'eval_path', @@ -596,14 +596,14 @@ export async function validateTargetsFile(filePath: string): Promise { + it('replays execution trace target output without invoking the live target and still runs graders', async () => { const tempDir = mkdtempSync(path.join(tmpdir(), 'agentv-envelope-replay-run-')); - const envelopePath = path.join(tempDir, 'trace-envelope.json'); + const envelopePath = path.join(tempDir, 'execution-trace.json'); const output: readonly Message[] = [ { role: 'assistant', content: 'Envelope replay target answer' }, ]; @@ -498,7 +498,7 @@ console.log('spreadsheet: revenue,total\\nQ1,42');`, kind: 'replay', name: 'replay_coding_agent', config: { - source: { kind: 'trace_envelopes', path: envelopePath }, + source: { kind: 'execution_traces', path: envelopePath }, sourceTarget: 'live_coding_agent', }, }; diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index a58bdde72..2b3fb6e01 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -1170,13 +1170,13 @@ describe('resolveTargetDefinition', () => { ).toThrow(/model/i); }); - it('resolves replay targets from trace envelope sources', () => { + it('resolves replay targets from execution trace sources', () => { const evalFilePath = '/workspace/evals/sample.eval.yaml'; const resolved = resolveTargetDefinition( { - name: 'replay-from-envelope', + name: 'replay-from-execution-trace', provider: 'replay', - trace_envelopes: '../fixtures/trace-envelopes.jsonl', + execution_traces: '../fixtures/execution-traces.jsonl', source_target: 'live-agent', suite: 'suite-a', eval_path: 'evals/sample.eval.yaml', @@ -1191,8 +1191,8 @@ describe('resolveTargetDefinition', () => { throw new Error('expected replay target'); } expect(resolved.config.source).toEqual({ - kind: 'trace_envelopes', - path: '/workspace/fixtures/trace-envelopes.jsonl', + kind: 'execution_traces', + path: '/workspace/fixtures/execution-traces.jsonl', }); expect(resolved.config.fixturesPath).toBeUndefined(); expect(resolved.config.sourceTarget).toBe('live-agent'); @@ -1208,7 +1208,7 @@ describe('resolveTargetDefinition', () => { name: 'ambiguous-replay', provider: 'replay', fixtures: './target-output.jsonl', - trace_envelopes: './trace-envelopes.jsonl', + execution_traces: './execution-traces.jsonl', source_target: 'live-agent', }, {}, diff --git a/packages/core/test/evaluation/replay-fixtures.test.ts b/packages/core/test/evaluation/replay-fixtures.test.ts index 6262e98c0..753e42ac8 100644 --- a/packages/core/test/evaluation/replay-fixtures.test.ts +++ b/packages/core/test/evaluation/replay-fixtures.test.ts @@ -382,10 +382,10 @@ describe('replay fixtures', () => { } }); - it('replays target output from trace envelope sources', async () => { + it('replays target output from execution trace sources', async () => { const dir = await mkdtemp(path.join(tmpdir(), 'agentv-replay-envelopes-')); try { - const envelopePath = path.join(dir, 'trace-envelope.json'); + const envelopePath = path.join(dir, 'execution-trace.json'); const wire = fullContentEnvelopeWire( envelopeResult([{ role: 'assistant', content: 'Envelope replay answer' }]), ); @@ -405,7 +405,7 @@ describe('replay fixtures', () => { expect(response.costUsd).toBe(0.0042); expect(response.durationMs).toBe(123); expect(response.raw).toMatchObject({ - replay_trace_envelope: { + replay_execution_trace: { suite: 'suite-a', eval_path: 'evals/sample.eval.yaml', test_id: 'case-a', @@ -417,10 +417,10 @@ describe('replay fixtures', () => { } }); - it('fails clearly when a trace envelope lacks replayable assistant output', async () => { + it('fails clearly when an execution trace lacks replayable assistant output', async () => { const dir = await mkdtemp(path.join(tmpdir(), 'agentv-replay-envelopes-')); try { - const envelopePath = path.join(dir, 'trace-envelope.json'); + const envelopePath = path.join(dir, 'execution-trace.json'); const wire = toTraceEnvelopeWire( buildTraceEnvelopeFromEvaluationResult( envelopeResult([{ role: 'assistant', content: 'Redacted answer' }]), @@ -446,7 +446,7 @@ describe('replay fixtures', () => { } }); - it('fails loudly for duplicate trace envelope replay identities', async () => { + it('fails loudly for duplicate execution trace replay identities', async () => { const dir = await mkdtemp(path.join(tmpdir(), 'agentv-replay-envelopes-')); try { const envelopePath = path.join(dir, 'trace-envelopes.jsonl'); @@ -470,7 +470,7 @@ describe('replay fixtures', () => { } }); - it('preserves opaque payload keys through trace envelope replay projection', async () => { + it('preserves opaque payload keys through execution trace replay projection', async () => { const dir = await mkdtemp(path.join(tmpdir(), 'agentv-replay-envelopes-')); try { const envelopePath = path.join(dir, 'payloads.json'); diff --git a/packages/core/test/evaluation/trace-envelope.test.ts b/packages/core/test/evaluation/trace-envelope.test.ts index b7b3faa39..b25735c9f 100644 --- a/packages/core/test/evaluation/trace-envelope.test.ts +++ b/packages/core/test/evaluation/trace-envelope.test.ts @@ -1,14 +1,22 @@ import { describe, expect, it } from 'bun:test'; +import { readFileSync } from 'node:fs'; +import path from 'node:path'; +import { ToolTrajectoryGrader } from '../../src/evaluation/graders/tool-trajectory.js'; import type { Message } from '../../src/evaluation/providers/types.js'; import { - TRACE_ENVELOPE_SCHEMA_VERSION, + EXECUTION_TRACE_SCHEMA_VERSION, + type TraceEnvelopeSpan, TraceEnvelopeWireSchema, buildTraceEnvelopeFromEvaluationResult, fromTraceEnvelopeWire, toTraceEnvelopeWire, traceEnvelopeToMessages, + traceEnvelopeToOtlpJson, + traceEnvelopeToToolTrajectoryView, + traceEnvelopeToTraceArtifact, traceEnvelopeToTraceSummary, + traceEnvelopeToTranscriptMessages, } from '../../src/evaluation/trace-envelope.js'; import { buildTraceFromMessages, computeTraceSummary } from '../../src/evaluation/trace.js'; import type { EvaluationResult } from '../../src/evaluation/types.js'; @@ -17,6 +25,15 @@ function jsonComparable(value: unknown): unknown { return JSON.parse(JSON.stringify(value)); } +function requireSpan( + spans: readonly TraceEnvelopeSpan[], + predicate: (span: TraceEnvelopeSpan) => boolean, +): TraceEnvelopeSpan { + const span = spans.find(predicate); + expect(span).toBeDefined(); + return span as TraceEnvelopeSpan; +} + function makeResult(overrides: Partial = {}): EvaluationResult { const input: readonly Message[] = [{ role: 'user', content: 'Inspect the repository' }]; const output: readonly Message[] = [{ role: 'assistant', content: 'Done' }]; @@ -58,12 +75,12 @@ function makeResult(overrides: Partial = {}): EvaluationResult }; } -describe('trace envelope v1', () => { +describe('execution trace artifact v1', () => { it('validates and round-trips the explicit snake_case wire shape', () => { const envelope = buildTraceEnvelopeFromEvaluationResult(makeResult(), { evalPath: 'examples/showcase/trace-evaluation/evals/coding-agent-replay.eval.yaml', runId: 'run-123', - experiment: 'trace-envelope-v1', + experiment: 'execution-trace-v1', now: () => new Date('2026-06-15T12:00:05.000Z'), source: { metadata: { @@ -72,15 +89,16 @@ describe('trace envelope v1', () => { }, }, artifacts: { - envelope_path: 'outputs/trace-envelope.json', + execution_trace_path: 'outputs/execution-trace.json', transcript_path: 'outputs/transcript.jsonl', }, }); const wire = toTraceEnvelopeWire(envelope); - expect(wire.schema_version).toBe(TRACE_ENVELOPE_SCHEMA_VERSION); - expect(wire.envelope_id).toMatch(/^trace-env-/); + expect(wire.schema_version).toBe(EXECUTION_TRACE_SCHEMA_VERSION); + expect(wire.artifact_id).toMatch(/^execution-trace-/); + expect(wire).not.toHaveProperty('envelope_id'); expect(wire.created_at).toBe('2026-06-15T12:00:05.000Z'); expect(wire.eval.eval_path).toContain('coding-agent-replay.eval.yaml'); expect(wire.trace.format).toBe('otlp_openinference_spans'); @@ -264,8 +282,14 @@ describe('trace envelope v1', () => { const envelope = buildTraceEnvelopeFromEvaluationResult(result); const root = envelope.trace.spans.find((span) => span.spanId === envelope.trace.rootSpanId); const summary = traceEnvelopeToTraceSummary(envelope); + const otlp = traceEnvelopeToOtlpJson(envelope); + const otlpRoot = otlp.resourceSpans[0]?.scopeSpans[0]?.spans.find( + (span) => span.spanId === envelope.trace.rootSpanId, + ); expect(root?.status).toEqual({ code: 'ERROR', message: 'Provider timed out' }); + expect(otlpRoot?.kind).toBe(0); + expect(otlpRoot?.status).toEqual({ code: 2, message: 'Provider timed out' }); expect(root?.attributes['gen_ai.usage.input_tokens']).toBe(100); expect(root?.attributes['agentv.trace.cost_usd']).toBe(0.012); expect(summary.tokenUsage).toEqual({ input: 100, output: 20, cached: 5, reasoning: 3 }); @@ -284,7 +308,7 @@ describe('trace envelope v1', () => { }); }); - it('projects TraceSummary and Message tool calls from envelope spans', () => { + it('projects Message[], TraceSummary, trajectory, tool grader input, and OTLP JSON from spans', () => { const output: readonly Message[] = [ { role: 'assistant', @@ -303,10 +327,247 @@ describe('trace envelope v1', () => { }), }); const envelope = buildTraceEnvelopeFromEvaluationResult(result); + const messages = traceEnvelopeToMessages(envelope); + const summary = traceEnvelopeToTraceSummary(envelope); + const trajectory = traceEnvelopeToTraceArtifact(envelope); + const compact = traceEnvelopeToToolTrajectoryView(envelope); + const otlp = traceEnvelopeToOtlpJson(envelope); + const grader = new ToolTrajectoryGrader({ + config: { + name: 'expected-tool-sequence', + type: 'tool-trajectory', + mode: 'exact', + expected: [{ tool: 'Read' }, { tool: 'Edit' }], + }, + }); + + expect(summary.trace).toEqual(computeTraceSummary(output).trace); + expect(messages[0]?.toolCalls?.map((toolCall) => toolCall.tool)).toEqual(['Read', 'Edit']); + expect(trajectory.events.map((event) => event.type)).toEqual([ + 'model_turn', + 'tool_call', + 'tool_call', + ]); + expect(compact.tools.map((tool) => [tool.position, tool.tool, tool.toolCallId])).toEqual([ + [0, 'Read', 'call-read'], + [1, 'Edit', 'call-edit'], + ]); + expect(grader.evaluate({ output: messages, trace: summary.trace } as never)).toMatchObject({ + score: 1, + verdict: 'pass', + }); + expect(otlp.resourceSpans[0]?.scopeSpans[0]?.spans.map((span) => span.name)).toEqual([ + 'invoke_agent replay_coding_agent', + 'chat replay_coding_agent', + 'execute_tool Read', + 'execute_tool Edit', + ]); + const otlpRead = otlp.resourceSpans[0]?.scopeSpans[0]?.spans.find( + (span) => span.name === 'execute_tool Read', + ); + expect(otlpRead?.kind).toBe(0); + expect(otlpRead?.status).toEqual({ code: 1 }); + expect(otlpRead?.parentSpanId).toBe( + envelope.trace.spans.find((span) => span.name === 'chat replay_coding_agent')?.spanId, + ); + }); + + it('projects transcript rows from canonical transcript events without changing replay messages', () => { + const input: readonly Message[] = [ + { role: 'system', content: 'Stay terse.' }, + { role: 'user', content: 'Read the config.' }, + ]; + const output: readonly Message[] = [ + { + role: 'assistant', + content: 'Read config.', + toolCalls: [ + { + tool: 'Read', + input: { file_path: 'src/config.ts', providerCamelKey: 'kept' }, + output: { line_count: 12, providerCamelKey: 'kept' }, + }, + ], + }, + ]; + const envelope = buildTraceEnvelopeFromEvaluationResult( + makeResult({ + output: 'Read config.', + trace: buildTraceFromMessages({ + input, + output, + finalOutput: 'Read config.', + target: 'codex', + testId: 'transcript-projection-case', + }), + }), + { capture: { content: 'full', redactionLevel: 'none', redactedFields: [] } }, + ); + + expect(traceEnvelopeToMessages(envelope).map((message) => message.role)).toEqual(['assistant']); + expect(traceEnvelopeToTranscriptMessages(envelope).map((message) => message.role)).toEqual([ + 'system', + 'user', + 'assistant', + ]); + expect(traceEnvelopeToTranscriptMessages(envelope)[2]?.toolCalls?.[0]).toMatchObject({ + tool: 'Read', + input: { file_path: 'src/config.ts', providerCamelKey: 'kept' }, + output: { line_count: 12, providerCamelKey: 'kept' }, + }); + + const root = envelope.trace.spans.find((span) => span.spanId === envelope.trace.rootSpanId); + const transcriptEvent = root?.events?.find( + (event) => event.name === 'agentv.transcript.message', + ); + expect(transcriptEvent?.attributes?.['agentv.transcript.message']).toMatchObject({ + role: 'system', + content: 'Stay terse.', + }); + }); + + it('orders span projections by numeric nanosecond timestamps', () => { + const output: readonly Message[] = [ + { + role: 'assistant', + content: 'early', + toolCalls: [{ tool: 'EarlyTool', id: 'call-early' }], + }, + { + role: 'assistant', + content: 'late', + toolCalls: [{ tool: 'LateTool', id: 'call-late' }], + }, + ]; + const envelope = buildTraceEnvelopeFromEvaluationResult( + makeResult({ + trace: buildTraceFromMessages({ + input: [{ role: 'user', content: 'Sort non-padded nanoseconds' }], + output, + finalOutput: 'late', + target: 'codex', + testId: 'timestamp-ordering-case', + }), + }), + { capture: { content: 'full', redactionLevel: 'none', redactedFields: [] } }, + ); + const root = requireSpan( + envelope.trace.spans, + (span) => span.spanId === envelope.trace.rootSpanId, + ); + const earlyChat = requireSpan( + envelope.trace.spans, + (span) => span.attributes['gen_ai.output.messages'] === 'early', + ); + const earlyTool = requireSpan( + envelope.trace.spans, + (span) => span.name === 'execute_tool EarlyTool', + ); + const lateChat = requireSpan( + envelope.trace.spans, + (span) => span.attributes['gen_ai.output.messages'] === 'late', + ); + const lateTool = requireSpan( + envelope.trace.spans, + (span) => span.name === 'execute_tool LateTool', + ); + const retime = ( + span: TraceEnvelopeSpan, + startTimeUnixNano: string, + endTimeUnixNano: string, + ) => ({ + ...span, + startTimeUnixNano, + endTimeUnixNano, + }); + const shuffled = { + ...envelope, + trace: { + ...envelope.trace, + spans: [ + retime(root, '0', '2000000000'), + retime(lateTool, '1010000000', '1020000000'), + retime(lateChat, '1000000000', '1100000000'), + retime(earlyTool, '910000000', '920000000'), + retime(earlyChat, '900000000', '990000000'), + ], + }, + }; - expect(traceEnvelopeToTraceSummary(envelope).trace).toEqual(computeTraceSummary(output).trace); + expect(traceEnvelopeToMessages(shuffled).map((message) => message.content)).toEqual([ + 'early', + 'late', + ]); + expect(traceEnvelopeToToolTrajectoryView(shuffled).tools.map((tool) => tool.tool)).toEqual([ + 'EarlyTool', + 'LateTool', + ]); + expect( + traceEnvelopeToTraceArtifact(shuffled).events.map((event) => event.sourceRef?.spanId), + ).toEqual([earlyChat.spanId, earlyTool.spanId, lateChat.spanId, lateTool.spanId]); expect( - traceEnvelopeToMessages(envelope)[0]?.toolCalls?.map((toolCall) => toolCall.tool), - ).toEqual(['Read', 'Edit']); + traceEnvelopeToOtlpJson(shuffled).resourceSpans[0]?.scopeSpans[0]?.spans.map( + (span) => span.spanId, + ), + ).toEqual([root.spanId, earlyChat.spanId, earlyTool.spanId, lateChat.spanId, lateTool.spanId]); + }); + + it('keeps nested subagent parent-child spans visible in golden projections', () => { + const fixturePath = path.join( + import.meta.dir, + 'fixtures', + 'execution-trace', + 'nested-subagent.json', + ); + const envelope = fromTraceEnvelopeWire(JSON.parse(readFileSync(fixturePath, 'utf8'))); + const wire = toTraceEnvelopeWire(envelope); + const messages = traceEnvelopeToMessages(envelope); + const compact = traceEnvelopeToToolTrajectoryView(envelope); + const otlp = traceEnvelopeToOtlpJson(envelope); + + expect(wire.schema_version).toBe(EXECUTION_TRACE_SCHEMA_VERSION); + expect(wire.source.metadata).toMatchObject({ + source_provider: 'codex', + providerCamelKey: 'kept', + }); + + const runSubagent = compact.tools.find((tool) => tool.tool === 'runSubagent'); + const nestedRead = compact.tools.find((tool) => tool.tool === 'Read'); + expect(runSubagent?.parentToolCallId).toBeUndefined(); + expect(nestedRead).toMatchObject({ + toolCallId: 'call-nested-read', + parentToolCallId: 'call-subagent', + input: { + file_path: 'src/config.ts', + providerCamelKey: 'input kept', + }, + output: { + line_count: 12, + providerCamelKey: 'output kept', + }, + }); + expect(nestedRead?.ancestorSpanIds).toContain(runSubagent?.spanId); + + expect(messages.map((message) => message.metadata)).toEqual([ + { + span_id: 'chat000000000001', + trace_id: '11111111111111111111111111111111', + parent_span_id: 'root000000000001', + }, + { + span_id: 'chat000000000002', + trace_id: '11111111111111111111111111111111', + parent_span_id: 'agent00000000001', + parent_tool_call_id: 'call-subagent', + }, + ]); + + const otlpSpans = otlp.resourceSpans[0]?.scopeSpans[0]?.spans ?? []; + const subagent = otlpSpans.find((span) => span.spanId === 'agent00000000001'); + const nestedChat = otlpSpans.find((span) => span.spanId === 'chat000000000002'); + const nestedTool = otlpSpans.find((span) => span.spanId === 'tool000000000002'); + expect(subagent?.parentSpanId).toBe('tool000000000001'); + expect(nestedChat?.parentSpanId).toBe('agent00000000001'); + expect(nestedTool?.parentSpanId).toBe('chat000000000002'); }); }); diff --git a/packages/core/test/evaluation/validation/targets-validator.test.ts b/packages/core/test/evaluation/validation/targets-validator.test.ts index 9cdf33ff8..6156018a9 100644 --- a/packages/core/test/evaluation/validation/targets-validator.test.ts +++ b/packages/core/test/evaluation/validation/targets-validator.test.ts @@ -224,14 +224,14 @@ describe('validateTargetsFile', () => { ).toBe(true); }); - it('accepts replay targets backed by trace envelopes', async () => { - const filePath = path.join(tempDir, 'replay-trace-envelopes.yaml'); + it('accepts replay targets backed by execution traces', async () => { + const filePath = path.join(tempDir, 'replay-execution-traces.yaml'); await writeFile( filePath, `targets: - - name: replay-envelope + - name: replay-execution-trace provider: replay - trace_envelopes: ./fixtures/trace-envelopes.jsonl + execution_traces: ./fixtures/execution-traces.jsonl source_target: live-agent `, ); @@ -240,7 +240,7 @@ describe('validateTargetsFile', () => { expect(result.valid).toBe(true); expect( - result.errors.some((error) => error.message.includes("Unknown setting 'trace_envelopes'")), + result.errors.some((error) => error.message.includes("Unknown setting 'execution_traces'")), ).toBe(false); }); @@ -252,7 +252,7 @@ describe('validateTargetsFile', () => { - name: replay-ambiguous provider: replay fixtures: ./fixtures/target-output.jsonl - trace_envelopes: ./fixtures/trace-envelopes.jsonl + execution_traces: ./fixtures/execution-traces.jsonl source_target: live-agent `, ); diff --git a/packages/core/test/evaluation/workspace/repo-manager.test.ts b/packages/core/test/evaluation/workspace/repo-manager.test.ts index a89d647d9..2948557b7 100644 --- a/packages/core/test/evaluation/workspace/repo-manager.test.ts +++ b/packages/core/test/evaluation/workspace/repo-manager.test.ts @@ -654,7 +654,7 @@ describe('RepoManager', () => { process.env.PATH = `${binDir}:${process.env.PATH ?? ''}`; try { - const timeoutManager = new RepoManager(false, { progress: false, timeoutMs: 50 }); + const timeoutManager = new RepoManager(false, { progress: false, timeoutMs: 500 }); const runGitStreaming = ( timeoutManager as unknown as { runGitStreaming( @@ -671,7 +671,7 @@ describe('RepoManager', () => { await expect( runGitStreaming(['stalled'], { description: 'git clone stalled' }), ).rejects.toThrow( - /git clone stalled made no progress for 0s.*Register a matching local checkout.*git_cache\.mirrors.*network connectivity/s, + /git clone stalled made no progress for \d+s.*Register a matching local checkout.*git_cache\.mirrors.*network connectivity/s, ); } finally { process.env.PATH = savedPath;