Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 38 additions & 15 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ import {
type GraderResult,
type Message,
type TargetDefinition,
type TraceEnvelope,
type TraceSummary,
buildTraceEnvelopeFromEvaluationResult,
buildTraceFromMessages,
extractLastAssistantContent,
toTraceEnvelopeWire,
traceEnvelopeToTranscriptMessages,
traceToTranscriptJsonLines,
} from '@agentv/core';
import { RESULT_INDEX_FILENAME } from './result-layout.js';
Expand Down Expand Up @@ -736,31 +738,37 @@ function resolveEnvelopeEvalPath(
return source?.evalFileRepoPath ?? source?.evalFilePath ?? fallbackEvalFile;
}

function resultHasExecutionTraceTranscript(result: EvaluationResult): boolean {
return result.output.length > 0 || result.trace.messages.length > 0;
}

async function writeTraceEnvelopeSidecar(params: {
readonly result: EvaluationResult;
readonly outputDir: string;
readonly outputsDir: string;
readonly evalPath?: string;
readonly experiment?: string;
}): Promise<void> {
const hasTranscript = params.result.output.length > 0 || params.result.trace.messages.length > 0;
}): Promise<TraceEnvelope> {
const hasTranscript = resultHasExecutionTraceTranscript(params.result);
const envelope = buildTraceEnvelopeFromEvaluationResult(params.result, {
evalPath: params.evalPath,
runId: path.basename(params.outputDir),
experiment: params.experiment,
source: { path: RESULT_INDEX_FILENAME },
capture: { content: 'full', redactionLevel: 'none', redactedFields: [] },
artifacts: {
envelope_path: 'outputs/trace-envelope.json',
execution_trace_path: 'outputs/execution-trace.json',
answer_path: params.result.output.length > 0 ? 'outputs/answer.md' : undefined,
response_path: params.result.output.length > 0 ? 'outputs/response.md' : undefined,
transcript_path: hasTranscript ? 'outputs/transcript.jsonl' : undefined,
},
});
await writeFile(
path.join(params.outputsDir, 'trace-envelope.json'),
path.join(params.outputsDir, 'execution-trace.json'),
`${JSON.stringify(toTraceEnvelopeWire(envelope), null, 2)}\n`,
'utf8',
);
return envelope;
}

export function buildIndexArtifactEntry(
Expand Down Expand Up @@ -829,7 +837,7 @@ export function buildResultIndexArtifact(
const artifactSubdir = buildArtifactSubdir(result);
const input = extractInput(result);
const hasAnswer = result.output.length > 0;
const hasTranscript = result.trace.messages.length > 0 || result.trace.events.length > 0;
const hasTranscript = resultHasExecutionTraceTranscript(result);

return {
timestamp: result.timestamp,
Expand Down Expand Up @@ -885,8 +893,23 @@ async function writeJsonlFile(filePath: string, records: readonly unknown[]): Pr
await writeFile(filePath, content, 'utf8');
}

async function writeTranscriptJsonl(filePath: string, result: EvaluationResult): Promise<void> {
const lines = traceToTranscriptJsonLines(result.trace, {
function traceProjectionForTranscript(result: EvaluationResult, envelope: TraceEnvelope) {
return {
...result.trace,
messages: traceEnvelopeToTranscriptMessages(envelope),
};
}

function hasTranscriptProjection(result: EvaluationResult, envelope: TraceEnvelope): boolean {
return result.output.length > 0 || traceEnvelopeToTranscriptMessages(envelope).length > 0;
}

async function writeTranscriptJsonl(
filePath: string,
result: EvaluationResult,
envelope: TraceEnvelope,
): Promise<void> {
const lines = traceToTranscriptJsonLines(traceProjectionForTranscript(result, envelope), {
testId: result.testId,
target: result.target,
});
Expand Down Expand Up @@ -1256,16 +1279,16 @@ export async function writePerTestArtifacts(
// for scored output or transcript.jsonl for the full execution record.
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
}
if (result.output.length > 0 || result.trace.messages.length > 0) {
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
}
await writeTraceEnvelopeSidecar({
const envelope = await writeTraceEnvelopeSidecar({
result,
outputDir,
outputsDir,
evalPath: resolveEnvelopeEvalPath(result, testByTestId),
experiment: options?.experiment,
});
if (hasTranscriptProjection(result, envelope)) {
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope);
}

const taskBundle = await materializeTaskBundleForResult({
result,
Expand Down Expand Up @@ -1336,16 +1359,16 @@ export async function writeArtifactsFromResults(
// for scored output or transcript.jsonl for the full execution record.
await writeFile(path.join(outputsDir, 'response.md'), result.output, 'utf8');
}
if (result.output.length > 0 || result.trace.messages.length > 0) {
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result);
}
await writeTraceEnvelopeSidecar({
const envelope = await writeTraceEnvelopeSidecar({
result,
outputDir,
outputsDir,
evalPath: resolveEnvelopeEvalPath(result, testByTestId, options?.evalFile),
experiment: options?.experiment,
});
if (hasTranscriptProjection(result, envelope)) {
await writeTranscriptJsonl(path.join(outputsDir, 'transcript.jsonl'), result, envelope);
}

const taskBundle = await materializeTaskBundleForResult({
result,
Expand Down
69 changes: 57 additions & 12 deletions apps/cli/test/commands/eval/artifact-writer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ import {
type GraderResult,
TraceEnvelopeWireSchema,
buildTraceFromMessages,
fromTraceEnvelopeWire,
parseYamlValue,
traceEnvelopeToTranscriptMessages,
traceToTranscriptJsonLines,
} from '@agentv/core';

import {
Expand Down Expand Up @@ -803,11 +806,31 @@ describe('writeArtifactsFromResults', () => {

await writeArtifactsFromResults(results, testDir);

const transcriptLines = (await readFile(path.join(testDir, 'transcript.jsonl'), 'utf8'))
const transcriptLines = (
await readFile(path.join(testDir, 'transcript-case', 'outputs', 'transcript.jsonl'), 'utf8')
)
.trim()
.split('\n')
.map((line) => JSON.parse(line));

const envelope = TraceEnvelopeWireSchema.parse(
JSON.parse(
await readFile(
path.join(testDir, 'transcript-case', 'outputs', 'execution-trace.json'),
'utf8',
),
),
);
const projectedEnvelope = fromTraceEnvelopeWire(envelope);
const projectedTranscript = traceToTranscriptJsonLines(
{
...results[0].trace,
messages: traceEnvelopeToTranscriptMessages(projectedEnvelope),
},
{ testId: 'transcript-case', target: 'codex' },
);

expect(transcriptLines).toEqual(JSON.parse(JSON.stringify(projectedTranscript)));
expect(transcriptLines).toEqual([
{
test_id: 'transcript-case',
Expand Down Expand Up @@ -845,16 +868,8 @@ describe('writeArtifactsFromResults', () => {
},
},
]);

const envelope = TraceEnvelopeWireSchema.parse(
JSON.parse(
await readFile(
path.join(testDir, 'transcript-case', 'outputs', 'trace-envelope.json'),
'utf8',
),
),
);
expect(envelope.schema_version).toBe('agentv.trace_envelope.v1');
expect(envelope.schema_version).toBe('agentv.execution_trace.v1');
expect(envelope.artifact_id).toMatch(/^execution-trace-/);
expect(envelope.eval.test_id).toBe('transcript-case');
expect(envelope.trace.spans.map((span) => span.attributes['gen_ai.operation.name'])).toEqual([
'invoke_agent',
Expand All @@ -865,7 +880,37 @@ describe('writeArtifactsFromResults', () => {
const indexLine = JSON.parse(
(await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
);
expect(indexLine).not.toHaveProperty('trace_envelope_path');
expect(indexLine).not.toHaveProperty('execution_trace_path');
});

it('omits per-test transcript links when the execution trace has no transcript rows', async () => {
const results = [
makeResult({
testId: 'no-transcript-case',
output: '',
trace: buildTraceFromMessages(),
}),
];

await writeArtifactsFromResults(results, testDir);

const transcriptPath = path.join(testDir, 'no-transcript-case', 'outputs', 'transcript.jsonl');
await expect(readFile(transcriptPath, 'utf8')).rejects.toThrow();

const indexLine = JSON.parse(
(await readFile(path.join(testDir, 'index.jsonl'), 'utf8')).trim(),
);
expect(indexLine).not.toHaveProperty('transcript_path');

const envelope = TraceEnvelopeWireSchema.parse(
JSON.parse(
await readFile(
path.join(testDir, 'no-transcript-case', 'outputs', 'execution-trace.json'),
'utf8',
),
),
);
expect(envelope.artifacts).not.toHaveProperty('transcript_path');
});

it('sanitizes test IDs for directory names', async () => {
Expand Down
3 changes: 2 additions & 1 deletion apps/cli/test/commands/trace/trace.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ const OTLP_TRACE = JSON.stringify({
startTimeUnixNano: '2500000000',
endTimeUnixNano: '3000000000',
attributes: [{ key: 'gen_ai.tool.name', value: { stringValue: 'read_file' } }],
status: { code: 1 },
status: { code: 2, message: 'tool failed' },
},
],
},
Expand Down Expand Up @@ -257,6 +257,7 @@ describe('trace utils', () => {
expect(results[0].trace?.event_count).toBe(1);
expect(results[0].trace?.llm_call_count).toBe(1);
expect(results[0].trace?.tool_calls).toEqual({ read_file: 1 });
expect(results[0].trace?.error_count).toBe(1);
});
});

Expand Down
Loading
Loading