From 959e93583c746f8644bc27c4e8bb171ed6dd0921 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 17 Jun 2026 11:29:31 +0200 Subject: [PATCH 1/3] fix(results): handle replay row schema aliases --- apps/cli/src/commands/eval/artifact-writer.ts | 23 ++- apps/cli/src/commands/inspect/filter.ts | 5 +- apps/cli/src/commands/inspect/search.ts | 8 +- apps/cli/src/commands/inspect/utils.ts | 10 +- apps/cli/src/commands/results/manifest.ts | 23 ++- .../src/commands/results/result-row-schema.ts | 163 ++++++++++++++++++ .../commands/eval/artifact-writer.test.ts | 29 ++++ apps/cli/test/commands/inspect/filter.test.ts | 26 +++ apps/cli/test/commands/results/shared.test.ts | 27 +++ .../fixtures/results/camel-replay/index.jsonl | 1 + 10 files changed, 296 insertions(+), 19 deletions(-) create mode 100644 apps/cli/src/commands/results/result-row-schema.ts create mode 100644 apps/cli/test/fixtures/results/camel-replay/index.jsonl diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index f322a2cd3..e3d2137d4 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -15,6 +15,7 @@ import { toTraceEnvelopeWire, traceToTranscriptJsonLines, } from '@agentv/core'; +import { normalizeResultRow } from '../results/result-row-schema.js'; import { RESULT_INDEX_FILENAME } from './result-layout.js'; import { type MaterializedTaskBundlePaths, @@ -1091,21 +1092,27 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin export function parseJsonlResults(content: string): EvaluationResult[] { const results: EvaluationResult[] = []; const lines = content.split('\n'); - for (const line of lines) { + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; const trimmed = line.trim(); if (trimmed.length === 0) { continue; } + let parsed: unknown; try { - const parsed = JSON.parse(trimmed); - // JSONL files from AgentV use snake_case; convert back to camelCase - const camelCased = toCamelCaseDeep(parsed); - const normalized = normalizeParsedResult(camelCased); - if (normalized) { - results.push(normalized); - } + parsed = JSON.parse(trimmed); } catch { // Skip malformed lines + continue; + } + + // JSONL files from AgentV use snake_case; convert supported historical aliases + // to canonical snake_case before mapping into TypeScript internals. + const canonicalRow = normalizeResultRow(parsed, { lineNumber: i + 1 }); + const camelCased = toCamelCaseDeep(canonicalRow); + const normalized = normalizeParsedResult(camelCased); + if (normalized) { + results.push(normalized); } } return results; diff --git a/apps/cli/src/commands/inspect/filter.ts b/apps/cli/src/commands/inspect/filter.ts index 89f81c77d..011723266 100644 --- a/apps/cli/src/commands/inspect/filter.ts +++ b/apps/cli/src/commands/inspect/filter.ts @@ -14,6 +14,7 @@ import { existsSync, readFileSync, readdirSync, statSync } from 'node:fs'; import path from 'node:path'; import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts'; +import { normalizeResultRow } from '../results/result-row-schema.js'; import { c, formatScore, padLeft, padRight } from './utils.js'; /** A lightweight result record with fields needed for filtering. */ @@ -105,13 +106,15 @@ export function parseFilterableRecords(filePath: string): FilterableRecord[] { const lines = content.split('\n').filter((line) => line.trim()); const records: FilterableRecord[] = []; - for (const line of lines) { + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; let raw: Record; try { raw = JSON.parse(line) as Record; } catch { continue; } + raw = normalizeResultRow(raw, { lineNumber: i + 1, sourceLabel: filePath }); // Determine experiment from record or from directory path let experiment = typeof raw.experiment === 'string' ? raw.experiment : undefined; diff --git a/apps/cli/src/commands/inspect/search.ts b/apps/cli/src/commands/inspect/search.ts index 7c551742c..480bc6025 100644 --- a/apps/cli/src/commands/inspect/search.ts +++ b/apps/cli/src/commands/inspect/search.ts @@ -110,9 +110,11 @@ export function searchJsonlFile( const testId = typeof record.test_id === 'string' ? record.test_id - : typeof record.source === 'object' && record.source !== null - ? ((record.source as Record).session_id as string | undefined) - : undefined; + : typeof record.testId === 'string' + ? record.testId + : typeof record.source === 'object' && record.source !== null + ? ((record.source as Record).session_id as string | undefined) + : undefined; // Apply metadata filters before regex search if (targetFilter && target !== targetFilter) continue; diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index 25399ed20..ae2e34ec9 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -9,6 +9,7 @@ import { resolveWorkspaceOrFilePath, } from '../eval/result-layout.js'; import { loadManifestResults } from '../results/manifest.js'; +import { normalizeResultRow } from '../results/result-row-schema.js'; // ANSI color codes (no dependency needed) const colors = { @@ -133,11 +134,10 @@ function loadJsonlRecords(filePath: string): RawResult[] { .filter((line) => line.trim()); return lines.map((line, i) => { - const record = JSON.parse(line) as RawResult; - if (typeof record.score !== 'number') { - throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`); - } - return record; + return normalizeResultRow(JSON.parse(line), { + lineNumber: i + 1, + sourceLabel: filePath, + }) as unknown as RawResult; }); } diff --git a/apps/cli/src/commands/results/manifest.ts b/apps/cli/src/commands/results/manifest.ts index 99dd71993..65044552e 100644 --- a/apps/cli/src/commands/results/manifest.ts +++ b/apps/cli/src/commands/results/manifest.ts @@ -3,8 +3,10 @@ import path from 'node:path'; import { type EvaluationResult, + type TraceSummary, type TranscriptJsonLine, buildTraceFromMessages, + toCamelCaseDeep, traceFromTranscriptJsonLines, } from '@agentv/core'; @@ -14,6 +16,7 @@ import { isDirectoryPath, resolveRunManifestPath, } from '../eval/result-layout.js'; +import { normalizeResultRow } from './result-row-schema.js'; export interface ResultManifestRecord { readonly timestamp?: string; @@ -33,6 +36,7 @@ export interface ResultManifestRecord { readonly output?: number; readonly reasoning?: number; }; + readonly trace?: Record; readonly grading_path?: string; readonly timing_path?: string; readonly input_path?: string; @@ -57,6 +61,20 @@ function parseJsonlLines(content: string): T[] { .map((line) => JSON.parse(line) as T); } +function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] { + return content + .split(/\r?\n/) + .map((line, index) => ({ line: line.trim(), lineNumber: index + 1 })) + .filter(({ line }) => line.length > 0) + .map( + ({ line, lineNumber }) => + normalizeResultRow(JSON.parse(line), { + lineNumber, + sourceLabel, + }) as unknown as ResultManifestRecord, + ); +} + function parseMarkdownMessages(content: string): { role: string; content: string }[] { const trimmed = content.trim(); if (!trimmed.startsWith('@[')) { @@ -138,6 +156,7 @@ function hydrateTrace(baseDir: string, record: ResultManifestRecord): Evaluation return buildTraceFromMessages({ input: hydrateInput(baseDir, record), output: output ? [{ role: 'assistant', content: output }] : [], + summary: record.trace ? (toCamelCaseDeep(record.trace) as TraceSummary) : undefined, finalOutput: output, target: record.target, testId: record.test_id, @@ -205,7 +224,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E } export function parseResultManifest(content: string): ResultManifestRecord[] { - return parseJsonlLines(content); + return parseResultRows(content); } export function resolveResultSourcePath(source: string, cwd?: string): string { @@ -219,7 +238,7 @@ export function resolveResultSourcePath(source: string, cwd?: string): string { export function loadManifestResults(sourceFile: string): EvaluationResult[] { const resolvedSourceFile = resolveRunManifestPath(sourceFile); const content = readFileSync(resolvedSourceFile, 'utf8'); - const records = parseResultManifest(content); + const records = parseResultRows(content, resolvedSourceFile); const baseDir = path.dirname(resolvedSourceFile); return records.map((record) => hydrateManifestRecord(baseDir, record)); } diff --git a/apps/cli/src/commands/results/result-row-schema.ts b/apps/cli/src/commands/results/result-row-schema.ts new file mode 100644 index 000000000..7ff85cdac --- /dev/null +++ b/apps/cli/src/commands/results/result-row-schema.ts @@ -0,0 +1,163 @@ +/** + * Result JSONL row schema used by CLI result readers. + * + * Canonical AgentV run manifests are `index.jsonl` files with snake_case keys + * and a numeric `score`. Historical rows produced from TypeScript + * `EvaluationResult` objects may contain a small set of camelCase aliases. + * Normalize those aliases only at this file/CLI boundary; callers should work + * with the canonical snake_case row shape or convert once into TypeScript + * internals. + * + * Eval case rows are not result rows: they usually contain `id`/`prompt` but no + * `score`. Reject them with migration guidance instead of treating them as + * failed results with an unknown test ID. + */ + +export class ResultRowSchemaError extends Error { + constructor(message: string) { + super(message); + this.name = 'ResultRowSchemaError'; + } +} + +const MIGRATION_GUIDANCE = + 'Expected an AgentV result row with a numeric score. Eval-case JSONL is input data, not a results artifact. Run `agentv eval --output ` and pass the run workspace or its index.jsonl manifest.'; + +const RESULT_ROW_ALIASES = { + answerPath: 'answer_path', + artifactDir: 'artifact_dir', + conversationId: 'conversation_id', + costUsd: 'cost_usd', + durationMs: 'duration_ms', + endTime: 'end_time', + evalPath: 'eval_path', + executionStatus: 'execution_status', + failureReasonCode: 'failure_reason_code', + failureStage: 'failure_stage', + filesPath: 'files_path', + gradersPath: 'graders_path', + gradingPath: 'grading_path', + inputPath: 'input_path', + outputPath: 'output_path', + responsePath: 'response_path', + startTime: 'start_time', + targetsPath: 'targets_path', + taskDir: 'task_dir', + testId: 'test_id', + timingPath: 'timing_path', + tokenUsage: 'token_usage', + transcriptPath: 'transcript_path', + workspacePath: 'workspace_path', +} as const; + +const TRACE_SUMMARY_ALIASES = { + costUsd: 'cost_usd', + durationMs: 'duration_ms', + errorCount: 'error_count', + eventCount: 'event_count', + llmCallCount: 'llm_call_count', + tokenUsage: 'token_usage', + toolCalls: 'tool_calls', + toolDurations: 'tool_durations', +} as const; + +const MESSAGE_ALIASES = { + durationMs: 'duration_ms', + endTime: 'end_time', + startTime: 'start_time', + tokenUsage: 'token_usage', + toolCalls: 'tool_calls', +} as const; + +const TOOL_CALL_ALIASES = { + durationMs: 'duration_ms', + endTime: 'end_time', + startTime: 'start_time', +} as const; + +type AliasMap = Readonly>; + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function normalizeKnownAliases(value: Record, aliases: AliasMap) { + const normalized = { ...value }; + for (const [camelKey, snakeKey] of Object.entries(aliases)) { + if (normalized[snakeKey] === undefined && normalized[camelKey] !== undefined) { + normalized[snakeKey] = normalized[camelKey]; + } + if (camelKey !== snakeKey) { + delete normalized[camelKey]; + } + } + return normalized; +} + +function normalizeToolCall(value: unknown): unknown { + if (!isRecord(value)) { + return value; + } + return normalizeKnownAliases(value, TOOL_CALL_ALIASES); +} + +function normalizeMessage(value: unknown): unknown { + if (!isRecord(value)) { + return value; + } + + const normalized = normalizeKnownAliases(value, MESSAGE_ALIASES); + if (Array.isArray(normalized.tool_calls)) { + normalized.tool_calls = normalized.tool_calls.map(normalizeToolCall); + } + return normalized; +} + +function normalizeTraceSummary(value: unknown): unknown { + if (!isRecord(value)) { + return value; + } + + const normalized = normalizeKnownAliases(value, TRACE_SUMMARY_ALIASES); + if (Array.isArray(normalized.messages)) { + normalized.messages = normalized.messages.map(normalizeMessage); + } + return normalized; +} + +function normalizeOutput(value: unknown): unknown { + if (!Array.isArray(value)) { + return value; + } + return value.map(normalizeMessage); +} + +function buildSchemaError(context: { + lineNumber?: number; + sourceLabel?: string; +}): ResultRowSchemaError { + const location = [ + context.sourceLabel ? ` in ${context.sourceLabel}` : '', + context.lineNumber !== undefined ? ` at line ${context.lineNumber}` : '', + ].join(''); + return new ResultRowSchemaError(`Unsupported result row${location}. ${MIGRATION_GUIDANCE}`); +} + +export function normalizeResultRow( + value: unknown, + context: { lineNumber?: number; sourceLabel?: string } = {}, +): Record { + if (!isRecord(value)) { + throw buildSchemaError(context); + } + + const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES); + normalized.trace = normalizeTraceSummary(normalized.trace); + normalized.output = normalizeOutput(normalized.output); + + if (typeof normalized.score !== 'number' || !Number.isFinite(normalized.score)) { + throw buildSchemaError(context); + } + + return normalized; +} diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 8926f2c6f..c86f29ed8 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -562,6 +562,29 @@ describe('parseJsonlResults', () => { expect(results[1].testId).toBe('b'); }); + it('normalizes historical camelCase result row aliases', () => { + const content = `${JSON.stringify({ + testId: 'wtg-replay-fail', + target: 'codex', + score: 0.4, + executionStatus: 'quality_failure', + durationMs: 1234, + tokenUsage: { input: 10, output: 5 }, + costUsd: 0.012, + trace: { eventCount: 1, toolCalls: { rg: 1 }, errorCount: 0 }, + })}\n`; + + const results = parseJsonlResults(content); + + expect(results).toHaveLength(1); + expect(results[0].testId).toBe('wtg-replay-fail'); + expect(results[0].executionStatus).toBe('quality_failure'); + expect(results[0].durationMs).toBe(1234); + expect(results[0].tokenUsage).toEqual({ input: 10, output: 5 }); + expect(results[0].costUsd).toBe(0.012); + expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); + }); + it('handles empty content', () => { expect(parseJsonlResults('')).toHaveLength(0); }); @@ -577,6 +600,12 @@ describe('parseJsonlResults', () => { const content = `${good}\nnot json\n`; expect(parseJsonlResults(content)).toHaveLength(1); }); + + it('rejects eval-case-only rows with migration guidance', () => { + const content = `${JSON.stringify({ id: 'case-a', prompt: 'What is 2 + 2?' })}\n`; + + expect(() => parseJsonlResults(content)).toThrow(/Eval-case JSONL is input data/); + }); }); // --------------------------------------------------------------------------- diff --git a/apps/cli/test/commands/inspect/filter.test.ts b/apps/cli/test/commands/inspect/filter.test.ts index 7bbb56c03..3a06e2d42 100644 --- a/apps/cli/test/commands/inspect/filter.test.ts +++ b/apps/cli/test/commands/inspect/filter.test.ts @@ -109,6 +109,25 @@ describe('inspect filter', () => { expect(records[0].tool_names).toContain('read_file'); }); + it('normalizes historical camelCase trace tool summaries', () => { + const record = JSON.stringify({ + testId: 'wtg-replay-fail', + target: 'codex', + score: 0.4, + executionStatus: 'quality_failure', + trace: { toolCalls: { rg: 1 } }, + }); + const filePath = path.join(tempDir, 'index.jsonl'); + writeFileSync(filePath, `${record}\n`); + + const records = parseFilterableRecords(filePath); + + expect(records).toHaveLength(1); + expect(records[0].test_id).toBe('wtg-replay-fail'); + expect(records[0].execution_status).toBe('quality_failure'); + expect(records[0].tool_names).toContain('rg'); + }); + it('returns empty array for unreadable files', () => { const records = parseFilterableRecords(path.join(tempDir, 'nonexistent.jsonl')); @@ -154,6 +173,13 @@ describe('inspect filter', () => { expect(records).toHaveLength(1); expect(records[0].test_id).toBe('unknown'); }); + + it('rejects eval-case-only rows with migration guidance', () => { + const filePath = path.join(tempDir, 'index.jsonl'); + writeFileSync(filePath, '{"id":"case-a","prompt":"What is 2 + 2?"}\n'); + + expect(() => parseFilterableRecords(filePath)).toThrow(/Eval-case JSONL is input data/); + }); }); describe('buildFilterPredicate', () => { diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index 2cd110fc3..ee32aa5eb 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -4,6 +4,7 @@ import { tmpdir } from 'node:os'; import path from 'node:path'; import { resolveRunManifestPath } from '../../../src/commands/eval/result-layout.js'; +import { loadManifestResults } from '../../../src/commands/results/manifest.js'; import { resolveSourceFile } from '../../../src/commands/results/shared.js'; describe('results shared source resolution', () => { @@ -60,4 +61,30 @@ describe('results shared source resolution', () => { 'Expected a run workspace directory or index.jsonl manifest', ); }); + + it('normalizes historical camelCase replay rows when loading manifests', () => { + const fixturePath = path.join( + process.cwd(), + 'apps/cli/test/fixtures/results/camel-replay/index.jsonl', + ); + + const results = loadManifestResults(fixturePath); + + expect(results).toHaveLength(1); + expect(results[0].testId).toBe('wtg-replay-fail'); + expect(results[0].executionStatus).toBe('quality_failure'); + expect(results[0].durationMs).toBe(1234); + expect(results[0].tokenUsage).toEqual({ input: 10, output: 5 }); + expect(results[0].costUsd).toBe(0.012); + expect(results[0].trace.toolCalls).toEqual({ rg: 1 }); + }); + + it('rejects eval-case-only rows with migration guidance', () => { + const runDir = path.join(tempDir, '.agentv', 'results', 'runs', '2026-03-25T10-00-00-000Z'); + mkdirSync(runDir, { recursive: true }); + const indexPath = path.join(runDir, 'index.jsonl'); + writeFileSync(indexPath, '{"id":"case-a","prompt":"What is 2 + 2?"}\n'); + + expect(() => loadManifestResults(indexPath)).toThrow(/Eval-case JSONL is input data/); + }); }); diff --git a/apps/cli/test/fixtures/results/camel-replay/index.jsonl b/apps/cli/test/fixtures/results/camel-replay/index.jsonl new file mode 100644 index 000000000..c4f57237e --- /dev/null +++ b/apps/cli/test/fixtures/results/camel-replay/index.jsonl @@ -0,0 +1 @@ +{"testId":"wtg-replay-fail","target":"codex","score":0.4,"executionStatus":"quality_failure","durationMs":1234,"tokenUsage":{"input":10,"output":5},"costUsd":0.012,"trace":{"eventCount":1,"toolCalls":{"rg":1},"errorCount":0}} From c234fd045a549f4f85f4e9d07a5b70a3e13b1e6d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 17 Jun 2026 12:47:07 +0200 Subject: [PATCH 2/3] fix(inspect): preserve trace invalid score errors --- apps/cli/src/commands/inspect/utils.ts | 38 ++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/apps/cli/src/commands/inspect/utils.ts b/apps/cli/src/commands/inspect/utils.ts index ae2e34ec9..6ea549678 100644 --- a/apps/cli/src/commands/inspect/utils.ts +++ b/apps/cli/src/commands/inspect/utils.ts @@ -9,7 +9,7 @@ import { resolveWorkspaceOrFilePath, } from '../eval/result-layout.js'; import { loadManifestResults } from '../results/manifest.js'; -import { normalizeResultRow } from '../results/result-row-schema.js'; +import { ResultRowSchemaError, normalizeResultRow } from '../results/result-row-schema.js'; // ANSI color codes (no dependency needed) const colors = { @@ -126,6 +126,24 @@ function resolveTraceResultPath(filePath: string): string { return resolveWorkspaceOrFilePath(filePath); } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function shouldUseTraceScoreError(value: unknown): boolean { + if (!isRecord(value)) { + return false; + } + + return ( + typeof value.test_id === 'string' || + typeof value.testId === 'string' || + Object.hasOwn(value, 'score') || + Object.hasOwn(value, 'trace') || + Object.hasOwn(value, 'spans') + ); +} + function loadJsonlRecords(filePath: string): RawResult[] { const content = readFileSync(filePath, 'utf8'); const lines = content @@ -134,10 +152,20 @@ function loadJsonlRecords(filePath: string): RawResult[] { .filter((line) => line.trim()); return lines.map((line, i) => { - return normalizeResultRow(JSON.parse(line), { - lineNumber: i + 1, - sourceLabel: filePath, - }) as unknown as RawResult; + const parsed = JSON.parse(line) as unknown; + try { + return normalizeResultRow(parsed, { + lineNumber: i + 1, + sourceLabel: filePath, + }) as unknown as RawResult; + } catch (error) { + if (error instanceof ResultRowSchemaError && shouldUseTraceScoreError(parsed)) { + throw new Error( + `Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`, + ); + } + throw error; + } }); } From 239ab033bb2d8ad1c9594f048a695247e2147ffc Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Wed, 17 Jun 2026 12:55:39 +0200 Subject: [PATCH 3/3] fix(results): preserve result-score validation errors --- .../src/commands/results/result-row-schema.ts | 34 +++++++++++++++++-- apps/cli/test/commands/results/shared.test.ts | 8 ++--- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/results/result-row-schema.ts b/apps/cli/src/commands/results/result-row-schema.ts index 7ff85cdac..cb3d40356 100644 --- a/apps/cli/src/commands/results/result-row-schema.ts +++ b/apps/cli/src/commands/results/result-row-schema.ts @@ -143,6 +143,29 @@ function buildSchemaError(context: { return new ResultRowSchemaError(`Unsupported result row${location}. ${MIGRATION_GUIDANCE}`); } +function buildInvalidScoreError(context: { + lineNumber?: number; + sourceLabel?: string; +}): ResultRowSchemaError { + const location = [ + context.sourceLabel ? ` in ${context.sourceLabel}` : '', + context.lineNumber !== undefined ? ` at line ${context.lineNumber}` : '', + ].join(''); + return new ResultRowSchemaError(`Missing or invalid score in result row${location}.`); +} + +function looksLikeResultRow(value: Record): boolean { + return ( + typeof value.test_id === 'string' || + Object.hasOwn(value, 'score') || + Object.hasOwn(value, 'trace') || + Object.hasOwn(value, 'spans') || + Object.hasOwn(value, 'target') || + Object.hasOwn(value, 'grading_path') || + Object.hasOwn(value, 'timing_path') + ); +} + export function normalizeResultRow( value: unknown, context: { lineNumber?: number; sourceLabel?: string } = {}, @@ -152,10 +175,17 @@ export function normalizeResultRow( } const normalized = normalizeKnownAliases(value, RESULT_ROW_ALIASES); - normalized.trace = normalizeTraceSummary(normalized.trace); - normalized.output = normalizeOutput(normalized.output); + if (normalized.trace !== undefined) { + normalized.trace = normalizeTraceSummary(normalized.trace); + } + if (normalized.output !== undefined) { + normalized.output = normalizeOutput(normalized.output); + } if (typeof normalized.score !== 'number' || !Number.isFinite(normalized.score)) { + if (looksLikeResultRow(normalized)) { + throw buildInvalidScoreError(context); + } throw buildSchemaError(context); } diff --git a/apps/cli/test/commands/results/shared.test.ts b/apps/cli/test/commands/results/shared.test.ts index ee32aa5eb..12f64f61e 100644 --- a/apps/cli/test/commands/results/shared.test.ts +++ b/apps/cli/test/commands/results/shared.test.ts @@ -2,11 +2,14 @@ import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; +import { fileURLToPath } from 'node:url'; import { resolveRunManifestPath } from '../../../src/commands/eval/result-layout.js'; import { loadManifestResults } from '../../../src/commands/results/manifest.js'; import { resolveSourceFile } from '../../../src/commands/results/shared.js'; +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + describe('results shared source resolution', () => { let tempDir: string; @@ -63,10 +66,7 @@ describe('results shared source resolution', () => { }); it('normalizes historical camelCase replay rows when loading manifests', () => { - const fixturePath = path.join( - process.cwd(), - 'apps/cli/test/fixtures/results/camel-replay/index.jsonl', - ); + const fixturePath = path.join(__dirname, '../../fixtures/results/camel-replay/index.jsonl'); const results = loadManifestResults(fixturePath);