Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions apps/cli/src/commands/eval/artifact-writer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
toTraceEnvelopeWire,
traceToTranscriptJsonLines,
} from '@agentv/core';
import { normalizeResultRow } from '../results/result-row-schema.js';
import { RESULT_INDEX_FILENAME } from './result-layout.js';
import {
type MaterializedTaskBundlePaths,
Expand Down Expand Up @@ -1091,21 +1092,27 @@ function normalizeParsedResult(value: unknown): ParsedEvaluationResult | undefin
export function parseJsonlResults(content: string): EvaluationResult[] {
const results: EvaluationResult[] = [];
const lines = content.split('\n');
for (const line of lines) {
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmed = line.trim();
if (trimmed.length === 0) {
continue;
}
let parsed: unknown;
try {
const parsed = JSON.parse(trimmed);
// JSONL files from AgentV use snake_case; convert back to camelCase
const camelCased = toCamelCaseDeep(parsed);
const normalized = normalizeParsedResult(camelCased);
if (normalized) {
results.push(normalized);
}
parsed = JSON.parse(trimmed);
} catch {
// Skip malformed lines
continue;
}

// JSONL files from AgentV use snake_case; convert supported historical aliases
// to canonical snake_case before mapping into TypeScript internals.
const canonicalRow = normalizeResultRow(parsed, { lineNumber: i + 1 });
const camelCased = toCamelCaseDeep(canonicalRow);
const normalized = normalizeParsedResult(camelCased);
if (normalized) {
results.push(normalized);
}
}
return results;
Expand Down
5 changes: 4 additions & 1 deletion apps/cli/src/commands/inspect/filter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import { existsSync, readFileSync, readdirSync, statSync } from 'node:fs';
import path from 'node:path';
import { command, number, oneOf, option, optional, positional, string } from 'cmd-ts';
import { normalizeResultRow } from '../results/result-row-schema.js';
import { c, formatScore, padLeft, padRight } from './utils.js';

/** A lightweight result record with fields needed for filtering. */
Expand Down Expand Up @@ -105,13 +106,15 @@ export function parseFilterableRecords(filePath: string): FilterableRecord[] {
const lines = content.split('\n').filter((line) => line.trim());
const records: FilterableRecord[] = [];

for (const line of lines) {
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
let raw: Record<string, unknown>;
try {
raw = JSON.parse(line) as Record<string, unknown>;
} catch {
continue;
}
raw = normalizeResultRow(raw, { lineNumber: i + 1, sourceLabel: filePath });

// Determine experiment from record or from directory path
let experiment = typeof raw.experiment === 'string' ? raw.experiment : undefined;
Expand Down
8 changes: 5 additions & 3 deletions apps/cli/src/commands/inspect/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,11 @@ export function searchJsonlFile(
const testId =
typeof record.test_id === 'string'
? record.test_id
: typeof record.source === 'object' && record.source !== null
? ((record.source as Record<string, unknown>).session_id as string | undefined)
: undefined;
: typeof record.testId === 'string'
? record.testId
: typeof record.source === 'object' && record.source !== null
? ((record.source as Record<string, unknown>).session_id as string | undefined)
: undefined;

// Apply metadata filters before regex search
if (targetFilter && target !== targetFilter) continue;
Expand Down
36 changes: 32 additions & 4 deletions apps/cli/src/commands/inspect/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
resolveWorkspaceOrFilePath,
} from '../eval/result-layout.js';
import { loadManifestResults } from '../results/manifest.js';
import { ResultRowSchemaError, normalizeResultRow } from '../results/result-row-schema.js';

// ANSI color codes (no dependency needed)
const colors = {
Expand Down Expand Up @@ -125,6 +126,24 @@ function resolveTraceResultPath(filePath: string): string {
return resolveWorkspaceOrFilePath(filePath);
}

function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === 'object' && value !== null && !Array.isArray(value);
}

function shouldUseTraceScoreError(value: unknown): boolean {
if (!isRecord(value)) {
return false;
}

return (
typeof value.test_id === 'string' ||
typeof value.testId === 'string' ||
Object.hasOwn(value, 'score') ||
Object.hasOwn(value, 'trace') ||
Object.hasOwn(value, 'spans')
);
}

function loadJsonlRecords(filePath: string): RawResult[] {
const content = readFileSync(filePath, 'utf8');
const lines = content
Expand All @@ -133,11 +152,20 @@ function loadJsonlRecords(filePath: string): RawResult[] {
.filter((line) => line.trim());

return lines.map((line, i) => {
const record = JSON.parse(line) as RawResult;
if (typeof record.score !== 'number') {
throw new Error(`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`);
const parsed = JSON.parse(line) as unknown;
try {
return normalizeResultRow(parsed, {
lineNumber: i + 1,
sourceLabel: filePath,
}) as unknown as RawResult;
} catch (error) {
if (error instanceof ResultRowSchemaError && shouldUseTraceScoreError(parsed)) {
throw new Error(
`Missing or invalid score in result at line ${i + 1}: ${line.slice(0, 100)}`,
);
}
throw error;
}
return record;
});
}

Expand Down
23 changes: 21 additions & 2 deletions apps/cli/src/commands/results/manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ import path from 'node:path';

import {
type EvaluationResult,
type TraceSummary,
type TranscriptJsonLine,
buildTraceFromMessages,
toCamelCaseDeep,
traceFromTranscriptJsonLines,
} from '@agentv/core';

Expand All @@ -14,6 +16,7 @@ import {
isDirectoryPath,
resolveRunManifestPath,
} from '../eval/result-layout.js';
import { normalizeResultRow } from './result-row-schema.js';

export interface ResultManifestRecord {
readonly timestamp?: string;
Expand All @@ -33,6 +36,7 @@ export interface ResultManifestRecord {
readonly output?: number;
readonly reasoning?: number;
};
readonly trace?: Record<string, unknown>;
readonly grading_path?: string;
readonly timing_path?: string;
readonly input_path?: string;
Expand All @@ -57,6 +61,20 @@ function parseJsonlLines<T>(content: string): T[] {
.map((line) => JSON.parse(line) as T);
}

function parseResultRows(content: string, sourceLabel?: string): ResultManifestRecord[] {
return content
.split(/\r?\n/)
.map((line, index) => ({ line: line.trim(), lineNumber: index + 1 }))
.filter(({ line }) => line.length > 0)
.map(
({ line, lineNumber }) =>
normalizeResultRow(JSON.parse(line), {
lineNumber,
sourceLabel,
}) as unknown as ResultManifestRecord,
);
}

function parseMarkdownMessages(content: string): { role: string; content: string }[] {
const trimmed = content.trim();
if (!trimmed.startsWith('@[')) {
Expand Down Expand Up @@ -138,6 +156,7 @@ function hydrateTrace(baseDir: string, record: ResultManifestRecord): Evaluation
return buildTraceFromMessages({
input: hydrateInput(baseDir, record),
output: output ? [{ role: 'assistant', content: output }] : [],
summary: record.trace ? (toCamelCaseDeep(record.trace) as TraceSummary) : undefined,
finalOutput: output,
target: record.target,
testId: record.test_id,
Expand Down Expand Up @@ -205,7 +224,7 @@ function hydrateManifestRecord(baseDir: string, record: ResultManifestRecord): E
}

export function parseResultManifest(content: string): ResultManifestRecord[] {
return parseJsonlLines<ResultManifestRecord>(content);
return parseResultRows(content);
}

export function resolveResultSourcePath(source: string, cwd?: string): string {
Expand All @@ -219,7 +238,7 @@ export function resolveResultSourcePath(source: string, cwd?: string): string {
export function loadManifestResults(sourceFile: string): EvaluationResult[] {
const resolvedSourceFile = resolveRunManifestPath(sourceFile);
const content = readFileSync(resolvedSourceFile, 'utf8');
const records = parseResultManifest(content);
const records = parseResultRows(content, resolvedSourceFile);
const baseDir = path.dirname(resolvedSourceFile);
return records.map((record) => hydrateManifestRecord(baseDir, record));
}
Expand Down
Loading
Loading