Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,14 @@ uv run python post_codegen.py

## 🔌 Eval Converters

We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM` and `lm-evaluation-harness` into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output.
We have prepared converters to make adapting to our schema as easy as possible. At the moment, we support converting local evaluation harness logs from `Inspect AI`, `HELM`, `lm-evaluation-harness`, and `lighteval` into our unified schema. Each converter produces aggregate JSON and optionally instance-level JSONL output.

| Framework | Command | Instance-Level JSONL |
|---|---|---|
| [Inspect AI](every_eval_ever/converters/inspect/) | `every_eval_ever convert inspect --log_path <path>` | Yes, if samples in log |
| [HELM](every_eval_ever/converters/helm/) | `every_eval_ever convert helm --log_path <path>` | Always |
| [lm-evaluation-harness](every_eval_ever/converters/lm_eval/) | `every_eval_ever convert lm_eval --log_path <path> --include_samples` | With `--include_samples` |
| [lighteval](every_eval_ever/converters/lighteval/) | `every_eval_ever convert lighteval --log_path <path>` | No |

For full CLI usage and required input files, see the [Eval Converters README](every_eval_ever/converters/README.md).

Expand Down
52 changes: 51 additions & 1 deletion every_eval_ever/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,33 @@ def _cmd_convert_lm_eval(args: argparse.Namespace) -> int:
return 0


def _cmd_convert_lighteval(args: argparse.Namespace) -> int:
from every_eval_ever.converters.lighteval.adapter import LightEvalAdapter

adapter = LightEvalAdapter()
metadata = _common_metadata(args)
if args.inference_engine:
metadata['inference_engine'] = args.inference_engine
if args.inference_engine_version:
metadata['inference_engine_version'] = args.inference_engine_version

log_path = Path(args.log_path)
if log_path.is_file():
logs = adapter.transform_from_file(log_path, metadata)
elif log_path.is_dir():
logs = adapter.transform_from_directory(log_path, metadata)
else:
raise FileNotFoundError(f'Path is not a file or directory: {log_path}')

output_dir = Path(args.output_dir)
for log in logs:
eval_uuid = str(uuid.uuid4())
print(_write_log(log, output_dir, eval_uuid=eval_uuid))

print(f'Converted {len(logs)} evaluation log(s).')
return 0


def _cmd_convert_inspect(args: argparse.Namespace) -> int:
from every_eval_ever.converters.inspect.adapter import (
InspectAIAdapter,
Expand Down Expand Up @@ -241,6 +268,7 @@ def build_parser() -> argparse.ArgumentParser:
epilog=(
'Examples:\n'
' every_eval_ever convert lm_eval --log_path results.json --output_dir data\n'
' every_eval_ever convert lighteval --log_path results_run_dir --output_dir data\n'
' every_eval_ever convert inspect --log_path inspect_log.json --output_dir data\n'
' every_eval_ever convert helm --log_path helm_run_dir --output_dir data'
),
Expand Down Expand Up @@ -298,7 +326,13 @@ def build_parser() -> argparse.ArgumentParser:
dest='source', required=True
)

for source in ['lm_eval', 'inspect', 'helm', 'alpaca_eval']:
for source in [
'lm_eval',
'lighteval',
'inspect',
'helm',
'alpaca_eval',
]:
source_parser = convert_subparsers.add_parser(
source,
help=f'Convert {source} logs',
Expand Down Expand Up @@ -385,6 +419,20 @@ def build_parser() -> argparse.ArgumentParser:
help='Inference engine version to record in model_info.inference_engine.version.',
)

if source == 'lighteval':
source_parser.add_argument(
'--inference_engine',
'--inference-engine',
default=None,
help='Override inferred inference engine (e.g. vllm, transformers).',
)
source_parser.add_argument(
'--inference_engine_version',
'--inference-engine-version',
default=None,
help='Inference engine version to record in model_info.inference_engine.version.',
)

return parser


Expand Down Expand Up @@ -415,6 +463,8 @@ def main(argv: list[str] | None = None) -> int:
if args.command == 'convert':
if args.source == 'lm_eval':
return _cmd_convert_lm_eval(args)
if args.source == 'lighteval':
return _cmd_convert_lighteval(args)
if args.source == 'inspect':
return _cmd_convert_inspect(args)
if args.source == 'helm':
Expand Down
72 changes: 70 additions & 2 deletions every_eval_ever/converters/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
## Automatic Evaluation Log Converters
A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI` and `lm-eval-harness`) and public leaderboards (e.g., AlpacaEval) into the unified Every Eval Ever schema.
A collection of scripts to convert evaluation logs from local evaluation frameworks (e.g., `Inspect AI`, `lm-eval-harness`, and `lighteval`) and public leaderboards (e.g., AlpacaEval) into the unified Every Eval Ever schema.

### Installation

Expand Down Expand Up @@ -171,7 +171,6 @@ Using the `--log_path` argument, you can run a command like this:
uv run every_eval_ever convert lm_eval --log_path tests/data/lm_eval/results_2026-01-21T03-44-18.458309.json
```


Full manual for conversion of your own lm-eval evaluation log into unified is available below:

```bash
Expand Down Expand Up @@ -217,6 +216,75 @@ options:
Version of the evaluation library
```

## lighteval

The conversion script from `lighteval` evaluation logs to the unified schema can be run using `every_eval_ever/converters/lighteval/__main__.py`.

Using the `--log_path` argument, you can provide either:
- A lighteval results JSON file (e.g., `results_2026-05-11T07-33-49.106520.json`)
- A directory containing multiple lighteval results files

The command for converting a lighteval evaluation log:

```bash
uv run every_eval_ever convert lighteval --log_path tests/data/lighteval/results_2026-05-11T07-33-49.106520.json
```

To convert all lighteval results in a directory:

```bash
uv run every_eval_ever convert lighteval --log_path path/to/lighteval/results/directory
```

The lighteval adapter automatically:
- Parses model provider and pretrained model name from the `model_name` field
- Extracts generation parameters from `model_config.generation_parameters`
- Handles tasks with seed suffixes (e.g., `aime25|0`) by extracting the base task name
- Infers inference engine from the model provider (can be overridden with `--inference_engine`)
- Resolves metric configurations including bounds and directionality (`higher_is_better`)

Full manual for conversion of your own lighteval evaluation log into unified is available below:

```bash
usage: __main__.py [-h] --log_path LOG_PATH [--output_dir OUTPUT_DIR]
[--source_organization_name SOURCE_ORGANIZATION_NAME]
[--evaluator_relationship {first_party,third_party,collaborative,other}]
[--source_organization_url SOURCE_ORGANIZATION_URL]
[--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL]
[--inference_engine INFERENCE_ENGINE]
[--inference_engine_version INFERENCE_ENGINE_VERSION]
[--eval_library_name EVAL_LIBRARY_NAME]
[--eval_library_version EVAL_LIBRARY_VERSION]

Convert lighteval output to every_eval_ever format

options:
-h, --help show this help message and exit
--log_path LOG_PATH Path to lighteval results JSON file or directory
containing results files
--output_dir OUTPUT_DIR
Output directory for converted files
--source_organization_name SOURCE_ORGANIZATION_NAME
Name of the organization that ran the evaluation
--evaluator_relationship {first_party,third_party,collaborative,other}
Relationship of the evaluator to the model
--source_organization_url SOURCE_ORGANIZATION_URL
URL of the source organization
--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL
Logo of the source organization
--inference_engine INFERENCE_ENGINE
Override inferred inference engine (e.g. 'vllm',
'transformers'). Auto-detected from model provider when
possible.
--inference_engine_version INFERENCE_ENGINE_VERSION
Inference engine version (e.g. '0.6.0'). Not available
from lighteval logs, so must be provided manually.
--eval_library_name EVAL_LIBRARY_NAME
Name of the evaluation library (default: lighteval)
--eval_library_version EVAL_LIBRARY_VERSION
Version of the evaluation library
```

## AlpacaEval

The AlpacaEval converter fetches the public leaderboard CSV directly from GitHub
Expand Down
1 change: 1 addition & 0 deletions every_eval_ever/converters/common/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class SupportedLibrary(Enum):
LM_EVAL = 'lm-evaluation-harness'
INSPECT_AI = 'inspect-ai'
HELM = 'helm'
LIGHTEVAL = 'lighteval'
CUSTOM = 'custom'


Expand Down
1 change: 1 addition & 0 deletions every_eval_ever/converters/lighteval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""lighteval adapter for every_eval_ever."""
142 changes: 142 additions & 0 deletions every_eval_ever/converters/lighteval/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""CLI for converting lighteval output to every_eval_ever format."""

import argparse
import json
import sys
import uuid
from pathlib import Path

from .adapter import LightEvalAdapter


def main():
parser = argparse.ArgumentParser(
description='Convert lighteval output to every_eval_ever format'
)
parser.add_argument(
'--log_path',
type=str,
required=True,
help='Path to results JSON file or directory containing results files',
)
parser.add_argument(
'--output_dir',
type=str,
default='data',
help='Output directory for converted files',
)
parser.add_argument(
'--source_organization_name',
type=str,
default='',
help='Name of the organization that ran the evaluation',
)
parser.add_argument(
'--evaluator_relationship',
type=str,
default='first_party',
choices=['first_party', 'third_party', 'collaborative', 'other'],
help='Relationship of the evaluator to the model',
)
parser.add_argument(
'--source_organization_url',
type=str,
default=None,
help='URL of the source organization',
)
parser.add_argument(
'--source_organization_logo_url',
type=str,
default=None,
help='Logo of the source organization',
)
parser.add_argument(
'--inference_engine',
type=str,
default=None,
help="Override inference engine name (e.g. 'vllm', 'transformers'). "
'Auto-detected from provider when possible.',
)
parser.add_argument(
'--inference_engine_version',
type=str,
default=None,
help="Inference engine version (e.g. '0.6.0'). "
'Not available from lighteval logs, so must be provided manually.',
)
parser.add_argument(
'--eval_library_name',
type=str,
default='lighteval',
help='Name of the evaluation library (e.g. inspect_ai, lm_eval, helm, lighteval)',
)
parser.add_argument(
'--eval_library_version',
type=str,
default='unknown',
help='Version of the evaluation library. It should be extracted in the adapter if available in the evaluation log.',
)

args = parser.parse_args()

adapter = LightEvalAdapter()
output_dir = Path(args.output_dir)

metadata_args = {
'source_organization_name': args.source_organization_name,
'evaluator_relationship': args.evaluator_relationship,
'source_organization_url': args.source_organization_url,
'eval_library_name': args.eval_library_name,
'eval_library_version': args.eval_library_version,
}
if args.inference_engine:
metadata_args['inference_engine'] = args.inference_engine
if args.inference_engine_version:
metadata_args['inference_engine_version'] = (
args.inference_engine_version
)

log_path = Path(args.log_path)

if log_path.is_file():
logs = adapter.transform_from_file(log_path, metadata_args)
elif log_path.is_dir():
logs = adapter.transform_from_directory(log_path, metadata_args)
else:
print(f'Error: {log_path} is not a file or directory', file=sys.stderr)
sys.exit(1)

for log in logs:
# Organize as: output_dir/{evaluation_name}/{developer}/{model_name}/{uuid}.json
# Use the first evaluation result's name (before any /filter suffix) as the task name
if log.evaluation_results:
eval_name = log.evaluation_results[0].evaluation_name.split('/')[0]
else:
eval_name = 'unknown'

model_parts = log.model_info.id.split('/')
if len(model_parts) >= 2:
developer = model_parts[0]
model_name = '/'.join(model_parts[1:])
else:
developer = 'unknown'
model_name = log.model_info.id

out_path = output_dir / eval_name / developer / model_name
out_path.mkdir(parents=True, exist_ok=True)

eval_uuid = str(uuid.uuid4())
out_file = out_path / f'{eval_uuid}.json'

with open(out_file, 'w') as f:
json.dump(
log.model_dump(mode='json', exclude_none=True), f, indent=2
)

print(f' {out_file}')

print(f'\nConverted {len(logs)} evaluation log(s).')


if __name__ == '__main__':
main()
Loading