Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions datasets/AlienKevin_SWE-ZERO-12M-trajectories/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SWE-ZERO 12M Trajectories Dataset

## Description

SWE-ZERO 12M Trajectories is a large-scale execution-free agentic coding trace dataset. It contains mini-swe-agent v1 style shell trajectories sampled from real GitHub PR snapshots, intended for mid-training coding agents on repository navigation, editing, and bash-based tool use.

## Dataset Information

- **Source URL**: https://huggingface.co/datasets/AlienKevin/SWE-ZERO-12M-trajectories
- **License**: Apache-2.0
- **Split used**: `train`
- **Approximate size**: 12,290,800 rollouts, 122,908 unique PRs, 3,222 repositories, 16 programming languages, and 112B tokens according to the dataset card.
- **Source task dataset**: nebius/SWE-rebench-V2-PRs
- **Trajectory format**: mini-swe-agent v1
- **Bootstrapping model**: ricdomolm/mini-coder-1.7b

## Schema Mapping

The raw dataset is a list of chat-style messages with `role` and `content` fields.
`raw_to_atif.py` keeps the mini-swe-agent-specific extraction local to this dataset:

- `system` messages are skipped because they only define the mini-swe-agent response format and execution-free shell constraints.
- Initial `user` task messages become ATIF `user` steps.
- Later `user` messages beginning with `Observation:` become tool observation results on the preceding agent step with the prefix removed.
- `assistant` messages containing fenced `bash` blocks become ATIF agent steps with `bash` tool calls; the text before the final bash block is preserved as the step message after removing a leading `THOUGHT:` label.
- `assistant` messages without a bash block become plain ATIF agent message steps so malformed or terminal natural-language turns are preserved.

`atif_to_std.py` uses the shared standardization pass, which maps `bash` tool calls to the standard `terminal` tool. The trajectory metadata preserves the raw `instance_id`, repository, `trajectory_format`, `exit_status`, and `duration_sec`. Trajectory IDs are derived deterministically from the instance ID plus a content hash because the source dataset contains many independent rollouts per PR with the same `instance_id`.

If an assistant message contains multiple bash blocks despite the source system prompt requiring exactly one, the converter emits a warning and uses the final block. The final block is treated as the executable action because it is the last stated command after any preceding reasoning or malformed draft command.

## Known Limitations

The dataset card describes this corpus as a mid-training dataset rather than a verified SFT dataset. The trajectories are execution-free, not validated against tests, and many rollouts terminate with `incomplete` or other non-submitted statuses. This converter preserves those trajectories instead of filtering to submitted-only samples.
7 changes: 7 additions & 0 deletions datasets/AlienKevin_SWE-ZERO-12M-trajectories/atif_to_std.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ruff: noqa: E402, I001

from scripts.atif_to_std_common import main


if __name__ == "__main__":
main(__file__)
21 changes: 21 additions & 0 deletions datasets/AlienKevin_SWE-ZERO-12M-trajectories/extract_raw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import json
import signal

from datasets import load_dataset

DATASET_NAME = "AlienKevin/SWE-ZERO-12M-trajectories"

signal.signal(signal.SIGPIPE, signal.SIG_DFL)


def main():
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
for item in dataset:
try:
print(json.dumps(item))
except BrokenPipeError:
return


if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions datasets/AlienKevin_SWE-ZERO-12M-trajectories/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"custom_tools": [],
"code_enabled": [
"bash"
],
"browser_enabled": false,
"sample_expectations": {
"min_std_steps": 35,
"min_std_tool_calls": 30,
"min_sdk_messages": 70
}
}
145 changes: 145 additions & 0 deletions datasets/AlienKevin_SWE-ZERO-12M-trajectories/raw_to_atif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from __future__ import annotations

import hashlib
import json
import re
import sys

from schema_raw import SchemaRaw

from schema.atif import (
Agent,
ATIFObservation,
ATIFTrajectory,
ObservationResult,
Step,
ToolCall,
)

OBSERVATION_PREFIX = "Observation:"
_BASH_BLOCK_RE = re.compile(r"```bash\s*\n(.*?)\n?```", re.DOTALL | re.IGNORECASE)
_THOUGHT_PREFIX_RE = re.compile(r"^THOUGHT:\s*", re.IGNORECASE)


def strip_thought_prefix(content: str) -> str:
return _THOUGHT_PREFIX_RE.sub("", content.strip()).strip()


def normalize_observation(content: str) -> str:
if content.startswith(OBSERVATION_PREFIX):
return content[len(OBSERVATION_PREFIX) :].lstrip()
return content


def trajectory_id(data: SchemaRaw) -> str:
serialized_messages = json.dumps(
[message.model_dump() for message in data.messages],
sort_keys=True,
ensure_ascii=False,
)
digest = hashlib.sha1(serialized_messages.encode("utf-8")).hexdigest()[:12]
return f"{data.instance_id}-{digest}"


def assistant_step(content: str, step_id: int, tool_call_id: str) -> Step:
bash_matches = list(_BASH_BLOCK_RE.finditer(content))
if not bash_matches:
return Step(step_id=step_id, source="agent", message=content)

if len(bash_matches) > 1:
print(
f"Step {step_id} has {len(bash_matches)} bash blocks; using the final block",
file=sys.stderr,
)

match = bash_matches[-1]
message = strip_thought_prefix(content[: match.start()])
command = match.group(1).strip()
return Step(
step_id=step_id,
source="agent",
message=message,
tool_calls=[
ToolCall(
tool_call_id=tool_call_id,
function_name="bash",
arguments={"command": command},
)
],
)


def attach_observation(step: Step, content: str) -> None:
source_call_id = None
if step.tool_calls:
source_call_id = step.tool_calls[-1].tool_call_id
step.observation = ATIFObservation(
results=[
ObservationResult(
source_call_id=source_call_id,
content=normalize_observation(content),
)
]
)


def process_data(data: SchemaRaw) -> ATIFTrajectory | None:
steps: list[Step] = []
tool_call_index = 1

for message in data.messages:
content = message.content
if message.role == "system":
continue
if message.role == "user":
if content.startswith(OBSERVATION_PREFIX) and steps and steps[-1].source == "agent":
attach_observation(steps[-1], content)
else:
steps.append(Step(step_id=len(steps) + 1, source="user", message=content))
elif message.role == "assistant":
step = assistant_step(
content,
step_id=len(steps) + 1,
tool_call_id=f"call_{tool_call_index:06d}",
)
if step.tool_calls:
tool_call_index += len(step.tool_calls)
steps.append(step)
else:
print(f"Unknown role: {message.role}", file=sys.stderr)

if not steps:
return None

return ATIFTrajectory(
trajectory_id=trajectory_id(data),
session_id=data.instance_id,
agent=Agent(
name="mini-swe-agent",
version=str(data.trajectory_format),
),
steps=steps,
extra={
"instance_id": data.instance_id,
"repo": data.repo,
"trajectory_format": data.trajectory_format,
"exit_status": data.exit_status,
"duration_sec": data.duration_sec,
},
)


def main() -> None:
for line in sys.stdin:
line = line.strip()
if not line:
continue
raw_data = json.loads(line)
data = SchemaRaw(**raw_data)
trajectory = process_data(data)
if trajectory:
print(trajectory.model_dump_json(exclude_none=True))


if __name__ == "__main__":
main()
Loading
Loading