neulab · neubig · Jun 3, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/README.md b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/README.md
@@ -0,0 +1,34 @@
+# SWE-ZERO 12M Trajectories Dataset
+
+## Description
+
+SWE-ZERO 12M Trajectories is a large-scale execution-free agentic coding trace dataset. It contains mini-swe-agent v1 style shell trajectories sampled from real GitHub PR snapshots, intended for mid-training coding agents on repository navigation, editing, and bash-based tool use.
+
+## Dataset Information
+
+- **Source URL**: https://huggingface.co/datasets/AlienKevin/SWE-ZERO-12M-trajectories
+- **License**: Apache-2.0
+- **Split used**: `train`
+- **Approximate size**: 12,290,800 rollouts, 122,908 unique PRs, 3,222 repositories, 16 programming languages, and 112B tokens according to the dataset card.
+- **Source task dataset**: nebius/SWE-rebench-V2-PRs
+- **Trajectory format**: mini-swe-agent v1
+- **Bootstrapping model**: ricdomolm/mini-coder-1.7b
+
+## Schema Mapping
+
+The raw dataset is a list of chat-style messages with `role` and `content` fields.
+`raw_to_atif.py` keeps the mini-swe-agent-specific extraction local to this dataset:
+
+- `system` messages are skipped because they only define the mini-swe-agent response format and execution-free shell constraints.
+- Initial `user` task messages become ATIF `user` steps.
+- Later `user` messages beginning with `Observation:` become tool observation results on the preceding agent step with the prefix removed.
+- `assistant` messages containing fenced `bash` blocks become ATIF agent steps with `bash` tool calls; the text before the final bash block is preserved as the step message after removing a leading `THOUGHT:` label.
+- `assistant` messages without a bash block become plain ATIF agent message steps so malformed or terminal natural-language turns are preserved.
+
+`atif_to_std.py` uses the shared standardization pass, which maps `bash` tool calls to the standard `terminal` tool. The trajectory metadata preserves the raw `instance_id`, repository, `trajectory_format`, `exit_status`, and `duration_sec`. Trajectory IDs are derived deterministically from the instance ID plus a content hash because the source dataset contains many independent rollouts per PR with the same `instance_id`.
+
+If an assistant message contains multiple bash blocks despite the source system prompt requiring exactly one, the converter emits a warning and uses the final block. The final block is treated as the executable action because it is the last stated command after any preceding reasoning or malformed draft command.
+
+## Known Limitations
+
+The dataset card describes this corpus as a mid-training dataset rather than a verified SFT dataset. The trajectories are execution-free, not validated against tests, and many rollouts terminate with `incomplete` or other non-submitted statuses. This converter preserves those trajectories instead of filtering to submitted-only samples.
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/atif_to_std.py b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/atif_to_std.py
@@ -0,0 +1,7 @@
+# ruff: noqa: E402, I001
+
+from scripts.atif_to_std_common import main
+
+
+if __name__ == "__main__":
+    main(__file__)
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/extract_raw.py b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/extract_raw.py
@@ -0,0 +1,21 @@
+import json
+import signal
+
+from datasets import load_dataset
+
+DATASET_NAME = "AlienKevin/SWE-ZERO-12M-trajectories"
+
+signal.signal(signal.SIGPIPE, signal.SIG_DFL)
+
+
+def main():
+    dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
+    for item in dataset:
+        try:
+            print(json.dumps(item))
+        except BrokenPipeError:
+            return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/metadata.json b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/metadata.json
@@ -0,0 +1,12 @@
+{
+  "custom_tools": [],
+  "code_enabled": [
+    "bash"
+  ],
+  "browser_enabled": false,
+  "sample_expectations": {
+    "min_std_steps": 35,
+    "min_std_tool_calls": 30,
+    "min_sdk_messages": 70
+  }
+}
diff --git a/datasets/AlienKevin_SWE-ZERO-12M-trajectories/raw_to_atif.py b/datasets/AlienKevin_SWE-ZERO-12M-trajectories/raw_to_atif.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+import sys
+
+from schema_raw import SchemaRaw
+
+from schema.atif import (
+    Agent,
+    ATIFObservation,
+    ATIFTrajectory,
+    ObservationResult,
+    Step,
+    ToolCall,
+)
+
+OBSERVATION_PREFIX = "Observation:"
+_BASH_BLOCK_RE = re.compile(r"```bash\s*\n(.*?)\n?```", re.DOTALL | re.IGNORECASE)
+_THOUGHT_PREFIX_RE = re.compile(r"^THOUGHT:\s*", re.IGNORECASE)
+
+
+def strip_thought_prefix(content: str) -> str:
+    return _THOUGHT_PREFIX_RE.sub("", content.strip()).strip()
+
+
+def normalize_observation(content: str) -> str:
+    if content.startswith(OBSERVATION_PREFIX):
+        return content[len(OBSERVATION_PREFIX) :].lstrip()
+    return content
+
+
+def trajectory_id(data: SchemaRaw) -> str:
+    serialized_messages = json.dumps(
+        [message.model_dump() for message in data.messages],
+        sort_keys=True,
+        ensure_ascii=False,
+    )
+    digest = hashlib.sha1(serialized_messages.encode("utf-8")).hexdigest()[:12]
+    return f"{data.instance_id}-{digest}"
+
+
+def assistant_step(content: str, step_id: int, tool_call_id: str) -> Step:
+    bash_matches = list(_BASH_BLOCK_RE.finditer(content))
+    if not bash_matches:
+        return Step(step_id=step_id, source="agent", message=content)
+
+    if len(bash_matches) > 1:
+        print(
+            f"Step {step_id} has {len(bash_matches)} bash blocks; using the final block",
+            file=sys.stderr,
+        )
+
+    match = bash_matches[-1]
+    message = strip_thought_prefix(content[: match.start()])
+    command = match.group(1).strip()
+    return Step(
+        step_id=step_id,
+        source="agent",
+        message=message,
+        tool_calls=[
+            ToolCall(
+                tool_call_id=tool_call_id,
+                function_name="bash",
+                arguments={"command": command},
+            )
+        ],
+    )
+
+
+def attach_observation(step: Step, content: str) -> None:
+    source_call_id = None
+    if step.tool_calls:
+        source_call_id = step.tool_calls[-1].tool_call_id
+    step.observation = ATIFObservation(
+        results=[
+            ObservationResult(
+                source_call_id=source_call_id,
+                content=normalize_observation(content),
+            )
+        ]
+    )
+
+
+def process_data(data: SchemaRaw) -> ATIFTrajectory | None:
+    steps: list[Step] = []
+    tool_call_index = 1
+
+    for message in data.messages:
+        content = message.content
+        if message.role == "system":
+            continue
+        if message.role == "user":
+            if content.startswith(OBSERVATION_PREFIX) and steps and steps[-1].source == "agent":
+                attach_observation(steps[-1], content)
+            else:
+                steps.append(Step(step_id=len(steps) + 1, source="user", message=content))
+        elif message.role == "assistant":
+            step = assistant_step(
+                content,
+                step_id=len(steps) + 1,
+                tool_call_id=f"call_{tool_call_index:06d}",
+            )
+            if step.tool_calls:
+                tool_call_index += len(step.tool_calls)
+            steps.append(step)
+        else:
+            print(f"Unknown role: {message.role}", file=sys.stderr)
+
+    if not steps:
+        return None
+
+    return ATIFTrajectory(
+        trajectory_id=trajectory_id(data),
+        session_id=data.instance_id,
+        agent=Agent(
+            name="mini-swe-agent",
+            version=str(data.trajectory_format),
+        ),
+        steps=steps,
+        extra={
+            "instance_id": data.instance_id,
+            "repo": data.repo,
+            "trajectory_format": data.trajectory_format,
+            "exit_status": data.exit_status,
+            "duration_sec": data.duration_sec,
+        },
+    )
+
+
+def main() -> None:
+    for line in sys.stdin:
+        line = line.strip()
+        if not line:
+            continue
+        raw_data = json.loads(line)
+        data = SchemaRaw(**raw_data)
+        trajectory = process_data(data)
+        if trajectory:
+            print(trajectory.model_dump_json(exclude_none=True))
+
+
+if __name__ == "__main__":
+    main()