Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions plugins/codex/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,43 @@ Or call the engine directly:

```bash
python -m skillopt_sleep dry-run --project "$(pwd)" --source codex --backend mock
python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex
python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \
--max-sessions 5 --max-tasks 3 --progress
python -m skillopt_sleep run --project "$(pwd)" --source codex --backend codex \
--target-skill-path .agents/skills/example/SKILL.md \
--max-sessions 5 --max-tasks 3 --progress
```

`--source codex` reads Codex Desktop archived sessions from
`~/.codex/archived_sessions`. Use `--codex-home /path/to/.codex` to point at a
different Codex home, or `--source auto` to try Codex archives first and fall
back to Claude Code transcripts. Default backend is `mock` (no API spend).
`--backend codex` uses your Codex budget for real improvement. All the
`--backend codex` uses your Codex budget for real improvement. Bound live runs
with `--max-sessions` and `--max-tasks`; add `--progress` because Codex-backed
mining, replay, and reflection can be slow and otherwise quiet. Use
`--target-skill-path` to stage/adopt into a repo-scoped Codex skill such as
`.agents/skills/<name>/SKILL.md`; target runs over-sample mined tasks and
prefer tasks that match the target skill's path, headings, and content. All the
controllable knobs (`--gate on|off`, `--rollouts-k`, `--budget-tokens`,
`--preferences`, optimizer/target split) work identically — see
[the SkillOpt-Sleep guide section](https://microsoft.github.io/SkillOpt/docs/guideline.html#sleep).

For privacy-sensitive projects, split the run into reviewable steps:

```bash
python -m skillopt_sleep harvest --project "$(pwd)" --source codex \
--target-skill-path .agents/skills/example/SKILL.md \
--max-sessions 5 --max-tasks 3 \
--output reviewed-tasks.json

python -m skillopt_sleep dry-run --project "$(pwd)" --backend codex \
--tasks-file reviewed-tasks.json --progress --json
```

Inspect/redact the JSON and set `"reviewed": true` before using a real backend.
`--tasks-file` skips archive harvest/mining and replays only the reviewed JSON
tasks; real backends refuse task files still marked `"reviewed": false`.

## Notes / status

- Codex's `exec` runs shell, so the real-tool-loop replay (e.g. the
Expand Down
139 changes: 121 additions & 18 deletions skillopt_sleep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
Common flags:
--project PATH project to evolve (default: cwd)
--scope all|invoked harvest scope (default: invoked)
--max-sessions N cap transcript sessions per run
--max-tasks N cap mined tasks per run
--target-skill-path PATH explicit live SKILL.md to stage/adopt
--tasks-file PATH reviewed TaskRecord JSON file to replay instead of harvesting
--backend mock|claude|codex
--source claude|codex|auto
--model NAME
Expand All @@ -31,6 +35,35 @@
from skillopt_sleep.staging import adopt as adopt_staging
from skillopt_sleep.staging import latest_staging
from skillopt_sleep.state import SleepState
from skillopt_sleep.tasks_file import load_tasks_file, make_tasks_payload, write_tasks_file


def _read_text(path: str) -> str:
try:
with open(path, encoding="utf-8") as f:
return f.read()
except Exception:
return ""


def _report_payload(rep, outcome) -> Dict[str, Any]:
return {
"night": rep.night,
"accepted": rep.accepted,
"gate_action": rep.gate_action,
"no_edits_reason": getattr(rep, "no_edits_reason", ""),
"baseline": rep.baseline_score,
"candidate": rep.candidate_score,
"n_tasks": rep.n_tasks,
"n_sessions": rep.n_sessions,
"n_accepted_edits": len(rep.edits),
"n_rejected_edits": len(rep.rejected_edits),
"edits": [e.__dict__ for e in rep.edits],
"rejected_edits": [e.__dict__ for e in rep.rejected_edits],
"notes": rep.notes,
"staging_dir": outcome.staging_dir,
"adopted": outcome.adopted,
}


def _add_common(p: argparse.ArgumentParser) -> None:
Expand All @@ -45,11 +78,21 @@ def _add_common(p: argparse.ArgumentParser) -> None:
help="session transcript source")
p.add_argument("--lookback-hours", type=int, default=0)
p.add_argument("--edit-budget", type=int, default=0)
p.add_argument("--max-sessions", type=int, default=0,
help="cap harvested sessions before mining; default derives from max tasks")
p.add_argument("--max-tasks", type=int, default=0,
help="cap mined tasks for this run")
p.add_argument("--target-skill-path", default="",
help="explicit live SKILL.md path to evolve/stage/adopt")
p.add_argument("--tasks-file", default="",
help="reviewed TaskRecord JSON file to replay instead of harvesting")
p.add_argument("--progress", action="store_true",
help="print phase progress to stderr")
p.add_argument("--auto-adopt", action="store_true")
p.add_argument("--json", action="store_true")


def _cfg_from_args(args) -> Any:
def _cfg_from_args(args, task_meta: Dict[str, Any] | None = None) -> Any:
overrides: Dict[str, Any] = {}
if args.project:
overrides["invoked_project"] = os.path.abspath(args.project)
Expand All @@ -72,30 +115,63 @@ def _cfg_from_args(args) -> Any:
overrides["lookback_hours"] = args.lookback_hours
if getattr(args, "edit_budget", 0):
overrides["edit_budget"] = args.edit_budget
if getattr(args, "max_sessions", 0):
overrides["max_sessions_per_night"] = args.max_sessions
if getattr(args, "max_tasks", 0):
overrides["max_tasks_per_night"] = args.max_tasks
target_skill_path = getattr(args, "target_skill_path", "")
if not target_skill_path and task_meta:
target_skill_path = str(task_meta.get("target_skill_path") or "")
if target_skill_path:
path = os.path.expanduser(target_skill_path)
if args.project and not os.path.isabs(path):
path = os.path.join(os.path.abspath(args.project), path)
overrides["target_skill_path"] = os.path.abspath(path)
if getattr(args, "progress", False):
overrides["progress"] = True
if getattr(args, "auto_adopt", False):
overrides["auto_adopt"] = True
return load_config(**overrides)


def cmd_run(args, dry: bool = False) -> int:
cfg = _cfg_from_args(args)
outcome = run_sleep_cycle(cfg, dry_run=dry)
task_meta: Dict[str, Any] = {}
tasks = None
if getattr(args, "tasks_file", ""):
# Load once before config so target_skill_path can default from metadata.
tasks, task_meta = load_tasks_file(args.tasks_file)
cfg = _cfg_from_args(args, task_meta=task_meta)
if getattr(args, "tasks_file", ""):
tasks, task_meta = load_tasks_file(
args.tasks_file,
holdout_fraction=cfg.get("holdout_fraction", 0.34),
seed=cfg.get("seed", 42),
)
if cfg.get("backend", "mock") != "mock" and task_meta.get("reviewed") is not True:
print(
"[sleep] refusing real-backend replay from an unreviewed tasks file; "
"inspect/redact it and set \"reviewed\": true first",
file=sys.stderr,
)
return 2
outcome = run_sleep_cycle(cfg, seed_tasks=tasks, dry_run=dry)
rep = outcome.report
if args.json:
print(json.dumps({
"night": rep.night, "accepted": rep.accepted,
"gate_action": rep.gate_action,
"baseline": rep.baseline_score, "candidate": rep.candidate_score,
"n_tasks": rep.n_tasks, "n_sessions": rep.n_sessions,
"edits": [e.__dict__ for e in rep.edits],
"staging_dir": outcome.staging_dir, "adopted": outcome.adopted,
}, ensure_ascii=False, indent=2))
payload = _report_payload(rep, outcome)
if task_meta:
payload["tasks_file"] = task_meta.get("tasks_file", "")
payload["tasks_reviewed"] = task_meta.get("reviewed", False)
print(json.dumps(payload, ensure_ascii=False, indent=2))
else:
print(f"[sleep] night {rep.night}: {rep.n_sessions} sessions -> {rep.n_tasks} tasks")
print(f"[sleep] held-out {rep.baseline_score:.3f} -> {rep.candidate_score:.3f} "
f"=> {rep.gate_action} (accepted={rep.accepted})")
for e in rep.edits:
print(f" + [{e.target}/{e.op}] {e.content}")
if rep.rejected_edits:
print("[sleep] rejected by gate:")
for e in rep.rejected_edits:
print(f" - [{e.target}/{e.op}] {e.content}")
if outcome.staging_dir:
print(f"[sleep] staged: {outcome.staging_dir}")
if not outcome.adopted:
Expand Down Expand Up @@ -152,16 +228,42 @@ def cmd_adopt(args) -> int:

def cmd_harvest(args) -> int:
cfg = _cfg_from_args(args)
digests = harvest_for_config(cfg, limit=cfg.get("max_tasks_per_night", 40) * 3)
tasks = mine(digests, max_tasks=cfg.get("max_tasks_per_night", 40),
holdout_fraction=cfg.get("holdout_fraction", 0.34), seed=cfg.get("seed", 42))
session_limit = cfg.get("max_sessions_per_night", 0) or cfg.get("max_tasks_per_night", 40) * 3
target_skill_path = cfg.managed_skill_path() if cfg.get("target_skill_path", "") else ""
target_skill_text = _read_text(target_skill_path) if target_skill_path else ""
max_tasks = cfg.get("max_tasks_per_night", 40)
candidate_limit = max_tasks
if cfg.get("target_task_filter", True) and target_skill_text:
candidate_limit = max(max_tasks, max_tasks * 3)
digests = harvest_for_config(cfg, limit=session_limit)
tasks = mine(
digests,
max_tasks=max_tasks,
candidate_limit=candidate_limit,
holdout_fraction=cfg.get("holdout_fraction", 0.34),
seed=cfg.get("seed", 42),
target_skill_text=target_skill_text,
target_skill_path=target_skill_path,
)
payload = make_tasks_payload(
tasks,
project=cfg.get("invoked_project") or os.getcwd(),
transcript_source=cfg.get("transcript_source", ""),
n_sessions=len(digests),
target_skill_path=target_skill_path,
)
output_path = ""
if getattr(args, "output", ""):
output_path = write_tasks_file(args.output, payload)
if args.json:
print(json.dumps({
"n_sessions": len(digests),
"tasks": [t.to_dict() for t in tasks],
}, ensure_ascii=False, indent=2))
json_payload = dict(payload)
if output_path:
json_payload["output"] = output_path
print(json.dumps(json_payload, ensure_ascii=False, indent=2))
else:
print(f"[sleep] {len(digests)} sessions -> {len(tasks)} tasks")
if output_path:
print(f"[sleep] wrote reviewed-task draft: {output_path}")
for t in tasks:
print(f" [{t.split}/{t.outcome}] {t.intent[:90]}")
return 0
Expand Down Expand Up @@ -207,6 +309,7 @@ def main(argv=None) -> int:
p_adopt.add_argument("--staging", default="", help="specific staging dir")
p_harvest = sub.add_parser("harvest", help="debug: show mined tasks")
_add_common(p_harvest)
p_harvest.add_argument("--output", default="", help="write mined tasks JSON for review")
p_sched = sub.add_parser("schedule", help="install a nightly cron entry for this project")
_add_common(p_sched)
p_sched.add_argument("--hour", type=int, default=3)
Expand Down
71 changes: 57 additions & 14 deletions skillopt_sleep/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,8 @@ def __init__(self, model: str = "", timeout: int = 180) -> None:
self.timeout = timeout
self._tokens = 0
self._cache: Dict[str, str] = {}
self.last_call_error = ""
self.last_reflect_raw = ""

# subclasses override --------------------------------------------------
def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
Expand Down Expand Up @@ -691,15 +693,25 @@ class CodexCliBackend(CliBackend):

name = "codex"

def __init__(self, model: str = "", codex_path: str = "", timeout: int = 240,
sandbox: str = "read-only") -> None:
def __init__(
self,
model: str = "",
codex_path: str = "",
timeout: int = 240,
sandbox: str = "read-only",
project_dir: str = "",
) -> None:
super().__init__(model=model or os.environ.get("SKILLOPT_SLEEP_CODEX_MODEL", ""),
timeout=timeout)
self.codex_path = resolve_codex_path(codex_path)
self.sandbox = sandbox
self.project_dir = (
os.path.abspath(os.path.expanduser(project_dir)) if project_dir else ""
)

def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
import tempfile
self.last_call_error = ""
out_path = tempfile.NamedTemporaryFile(
prefix="codex_last_", suffix=".txt", delete=False
).name
Expand All @@ -708,18 +720,39 @@ def _call(self, prompt: str, *, max_tokens: int = 1024) -> str:
"--color", "never", "--sandbox", self.sandbox,
"-o", out_path,
]
if self.project_dir:
cmd[3:3] = ["-C", self.project_dir]
if self.model:
cmd += ["-m", self.model]
cmd += ["--", prompt]
proc = None
try:
subprocess.run(cmd, capture_output=True, text=True, timeout=self.timeout)
except Exception:
return ""
try:
with open(out_path, encoding="utf-8") as f:
return f.read().strip()
except Exception:
return ""
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=self.timeout,
cwd=self.project_dir or None,
)
except subprocess.TimeoutExpired:
self.last_call_error = f"codex exec timed out after {self.timeout}s"
return ""
except Exception as exc:
self.last_call_error = f"codex exec failed: {exc}"
return ""
try:
with open(out_path, encoding="utf-8") as f:
out = f.read().strip()
if out:
return out
except Exception as exc:
self.last_call_error = f"could not read codex output file: {exc}"
stdout = (proc.stdout or "").strip() if proc is not None else ""
stderr = (proc.stderr or "").strip() if proc is not None else ""
if proc is not None and proc.returncode != 0 and not self.last_call_error:
self.last_call_error = f"codex exec exited {proc.returncode}: {stderr[:500]}"
return stdout or stderr
finally:
try:
os.unlink(out_path)
Expand Down Expand Up @@ -1025,12 +1058,13 @@ def get_backend(
claude_path: str = "claude",
codex_path: str = "",
azure_endpoint: str = "",
project_dir: str = "",
) -> Backend:
n = (name or "mock").strip().lower()
if n in {"claude", "anthropic", "claude_cli", "claude_code"}:
return ClaudeCliBackend(model=model, claude_path=claude_path)
if n in {"codex", "codex_cli", "openai_codex"}:
return CodexCliBackend(model=model, codex_path=codex_path)
return CodexCliBackend(model=model, codex_path=codex_path, project_dir=project_dir)
if n in {"azure", "azure_openai", "aoai"}:
return AzureOpenAIBackend(deployment=model, endpoint=azure_endpoint)
if n in {"azure-responses", "azure_responses", "aoai-responses", "responses"}:
Expand All @@ -1050,6 +1084,7 @@ def build_backend(
codex_path: str = "",
azure_endpoint: str = "",
preferences: str = "",
project_dir: str = "",
) -> Backend:
"""Build a single or dual backend.

Expand All @@ -1060,13 +1095,21 @@ def build_backend(
"""
has_split = any([optimizer_backend, optimizer_model, target_backend, target_model])
if not has_split:
be = get_backend(backend, model=model, codex_path=codex_path, azure_endpoint=azure_endpoint)
be = get_backend(
backend,
model=model,
codex_path=codex_path,
azure_endpoint=azure_endpoint,
project_dir=project_dir,
)
be.preferences = preferences
return be
tgt = get_backend(target_backend or backend, model=target_model or model,
codex_path=codex_path, azure_endpoint=azure_endpoint)
codex_path=codex_path, azure_endpoint=azure_endpoint,
project_dir=project_dir)
opt = get_backend(optimizer_backend or backend, model=optimizer_model or model,
codex_path=codex_path, azure_endpoint=azure_endpoint)
codex_path=codex_path, azure_endpoint=azure_endpoint,
project_dir=project_dir)
opt.preferences = preferences # reflect runs on the optimizer
dual = DualBackend(target=tgt, optimizer=opt)
dual.preferences = preferences
Expand Down
Loading