diff --git a/crates/codra-runtime/src/types.rs b/crates/codra-runtime/src/types.rs index 692ed74..4341d47 100644 --- a/crates/codra-runtime/src/types.rs +++ b/crates/codra-runtime/src/types.rs @@ -431,6 +431,120 @@ pub struct ApprovalPolicy { pub max_auto_approve_per_session: usize, } +// ── Remote Worker Types ───────────────────────────────────────── + +/// Unique identifier for a worker instance. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct WorkerId(pub String); + +/// A worker's public identity metadata. +/// +/// The full cryptographic identity (X25519 keypair) is stored +/// separately on disk. This type carries the public-facing +/// fingerprint and label used for pairing and display. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkerIdentity { + pub worker_id: WorkerId, + pub label: String, + /// SHA-256 of the X25519 static public key, hex-encoded. + pub pin_sha256: String, + pub worker_version: String, +} + +/// Whether a paired worker is currently reachable and load-bearing. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum WorkerStatus { + #[serde(rename = "offline")] + Offline, + #[serde(rename = "idle")] + Idle, + #[serde(rename = "busy")] + Busy, + #[serde(rename = "degraded")] + Degraded, +} + +/// The level of trust a controller grants to a worker (or vice versa). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum TrustLevel { + #[serde(rename = "untrusted")] + Untrusted, + #[serde(rename = "limited")] + Limited, + #[serde(rename = "standard")] + Standard, + #[serde(rename = "elevated")] + Elevated, + #[serde(rename = "full")] + Full, +} + +/// The current state of a pairing request between controller and worker. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum PairingStatus { + #[serde(rename = "pending")] + Pending, + #[serde(rename = "approved")] + Approved, + #[serde(rename = "rejected")] + Rejected, + #[serde(rename = "expired")] + Expired, + #[serde(rename = "revoked")] + Revoked, +} + +/// Worker-side record of a paired controller. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoredPeer { + pub controller_id: String, + pub controller_label: String, + pub pin_sha256: String, + pub trust_level: TrustLevel, + pub trusted_at: String, +} + +/// Controller-side record of a paired worker. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoredPairing { + pub worker_id: WorkerId, + pub worker_label: String, + pub pin_sha256: String, + pub worker_host: String, + pub worker_port: u16, + pub trust_level: TrustLevel, + pub paired_at: String, + pub last_seen: String, +} + +/// Declared capabilities of a worker, advertised during health probes. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct WorkerCapabilities { + pub task_execution: bool, + pub event_streaming: bool, + pub approval_forwarding: bool, + pub remote_pairing: bool, + pub mdns_discovery: bool, +} + +/// Health response returned by a worker's GET /api/workers/health endpoint. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkerHealth { + pub status: String, + pub worker_id: WorkerId, + pub version: String, + pub hostname: String, + pub os: String, + pub arch: String, + pub uptime_seconds: u64, + pub supported_runtime_kinds: Vec, + pub available_runtimes: Vec, + pub workspace_mode: String, + pub remote_worker_protocol_version: String, + pub capabilities: WorkerCapabilities, +} + impl Default for SafetyConfig { fn default() -> Self { Self { @@ -459,3 +573,137 @@ impl Default for SafetyConfig { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn worker_identity_round_trip() { + let identity = WorkerIdentity { + worker_id: WorkerId("wkr-001".to_string()), + label: "Build Server Alpha".to_string(), + pin_sha256: "a3f1c8e2b7d4...".to_string(), + worker_version: "0.1.0".to_string(), + }; + let json = serde_json::to_string(&identity).unwrap(); + let deserialized: WorkerIdentity = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.worker_id, identity.worker_id); + assert_eq!(deserialized.pin_sha256, "a3f1c8e2b7d4..."); + } + + #[test] + fn trust_level_wire_values() { + assert_eq!( + serde_json::to_value(TrustLevel::Untrusted).unwrap(), + serde_json::json!("untrusted") + ); + assert_eq!( + serde_json::to_value(TrustLevel::Limited).unwrap(), + serde_json::json!("limited") + ); + assert_eq!( + serde_json::to_value(TrustLevel::Standard).unwrap(), + serde_json::json!("standard") + ); + assert_eq!( + serde_json::to_value(TrustLevel::Elevated).unwrap(), + serde_json::json!("elevated") + ); + assert_eq!( + serde_json::to_value(TrustLevel::Full).unwrap(), + serde_json::json!("full") + ); + } + + #[test] + fn worker_status_wire_values() { + assert_eq!( + serde_json::to_value(WorkerStatus::Offline).unwrap(), + serde_json::json!("offline") + ); + assert_eq!( + serde_json::to_value(WorkerStatus::Idle).unwrap(), + serde_json::json!("idle") + ); + assert_eq!( + serde_json::to_value(WorkerStatus::Busy).unwrap(), + serde_json::json!("busy") + ); + assert_eq!( + serde_json::to_value(WorkerStatus::Degraded).unwrap(), + serde_json::json!("degraded") + ); + } + + #[test] + fn pairing_status_wire_values() { + assert_eq!( + serde_json::to_value(PairingStatus::Pending).unwrap(), + serde_json::json!("pending") + ); + assert_eq!( + serde_json::to_value(PairingStatus::Approved).unwrap(), + serde_json::json!("approved") + ); + assert_eq!( + serde_json::to_value(PairingStatus::Rejected).unwrap(), + serde_json::json!("rejected") + ); + assert_eq!( + serde_json::to_value(PairingStatus::Expired).unwrap(), + serde_json::json!("expired") + ); + assert_eq!( + serde_json::to_value(PairingStatus::Revoked).unwrap(), + serde_json::json!("revoked") + ); + } + + #[test] + fn stored_peer_serializes() { + let peer = StoredPeer { + controller_id: "ctrl-abc".to_string(), + controller_label: "My Desktop".to_string(), + pin_sha256: "deadbeef".to_string(), + trust_level: TrustLevel::Standard, + trusted_at: "2026-01-15T10:30:00Z".to_string(), + }; + let json = serde_json::to_value(&peer).unwrap(); + assert_eq!(json["controller_id"], "ctrl-abc"); + assert_eq!(json["trust_level"], "standard"); + assert_eq!(json["trusted_at"], "2026-01-15T10:30:00Z"); + } + + #[test] + fn worker_health_serializes_with_capabilities() { + let health = WorkerHealth { + status: "ok".to_string(), + worker_id: WorkerId("wkr-001".to_string()), + version: "0.1.0".to_string(), + hostname: "build-server".to_string(), + os: "linux".to_string(), + arch: "aarch64".to_string(), + uptime_seconds: 86400, + supported_runtime_kinds: vec![RuntimeKind::LocalAgent], + available_runtimes: vec![], + workspace_mode: "local_only".to_string(), + remote_worker_protocol_version: "0.1".to_string(), + capabilities: WorkerCapabilities { + task_execution: true, + event_streaming: true, + approval_forwarding: false, + remote_pairing: false, + mdns_discovery: false, + }, + }; + let json = serde_json::to_value(&health).unwrap(); + assert_eq!(json["status"], "ok"); + assert_eq!(json["worker_id"], "wkr-001"); + assert!(json["capabilities"]["task_execution"].as_bool().unwrap()); + assert!(!json["capabilities"]["approval_forwarding"] + .as_bool() + .unwrap()); + assert!(!json["capabilities"]["mdns_discovery"].as_bool().unwrap()); + } +} diff --git a/docs/REMOTE_WORKER_ARCHITECTURE.md b/docs/REMOTE_WORKER_ARCHITECTURE.md index baded6f..bd920d0 100644 --- a/docs/REMOTE_WORKER_ARCHITECTURE.md +++ b/docs/REMOTE_WORKER_ARCHITECTURE.md @@ -674,7 +674,272 @@ and fans out events to all connected control surfaces. --- -## Worker Mode Daemon Configuration +--- + +## Future: Computer-Use & Sandbox Runtimes on Workers + +### Design Principle + +Remote workers are the natural host for computer-use and sandbox runtimes. A Controller delegates a GUI-intensive or high-risk task to a Worker that runs a `ComputerUseAgent` or `SandboxAgent` runtime, and the Controller never touches the execution environment directly. + +### Sandbox Workers + +A sandbox Worker runs each task inside an isolated environment: + +``` + ┌──────────────────────────────────┐ + │ CONTROL SURFACES │ + │ │ + ┌─────────┼──────┬───────────┬────────────┐ │ + │ │ │ │ │ │ + ┌────▼──┐ ┌────▼──┐ ┌─▼──────┐ ┌─▼──────┐ │ │ + │Desktop│ │CLI/TUI│ │Telegram│ │Android │ │ │ + └───┬───┘ └───┬───┘ └───┬────┘ └───┬────┘ │ │ + └─────────┼─────────┼───────────┘ │ │ + │ │ │ │ + ┌────▼─────────▼──────────────┐ │ │ + │ codra-daemon │ │ │ + │ (gateway + controller) │ │ │ + │ │ │ │ + │ Submits task to sandbox │ │ │ + │ Streams screenshots back │ │ │ + │ Forwards approvals │ │ │ + └────────────┬─────────────────┘ │ │ + │ │ │ + ┌──────────▼──────────┐ │ │ + │ Worker (remote) │ │ │ + │ │ │ │ + │ ┌────────────────┐ │ │ │ + │ │ SandboxManager │ │ │ │ + │ │ │ │ │ │ + │ │ 1. Request SS │ │ │ │ + │ │ → Docker │ │ │ │ + │ │ Container │ │ │ │ + │ │ → Firecracker│ │ │ │ + │ │ MicroVM │ │ │ │ + │ └───────┬────────┘ │ │ │ + │ │ │ │ │ + │ ┌───────▼────────┐ │ │ │ + │ │ Inside Sandbox │ │ │ │ + │ │ │ │ │ │ + │ │ Xvfb + WM │ │ │ │ + │ │ ComputerUseRT │ │ │ │ + │ │ LLM + agent │ │ │ │ + │ └────────────────┘ │ │ │ + └──────────────────────┘ │ │ + └───┘ +``` + +#### Sandbox Lifecycle on Workers + +``` +1. Controller submits task with runtime_category: "ComputerUseAgent" + │ +2. Worker checks capabilities → supports_sandbox: true + │ +3. Worker provisions sandbox: + ┌────────────────────────────────────────────┐ + │ - Rust's bollard crate → Docker API call │ + │ - Image: cua-sandbox:latest │ + │ - Mounts: workspace copy (read-only) │ + │ - Resources: 4 CPU, 8GB RAM, 20GB disk │ + │ - Network: isolated bridge, no egress │ + │ - GPU: optional passthrough │ + └────────────────────────────────────────────┘ + │ +4. Worker starts agent loop inside sandbox: + ┌────────────────────────────────────────────┐ + │ - Xvfb :99 -screen 0 1920x1080x24 │ + │ - fluxbox (lightweight window manager) │ + │ - Agent process watching :99 via CDP + VNC│ + └────────────────────────────────────────────┘ + │ +5. Worker streams screenshots + events over peer-link: + ┌────────────────────────────────────────────┐ + │ - Every agent step: screenshot + action │ + │ - Screenshot encoded as base64 PNG │ + │ - Approval requests for risky GUI ops │ + │ - All events ride same Noise XX WS │ + └────────────────────────────────────────────┘ + │ +6. Controller renders screenshots inline in UI + │ +7. Task completes → Worker tears down sandbox + │ +8. Artifacts returned to Controller: + ┌────────────────────────────────────────────┐ + │ - TaskTrace with full trajectory │ + │ - Screenshots at each step │ + │ - File diffs (if workspace modified) │ + │ - Sandbox logs │ + │ - Replay-ready trajectory JSON │ + └────────────────────────────────────────────┘ +``` + +#### Sandbox Runtimes on Workers + +| Runtime | Isolation | Use Case | Worker Config | +|---------|-----------|----------|---------------| +| `ComputerUseAgent` | Docker container + Xvfb | UI testing, browser automation, visual debugging | GPU passthrough, display env, CDP ports | +| `BrowserAgent` | Docker container + Chromium | Web testing, screenshot verification | CDP port mapping, site allowlist | +| `MobileEmulator` | Android emulator in Docker | Mobile app testing, ADB control | KVM passthrough, ADB port mapping | +| `GenericSandbox` | Docker / Firecracker / QEMU | Risky commands, unknown code execution | Resource caps, network policy, snapshot volume | + +### Screenshot Streaming Over Peer-Link + +When a Worker hosts a `ComputerUseAgent`, screenshots are streamed inline: + +```rust +pub struct ScreenshotFrame { + pub step_index: u32, + pub action: ComputerUseAction, + pub screenshot: String, // base64 PNG + pub dom_snapshot: Option, + pub cursor_position: Option<(u32, u32)>, + pub timestamp: String, +} +``` + +These frames ride the same Noise XX encrypted WebSocket as other RuntimeEvents. The Controller: +- Receives `ScreenshotFrame` events +- Renders screenshots inline (desktop: image in session pane; CLI: ascii art or file output) +- Stores screenshots in the task trace for replay + +### Task Trajectory Replay Over Peer-Link + +Replay is a special stream mode where the Worker resends a recorded task's events: + +``` +Controller sends: { "type": "replay_request", "task_id": "..." } +Worker responds: { "type": "event", "kind": "ReplayStarting", "total_steps": 42 } +Worker streams: { "type": "event", "kind": "ReplayStep", "step": 5, ... } + // Each step includes: action, before_screenshot, after_screenshot, + // dom_snapshot, model_thought, action_result +Worker ends: { "type": "event", "kind": "ReplayCompleted" } +``` + +The replay stream uses the same RuntimeEvent types — no new transport needed. The Worker reads from its stored `TaskTrace` and replays events as if they were happening live. + +### Worker Configuration for Sandbox Runtimes + +```toml +# ~/.codra/worker/config.toml + +[worker] +enabled = true +bind_host = "0.0.0.0" +bind_port = 9091 +name = "Sandbox Worker" +max_concurrent_tasks = 1 # sandbox tasks are resource-heavy + +[worker.capabilities] +supports_gui_control = true +supports_screenshot = true +supports_replay = true +supports_sandbox = true +supports_browser = true +supports_mobile_device = false # not available on this hardware + +[sandbox] +provider = "docker" # docker, firecracker, qemu +default_image = "cua-sandbox:latest" +ephemeral = true # destroy sandbox after each task +workspace_mount_mode = "copy" # copy, bind, mount +cpu_limit = 4 +memory_limit = "8GB" +disk_limit = "20GB" +network_isolation = true +gpu_passthrough = false +max_run_duration_seconds = 3600 + +[sandbox.volumes] +# Additional volumes to mount inside sandbox +node_modules = "/home/user/.codra/sandbox-cache/node_modules" + +[sandbox.env] +# Environment variables injected into sandbox +DISPLAY = ":99" +RESOLUTION = "1920x1080x24" +``` + +### Integration With Existing Surfaces + +#### Codra Desktop +- **Runtime picker** shows sandbox workers with capability badges ("GUI", "Sandbox", "Browser") +- **Session pane** renders screenshots inline with before/after toggle +- **Replay viewer** lets users step through trajectories with arrow keys +- **Sanbox tab** shows resource usage, network policy, remaining time + +#### Codra CLI/TUI +- `codra sandbox provision` — request a sandbox worker +- `codra sandbox attach ` — stream events with screenshots (or screencast) +- `codra replay ` — step through recorded trajectory +- `codra replay export --format cua_v1` — export for Cua-compatible tools + +#### Codra Daemon +- New endpoints: `POST /api/sandboxes`, `DELETE /api/sandboxes/:id`, `GET /api/sandboxes/:id/status` +- Screenshot metadata in SSE events +- Acts as broker: desktop → daemon → sandbox worker → stream back + +#### Android/Telegram Control Layer +- Receive screenshot thumbnails in approval notifications +- Approve/reject GUI actions ("click OK button", "type password") from phone +- View replay as slideshow of keyframes +- Track sandbox resource usage remotely + +#### Codex SDK Runtime (Code runtimes, no GUI) +- Codex SDK tasks route to regular workers, not sandbox workers +- Sandbox not needed — Codex SDK runs locally or on a standard worker +- The `RuntimeCapabilities.supports_gui_control` flag tells Codra not to attempt remote desktop on Codex tasks + +#### Claude Code / OpenCode / Pi / Hermes Runtimes (Code runtimes, no GUI) +- These CLI tools are code-only; they don't need sandbox or computer-use capabilities +- They can still be run inside a sandbox (GenericSandbox) for isolation, without GUI +- The sandbox simply runs the CLI tool in the container — no Xvfb needed + +#### Future Cua-like Sandbox Runtime +- A new `cua-sandbox` crate implementing `CodraRuntime` with `ComputerUseAgent` category +- Uses Docker/Firecracker for isolation, Xvfb for display, CDP for browser control +- Streams screenshots as `ScreenshotFrame` events over peer-link +- Returns `TaskTrace` with full trajectory for replay +- Can be hosted on any Worker with Docker and GPU support + +### Priority Ladder for Remote Workers + +``` +Worker type Capabilities Deployed on +─────────────────────────────────────────────────────────────────────────────────── +Code Worker supports_sandbox: false LAN machine +(NativeCodraRuntime) supports_gui_control: false or cloud VM + +Sandbox Code Worker supports_sandbox: true Cloud VM with Docker +(GenericSandbox) supports_gui_control: false or bare metal + +Sandbox Computer-Use supports_sandbox: true GPU-enabled cloud VM +Worker (Cua-like RT) supports_gui_control: true or powerful workstation + supports_screenshot: true + supports_replay: true + supports_browser: true + +Sandbox Mobile Worker supports_sandbox: true KVM-enabled cloud VM +(MobileDeviceAgent) supports_mobile_device: true with Android emulator + supports_screenshot: true + supports_replay: true +``` + +### Next Steps for Sandbox/Computer-Use on Workers + +1. **Define types** — `ComputerUseAction`, `ScreenshotFrame`, `TaskTrace`, `ReplayRequest` in `codra-protocol` or `codra-runtime` +2. **Add capability flags** — `supports_gui_control`, `supports_sandbox`, etc. to `WorkerCapabilities` +3. **Create sandbox manager trait** — `ProvisionSandbox`, `DestroySandbox`, `GetSandboxStatus` in `codra-runtime` +4. **Implement Docker sandbox provider** — using `bollard` crate, container lifecycle management +5. **Add screenshot streaming** — extend peer-link protocol with binary screenshot frames +6. **Add replay endpoint** — `replay_task` on Worker that replays stored `TaskTrace` over peer-link +7. **Integrate with daemon** — `POST /api/sandboxes` → provision, stream events back +8. **UI: screenshot viewer** — desktop image rendering, CLI ascii fallback +9. **UI: replay viewer** — step through trajectories with keyboard controls + + ```toml # ~/.codra/worker/config.toml diff --git a/docs/RUNTIME_ADAPTER_ARCHITECTURE.md b/docs/RUNTIME_ADAPTER_ARCHITECTURE.md index 53b4104..8ac253c 100644 --- a/docs/RUNTIME_ADAPTER_ARCHITECTURE.md +++ b/docs/RUNTIME_ADAPTER_ARCHITECTURE.md @@ -237,6 +237,16 @@ pub enum RuntimeCategory { /// Local LLM (Ollama, LM Studio, llama.cpp) with Codra's own agent loop. LocalModel, + + /// Computer-use agent runtime (Cua-like sandbox, browser automation, GUI control). + /// Can see screen state, click, type, run shell commands, and verify UI outcomes. + /// Runs in a sandboxed environment, not on the user's host directly. + ComputerUseAgent, + + /// Sandboxed agent runtime (container/VM-isolated execution for risky tasks). + /// Provides full filesystem/network isolation from the user's main machine. + /// Often hosts a ComputerUseAgent inside the sandbox. + SandboxAgent, } ``` @@ -489,6 +499,21 @@ pub struct RuntimeCapabilities { pub supports_planning: bool, pub supports_verification: bool, pub supports_repair: bool, + + // ── Computer-Use & Sandbox Capabilities ───────────────── + /// Can observe and interact with the OS GUI (click, type, drag, scroll). + pub supports_gui_control: bool, + /// Can capture screen state as images (full-screen, window, region). + pub supports_screenshot: bool, + /// Can record and replay task trajectories (events + screenshots at each step). + pub supports_replay: bool, + /// Runs in an isolated sandbox (container/VM) with no host filesystem access. + pub supports_sandbox: bool, + /// Can launch and control a headless or headed web browser. + pub supports_browser: bool, + /// Can control mobile device emulators or physical devices (ADB, simulators). + pub supports_mobile_device: bool, + pub max_concurrent_sessions: usize, pub available_tools: Vec, pub model_info: Option, @@ -942,3 +967,244 @@ pub struct RuntimeFactoryInfo { | **API key propagation** | Multiple runtimes may need different API keys, stored in different places. | Unified `RuntimeConfig.api_key` + runtime-specific `extra` map. Providers handle key wrapping. | | **Backward compatibility** | Existing codra-core types (Task, TaskStatus, TaskEvent) duplicate runtime types. | Runtime types live in `codra-runtime`. Legacy codra-core types become one implementation (LocalModel runtime). Migration path: adapters map to legacy types where needed. | | **Feature disparity** | Not all runtimes support all capabilities (fork, clone, streaming). | `RuntimeCapabilities` struct lets consumers check before calling. Graceful fallback. | + +--- + +## Future: Computer-Use & Sandbox Runtimes + +### Design Principle: Code First, Computer-Use Later + +Codra's architecture treats **code runtimes** (Planner→Executor→Verifier, file edits, shell commands, git) as the primary execution path and **computer-use runtimes** (GUI automation, browser observation, screen interaction) as a future extension. This priority ladder guides all implementation choices: + +``` +Priority 1: Code runtimes (LocalAgent, CliAgent, CloudAgent, DirectModel, LocalModel) +Priority 2: Computer-use runtimes (ComputerUseAgent — see Cua architecture) +Priority 3: Sandbox runtimes (SandboxAgent — isolated execution environments) +Priority 4: Mobile device runtimes (MobileDeviceAgent — emulator/ADB control) +``` + +### What a Computer-Use Runtime Does + +A `ComputerUseAgent` runtime can: +- **Observe screen state** — capture full-screen, window, or region screenshots; read pixel data +- **Interact with GUI** — click, drag, scroll, type, press keys at OS level +- **Run shell commands** — within the sandbox environment (not the host) +- **Control browsers** — navigate, click elements, extract text via CDP or Playwright-style APIs +- **Verify UI outcomes** — compare screenshots against expected state, detect visual regressions +- **Record trajectories** — every action + resulting screenshot + model thought for replay + +Key difference from code runtimes: a computer-use runtime operates on **visual state**, not just file/process state. The agent must interpret pixels, not just text. + +### What a Sandbox Runtime Does + +A `SandboxAgent` runtime wraps another runtime in an isolated environment: +- **Container isolation** (Docker/Podman) for Linux, **VM isolation** for cross-platform safety +- **No host filesystem access** — workspace is copied into the sandbox; results are extracted +- **Network policy** — can be restricted (no internet), bridged (limited ports), or open +- **Resource limits** — CPU, memory, disk, network bandwidth caps +- **Ephemeral by default** — containers destroyed after task completion unless snapshot is saved +- **Snapshot/resume** — save sandbox state mid-task for debugging or replay + +Sandboxes are the natural host for `ComputerUseAgent` runtimes, since GUI automation requires elevated OS access that shouldn't run directly on the user's machine. + +### Capability Flags + +The `RuntimeCapabilities` struct (defined above) includes six computer-use/sandbox flags: + +| Flag | Meaning | Example Runtimes | +|------|---------|-----------------| +| `supports_gui_control` | Can click, type, drag at OS level | Cua sandbox, WinAppDriver, Xvfb + xdotool | +| `supports_screenshot` | Can capture screen as image | Chromium CDP, Xvfb + ImageMagick, Windows GDI | +| `supports_replay` | Can record + replay task trajectories | Cua sandbox (step replay), Browserstack sessions | +| `supports_sandbox` | Runs in isolated container/VM | Docker executor, Firecracker microVM, QEMU | +| `supports_browser` | Can launch and control browser | Chromium CDP, Playwright, Selenium | +| `supports_mobile_device` | Controls emulator or physical device | Android ADB, iOS simulator, Browserstack device cloud | + +### Runtime Categories for Computer-Use & Sandbox + +```rust +pub enum RuntimeCategory { + // Existing: + LocalAgent, + CliAgent, + CloudAgent, + DirectModel, + LocalModel, + + // New: + /// Computer-use agent — observes and interacts with a GUI environment. + /// Can be hosted inside a sandbox for isolation. + ComputerUseAgent, + + /// Sandboxed agent — wraps any runtime in an isolated container/VM. + /// The sandbox provides filesystem, network, and resource isolation. + /// Often paired with a ComputerUseAgent inside the sandbox. + SandboxAgent, +} +``` + +### Task Traces: Commands, Diffs, Approvals, Screenshots, Trajectories + +Codra task traces should evolve to include computer-use and sandbox artifacts: + +```rust +pub struct ComputerUseStep { + pub step_index: u32, + pub action: ComputerUseAction, // what the agent did + pub screenshot_before: Option, // base64 PNG before action + pub screenshot_after: Option, // base64 PNG after action + pub dom_snapshot: Option, // accessibility tree / DOM + pub model_thought: Option, // what the model was thinking + pub action_result: String, // success/failure/error + pub timestamp: String, +} + +pub struct TaskTrace { + pub task_id: String, + pub session_id: String, + pub runtime_category: RuntimeCategory, + + // Code runtime artifacts: + pub commands_run: Vec, + pub file_changes: Vec, + pub approvals: Vec, + + // Computer-use artifacts (when applicable): + pub computer_use_steps: Option>, + + // Sandbox artifacts (when applicable): + pub sandbox_id: Option, + pub sandbox_snapshot_path: Option, + + // Replay metadata: + pub total_steps: u32, + pub supports_replay: bool, + pub replay_format: Option, // "cua_trajectory_v1", "codra_trace_v1" +} +``` + +### Architecture: How Computer-Use & Sandbox Fit Into Each Surface + +#### Codra Desktop + +- **Runtime picker** shows computer-use and sandbox runtimes alongside code runtimes +- **Session pane** displays screenshots inline (before/after each action) +- **Replay viewer** lets users step through task trajectories frame-by-frame +- **Sandbox indicator** shows resource usage, isolation status, network policy +- Desktop itself never hosts a computer-use runtime — it connects to remote sandboxes + +#### Codra CLI/TUI + +- `codra sandbox create` — provision a sandbox (local Docker or remote worker) +- `codra sandbox attach ` — stream events from sandbox runtime +- `codra sandbox exec "command"` — run commands inside sandbox +- `codra replay ` — step through a recorded trajectory +- `codra screenshot ` — view screenshot at a specific step + +#### Codra Daemon + +- REST endpoints for sandbox lifecycle: `POST /api/sandboxes`, `DELETE /api/sandboxes/:id` +- REST endpoints for replay: `GET /api/tasks/:id/trajectory`, `GET /api/tasks/:id/replay` +- SSE streams include screenshot metadata alongside events +- Acts as gateway: desktops/CLIs talk to daemon, daemon talks to sandbox workers + +#### Android/Telegram Control Layer + +- Receive screenshot thumbnails in approval notifications +- Approve/reject GUI actions (click here, type this) from phone +- View task replay as a slideshow of screenshots +- Remotely start/stop sandbox runtimes + +#### Codex SDK Runtime + +- Codex SDK's `computer_use` tools map to `ComputerUseActionKind` (ClickTarget, TypeText, PressKey) +- Codex SDK's screenshot capabilities map to `ComputerUseStep.screenshot_*` +- The same `CodraRuntime` trait works — just with additional capability flags set + +#### Claude Code / OpenCode / Pi / Hermes Runtimes + +- These CLI tools don't support computer-use natively +- When used through Codra, the CLI adapter treats them as code-only (`supports_gui_control: false`) +- Users can still route their tasks into a sandbox that happens to use a different runtime + +#### Future Cua-like Sandbox Runtime + +A `ComputerUseAgent` runtime modeled after Cua's architecture would: + +``` +┌───────────────────────────────────────────────┐ +│ CuaSandboxRuntime │ +│ ┌─────────────────────────────────────────┐ │ +│ │ 1. Provision container │ │ +│ │ - Docker / Firecracker / QEMU │ │ +│ │ - Mount workspace copy │ │ +│ │ - Configure network policy │ │ +│ │ - Set resource limits │ │ +│ └─────────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ 2. Start Xvfb + window manager │ │ +│ │ - Virtual framebuffer (Xvfb) │ │ +│ │ - Lightweight WM (fluxbox, jwm) │ │ +│ │ - VNC or pipe screenshot stream │ │ +│ └─────────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ 3. Agent loop inside container │ │ +│ │ - LLM → thought → action → observe │ │ +│ │ - Actions: click, type, shell, wait │ │ +│ │ - Observation: screenshot + a11y tree│ │ +│ └─────────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ 4. Stream events + screenshots back │ │ +│ │ - Every step: screenshot + action │ │ +│ │ - Approval requests for risky ops │ │ +│ │ - On container exit: collect results │ │ +│ └─────────────────────────────────────────┘ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ 5. Return artifacts │ │ +│ │ - TaskTrace with trajectory │ │ +│ │ - Screenshots at each step │ │ +│ │ - File diffs (if workspace modified) │ │ +│ │ - Replay-ready trajectory JSON │ │ +│ └─────────────────────────────────────────┘ │ +└───────────────────────────────────────────────┘ +``` + +The `ComputerUseAgent` and `SandboxAgent` runtime categories are **additive** to the existing architecture: +- They don't change the `CodraRuntime` trait — the same `submit_task`, `approve`, `stream_events` interface works +- They add new capability flags so consumers can decide what UI to show +- They introduce new data types (`ComputerUseStep`, `TaskTrace`) for task artifacts +- They enable one runtime to host another (`SandboxAgent` contains a `ComputerUseAgent`) + +### Planning vs. Computer-Use Priority + +``` +Current (MVP): + Code runtimes only + ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + │LLA │ │CLI │ │Cloud│ │Model│ + └─────┘ └─────┘ └─────┘ └─────┘ + +Phase 2: + + Computer-use runtime (remote sandbox only) + ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐ + │Code │ │CLI │ │Cloud│ │Model│ │CuaSandbox│ + └─────┘ └─────┘ └─────┘ └─────┘ └──────────┘ + │ + ┌────▼────┐ + │ Worker │ + │ (remote) │ + └─────────┘ + +Phase 3: + + Local sandbox (for offline/low-risk computer-use) + ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐ ┌────────────┐ + │Code │ │CLI │ │Cloud│ │Model│ │CuaSandbox│ │LocalSandbox│ + └─────┘ └─────┘ └─────┘ └─────┘ └──────────┘ └────────────┘ + +Phase 4: + + Mobile device runtime + + Cross-runtime task handoff (code → sandbox → mobile) + ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌──────────┐ ┌────────────┐ ┌──────────┐ + │Code │ │CLI │ │Cloud│ │Model│ │CuaSandbox│ │LocalSandbox│ │MobileDev │ + └─────┘ └─────┘ └─────┘ └─────┘ └──────────┘ └────────────┘ └──────────┘ +```