OpenWinBot/agent.py at main · Alidmo/OpenWinBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#!/usr/bin/env python3
"""
OpenWinBot LLM agent — closes the perception-action loop using Claude.

The agent subscribes to win-observer (PUB), formats the semantic UI state into
a structured prompt, calls Claude with tool definitions, and relays each tool
call to win-actuator (REP). It loops until Claude emits a final answer or
the maximum iteration count is reached.

Requires:
    pip install anthropic pyzmq

Environment:
    ANTHROPIC_API_KEY must be set.

Standalone usage:
    python agent.py --window "Calculator" --task "Compute 9 + 3"
    python agent.py --window "Notepad"    --task "Type 'Hello World'"
    python agent.py --window "Paint"      --task "Describe the toolbar buttons"

Imported usage (from test_suite.py):
    from agent import run_agent
    result = run_agent(window="Calculator", task="Compute 9 + 3")
    print(result["summary"])
"""

import argparse
import json
import sys
import time
from typing import Any

import zmq
import anthropic

# ── Defaults ──────────────────────────────────────────────────────────────────

DEFAULT_OBS  = "tcp://localhost:5555"
DEFAULT_ACT  = "tcp://localhost:5556"
DEFAULT_MODEL = "claude-sonnet-4-6"
RECV_TIMEOUT_MS = 8_000

# ── Tool definitions (sent to Claude) ────────────────────────────────────────

TOOLS = [
    {
        "name": "click",
        "description": (
            "Click at an absolute screen position. "
            "Use this when you have the exact pixel coordinates. "
            "For named elements prefer click_id — it is more reliable."
        ),
        "input_schema": {
            "type": "object",
            "properties": {
                "x": {"type": "integer", "description": "Screen X pixel"},
                "y": {"type": "integer", "description": "Screen Y pixel"},
            },
            "required": ["x", "y"],
        },
    },
    {
        "name": "click_id",
        "description": (
            "Click a UI element by its stable ID from the state message. "
            "The actuator resolves the centre automatically from its local state cache. "
            "This is the preferred way to click named elements — copy the id field exactly."
        ),
        "input_schema": {
            "type": "object",
            "properties": {
                "id": {"type": "string", "description": "Element id (16-char hex from the state message)"},
            },
            "required": ["id"],
        },
    },
    {
        "name": "type_text",
        "description": "Type a string of text into the focused window using Unicode keystrokes.",
        "input_schema": {
            "type": "object",
            "properties": {
                "text": {"type": "string", "description": "Text to type"},
            },
            "required": ["text"],
        },
    },
    {
        "name": "read_state",
        "description": (
            "Fetch the latest UI state from win-observer. "
            "Call this after every action to observe its effect before deciding what to do next."
        ),
        "input_schema": {"type": "object", "properties": {}, "required": []},
    },
]

# ── State formatting ──────────────────────────────────────────────────────────

def format_state(state: dict) -> str:
    """
    Render a StateMessage dict as a compact, human-readable block suitable
    for inclusion in an LLM prompt. Separates informational from actionable
    elements and includes element IDs so Claude can use click_id directly.
    """
    elements = state.get("elements", [])
    window   = state.get("window", "?")
    ts       = state.get("timestamp", 0)

    info   = [e for e in elements if e.get("role") == "informational"]
    action = [e for e in elements if e.get("role") == "actionable"]
    other  = [e for e in elements if e.get("role") not in ("informational", "actionable")]

    lines = [f"Window: {window!r}  (timestamp={ts})"]

    if info:
        lines.append("\n── DISPLAY / STATUS ──────────────────────────────")
        for e in info:
            flag = " [DISABLED]" if not e.get("is_enabled", True) else ""
            lines.append(f"  {e['type']:<14} {e.get('name', '')!r}{flag}")

    if action:
        lines.append("\n── ACTIONABLE ELEMENTS ───────────────────────────")
        lines.append(f"  {'ID':>8}  {'TYPE':<14}  {'NAME':<28}  CENTER")
        for e in action:
            r   = e["rect"]
            cx  = r["x"] + r["w"] // 2
            cy  = r["y"] + r["h"] // 2
            dis = " [DISABLED]" if not e.get("is_enabled", True) else ""
            lines.append(
                f"  {e['id'][:8]}  {e['type']:<14}  {e.get('name', ''):<28}  ({cx},{cy}){dis}"
            )

    if other and other:
        lines.append(f"\n  (+{len(other)} structural elements omitted)")

    return "\n".join(lines)


# ── ZMQ helpers ───────────────────────────────────────────────────────────────

def recv_full_state(sub: zmq.Socket, max_attempts: int = 10) -> dict:
    """
    Read from the observer SUB socket until we get a 'full' state message.
    Delta messages (if --delta is active on the observer) are skipped here
    because the agent always wants a complete snapshot.
    """
    for _ in range(max_attempts):
        try:
            raw  = sub.recv()
            data = json.loads(raw)
            if data.get("type", "full") == "full":
                return data
        except zmq.Again:
            raise TimeoutError(
                f"Timed out waiting for a state frame from the observer.\n"
                f"Is win-observer running and publishing to {DEFAULT_OBS}?"
            )
    raise RuntimeError("Received only delta messages — restart observer without --delta "
                       "or wait for the next periodic full-frame broadcast.")


def send_action(req: zmq.Socket, action: dict) -> dict:
    req.send_string(json.dumps(action))
    raw = req.recv()
    return json.loads(raw)


# ── Tool execution ────────────────────────────────────────────────────────────

def execute_tool(name: str, inputs: dict,
                 req: zmq.Socket, sub: zmq.Socket) -> Any:
    if name == "click":
        result = send_action(req, {"action": "click", "x": inputs["x"], "y": inputs["y"]})
        time.sleep(0.3)   # let the UI settle
        return result

    elif name == "click_id":
        result = send_action(req, {"action": "click_id", "id": inputs["id"]})
        time.sleep(0.3)
        return result

    elif name == "type_text":
        result = send_action(req, {"action": "type", "text": inputs["text"]})
        time.sleep(0.2)
        return result

    elif name == "read_state":
        state = recv_full_state(sub)
        return {"state": format_state(state)}

    return {"error": f"unknown tool: {name}"}


# ── System prompt ─────────────────────────────────────────────────────────────

SYSTEM_PROMPT = """\
You are a Windows desktop automation agent powered by the OpenWinBot framework.

You receive a structured view of a window's UI elements and can interact with
them using the provided tools.

Rules:
1. Only interact with elements where role is "actionable" and is_enabled is true.
2. Prefer click_id over click — paste the element's id field exactly as shown.
3. After every action call read_state to observe the result before deciding next step.
4. When the task is complete, describe exactly what you did and what the final
   state of the relevant UI element shows (e.g., what number is on the display).
5. If an element you need is not visible, say so clearly rather than guessing.
"""


# ── Main agent loop ───────────────────────────────────────────────────────────

def run_agent(
    task:      str,
    window:    str,
    obs_addr:  str  = DEFAULT_OBS,
    act_addr:  str  = DEFAULT_ACT,
    model:     str  = DEFAULT_MODEL,
    max_iters: int  = 12,
    verbose:   bool = True,
) -> dict:
    """
    Run one agent session for the given task on the given window.

    Returns:
        {"success": bool, "summary": str, "iterations": int}
    """
    client = anthropic.Anthropic()

    ctx = zmq.Context()

    sub = ctx.socket(zmq.SUB)
    sub.setsockopt(zmq.RCVTIMEO, RECV_TIMEOUT_MS)
    sub.setsockopt_string(zmq.SUBSCRIBE, "")
    sub.connect(obs_addr)

    req = ctx.socket(zmq.REQ)
    req.setsockopt(zmq.RCVTIMEO, RECV_TIMEOUT_MS)
    req.connect(act_addr)

    if verbose:
        print(f"\n{'='*60}")
        print(f"  Task   : {task}")
        print(f"  Window : {window}")
        print(f"  Model  : {model}")
        print(f"{'='*60}\n")

    try:
        # ── Initial perception ────────────────────────────────────────────────
        if verbose: print("[agent] Fetching initial state...")
        state = recv_full_state(sub)
        state_text = format_state(state)

        if verbose:
            print("[agent] State received:\n" + state_text + "\n")

        messages = [
            {
                "role": "user",
                "content": (
                    f"Task: {task}\n\n"
                    f"Current UI state:\n{state_text}\n\n"
                    "Use the tools to complete the task. "
                    "Call read_state after each action to verify progress."
                ),
            }
        ]

        # ── Agentic loop ──────────────────────────────────────────────────────
        for iteration in range(1, max_iters + 1):
            if verbose: print(f"[agent] Iteration {iteration}/{max_iters}...")

            response = client.messages.create(
                model=model,
                max_tokens=2048,
                system=SYSTEM_PROMPT,
                tools=TOOLS,
                messages=messages,
            )

            messages.append({"role": "assistant", "content": response.content})

            # ── Task complete ─────────────────────────────────────────────────
            if response.stop_reason == "end_turn":
                summary = next(
                    (b.text for b in response.content if hasattr(b, "text")), ""
                )
                if verbose:
                    print(f"\n[agent] ✓ Done after {iteration} iteration(s).")
                    print(f"[agent] Summary: {summary}\n")
                return {"success": True, "summary": summary, "iterations": iteration}

            # ── Execute tool calls ────────────────────────────────────────────
            if response.stop_reason == "tool_use":
                tool_results = []

                for block in response.content:
                    if not hasattr(block, "type") or block.type != "tool_use":
                        continue

                    if verbose:
                        print(f"[agent] Tool call: {block.name}({json.dumps(block.input)})")

                    result = execute_tool(block.name, block.input, req, sub)

                    if verbose:
                        print(f"[agent] Tool result: {json.dumps(result)}")

                    tool_results.append({
                        "type":        "tool_result",
                        "tool_use_id": block.id,
                        "content":     json.dumps(result),
                    })

                messages.append({"role": "user", "content": tool_results})
                continue

            # Unexpected stop reason
            break

        summary = f"Reached max iterations ({max_iters}) without completing task."
        if verbose: print(f"[agent] ✗ {summary}")
        return {"success": False, "summary": summary, "iterations": max_iters}

    except TimeoutError as e:
        msg = str(e)
        if verbose: print(f"[agent] ✗ Timeout: {msg}")
        return {"success": False, "summary": msg, "iterations": 0}

    finally:
        sub.close()
        req.close()
        ctx.term()


# ── CLI entry point ───────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="OpenWinBot LLM agent")
    parser.add_argument("--window", "-w", required=True, help="Target window title")
    parser.add_argument("--task",   "-t", required=True, help="Task for the agent to complete")
    parser.add_argument("--obs",          default=DEFAULT_OBS, help="Observer PUB address")
    parser.add_argument("--act",          default=DEFAULT_ACT, help="Actuator REP address")
    parser.add_argument("--model",        default=DEFAULT_MODEL)
    parser.add_argument("--max-iters",    type=int, default=12)
    args = parser.parse_args()

    result = run_agent(
        task      = args.task,
        window    = args.window,
        obs_addr  = args.obs,
        act_addr  = args.act,
        model     = args.model,
        max_iters = args.max_iters,
    )
    sys.exit(0 if result["success"] else 1)


if __name__ == "__main__":
    main()