From 8f496affa10613c2fbd3df8b26c7e6b133e764dd Mon Sep 17 00:00:00 2001
From: philipph-askui <philipp.hallgarten@askui.com>
Date: Tue, 9 Jun 2026 16:37:03 +0200
Subject: [PATCH] feat: add coordinate space abstraction for open weights LLM
 support

---
 src/askui/android_agent.py                    |   5 +-
 src/askui/computer_agent.py                   |   3 +-
 .../model_providers/ollama_vlm_provider.py    |  36 +++++
 .../model_providers/openai_vlm_provider.py    |  26 ++++
 src/askui/model_providers/vlm_provider.py     |  31 +++++
 src/askui/models/shared/__init__.py           |  10 ++
 src/askui/models/shared/coordinate_space.py   | 104 +++++++++++++++
 src/askui/tools/android/agent_os_facade.py    |  31 +++--
 src/askui/tools/computer_agent_os_facade.py   |  30 ++++-
 src/askui/tools/playwright/agent_os_facade.py |  28 +++-
 src/askui/web_agent.py                        |   5 +-
 .../test_ollama_vlm_provider.py               |  64 +++++++++
 .../test_openai_vlm_provider.py               | 125 ++++++++++++++++++
 13 files changed, 474 insertions(+), 24 deletions(-)
 create mode 100644 src/askui/models/shared/coordinate_space.py

diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py
index 98b79143..7b7818f8 100644
--- a/src/askui/android_agent.py
+++ b/src/askui/android_agent.py
@@ -87,7 +87,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PpadbAgentOs(device_identifier=device, reporter=reporter)
-        self.act_agent_os_facade = AndroidAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -97,6 +96,10 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = AndroidAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with Android-specific settings
         self.act_settings = ActSettings(
diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py
index ad0a6627..6e53df87 100644
--- a/src/askui/computer_agent.py
+++ b/src/askui/computer_agent.py
@@ -130,7 +130,8 @@ def __init__(
             truncation_strategy=truncation_strategy,
         )
         self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade(
-            self.tools.os
+            self.tools.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
         )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         # Override default act settings with computer-specific settings
diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py
index e06fa408..1cca3905 100644
--- a/src/askui/model_providers/ollama_vlm_provider.py
+++ b/src/askui/model_providers/ollama_vlm_provider.py
@@ -1,12 +1,23 @@
 """OllamaVlmProvider — VLM access via a local Ollama instance."""
 
 from openai import OpenAI
+from typing_extensions import override
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 
 _DEFAULT_BASE_URL = "http://localhost:11434/v1"
 _DEFAULT_MODEL_ID = "qwen3.5"
 
+_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000)
+_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace()
+
 
 class OllamaVlmProvider(OpenAIVlmProvider):
     """VLM provider that routes requests to a local Ollama instance.
@@ -14,6 +25,11 @@ class OllamaVlmProvider(OpenAIVlmProvider):
     Thin convenience wrapper around `OpenAIVlmProvider` with Ollama
     defaults (``base_url``, ``api_key``, ``model_id``).
 
+    Qwen and Holo models are automatically detected and their coordinate
+    space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``.
+    Kimi models use ``NormalizedCoordinateSpace()``.
+    Pass ``coordinate_space`` explicitly to override auto-detection.
+
     Args:
         model_id (str, optional): Ollama model to use. Defaults to
             ``"qwen3.5"``.
@@ -21,6 +37,9 @@ class OllamaVlmProvider(OpenAIVlmProvider):
             API. Defaults to ``"http://localhost:11434/v1"``.
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``base_url`` is ignored.
+        coordinate_space (VlmCoordinateSpace | None, optional): The coordinate
+            grid the model emits coordinates in.  ``None`` (the default)
+            enables auto-detection based on ``model_id``.
 
     Example:
         ```python
@@ -40,10 +59,27 @@ def __init__(
         model_id: str = _DEFAULT_MODEL_ID,
         base_url: str = _DEFAULT_BASE_URL,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace | None = None,
     ) -> None:
+        self._coordinate_space_override = coordinate_space
         super().__init__(
             model_id=model_id,
             api_key="ollama",  # Ollama requires no auth; OpenAI SDK needs a value
             base_url=base_url,
             client=client,
+            coordinate_space=coordinate_space or PixelCoordinateSpace(),
         )
+
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        if self._coordinate_space_override is not None:
+            return self._coordinate_space_override
+        model_lower = self._model_id_value.lower()
+        if "qwen" in model_lower:
+            return _QWEN_COORDINATE_SPACE
+        if "holo" in model_lower:
+            return _HOLO_COORDINATE_SPACE
+        if "kimi" in model_lower:
+            return _KIMI_COORDINATE_SPACE
+        return self._coordinate_space
diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py
index 47475cc7..8ac5f6a6 100644
--- a/src/askui/model_providers/openai_vlm_provider.py
+++ b/src/askui/model_providers/openai_vlm_provider.py
@@ -14,11 +14,17 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
 _DEFAULT_MODEL_ID = "gpt-5.4"
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
 
 
 class OpenAIVlmProvider(VlmProvider):
@@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider):
             to the OpenAI API (``https://api.openai.com/v1``).
         client (`OpenAI` | None, optional): Pre-configured OpenAI client.
             If provided, ``api_key`` and ``base_url`` are ignored.
+        coordinate_space (VlmCoordinateSpace, optional): The coordinate grid
+            the model emits coordinates in.  Defaults to the screenshot
+            resolution (native pixel coordinates).
 
     Example:
         ```python
@@ -57,6 +66,7 @@ def __init__(
         api_key: str | None = None,
         base_url: str | None = None,
         client: OpenAI | None = None,
+        coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE,
         input_cost_per_million_tokens: float | None = None,
         output_cost_per_million_tokens: float | None = None,
         cache_write_cost_per_million_tokens: float | None = None,
@@ -65,6 +75,7 @@ def __init__(
         self._model_id_value = (
             model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID
         )
+        self._coordinate_space = coordinate_space
         if client is not None:
             self._client = client
         else:
@@ -86,6 +97,11 @@ def __init__(
     def model_id(self) -> str:
         return self._model_id_value
 
+    @property
+    @override
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        return self._coordinate_space
+
     @property
     @override
     def pricing(self) -> ModelPricing | None:
@@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi:
         """Lazily initialise the `OpenAIMessagesApi` on first use."""
         return OpenAIMessagesApi(client=self._client)
 
+    @override
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Append coordinate and resolution info to the system prompt."""
+        coord_info = self.coordinate_space.build_prompt_section(
+            screenshot_resolution=SCREENSHOT_RESOLUTION,
+        )
+        return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}")
+
     @override
     def create_message(
         self,
@@ -108,6 +132,8 @@ def create_message(
         temperature: float | None = None,
         provider_options: dict[str, Any] | None = None,
     ) -> MessageParam:
+        if system is not None:
+            system = self.augment_system_prompt(system)
         return self._messages_api.create_message(
             messages=messages,
             model_id=self._model_id_value,
diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py
index 1e98b972..6d4d9738 100644
--- a/src/askui/model_providers/vlm_provider.py
+++ b/src/askui/model_providers/vlm_provider.py
@@ -8,10 +8,16 @@
     ThinkingConfigParam,
     ToolChoiceParam,
 )
+from askui.models.shared.coordinate_space import (
+    PixelCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.prompts import SystemPrompt
 from askui.models.shared.tools import ToolCollection
 from askui.utils.model_pricing import ModelPricing
 
+_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace()
+
 
 class VlmProvider(ABC):
     """Interface for Vision Language Model providers.
@@ -44,6 +50,17 @@ class VlmProvider(ABC):
     def model_id(self) -> str:
         """The model identifier used by this provider."""
 
+    @property
+    def coordinate_space(self) -> VlmCoordinateSpace:
+        """The coordinate space this model emits coordinates in.
+
+        Returns a `VlmCoordinateSpace` describing the grid the model uses.
+        The default is `PixelCoordinateSpace` (native pixel coordinates).
+        Override in subclasses when the model uses a different grid
+        (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen).
+        """
+        return _DEFAULT_COORDINATE_SPACE
+
     @property
     def pricing(self) -> ModelPricing | None:
         """Pricing information for this provider's model.
@@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None:
         """
         return None
 
+    def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt:
+        """Hook for providers to augment the system prompt before sending.
+
+        Called by ``create_message()`` implementations.  The base
+        implementation returns the prompt unchanged.  Override in
+        subclasses that need to inject provider-specific information
+        (e.g. coordinate bounds for non-Anthropic models).
+
+        The original ``SystemPrompt`` object is **not** mutated —
+        implementations should create a new ``SystemPrompt`` wrapping
+        the augmented text.
+        """
+        return system
+
     @abstractmethod
     def create_message(
         self,
diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py
index 4df27a7b..635fc053 100644
--- a/src/askui/models/shared/__init__.py
+++ b/src/askui/models/shared/__init__.py
@@ -1,5 +1,11 @@
 from .android_base_tool import AndroidBaseTool
 from .computer_base_tool import ComputerBaseTool
+from .coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+    VlmCoordinateSpace,
+)
 from .tool_tags import ToolTags
 
 try:
@@ -13,6 +19,10 @@
 __all__ = [
     "AndroidBaseTool",
     "ComputerBaseTool",
+    "NormalizedCoordinateSpace",
+    "PixelCoordinateSpace",
+    "ScaledCoordinateSpace",
+    "VlmCoordinateSpace",
     "ToolTags",
 ]
 
diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py
new file mode 100644
index 00000000..69696cdd
--- /dev/null
+++ b/src/askui/models/shared/coordinate_space.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+
+from pydantic import BaseModel, Field
+
+# The resolution screenshots are scaled to before being sent to the model.
+# Used by all agent OS facades (computer, Android, Playwright).
+SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768)
+
+
+def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]:
+    sw, sh = screenshot_resolution
+    return [
+        f"* Screenshot resolution: {sw}x{sh} pixels",
+        "* Screenshots may contain black padding bars to preserve the "
+        "original aspect ratio. UI elements are NOT located in the "
+        "padding area.",
+        "* Coordinate origin is the top-left corner (0, 0)",
+    ]
+
+
+class VlmCoordinateSpace(BaseModel, ABC):
+    """Abstract base for VLM coordinate conventions.
+
+    Each subclass describes one coordinate grid a VLM may emit and knows
+    how to map those coordinates back to pixel space and how to render
+    the matching prompt section.
+    """
+
+    @abstractmethod
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        """Map model coordinates to pixel coordinates in *target_resolution*."""
+
+    @abstractmethod
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        """Build prompt text describing coordinate bounds for the model."""
+
+
+class PixelCoordinateSpace(VlmCoordinateSpace):
+    """Identity mapping -- coordinates already in pixel space.
+
+    Used by Anthropic/Claude which emit coordinates matching the
+    screenshot resolution.
+    """
+
+    def map_to_target(
+        self,
+        x: float,
+        y: float,
+        target_resolution: tuple[int, int],  # noqa: ARG002
+    ) -> tuple[int, int]:
+        return int(x), int(y)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        sw, sh = screenshot_resolution
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class ScaledCoordinateSpace(VlmCoordinateSpace):
+    """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling."""
+
+    width: int = Field(gt=0, description="Width of the coordinate grid")
+    height: int = Field(gt=0, description="Height of the coordinate grid")
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw / self.width), int(y * th / self.height)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        if (self.width, self.height) != screenshot_resolution:
+            lines.append(
+                f"* Emit coordinates in a {self.width}x{self.height} "
+                f"normalised grid: 0 <= x < {self.width}, "
+                f"0 <= y < {self.height}"
+            )
+        else:
+            sw, sh = screenshot_resolution
+            lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}")
+        return "\n".join(lines)
+
+
+class NormalizedCoordinateSpace(VlmCoordinateSpace):
+    """0.0-1.0 float grid (Kimi). No fields."""
+
+    def map_to_target(
+        self, x: float, y: float, target_resolution: tuple[int, int]
+    ) -> tuple[int, int]:
+        tw, th = target_resolution
+        return int(x * tw), int(y * th)
+
+    def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str:
+        lines = _common_prompt_lines(screenshot_resolution)
+        lines.append(
+            "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0"
+        )
+        return "\n".join(lines)
diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py
index f27d0eee..f0374036 100644
--- a/src/askui/tools/android/agent_os_facade.py
+++ b/src/askui/tools/android/agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
 from askui.tools.android.uiautomator_hierarchy import UIElementCollection
@@ -15,9 +19,14 @@ class AndroidAgentOsFacade(AndroidAgentOs):
     and back to the real screen resolution.
     """
 
-    def __init__(self, agent_os: AndroidAgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: AndroidAgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os: AndroidAgentOs = agent_os
-        self._target_resolution: Tuple[int, int] = (1024, 768)
+        self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: Optional[Tuple[int, int]] = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
@@ -39,33 +48,39 @@ def screenshot(self) -> Image.Image:
 
     def _scale_coordinates(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
     ) -> Tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot().size
 
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             self._real_screen_resolution,
             self._target_resolution,
             inverse=from_agent,
         )
 
-    def tap(self, x: int, y: int) -> None:
+    def tap(self, x: float, y: float) -> None:
         x, y = self._scale_coordinates(x, y)
         self._agent_os.tap(x, y)
 
     def swipe(
-        self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
+        self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
         x1, y1 = self._scale_coordinates(x1, y1)
         x2, y2 = self._scale_coordinates(x2, y2)
         self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms)
 
     def drag_and_drop(
-        self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000
+        self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000
     ) -> None:
         x1, y1 = self._scale_coordinates(x1, y1)
         x2, y2 = self._scale_coordinates(x2, y2)
diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py
index 28a1a8c5..c91a2c84 100644
--- a/src/askui/tools/computer_agent_os_facade.py
+++ b/src/askui/tools/computer_agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import (
     AgentOs,
@@ -36,9 +40,14 @@ class ComputerAgentOsFacade(AgentOs):
     and back to the real screen resolution.
     """
 
-    def __init__(self, agent_os: AgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: AgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = (1024, 768)
+        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: DisplaySize | None = None
         self.tags.append(ToolTags.SCALED_AGENT_OS.value)
 
@@ -57,7 +66,7 @@ def screenshot(self, report: bool = True) -> Image.Image:
         )
         return scale_image_to_fit(screenshot, self._target_resolution)
 
-    def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+    def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
 
@@ -68,7 +77,7 @@ def get_mouse_position(self) -> Coordinate:
         )
         return Coordinate(x=scaled_x, y=scaled_y)
 
-    def set_mouse_position(self, x: int, y: int) -> None:
+    def set_mouse_position(self, x: float, y: float) -> None:
         scaled_x, scaled_y = self._scale_coordinates_back(x, y)
         self._agent_os.set_mouse_position(scaled_x, scaled_y)
 
@@ -302,15 +311,22 @@ def remove_virtual_displays(self) -> None:
 
     def _scale_coordinates_back(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
         check_coordinates_in_bounds: bool = True,
     ) -> tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.retrieve_active_display().size
+
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             (self._real_screen_resolution.width, self._real_screen_resolution.height),
             self._target_resolution,
             inverse=from_agent,
diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py
index 091ff804..5d6f7c42 100644
--- a/src/askui/tools/playwright/agent_os_facade.py
+++ b/src/askui/tools/playwright/agent_os_facade.py
@@ -2,6 +2,10 @@
 
 from PIL import Image
 
+from askui.models.shared.coordinate_space import (
+    SCREENSHOT_RESOLUTION,
+    VlmCoordinateSpace,
+)
 from askui.models.shared.tool_tags import ToolTags
 from askui.tools.agent_os import Display, ModifierKey, PcKey
 from askui.tools.playwright.agent_os import PlaywrightAgentOs
@@ -20,9 +24,14 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs):
         agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap.
     """
 
-    def __init__(self, agent_os: PlaywrightAgentOs) -> None:
+    def __init__(
+        self,
+        agent_os: PlaywrightAgentOs,
+        coordinate_space: VlmCoordinateSpace,
+    ) -> None:
         self._agent_os = agent_os
-        self._target_resolution: tuple[int, int] = (1024, 768)
+        self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION
+        self._coordinate_space: VlmCoordinateSpace = coordinate_space
         self._real_screen_resolution: tuple[int, int] | None = None
         self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value]
 
@@ -43,22 +52,29 @@ def screenshot(self, report: bool = True) -> Image.Image:
 
     def _scale_coordinates(
         self,
-        x: int,
-        y: int,
+        x: float,
+        y: float,
         from_agent: bool = True,
     ) -> tuple[int, int]:
         if self._real_screen_resolution is None:
             self._real_screen_resolution = self._agent_os.screenshot(
                 report=False,
             ).size
+
+        mapped_x, mapped_y = (
+            self._coordinate_space.map_to_target(x, y, self._target_resolution)
+            if from_agent
+            else (int(x), int(y))
+        )
+
         return scale_coordinates(
-            (x, y),
+            (mapped_x, mapped_y),
             self._real_screen_resolution,
             self._target_resolution,
             inverse=from_agent,
         )
 
-    def mouse_move(self, x: int, y: int, duration: int = 500) -> None:
+    def mouse_move(self, x: float, y: float, duration: int = 500) -> None:
         scaled_x, scaled_y = self._scale_coordinates(x, y)
         # scaled_x, scaled_y = x, y
         self._agent_os.mouse_move(scaled_x, scaled_y, duration)
diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py
index fe47c5f9..bc211ec9 100644
--- a/src/askui/web_agent.py
+++ b/src/askui/web_agent.py
@@ -60,7 +60,6 @@ def __init__(
     ) -> None:
         reporter = CompositeReporter(reporters=reporters)
         self.os = PlaywrightAgentOs(reporter)
-        self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os)
         super().__init__(
             reporter=reporter,
             retry=retry,
@@ -70,6 +69,10 @@ def __init__(
             callbacks=callbacks,
             truncation_strategy=truncation_strategy,
         )
+        self.act_agent_os_facade = PlaywrightAgentOsFacade(
+            self.os,
+            coordinate_space=self._vlm_provider.coordinate_space,
+        )
         self.act_tool_collection.add_agent_os(self.act_agent_os_facade)
         self.act_settings = ActSettings(
             messages=MessageSettings(
diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py
index 143e7c35..e3f78ef5 100644
--- a/tests/unit/model_providers/test_ollama_vlm_provider.py
+++ b/tests/unit/model_providers/test_ollama_vlm_provider.py
@@ -6,6 +6,11 @@
 
 from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+)
 
 
 class TestOllamaVlmProvider:
@@ -48,3 +53,62 @@ def test_create_message_delegates_to_messages_api(self) -> None:
 
         mock_client.chat.completions.create.assert_called_once()
         assert result.role == "assistant"
+
+    def test_coordinate_space_auto_detects_qwen(self) -> None:
+        provider = OllamaVlmProvider(model_id="qwen3.5")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Qwen2-VL")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_kimi(self) -> None:
+        provider = OllamaVlmProvider(model_id="kimi-vl")
+        assert provider.coordinate_space == NormalizedCoordinateSpace()
+
+    def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Kimi-VL-A3B")
+        assert provider.coordinate_space == NormalizedCoordinateSpace()
+
+    def test_coordinate_space_default_for_non_qwen(self) -> None:
+        provider = OllamaVlmProvider(model_id="llava")
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_explicit_override(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="llava",
+            coordinate_space=ScaledCoordinateSpace(width=500, height=500),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(width=500, height=500)
+
+    def test_coordinate_space_explicit_override_takes_precedence(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="qwen3.5",
+            coordinate_space=ScaledCoordinateSpace(width=2000, height=2000),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=2000, height=2000
+        )
+
+    def test_coordinate_space_explicit_pixel_overrides_qwen_auto_detect(self) -> None:
+        provider = OllamaVlmProvider(
+            model_id="qwen3.5",
+            coordinate_space=PixelCoordinateSpace(),
+        )
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_auto_detects_holo(self) -> None:
+        provider = OllamaVlmProvider(model_id="holo3.1-35b-a3b")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_coordinate_space_auto_detects_holo_case_insensitive(self) -> None:
+        provider = OllamaVlmProvider(model_id="Holo-3.1-4B")
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py
index d51ff74b..7a5a2a87 100644
--- a/tests/unit/model_providers/test_openai_vlm_provider.py
+++ b/tests/unit/model_providers/test_openai_vlm_provider.py
@@ -6,6 +6,12 @@
 
 from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider
 from askui.models.shared.agent_message_param import MessageParam
+from askui.models.shared.coordinate_space import (
+    NormalizedCoordinateSpace,
+    PixelCoordinateSpace,
+    ScaledCoordinateSpace,
+)
+from askui.models.shared.prompts import SystemPrompt
 
 
 class TestOpenAIVlmProvider:
@@ -41,3 +47,122 @@ def test_create_message_delegates_to_messages_api(self) -> None:
 
         mock_client.chat.completions.create.assert_called_once()
         assert result.role == "assistant"
+
+    def test_coordinate_space_defaults_to_pixel(self) -> None:
+        provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
+        assert provider.coordinate_space == PixelCoordinateSpace()
+
+    def test_coordinate_space_passthrough(self) -> None:
+        provider = OpenAIVlmProvider(
+            model_id="gpt-4o",
+            api_key="sk-test",
+            coordinate_space=ScaledCoordinateSpace(width=1000, height=1000),
+        )
+        assert provider.coordinate_space == ScaledCoordinateSpace(
+            width=1000, height=1000
+        )
+
+    def test_augment_system_prompt_scaled_coordinate_space(self) -> None:
+        provider = OpenAIVlmProvider(
+            model_id="gpt-4o",
+            api_key="sk-test",
+            coordinate_space=ScaledCoordinateSpace(width=1000, height=1000),
+        )
+        system = SystemPrompt(prompt="You are a helpful assistant.")
+        augmented = provider.augment_system_prompt(system)
+
+        rendered = str(augmented)
+        assert "You are a helpful assistant." in rendered
+        assert "1000x1000 normalised grid" in rendered
+        assert "1024x768" in rendered
+
+    def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None:
+        provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test")
+        system = SystemPrompt(prompt="Base prompt.")
+        augmented = provider.augment_system_prompt(system)
+
+        rendered = str(augmented)
+        assert "normalised grid" not in rendered
+        assert "0 <= x < 1024" in rendered
+
+
+class TestPixelCoordinateSpacePrompt:
+    def test_shows_pixel_bounds(self) -> None:
+        cs = PixelCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "0 <= x < 1024" in result
+        assert "0 <= y < 768" in result
+        assert "normalised grid" not in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = PixelCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestScaledCoordinateSpacePrompt:
+    def test_shows_normalised_grid(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        result = cs.build_prompt_section((1024, 768))
+        assert "1024x768" in result
+        assert "1000x1000 normalised grid" in result
+        assert "0 <= x < 1000" in result
+        assert "0 <= y < 1000" in result
+
+    def test_matching_resolution_shows_pixel_bounds(self) -> None:
+        cs = ScaledCoordinateSpace(width=1024, height=768)
+        result = cs.build_prompt_section((1024, 768))
+        assert "0 <= x < 1024" in result
+        assert "normalised grid" not in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestNormalizedCoordinateSpacePrompt:
+    def test_shows_normalised_floats(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "0.0 <= x <= 1.0" in result
+        assert "0.0 <= y <= 1.0" in result
+        assert "normalised floats" in result
+
+    def test_includes_padding_and_origin_info(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        result = cs.build_prompt_section((1024, 768))
+        assert "black padding" in result
+        assert "top-left" in result
+
+
+class TestMapToTarget:
+    def test_pixel_identity(self) -> None:
+        cs = PixelCoordinateSpace()
+        assert cs.map_to_target(512, 384, (1024, 768)) == (512, 384)
+
+    def test_pixel_truncates_floats(self) -> None:
+        cs = PixelCoordinateSpace()
+        assert cs.map_to_target(512.7, 384.3, (1024, 768)) == (512, 384)
+
+    def test_scaled_maps_correctly(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        assert cs.map_to_target(500, 500, (1024, 768)) == (512, 384)
+
+    def test_scaled_zero(self) -> None:
+        cs = ScaledCoordinateSpace(width=1000, height=1000)
+        assert cs.map_to_target(0, 0, (1024, 768)) == (0, 0)
+
+    def test_normalized_maps_correctly(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(0.5, 0.5, (1024, 768)) == (512, 384)
+
+    def test_normalized_zero(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(0.0, 0.0, (1024, 768)) == (0, 0)
+
+    def test_normalized_one(self) -> None:
+        cs = NormalizedCoordinateSpace()
+        assert cs.map_to_target(1.0, 1.0, (1024, 768)) == (1024, 768)