From 8f496affa10613c2fbd3df8b26c7e6b133e764dd Mon Sep 17 00:00:00 2001 From: philipph-askui Date: Tue, 9 Jun 2026 16:37:03 +0200 Subject: [PATCH] feat: add coordinate space abstraction for open weights LLM support --- src/askui/android_agent.py | 5 +- src/askui/computer_agent.py | 3 +- .../model_providers/ollama_vlm_provider.py | 36 +++++ .../model_providers/openai_vlm_provider.py | 26 ++++ src/askui/model_providers/vlm_provider.py | 31 +++++ src/askui/models/shared/__init__.py | 10 ++ src/askui/models/shared/coordinate_space.py | 104 +++++++++++++++ src/askui/tools/android/agent_os_facade.py | 31 +++-- src/askui/tools/computer_agent_os_facade.py | 30 ++++- src/askui/tools/playwright/agent_os_facade.py | 28 +++- src/askui/web_agent.py | 5 +- .../test_ollama_vlm_provider.py | 64 +++++++++ .../test_openai_vlm_provider.py | 125 ++++++++++++++++++ 13 files changed, 474 insertions(+), 24 deletions(-) create mode 100644 src/askui/models/shared/coordinate_space.py diff --git a/src/askui/android_agent.py b/src/askui/android_agent.py index 98b79143..7b7818f8 100644 --- a/src/askui/android_agent.py +++ b/src/askui/android_agent.py @@ -87,7 +87,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PpadbAgentOs(device_identifier=device, reporter=reporter) - self.act_agent_os_facade = AndroidAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -97,6 +96,10 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = AndroidAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with Android-specific settings self.act_settings = ActSettings( diff --git a/src/askui/computer_agent.py b/src/askui/computer_agent.py index ad0a6627..6e53df87 100644 --- a/src/askui/computer_agent.py +++ b/src/askui/computer_agent.py @@ -130,7 +130,8 @@ def __init__( truncation_strategy=truncation_strategy, ) self.act_agent_os_facade: ComputerAgentOsFacade = ComputerAgentOsFacade( - self.tools.os + self.tools.os, + coordinate_space=self._vlm_provider.coordinate_space, ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) # Override default act settings with computer-specific settings diff --git a/src/askui/model_providers/ollama_vlm_provider.py b/src/askui/model_providers/ollama_vlm_provider.py index e06fa408..1cca3905 100644 --- a/src/askui/model_providers/ollama_vlm_provider.py +++ b/src/askui/model_providers/ollama_vlm_provider.py @@ -1,12 +1,23 @@ """OllamaVlmProvider — VLM access via a local Ollama instance.""" from openai import OpenAI +from typing_extensions import override from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) _DEFAULT_BASE_URL = "http://localhost:11434/v1" _DEFAULT_MODEL_ID = "qwen3.5" +_QWEN_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_HOLO_COORDINATE_SPACE = ScaledCoordinateSpace(width=1000, height=1000) +_KIMI_COORDINATE_SPACE = NormalizedCoordinateSpace() + class OllamaVlmProvider(OpenAIVlmProvider): """VLM provider that routes requests to a local Ollama instance. @@ -14,6 +25,11 @@ class OllamaVlmProvider(OpenAIVlmProvider): Thin convenience wrapper around `OpenAIVlmProvider` with Ollama defaults (``base_url``, ``api_key``, ``model_id``). + Qwen and Holo models are automatically detected and their coordinate + space is set to ``ScaledCoordinateSpace(width=1000, height=1000)``. + Kimi models use ``NormalizedCoordinateSpace()``. + Pass ``coordinate_space`` explicitly to override auto-detection. + Args: model_id (str, optional): Ollama model to use. Defaults to ``"qwen3.5"``. @@ -21,6 +37,9 @@ class OllamaVlmProvider(OpenAIVlmProvider): API. Defaults to ``"http://localhost:11434/v1"``. client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``base_url`` is ignored. + coordinate_space (VlmCoordinateSpace | None, optional): The coordinate + grid the model emits coordinates in. ``None`` (the default) + enables auto-detection based on ``model_id``. Example: ```python @@ -40,10 +59,27 @@ def __init__( model_id: str = _DEFAULT_MODEL_ID, base_url: str = _DEFAULT_BASE_URL, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace | None = None, ) -> None: + self._coordinate_space_override = coordinate_space super().__init__( model_id=model_id, api_key="ollama", # Ollama requires no auth; OpenAI SDK needs a value base_url=base_url, client=client, + coordinate_space=coordinate_space or PixelCoordinateSpace(), ) + + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + if self._coordinate_space_override is not None: + return self._coordinate_space_override + model_lower = self._model_id_value.lower() + if "qwen" in model_lower: + return _QWEN_COORDINATE_SPACE + if "holo" in model_lower: + return _HOLO_COORDINATE_SPACE + if "kimi" in model_lower: + return _KIMI_COORDINATE_SPACE + return self._coordinate_space diff --git a/src/askui/model_providers/openai_vlm_provider.py b/src/askui/model_providers/openai_vlm_provider.py index 47475cc7..8ac5f6a6 100644 --- a/src/askui/model_providers/openai_vlm_provider.py +++ b/src/askui/model_providers/openai_vlm_provider.py @@ -14,11 +14,17 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection from askui.utils.model_pricing import ModelPricing _DEFAULT_MODEL_ID = "gpt-5.4" +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() class OpenAIVlmProvider(VlmProvider): @@ -36,6 +42,9 @@ class OpenAIVlmProvider(VlmProvider): to the OpenAI API (``https://api.openai.com/v1``). client (`OpenAI` | None, optional): Pre-configured OpenAI client. If provided, ``api_key`` and ``base_url`` are ignored. + coordinate_space (VlmCoordinateSpace, optional): The coordinate grid + the model emits coordinates in. Defaults to the screenshot + resolution (native pixel coordinates). Example: ```python @@ -57,6 +66,7 @@ def __init__( api_key: str | None = None, base_url: str | None = None, client: OpenAI | None = None, + coordinate_space: VlmCoordinateSpace = _DEFAULT_COORDINATE_SPACE, input_cost_per_million_tokens: float | None = None, output_cost_per_million_tokens: float | None = None, cache_write_cost_per_million_tokens: float | None = None, @@ -65,6 +75,7 @@ def __init__( self._model_id_value = ( model_id or os.environ.get("VLM_PROVIDER_MODEL_ID") or _DEFAULT_MODEL_ID ) + self._coordinate_space = coordinate_space if client is not None: self._client = client else: @@ -86,6 +97,11 @@ def __init__( def model_id(self) -> str: return self._model_id_value + @property + @override + def coordinate_space(self) -> VlmCoordinateSpace: + return self._coordinate_space + @property @override def pricing(self) -> ModelPricing | None: @@ -96,6 +112,14 @@ def _messages_api(self) -> OpenAIMessagesApi: """Lazily initialise the `OpenAIMessagesApi` on first use.""" return OpenAIMessagesApi(client=self._client) + @override + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Append coordinate and resolution info to the system prompt.""" + coord_info = self.coordinate_space.build_prompt_section( + screenshot_resolution=SCREENSHOT_RESOLUTION, + ) + return SystemPrompt(prompt=f"{str(system)}\n\n{coord_info}") + @override def create_message( self, @@ -108,6 +132,8 @@ def create_message( temperature: float | None = None, provider_options: dict[str, Any] | None = None, ) -> MessageParam: + if system is not None: + system = self.augment_system_prompt(system) return self._messages_api.create_message( messages=messages, model_id=self._model_id_value, diff --git a/src/askui/model_providers/vlm_provider.py b/src/askui/model_providers/vlm_provider.py index 1e98b972..6d4d9738 100644 --- a/src/askui/model_providers/vlm_provider.py +++ b/src/askui/model_providers/vlm_provider.py @@ -8,10 +8,16 @@ ThinkingConfigParam, ToolChoiceParam, ) +from askui.models.shared.coordinate_space import ( + PixelCoordinateSpace, + VlmCoordinateSpace, +) from askui.models.shared.prompts import SystemPrompt from askui.models.shared.tools import ToolCollection from askui.utils.model_pricing import ModelPricing +_DEFAULT_COORDINATE_SPACE = PixelCoordinateSpace() + class VlmProvider(ABC): """Interface for Vision Language Model providers. @@ -44,6 +50,17 @@ class VlmProvider(ABC): def model_id(self) -> str: """The model identifier used by this provider.""" + @property + def coordinate_space(self) -> VlmCoordinateSpace: + """The coordinate space this model emits coordinates in. + + Returns a `VlmCoordinateSpace` describing the grid the model uses. + The default is `PixelCoordinateSpace` (native pixel coordinates). + Override in subclasses when the model uses a different grid + (e.g. ``ScaledCoordinateSpace(1000, 1000)`` for Qwen). + """ + return _DEFAULT_COORDINATE_SPACE + @property def pricing(self) -> ModelPricing | None: """Pricing information for this provider's model. @@ -53,6 +70,20 @@ def pricing(self) -> ModelPricing | None: """ return None + def augment_system_prompt(self, system: SystemPrompt) -> SystemPrompt: + """Hook for providers to augment the system prompt before sending. + + Called by ``create_message()`` implementations. The base + implementation returns the prompt unchanged. Override in + subclasses that need to inject provider-specific information + (e.g. coordinate bounds for non-Anthropic models). + + The original ``SystemPrompt`` object is **not** mutated — + implementations should create a new ``SystemPrompt`` wrapping + the augmented text. + """ + return system + @abstractmethod def create_message( self, diff --git a/src/askui/models/shared/__init__.py b/src/askui/models/shared/__init__.py index 4df27a7b..635fc053 100644 --- a/src/askui/models/shared/__init__.py +++ b/src/askui/models/shared/__init__.py @@ -1,5 +1,11 @@ from .android_base_tool import AndroidBaseTool from .computer_base_tool import ComputerBaseTool +from .coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, + VlmCoordinateSpace, +) from .tool_tags import ToolTags try: @@ -13,6 +19,10 @@ __all__ = [ "AndroidBaseTool", "ComputerBaseTool", + "NormalizedCoordinateSpace", + "PixelCoordinateSpace", + "ScaledCoordinateSpace", + "VlmCoordinateSpace", "ToolTags", ] diff --git a/src/askui/models/shared/coordinate_space.py b/src/askui/models/shared/coordinate_space.py new file mode 100644 index 00000000..69696cdd --- /dev/null +++ b/src/askui/models/shared/coordinate_space.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from pydantic import BaseModel, Field + +# The resolution screenshots are scaled to before being sent to the model. +# Used by all agent OS facades (computer, Android, Playwright). +SCREENSHOT_RESOLUTION: tuple[int, int] = (1024, 768) + + +def _common_prompt_lines(screenshot_resolution: tuple[int, int]) -> list[str]: + sw, sh = screenshot_resolution + return [ + f"* Screenshot resolution: {sw}x{sh} pixels", + "* Screenshots may contain black padding bars to preserve the " + "original aspect ratio. UI elements are NOT located in the " + "padding area.", + "* Coordinate origin is the top-left corner (0, 0)", + ] + + +class VlmCoordinateSpace(BaseModel, ABC): + """Abstract base for VLM coordinate conventions. + + Each subclass describes one coordinate grid a VLM may emit and knows + how to map those coordinates back to pixel space and how to render + the matching prompt section. + """ + + @abstractmethod + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + """Map model coordinates to pixel coordinates in *target_resolution*.""" + + @abstractmethod + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + """Build prompt text describing coordinate bounds for the model.""" + + +class PixelCoordinateSpace(VlmCoordinateSpace): + """Identity mapping -- coordinates already in pixel space. + + Used by Anthropic/Claude which emit coordinates matching the + screenshot resolution. + """ + + def map_to_target( + self, + x: float, + y: float, + target_resolution: tuple[int, int], # noqa: ARG002 + ) -> tuple[int, int]: + return int(x), int(y) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + sw, sh = screenshot_resolution + lines = _common_prompt_lines(screenshot_resolution) + lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + return "\n".join(lines) + + +class ScaledCoordinateSpace(VlmCoordinateSpace): + """Integer grid (e.g. 1000x1000 for Qwen). Linear scaling.""" + + width: int = Field(gt=0, description="Width of the coordinate grid") + height: int = Field(gt=0, description="Height of the coordinate grid") + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw / self.width), int(y * th / self.height) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + lines = _common_prompt_lines(screenshot_resolution) + if (self.width, self.height) != screenshot_resolution: + lines.append( + f"* Emit coordinates in a {self.width}x{self.height} " + f"normalised grid: 0 <= x < {self.width}, " + f"0 <= y < {self.height}" + ) + else: + sw, sh = screenshot_resolution + lines.append(f"* Coordinate bounds: 0 <= x < {sw}, 0 <= y < {sh}") + return "\n".join(lines) + + +class NormalizedCoordinateSpace(VlmCoordinateSpace): + """0.0-1.0 float grid (Kimi). No fields.""" + + def map_to_target( + self, x: float, y: float, target_resolution: tuple[int, int] + ) -> tuple[int, int]: + tw, th = target_resolution + return int(x * tw), int(y * th) + + def build_prompt_section(self, screenshot_resolution: tuple[int, int]) -> str: + lines = _common_prompt_lines(screenshot_resolution) + lines.append( + "* Emit coordinates as normalised floats: 0.0 <= x <= 1.0, 0.0 <= y <= 1.0" + ) + return "\n".join(lines) diff --git a/src/askui/tools/android/agent_os_facade.py b/src/askui/tools/android/agent_os_facade.py index f27d0eee..f0374036 100644 --- a/src/askui/tools/android/agent_os_facade.py +++ b/src/askui/tools/android/agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay from askui.tools.android.uiautomator_hierarchy import UIElementCollection @@ -15,9 +19,14 @@ class AndroidAgentOsFacade(AndroidAgentOs): and back to the real screen resolution. """ - def __init__(self, agent_os: AndroidAgentOs) -> None: + def __init__( + self, + agent_os: AndroidAgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os: AndroidAgentOs = agent_os - self._target_resolution: Tuple[int, int] = (1024, 768) + self._target_resolution: Tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: Optional[Tuple[int, int]] = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -39,33 +48,39 @@ def screenshot(self) -> Image.Image: def _scale_coordinates( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, ) -> Tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot().size + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), self._real_screen_resolution, self._target_resolution, inverse=from_agent, ) - def tap(self, x: int, y: int) -> None: + def tap(self, x: float, y: float) -> None: x, y = self._scale_coordinates(x, y) self._agent_os.tap(x, y) def swipe( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: x1, y1 = self._scale_coordinates(x1, y1) x2, y2 = self._scale_coordinates(x2, y2) self._agent_os.swipe(x1, y1, x2, y2, duration_in_ms) def drag_and_drop( - self, x1: int, y1: int, x2: int, y2: int, duration_in_ms: int = 1000 + self, x1: float, y1: float, x2: float, y2: float, duration_in_ms: int = 1000 ) -> None: x1, y1 = self._scale_coordinates(x1, y1) x2, y2 = self._scale_coordinates(x2, y2) diff --git a/src/askui/tools/computer_agent_os_facade.py b/src/askui/tools/computer_agent_os_facade.py index 28a1a8c5..c91a2c84 100644 --- a/src/askui/tools/computer_agent_os_facade.py +++ b/src/askui/tools/computer_agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import ( AgentOs, @@ -36,9 +40,14 @@ class ComputerAgentOsFacade(AgentOs): and back to the real screen resolution. """ - def __init__(self, agent_os: AgentOs) -> None: + def __init__( + self, + agent_os: AgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) + self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: DisplaySize | None = None self.tags.append(ToolTags.SCALED_AGENT_OS.value) @@ -57,7 +66,7 @@ def screenshot(self, report: bool = True) -> Image.Image: ) return scale_image_to_fit(screenshot, self._target_resolution) - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) self._agent_os.mouse_move(scaled_x, scaled_y, duration) @@ -68,7 +77,7 @@ def get_mouse_position(self) -> Coordinate: ) return Coordinate(x=scaled_x, y=scaled_y) - def set_mouse_position(self, x: int, y: int) -> None: + def set_mouse_position(self, x: float, y: float) -> None: scaled_x, scaled_y = self._scale_coordinates_back(x, y) self._agent_os.set_mouse_position(scaled_x, scaled_y) @@ -302,15 +311,22 @@ def remove_virtual_displays(self) -> None: def _scale_coordinates_back( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, check_coordinates_in_bounds: bool = True, ) -> tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.retrieve_active_display().size + + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), (self._real_screen_resolution.width, self._real_screen_resolution.height), self._target_resolution, inverse=from_agent, diff --git a/src/askui/tools/playwright/agent_os_facade.py b/src/askui/tools/playwright/agent_os_facade.py index 091ff804..5d6f7c42 100644 --- a/src/askui/tools/playwright/agent_os_facade.py +++ b/src/askui/tools/playwright/agent_os_facade.py @@ -2,6 +2,10 @@ from PIL import Image +from askui.models.shared.coordinate_space import ( + SCREENSHOT_RESOLUTION, + VlmCoordinateSpace, +) from askui.models.shared.tool_tags import ToolTags from askui.tools.agent_os import Display, ModifierKey, PcKey from askui.tools.playwright.agent_os import PlaywrightAgentOs @@ -20,9 +24,14 @@ class PlaywrightAgentOsFacade(PlaywrightAgentOs): agent_os (PlaywrightAgentOs): The real Playwright agent OS to wrap. """ - def __init__(self, agent_os: PlaywrightAgentOs) -> None: + def __init__( + self, + agent_os: PlaywrightAgentOs, + coordinate_space: VlmCoordinateSpace, + ) -> None: self._agent_os = agent_os - self._target_resolution: tuple[int, int] = (1024, 768) + self._target_resolution: tuple[int, int] = SCREENSHOT_RESOLUTION + self._coordinate_space: VlmCoordinateSpace = coordinate_space self._real_screen_resolution: tuple[int, int] | None = None self.tags = self._agent_os.tags + [ToolTags.SCALED_AGENT_OS.value] @@ -43,22 +52,29 @@ def screenshot(self, report: bool = True) -> Image.Image: def _scale_coordinates( self, - x: int, - y: int, + x: float, + y: float, from_agent: bool = True, ) -> tuple[int, int]: if self._real_screen_resolution is None: self._real_screen_resolution = self._agent_os.screenshot( report=False, ).size + + mapped_x, mapped_y = ( + self._coordinate_space.map_to_target(x, y, self._target_resolution) + if from_agent + else (int(x), int(y)) + ) + return scale_coordinates( - (x, y), + (mapped_x, mapped_y), self._real_screen_resolution, self._target_resolution, inverse=from_agent, ) - def mouse_move(self, x: int, y: int, duration: int = 500) -> None: + def mouse_move(self, x: float, y: float, duration: int = 500) -> None: scaled_x, scaled_y = self._scale_coordinates(x, y) # scaled_x, scaled_y = x, y self._agent_os.mouse_move(scaled_x, scaled_y, duration) diff --git a/src/askui/web_agent.py b/src/askui/web_agent.py index fe47c5f9..bc211ec9 100644 --- a/src/askui/web_agent.py +++ b/src/askui/web_agent.py @@ -60,7 +60,6 @@ def __init__( ) -> None: reporter = CompositeReporter(reporters=reporters) self.os = PlaywrightAgentOs(reporter) - self.act_agent_os_facade = PlaywrightAgentOsFacade(self.os) super().__init__( reporter=reporter, retry=retry, @@ -70,6 +69,10 @@ def __init__( callbacks=callbacks, truncation_strategy=truncation_strategy, ) + self.act_agent_os_facade = PlaywrightAgentOsFacade( + self.os, + coordinate_space=self._vlm_provider.coordinate_space, + ) self.act_tool_collection.add_agent_os(self.act_agent_os_facade) self.act_settings = ActSettings( messages=MessageSettings( diff --git a/tests/unit/model_providers/test_ollama_vlm_provider.py b/tests/unit/model_providers/test_ollama_vlm_provider.py index 143e7c35..e3f78ef5 100644 --- a/tests/unit/model_providers/test_ollama_vlm_provider.py +++ b/tests/unit/model_providers/test_ollama_vlm_provider.py @@ -6,6 +6,11 @@ from askui.model_providers.ollama_vlm_provider import OllamaVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) class TestOllamaVlmProvider: @@ -48,3 +53,62 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_auto_detects_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="qwen3.5") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_qwen_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Qwen2-VL") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_kimi(self) -> None: + provider = OllamaVlmProvider(model_id="kimi-vl") + assert provider.coordinate_space == NormalizedCoordinateSpace() + + def test_coordinate_space_auto_detects_kimi_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Kimi-VL-A3B") + assert provider.coordinate_space == NormalizedCoordinateSpace() + + def test_coordinate_space_default_for_non_qwen(self) -> None: + provider = OllamaVlmProvider(model_id="llava") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_explicit_override(self) -> None: + provider = OllamaVlmProvider( + model_id="llava", + coordinate_space=ScaledCoordinateSpace(width=500, height=500), + ) + assert provider.coordinate_space == ScaledCoordinateSpace(width=500, height=500) + + def test_coordinate_space_explicit_override_takes_precedence(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=ScaledCoordinateSpace(width=2000, height=2000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=2000, height=2000 + ) + + def test_coordinate_space_explicit_pixel_overrides_qwen_auto_detect(self) -> None: + provider = OllamaVlmProvider( + model_id="qwen3.5", + coordinate_space=PixelCoordinateSpace(), + ) + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_auto_detects_holo(self) -> None: + provider = OllamaVlmProvider(model_id="holo3.1-35b-a3b") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_coordinate_space_auto_detects_holo_case_insensitive(self) -> None: + provider = OllamaVlmProvider(model_id="Holo-3.1-4B") + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) diff --git a/tests/unit/model_providers/test_openai_vlm_provider.py b/tests/unit/model_providers/test_openai_vlm_provider.py index d51ff74b..7a5a2a87 100644 --- a/tests/unit/model_providers/test_openai_vlm_provider.py +++ b/tests/unit/model_providers/test_openai_vlm_provider.py @@ -6,6 +6,12 @@ from askui.model_providers.openai_vlm_provider import OpenAIVlmProvider from askui.models.shared.agent_message_param import MessageParam +from askui.models.shared.coordinate_space import ( + NormalizedCoordinateSpace, + PixelCoordinateSpace, + ScaledCoordinateSpace, +) +from askui.models.shared.prompts import SystemPrompt class TestOpenAIVlmProvider: @@ -41,3 +47,122 @@ def test_create_message_delegates_to_messages_api(self) -> None: mock_client.chat.completions.create.assert_called_once() assert result.role == "assistant" + + def test_coordinate_space_defaults_to_pixel(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + assert provider.coordinate_space == PixelCoordinateSpace() + + def test_coordinate_space_passthrough(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + assert provider.coordinate_space == ScaledCoordinateSpace( + width=1000, height=1000 + ) + + def test_augment_system_prompt_scaled_coordinate_space(self) -> None: + provider = OpenAIVlmProvider( + model_id="gpt-4o", + api_key="sk-test", + coordinate_space=ScaledCoordinateSpace(width=1000, height=1000), + ) + system = SystemPrompt(prompt="You are a helpful assistant.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "You are a helpful assistant." in rendered + assert "1000x1000 normalised grid" in rendered + assert "1024x768" in rendered + + def test_augment_system_prompt_pixel_bounds_when_matching(self) -> None: + provider = OpenAIVlmProvider(model_id="gpt-4o", api_key="sk-test") + system = SystemPrompt(prompt="Base prompt.") + augmented = provider.augment_system_prompt(system) + + rendered = str(augmented) + assert "normalised grid" not in rendered + assert "0 <= x < 1024" in rendered + + +class TestPixelCoordinateSpacePrompt: + def test_shows_pixel_bounds(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "0 <= x < 1024" in result + assert "0 <= y < 768" in result + assert "normalised grid" not in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = PixelCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestScaledCoordinateSpacePrompt: + def test_shows_normalised_grid(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section((1024, 768)) + assert "1024x768" in result + assert "1000x1000 normalised grid" in result + assert "0 <= x < 1000" in result + assert "0 <= y < 1000" in result + + def test_matching_resolution_shows_pixel_bounds(self) -> None: + cs = ScaledCoordinateSpace(width=1024, height=768) + result = cs.build_prompt_section((1024, 768)) + assert "0 <= x < 1024" in result + assert "normalised grid" not in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestNormalizedCoordinateSpacePrompt: + def test_shows_normalised_floats(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "0.0 <= x <= 1.0" in result + assert "0.0 <= y <= 1.0" in result + assert "normalised floats" in result + + def test_includes_padding_and_origin_info(self) -> None: + cs = NormalizedCoordinateSpace() + result = cs.build_prompt_section((1024, 768)) + assert "black padding" in result + assert "top-left" in result + + +class TestMapToTarget: + def test_pixel_identity(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512, 384, (1024, 768)) == (512, 384) + + def test_pixel_truncates_floats(self) -> None: + cs = PixelCoordinateSpace() + assert cs.map_to_target(512.7, 384.3, (1024, 768)) == (512, 384) + + def test_scaled_maps_correctly(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(500, 500, (1024, 768)) == (512, 384) + + def test_scaled_zero(self) -> None: + cs = ScaledCoordinateSpace(width=1000, height=1000) + assert cs.map_to_target(0, 0, (1024, 768)) == (0, 0) + + def test_normalized_maps_correctly(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.5, 0.5, (1024, 768)) == (512, 384) + + def test_normalized_zero(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(0.0, 0.0, (1024, 768)) == (0, 0) + + def test_normalized_one(self) -> None: + cs = NormalizedCoordinateSpace() + assert cs.map_to_target(1.0, 1.0, (1024, 768)) == (1024, 768)