Skip to content

Commit 062e842

Browse files
committed
refactor(tools/computer): consistent, more readable coordinate scaling
1 parent 69f8afd commit 062e842

File tree

9 files changed

+473
-210
lines changed

9 files changed

+473
-210
lines changed

src/askui/locators/serializers.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
Prompt,
1414
Text,
1515
)
16-
from .locators import (
17-
AiElement as AiElementLocator,
18-
)
16+
from .locators import AiElement as AiElementLocator
1917
from .relatable import (
2018
BoundingRelation,
2119
LogicalRelation,

src/askui/models/anthropic/messages_api.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@
4545
from askui.utils.image_utils import (
4646
ImageSource,
4747
image_to_base64,
48-
scale_coordinates_back,
49-
scale_image_with_padding,
48+
scale_coordinates,
49+
scale_image_to_fit,
5050
)
5151

5252
from .utils import extract_click_coordinates
@@ -156,10 +156,9 @@ def _inference(
156156
system: str,
157157
model_choice: str,
158158
) -> str:
159-
scaled_image = scale_image_with_padding(
159+
scaled_image = scale_image_to_fit(
160160
image.root,
161-
self._settings.resolution[0],
162-
self._settings.resolution[1],
161+
self._settings.resolution,
163162
)
164163
message = self.create_message(
165164
messages=[
@@ -222,16 +221,12 @@ def locate(
222221
),
223222
model_choice=model_choice,
224223
)
225-
scaled_x, scaled_y = extract_click_coordinates(content)
226-
x, y = scale_coordinates_back(
227-
scaled_x,
228-
scaled_y,
229-
image.root.width,
230-
image.root.height,
231-
screen_width,
232-
screen_height,
224+
return scale_coordinates(
225+
extract_click_coordinates(content),
226+
image.root.size,
227+
self._settings.resolution,
228+
inverse=True,
233229
)
234-
return int(x), int(y)
235230
except (
236231
_UnexpectedResponseError,
237232
ValueError,

src/askui/models/shared/tools.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pydantic import BaseModel, Field
88
from typing_extensions import Self
99

10+
from askui.logger import logger
1011
from askui.models.shared.agent_message_param import (
1112
Base64ImageSourceParam,
1213
ContentBlockParam,
@@ -155,6 +156,7 @@ def _run_tool(
155156
except AgentException:
156157
raise
157158
except Exception as e: # noqa: BLE001
159+
logger.error(f"Tool {tool_use_block_param.name} failed: {e}", exc_info=True)
158160
return ToolResultBlockParam(
159161
content=f"Tool {tool_use_block_param.name} failed: {e}",
160162
is_error=True,

src/askui/tools/android/agent_os_facade.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from askui.reporting import Reporter
66
from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
7-
from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding
7+
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
88

99

1010
class AndroidAgentOsFacade(AndroidAgentOs):
@@ -32,10 +32,9 @@ def disconnect(self) -> None:
3232
def screenshot(self) -> Image.Image:
3333
screenshot = self._agent_os.screenshot()
3434
self._real_screen_resolution = screenshot.size
35-
scaled_image = scale_image_with_padding(
35+
scaled_image = scale_image_to_fit(
3636
screenshot,
37-
self._target_resolution[0],
38-
self._target_resolution[1],
37+
self._target_resolution,
3938
)
4039

4140
self._reporter.add_message("AndroidAgentOS", "Screenshot taken", screenshot)
@@ -45,15 +44,12 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
4544
if self._real_screen_resolution is None:
4645
self._real_screen_resolution = self._agent_os.screenshot().size
4746

48-
scaled_x, scaled_y = scale_coordinates_back(
49-
x,
50-
y,
51-
self._real_screen_resolution[0],
52-
self._real_screen_resolution[1],
53-
self._target_resolution[0],
54-
self._target_resolution[1],
47+
return scale_coordinates(
48+
(x, y),
49+
self._real_screen_resolution,
50+
self._target_resolution,
51+
inverse=True,
5552
)
56-
return int(scaled_x), int(scaled_y)
5753

5854
def tap(self, x: int, y: int) -> None:
5955
scaled_x, scaled_y = self._scale_coordinates_back(x, y)

src/askui/tools/askui/askui_controller.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
from askui.tools.askui.askui_ui_controller_grpc.generated import (
3030
Controller_V1_pb2_grpc as controller_v1,
3131
)
32-
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import (
33-
RenderObjectStyle, # noqa: E501
32+
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501
33+
RenderObjectStyle,
3434
)
35-
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import (
36-
AskuiAgentosSendResponseSchema, # noqa: E501
35+
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Response_2501 import ( # noqa: E501
36+
AskuiAgentosSendResponseSchema,
3737
)
3838
from askui.tools.askui.command_helpers import (
3939
create_clear_render_objects_command,

src/askui/tools/computer.py

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,7 @@
1313
from typing_extensions import Self, override
1414

1515
from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
16-
from askui.utils.image_utils import (
17-
scale_coordinates_back,
18-
scale_coordinates_with_padding,
19-
scale_image_with_padding,
20-
)
16+
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
2117

2218
from ..models.shared.tools import InputSchema, Tool
2319

@@ -192,20 +188,55 @@ class BetaToolComputerUseParamBase(TypedDict):
192188
display_height_px: int
193189

194190

191+
@dataclass
192+
class Resolution:
193+
width: int
194+
height: int
195+
196+
197+
# https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/README.md
198+
RESOLUTIONS_RECOMMENDED_BY_ANTHROPIC: dict[str, Resolution] = {
199+
"XGA": Resolution(width=1024, height=768), # 4:3
200+
"WXGA": Resolution(width=1280, height=800), # 16:10
201+
}
202+
203+
204+
def _get_closest_recommended_resolution(resolution: Resolution) -> Resolution:
205+
return min(
206+
RESOLUTIONS_RECOMMENDED_BY_ANTHROPIC.values(),
207+
key=lambda r: abs(r.width - resolution.width)
208+
+ abs(r.height - resolution.height),
209+
)
210+
211+
195212
class ComputerToolBase(Tool, ABC):
196213
def __init__(
197214
self,
198215
agent_os: AgentOs,
199216
input_schema: InputSchema,
217+
resolution: Resolution | None = None,
200218
) -> None:
201219
super().__init__(
202220
name="computer",
203221
description="A tool for interacting with the computer",
204222
input_schema=input_schema,
205223
)
206224
self._agent_os = agent_os
207-
self._width = 1280
208-
self._height = 800
225+
real_resolution = self._get_real_screen_resolution()
226+
self._resolution = resolution or _get_closest_recommended_resolution(
227+
Resolution(
228+
width=real_resolution[0],
229+
height=real_resolution[1],
230+
)
231+
)
232+
233+
@property
234+
def _width(self) -> int:
235+
return self._resolution.width
236+
237+
@property
238+
def _height(self) -> int:
239+
return self._resolution.height
209240

210241
@property
211242
def params_base(
@@ -228,7 +259,7 @@ def __call__( # noqa: C901
228259
) -> Image.Image | None | str:
229260
match action:
230261
case "cursor_position":
231-
return self._get_mouse_position_scaled()
262+
return self._retrieve_cursor_position()
232263
case "double_click":
233264
return self._agent_os.click("left", 2)
234265
case "key":
@@ -284,17 +315,12 @@ def _scale_coordinates_back(
284315
self,
285316
coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]],
286317
) -> tuple[int, int]:
287-
real_screen_width, real_screen_height = self._get_real_screen_resolution()
288-
x, y = scale_coordinates_back(
289-
coordinate[0],
290-
coordinate[1],
291-
real_screen_width,
292-
real_screen_height,
293-
self._width,
294-
self._height,
318+
return scale_coordinates(
319+
coordinate,
320+
self._get_real_screen_resolution(),
321+
(self._width, self._height),
322+
inverse=True,
295323
)
296-
x, y = int(x), int(y)
297-
return x, y
298324

299325
@validate_call
300326
def _mouse_move(
@@ -320,18 +346,15 @@ def _screenshot(self) -> Image.Image:
320346
Take a screenshot of the current screen, scale it and return it
321347
"""
322348
screenshot = self._agent_os.screenshot()
323-
return scale_image_with_padding(screenshot, self._width, self._height)
349+
return scale_image_to_fit(screenshot, (self._width, self._height))
324350

325-
def _get_mouse_position_scaled(self) -> str:
351+
def _retrieve_cursor_position(self) -> str:
326352
mouse_position: Coordinate = self._agent_os.get_mouse_position()
327353
real_screen_width, real_screen_height = self._get_real_screen_resolution()
328-
x, y = scale_coordinates_with_padding(
329-
mouse_position.x,
330-
mouse_position.y,
331-
real_screen_width,
332-
real_screen_height,
333-
self._width,
334-
self._height,
354+
x, y = scale_coordinates(
355+
(mouse_position.x, mouse_position.y),
356+
(real_screen_width, real_screen_height),
357+
(self._width, self._height),
335358
)
336359

337360
return f"X={x},Y={y}"

0 commit comments

Comments
 (0)