Skip to content

Commit a222370

Browse files
refactor: enhance ComputerToolBase functionality and add coordinate scaling
- Added `_get_mouse_position_scaled` method to `ComputerToolBase` for retrieving and scaling mouse position. - Updated `action` method to return scaled mouse position. - Introduced `scale_coordinates_with_padding` function in `image_utils.py` for scaling coordinates with padding. - Cleaned up import statements for better organization and readability.
1 parent e54f95f commit a222370

File tree

2 files changed

+72
-9
lines changed

2 files changed

+72
-9
lines changed

src/askui/tools/computer.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,16 @@
44
from dataclasses import dataclass
55
from typing import Annotated, Literal, TypedDict, cast, get_args
66

7-
from anthropic.types.beta import (
8-
BetaToolComputerUse20241022Param,
9-
BetaToolComputerUse20250124Param,
10-
)
7+
from anthropic.types.beta import (BetaToolComputerUse20241022Param,
8+
BetaToolComputerUse20250124Param)
119
from PIL import Image
1210
from pydantic import Field, validate_call
1311
from typing_extensions import Self, override
1412

15-
from askui.tools.agent_os import AgentOs, ModifierKey, PcKey
16-
from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding
13+
from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
14+
from askui.utils.image_utils import (scale_coordinates_back,
15+
scale_coordinates_with_padding,
16+
scale_image_with_padding)
1717

1818
from ..models.shared.tools import InputSchema, Tool
1919

@@ -223,10 +223,10 @@ def __call__( # noqa: C901
223223
text: str | None = None,
224224
coordinate: tuple[Annotated[int, Field(ge=0)], Annotated[int, Field(ge=0)]]
225225
| None = None,
226-
) -> Image.Image | None:
226+
) -> Image.Image | None | Coordinate:
227227
match action:
228228
case "cursor_position":
229-
raise ActionNotImplementedError(action, self.name)
229+
return self._get_mouse_position_scaled()
230230
case "double_click":
231231
return self._agent_os.click("left", 2)
232232
case "key":
@@ -326,6 +326,13 @@ def _screenshot(self) -> Image.Image:
326326
return scale_image_with_padding(screenshot, self._width, self._height)
327327

328328

329+
def _get_mouse_position_scaled(self) -> Coordinate:
330+
mouse_position: Coordinate = self._agent_os.get_mouse_position()
331+
real_screen_width, real_screen_height = self._get_real_screen_resolution()
332+
x, y = scale_coordinates_with_padding(mouse_position.x, mouse_position.y, real_screen_width, real_screen_height, self._width, self._height)
333+
return Coordinate(x=int(x), y=int(y))
334+
335+
329336
class Computer20241022Tool(ComputerToolBase):
330337
type: Literal["computer_20241022"] = "computer_20241022"
331338

src/askui/utils/image_utils.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77
from pathlib import Path
88
from typing import Any, Literal, Tuple, Union
99

10-
from PIL import Image, ImageDraw, ImageOps, UnidentifiedImageError
10+
from PIL import Image
1111
from PIL import Image as PILImage
12+
from PIL import ImageDraw, ImageOps, UnidentifiedImageError
1213
from pydantic import ConfigDict, RootModel, field_validator
1314

1415
# Regex to capture any kind of valid base64 data url (with optional media type and ;base64)
@@ -190,6 +191,61 @@ def scale_image_with_padding(
190191
)
191192

192193

194+
def scale_coordinates_with_padding(
195+
x: float,
196+
y: float,
197+
original_width: int,
198+
original_height: int,
199+
max_width: int,
200+
max_height: int,
201+
) -> Tuple[float, float]:
202+
"""Convert coordinates from an original image to a scaled and padded image.
203+
204+
This function takes coordinates from the original image and calculates
205+
their corresponding position in an image that has been scaled and
206+
padded to fit within `max_width` and `max_height`.
207+
208+
Args:
209+
x (float): The x-coordinate in the original image.
210+
y (float): The y-coordinate in the original image.
211+
original_width (int): The width of the original image.
212+
original_height (int): The height of the original image.
213+
max_width (int): The maximum width of the output scaled and padded image.
214+
max_height (int): The maximum height of the output scaled and padded image.
215+
216+
Returns:
217+
Tuple[float, float]: A tuple of (scaled_x, scaled_y) coordinates
218+
in the padded image.
219+
"""
220+
aspect_ratio = original_width / original_height
221+
if (max_width / max_height) > aspect_ratio:
222+
scale_factor = max_height / original_height
223+
scaled_width = int(original_width * scale_factor)
224+
scaled_height = max_height
225+
else:
226+
scale_factor = max_width / original_width
227+
scaled_width = max_width
228+
scaled_height = int(original_height * scale_factor)
229+
230+
pad_left = (max_width - scaled_width) // 2
231+
pad_top = (max_height - scaled_height) // 2
232+
233+
scaled_x = x * scale_factor + pad_left
234+
scaled_y = y * scale_factor + pad_top
235+
236+
if (
237+
scaled_x < 0
238+
or scaled_y < 0
239+
or scaled_x > max_width
240+
or scaled_y > max_height
241+
):
242+
error_msg = "Coordinates are outside the padded image area"
243+
raise ValueError(error_msg)
244+
245+
246+
return scaled_x, scaled_y
247+
248+
193249
def scale_coordinates_back(
194250
x: float,
195251
y: float,

0 commit comments

Comments
 (0)