Skip to content

Commit 640244d

Browse files
Merge pull request #103 from askui/feat/multiple-display-support
feat: add display management tools and enhance VisionAgent
2 parents 50f6717 + 7a19a8f commit 640244d

File tree

17 files changed

+681
-182
lines changed

17 files changed

+681
-182
lines changed

src/askui/agent.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
from askui.models.shared.tools import Tool
2020
from askui.tools.computer import Computer20241022Tool, Computer20250124Tool
2121
from askui.tools.exception_tool import ExceptionTool
22+
from askui.tools.list_displays_tool import ListDisplaysTool
23+
from askui.tools.retrieve_active_display_tool import RetrieveActiveDisplayTool
24+
from askui.tools.set_active_display_tool import SetActiveDisplayTool
2225

2326
from .logger import logger
2427
from .models import ModelComposition
@@ -30,9 +33,10 @@
3033

3134
_SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
3235
* You are utilising a {sys.platform} machine using {platform.machine()} architecture with internet access.
36+
* When you cannot find something (application window, ui element etc.) on the currently selected/active displa/screen, check the other available displays by listing them and checking which one is currently active and then going through the other displays one by one until you find it or you have checked all of them.
3337
* When asked to perform web tasks try to open the browser (firefox, chrome, safari, ...) if not already open. Often you can find the browser icons in the toolbars of the operating systems.
34-
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
35-
* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
38+
* When viewing a page it can be helpful to zoom out/in so that you can see everything on the page. Either that, or make sure you scroll down/up to see everything before deciding something isn't available.
39+
* When using your function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
3640
* The current date and time is {datetime.now(timezone.utc).strftime("%A, %B %d, %Y %H:%M:%S %z")}.
3741
</SYSTEM_CAPABILITY>
3842
@@ -115,6 +119,9 @@ def __init__(
115119
models=models,
116120
tools=[
117121
ExceptionTool(),
122+
SetActiveDisplayTool(agent_os=self.tools.os),
123+
RetrieveActiveDisplayTool(agent_os=self.tools.os),
124+
ListDisplaysTool(agent_os=self.tools.os),
118125
]
119126
+ (act_tools or []),
120127
agent_os=self.tools.os,

src/askui/locators/serializers.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@
1313
Prompt,
1414
Text,
1515
)
16-
from .locators import (
17-
AiElement as AiElementLocator,
18-
)
16+
from .locators import AiElement as AiElementLocator
1917
from .relatable import (
2018
BoundingRelation,
2119
LogicalRelation,

src/askui/models/anthropic/messages_api.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@
4545
from askui.utils.image_utils import (
4646
ImageSource,
4747
image_to_base64,
48-
scale_coordinates_back,
49-
scale_image_with_padding,
48+
scale_coordinates,
49+
scale_image_to_fit,
5050
)
5151

5252
from .utils import extract_click_coordinates
@@ -156,10 +156,9 @@ def _inference(
156156
system: str,
157157
model_choice: str,
158158
) -> str:
159-
scaled_image = scale_image_with_padding(
159+
scaled_image = scale_image_to_fit(
160160
image.root,
161-
self._settings.resolution[0],
162-
self._settings.resolution[1],
161+
self._settings.resolution,
163162
)
164163
message = self.create_message(
165164
messages=[
@@ -222,16 +221,12 @@ def locate(
222221
),
223222
model_choice=model_choice,
224223
)
225-
scaled_x, scaled_y = extract_click_coordinates(content)
226-
x, y = scale_coordinates_back(
227-
scaled_x,
228-
scaled_y,
229-
image.root.width,
230-
image.root.height,
231-
screen_width,
232-
screen_height,
224+
return scale_coordinates(
225+
extract_click_coordinates(content),
226+
image.root.size,
227+
self._settings.resolution,
228+
inverse=True,
233229
)
234-
return int(x), int(y)
235230
except (
236231
_UnexpectedResponseError,
237232
ValueError,

src/askui/models/shared/tools.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pydantic import BaseModel, Field
88
from typing_extensions import Self
99

10+
from askui.logger import logger
1011
from askui.models.shared.agent_message_param import (
1112
Base64ImageSourceParam,
1213
ContentBlockParam,
@@ -155,6 +156,7 @@ def _run_tool(
155156
except AgentException:
156157
raise
157158
except Exception as e: # noqa: BLE001
159+
logger.error(f"Tool {tool_use_block_param.name} failed: {e}", exc_info=True)
158160
return ToolResultBlockParam(
159161
content=f"Tool {tool_use_block_param.name} failed: {e}",
160162
is_error=True,

src/askui/tools/agent_os.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from typing import TYPE_CHECKING, Literal
33

44
from PIL import Image
5-
from pydantic import BaseModel
5+
from pydantic import BaseModel, ConfigDict, Field
66

77
if TYPE_CHECKING:
88
from askui.tools.askui.askui_ui_controller_grpc.generated.AgentOS_Send_Request_2501 import ( # noqa: E501
@@ -159,6 +159,26 @@ class Coordinate(BaseModel):
159159
y: int
160160

161161

162+
class DisplaySize(BaseModel):
163+
"""Represents the size of a display in pixels."""
164+
165+
width: int
166+
height: int
167+
168+
169+
class Display(BaseModel):
170+
model_config = ConfigDict(
171+
validate_by_name=True,
172+
)
173+
174+
id: int = Field(validation_alias="displayID")
175+
size: DisplaySize = Field(validation_alias="sizeInPixels")
176+
177+
178+
class DisplaysListResponse(BaseModel):
179+
data: list[Display] = Field(validation_alias="displays")
180+
181+
162182
InputEvent = ClickEvent
163183

164184

@@ -323,6 +343,22 @@ def keyboard_tap(
323343
"""
324344
raise NotImplementedError
325345

346+
def list_displays(self) -> DisplaysListResponse:
347+
"""
348+
List all the available displays.
349+
"""
350+
raise NotImplementedError
351+
352+
@abstractmethod
353+
def retrieve_active_display(self) -> Display:
354+
"""
355+
Retrieve the currently active display/screen.
356+
357+
Returns:
358+
Display: The currently active display/screen.
359+
"""
360+
raise NotImplementedError
361+
326362
def set_display(self, display: int = 1) -> None:
327363
"""
328364
Sets the active display for screen interactions.

src/askui/tools/android/agent_os_facade.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from askui.reporting import Reporter
66
from askui.tools.android.agent_os import ANDROID_KEY, AndroidAgentOs, AndroidDisplay
7-
from askui.utils.image_utils import scale_coordinates_back, scale_image_with_padding
7+
from askui.utils.image_utils import scale_coordinates, scale_image_to_fit
88

99

1010
class AndroidAgentOsFacade(AndroidAgentOs):
@@ -32,10 +32,9 @@ def disconnect(self) -> None:
3232
def screenshot(self) -> Image.Image:
3333
screenshot = self._agent_os.screenshot()
3434
self._real_screen_resolution = screenshot.size
35-
scaled_image = scale_image_with_padding(
35+
scaled_image = scale_image_to_fit(
3636
screenshot,
37-
self._target_resolution[0],
38-
self._target_resolution[1],
37+
self._target_resolution,
3938
)
4039

4140
self._reporter.add_message("AndroidAgentOS", "Screenshot taken", screenshot)
@@ -45,15 +44,12 @@ def _scale_coordinates_back(self, x: int, y: int) -> Tuple[int, int]:
4544
if self._real_screen_resolution is None:
4645
self._real_screen_resolution = self._agent_os.screenshot().size
4746

48-
scaled_x, scaled_y = scale_coordinates_back(
49-
x,
50-
y,
51-
self._real_screen_resolution[0],
52-
self._real_screen_resolution[1],
53-
self._target_resolution[0],
54-
self._target_resolution[1],
47+
return scale_coordinates(
48+
(x, y),
49+
self._real_screen_resolution,
50+
self._target_resolution,
51+
inverse=True,
5552
)
56-
return int(scaled_x), int(scaled_y)
5753

5854
def tap(self, x: int, y: int) -> None:
5955
scaled_x, scaled_y = self._scale_coordinates_back(x, y)

src/askui/tools/askui/askui_controller.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,21 @@
77
from typing import Literal, Type
88

99
import grpc
10+
from google.protobuf.json_format import MessageToDict
1011
from PIL import Image
1112
from typing_extensions import Self, override
1213

1314
from askui.container import telemetry
1415
from askui.logger import logger
1516
from askui.reporting import Reporter
16-
from askui.tools.agent_os import AgentOs, Coordinate, ModifierKey, PcKey
17+
from askui.tools.agent_os import (
18+
AgentOs,
19+
Coordinate,
20+
Display,
21+
DisplaysListResponse,
22+
ModifierKey,
23+
PcKey,
24+
)
1725
from askui.tools.askui.askui_controller_settings import AskUiControllerSettings
1826
from askui.tools.askui.askui_ui_controller_grpc.generated import (
1927
Controller_V1_pb2 as controller_v1_pbs,
@@ -626,28 +634,49 @@ def run_command(self, command: str, timeout_ms: int = 30000) -> None:
626634
)
627635

628636
@telemetry.record_call()
629-
def get_display_information(
637+
@override
638+
def retrieve_active_display(self) -> Display:
639+
"""
640+
Retrieve the currently active display/screen.
641+
642+
Returns:
643+
Display: The currently active display/screen.
644+
"""
645+
self._reporter.add_message("AgentOS", "retrieve_active_display()")
646+
displays_list_response = self.list_displays()
647+
for display in displays_list_response.data:
648+
if display.id == self._display:
649+
return display
650+
error_msg = f"Display {self._display} not found"
651+
raise ValueError(error_msg)
652+
653+
@telemetry.record_call()
654+
@override
655+
def list_displays(
630656
self,
631-
) -> controller_v1_pbs.Response_GetDisplayInformation:
657+
) -> DisplaysListResponse:
632658
"""
633-
Get information about all available displays and virtual screen.
659+
List all available displays including virtual screens.
634660
635661
Returns:
636-
controller_v1_pbs.Response_GetDisplayInformation:
637-
- displays: List of DisplayInformation objects
638-
- virtualScreenRectangle: Overall virtual screen bounds
662+
DisplaysListResponse
639663
"""
640664
assert isinstance(self._stub, controller_v1.ControllerAPIStub), (
641665
"Stub is not initialized"
642666
)
643667

644-
self._reporter.add_message("AgentOS", "get_display_information()")
668+
self._reporter.add_message("AgentOS", "list_displays()")
645669

646670
response: controller_v1_pbs.Response_GetDisplayInformation = (
647671
self._stub.GetDisplayInformation(controller_v1_pbs.Request_Void())
648672
)
649673

650-
return response
674+
response_dict = MessageToDict(
675+
response,
676+
preserving_proto_field_name=True,
677+
)
678+
679+
return DisplaysListResponse.model_validate(response_dict)
651680

652681
@telemetry.record_call()
653682
def get_process_list(

0 commit comments

Comments
 (0)