askui
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/askui/models/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎src/askui/models/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/askui/models/anthropic/handler.py‎ ‎src/askui/models/anthropic/model.py‎src/askui/models/anthropic/handler.py renamed to src/askui/models/anthropic/model.py
Lines changed: 5 additions & 6 deletions b/‎src/askui/models/anthropic/handler.py‎ ‎src/askui/models/anthropic/model.py‎src/askui/models/anthropic/handler.py renamed to src/askui/models/anthropic/model.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎src/askui/models/anthropic/settings.py‎
Lines changed: 5 additions & 2 deletions b/‎src/askui/models/anthropic/settings.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/askui/models/model_router.py‎
Lines changed: 1 addition & 1 deletion b/‎src/askui/models/model_router.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/askui/models/openrouter/handler.py‎
Lines changed: 0 additions & 77 deletions b/‎src/askui/models/openrouter/handler.py‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎src/askui/models/openrouter/model.py‎
Lines changed: 185 additions & 0 deletions b/‎src/askui/models/openrouter/model.py‎
Lines changed: 185 additions & 0 deletions
diff --git a/‎src/askui/models/openrouter/prompts.py‎
Lines changed: 0 additions & 1 deletion b/‎src/askui/models/openrouter/prompts.py‎
Lines changed: 0 additions & 1 deletion
@@ -446,15 +446,15 @@ You can use Vision Agent with [OpenRouter](https://openrouter.ai/) to access a w
 ```python
 from askui import VisionAgent
 from askui.models import (
-    OpenRouterGetModel,
+    OpenRouterModel,
     OpenRouterSettings,
     ModelRegistry,
 )
 
 
 # Register OpenRouter model in the registry
 custom_models: ModelRegistry = {
-    "my-custom-model": OpenRouterGetModel(
+    "my-custom-model": OpenRouterModel(
         OpenRouterSettings(
             model="anthropic/claude-opus-4",
         )
 
@@ -11,7 +11,7 @@
     OnMessageCb,
     Point,
 )
-from .openrouter.handler import OpenRouterGetModel
+from .openrouter.model import OpenRouterModel
 from .openrouter.settings import OpenRouterSettings
 from .shared.computer_agent_message_param import (
     Base64ImageSourceParam,
@@ -28,6 +28,7 @@
     ToolUseBlockParam,
     UrlImageSourceParam,
 )
+from .shared.settings import ChatCompletionsCreateSettings
 
 __all__ = [
     "ActModel",
@@ -54,6 +55,7 @@
     "ToolResultBlockParam",
     "ToolUseBlockParam",
     "UrlImageSourceParam",
-    "OpenRouterGetModel",
+    "OpenRouterModel",
     "OpenRouterSettings",
+    "ChatCompletionsCreateSettings",
 ]
@@ -21,6 +21,7 @@
     ModelName,
     Point,
 )
+from askui.models.shared.prompts import SYSTEM_PROMPT_GET, build_system_prompt_locate
 from askui.models.types.response_schemas import ResponseSchema
 from askui.utils.image_utils import (
     ImageSource,
@@ -47,8 +48,8 @@ def _inference(
     ) -> list[anthropic.types.ContentBlock]:
         message = self._client.messages.create(
             model=model,
-            max_tokens=self._settings.max_tokens,
-            temperature=self._settings.temperature,
+            max_tokens=self._settings.chat_completions_create_settings.max_tokens,
+            temperature=self._settings.chat_completions_create_settings.temperature,
             system=system_prompt,
             messages=[
                 {
@@ -87,12 +88,11 @@ def locate(
         prompt = f"Click on {locator_serialized}"
         screen_width = self._settings.resolution[0]
         screen_height = self._settings.resolution[1]
-        system_prompt = f"Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try taking another screenshot.\n* The screen's resolution is {screen_width}x{screen_height}.\n* The display number is 0\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\n"  # noqa: E501
         scaled_image = scale_image_with_padding(image.root, screen_width, screen_height)
         response = self._inference(
             image_to_base64(scaled_image),
             prompt,
-            system_prompt,
+            build_system_prompt_locate(str(screen_width), str(screen_height)),
             model=ANTHROPIC_MODEL_NAME_MAPPING[ModelName(model_choice)],
         )
         assert len(response) > 0
@@ -129,11 +129,10 @@ def get(
             max_width=self._settings.resolution[0],
             max_height=self._settings.resolution[1],
         )
-        system_prompt = "You are an agent to process screenshots and answer questions about things on the screen or extract information from it. Answer only with the response to the question and keep it short and precise."  # noqa: E501
         response = self._inference(
             base64_image=image_to_base64(scaled_image),
             prompt=query,
-            system_prompt=system_prompt,
+            system_prompt=SYSTEM_PROMPT_GET,
             model=ANTHROPIC_MODEL_NAME_MAPPING[ModelName(model_choice)],
         )
         if len(response) == 0:
 
@@ -2,6 +2,7 @@
 from pydantic_settings import BaseSettings
 
 from askui.models.shared.computer_agent import ComputerAgentSettingsBase
+from askui.models.shared.settings import ChatCompletionsCreateSettings
 
 COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"
 
@@ -20,8 +21,10 @@ class ClaudeSettingsBase(BaseModel):
 
 class ClaudeSettings(ClaudeSettingsBase):
     resolution: tuple[int, int] = Field(default_factory=lambda: (1280, 800))
-    max_tokens: int = 1000
-    temperature: float = 0.0
+    chat_completions_create_settings: ChatCompletionsCreateSettings = Field(
+        default_factory=ChatCompletionsCreateSettings,
+        description="Settings for ChatCompletions",
+    )
 
 
 class ClaudeComputerAgentSettings(ComputerAgentSettingsBase, ClaudeSettingsBase):
 
@@ -41,7 +41,7 @@
 
 from ..logger import logger
 from .anthropic.computer_agent import ClaudeComputerAgent
-from .anthropic.handler import ClaudeHandler
+from .anthropic.model import ClaudeHandler
 from .askui.inference_api import AskUiInferenceApi, AskUiSettings
 
 
 
@@ -0,0 +1,185 @@
+import json
+from typing import TYPE_CHECKING, Any, Optional, Type
+
+import openai
+from openai import OpenAI
+from typing_extensions import override
+
+from askui.logger import logger
+from askui.models.exceptions import QueryNoResponseError
+from askui.models.models import GetModel
+from askui.models.shared.prompts import SYSTEM_PROMPT_GET
+from askui.models.types.response_schemas import ResponseSchema, to_response_schema
+from askui.utils.image_utils import ImageSource
+
+from .settings import OpenRouterSettings
+
+if TYPE_CHECKING:
+    from openai.types.chat.completion_create_params import ResponseFormat
+
+
+def _clean_schema_refs(schema: dict[str, Any] | list[Any]) -> None:
+    """Remove title fields that are at the same level as $ref fields as they are not supported by OpenAI."""  # noqa: E501
+    if isinstance(schema, dict):
+        if "$ref" in schema and "title" in schema:
+            del schema["title"]
+        for value in schema.values():
+            if isinstance(value, (dict, list)):
+                _clean_schema_refs(value)
+    elif isinstance(schema, list):
+        for item in schema:
+            if isinstance(item, (dict, list)):
+                _clean_schema_refs(item)
+
+
+class OpenRouterModel(GetModel):
+    """
+    This class implements the GetModel interface for the OpenRouter API.
+
+    Args:
+        settings (OpenRouterSettings): The settings for the OpenRouter model.
+
+    Example:
+        ```python
+        from askui import VisionAgent
+        from askui.models import (
+            OpenRouterModel,
+            OpenRouterSettings,
+            ModelRegistry,
+        )
+
+
+        # Register OpenRouter model in the registry
+        custom_models: ModelRegistry = {
+            "my-custom-model": OpenRouterGetModel(
+                OpenRouterSettings(
+                    model="anthropic/claude-opus-4",
+                )
+            ),
+        }
+
+        with VisionAgent(models=custom_models, model={"get":"my-custom-model"}) as agent:
+            result = agent.get("What is the main heading on the screen?")
+            print(result)
+        ```
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        settings: OpenRouterSettings | None = None,
+        client: Optional[OpenAI] = None,
+    ):
+        self._settings = settings or OpenRouterSettings()
+
+        self._client = (
+            client
+            if client is not None
+            else OpenAI(
+                api_key=self._settings.open_router_api_key.get_secret_value(),
+                base_url=str(self._settings.base_url),
+            )
+        )
+
+    def _predict(
+        self,
+        image_url: str,
+        instruction: str,
+        prompt: str,
+        response_schema: type[ResponseSchema] | None,
+    ) -> str | None | ResponseSchema:
+        extra_body: dict[str, object] = {}
+
+        if len(self._settings.models) > 0:
+            extra_body["models"] = self._settings.models
+
+        _response_schema = (
+            to_response_schema(response_schema) if response_schema else None
+        )
+
+        response_format: openai.NotGiven | ResponseFormat = openai.NOT_GIVEN
+        if _response_schema is not None:
+            extra_body["provider"] = {"require_parameters": True}
+            schema = _response_schema.model_json_schema()
+            _clean_schema_refs(schema)
+
+            defs = schema.pop("$defs", None)
+            schema_response_wrapper = {
+                "type": "object",
+                "properties": {"response": schema},
+                "additionalProperties": False,
+                "required": ["response"],
+            }
+            if defs:
+                schema_response_wrapper["$defs"] = defs
+            response_format = {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "user_json_schema",
+                    "schema": schema_response_wrapper,
+                    "strict": True,
+                },
+            }
+
+        chat_completion = self._client.chat.completions.create(
+            model=self._settings.model,
+            extra_body=extra_body,
+            response_format=response_format,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                        },
+                        {"type": "text", "text": prompt + instruction},
+                    ],
+                }
+            ],
+            stream=False,
+            top_p=self._settings.chat_completions_create_settings.top_p,
+            temperature=self._settings.chat_completions_create_settings.temperature,
+            max_tokens=self._settings.chat_completions_create_settings.max_tokens,
+            seed=self._settings.chat_completions_create_settings.seed,
+            stop=self._settings.chat_completions_create_settings.stop,
+            frequency_penalty=self._settings.chat_completions_create_settings.frequency_penalty,
+            presence_penalty=self._settings.chat_completions_create_settings.presence_penalty,
+        )
+
+        model_response = chat_completion.choices[0].message.content
+
+        if _response_schema is not None and model_response is not None:
+            try:
+                response_json = json.loads(model_response)
+            except json.JSONDecodeError:
+                error_msg = f"Expected JSON, but model {self._settings.model} returned: {model_response}"  # noqa: E501
+                logger.error(error_msg)
+                raise ValueError(error_msg) from None
+
+            validated_response = _response_schema.model_validate(
+                response_json["response"]
+            )
+            return validated_response.root
+
+        return model_response
+
+    @override
+    def get(
+        self,
+        query: str,
+        image: ImageSource,
+        response_schema: Type[ResponseSchema] | None,
+        model_choice: str,
+    ) -> ResponseSchema | str:
+        response = self._predict(
+            image_url=image.to_data_url(),
+            instruction=query,
+            prompt=SYSTEM_PROMPT_GET,
+            response_schema=response_schema,
+        )
+        if response is None:
+            error_msg = f'No response from model "{model_choice}" to query: "{query}"'
+            raise QueryNoResponseError(error_msg, query)
+        return response