From 84c5fd810b467d7ea04a866b082d2de5131c6d7f Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Fri, 4 Apr 2025 14:11:29 +0700
Subject: [PATCH 1/6] modify send_vm to use our local device

---
 gradio_demo.py                                |   4 +-
 omnitool/.DS_Store                            | Bin 0 -> 6148 bytes
 omnitool/gradio/.DS_Store                     | Bin 0 -> 6148 bytes
 omnitool/gradio/agent/.DS_Store               | Bin 0 -> 6148 bytes
 omnitool/gradio/app.py                        |  14 ++---
 omnitool/gradio/app_new.py                    |   2 +-
 omnitool/gradio/tools/.DS_Store               | Bin 0 -> 6148 bytes
 omnitool/gradio/tools/computer.py             |  55 ++++++++++++------
 omnitool/gradio/tools/screen_capture.py       |  16 ++---
 omnitool/omniparserserver/omniparserserver.py |   3 +-
 util/omniparser.py                            |   4 +-
 util/utils.py                                 |  13 ++++-
 12 files changed, 70 insertions(+), 41 deletions(-)
 create mode 100644 omnitool/.DS_Store
 create mode 100644 omnitool/gradio/.DS_Store
 create mode 100644 omnitool/gradio/agent/.DS_Store
 create mode 100644 omnitool/gradio/tools/.DS_Store
diff --git a/gradio_demo.py b/gradio_demo.py
index 15664d31..e2f9266d 100644
--- a/gradio_demo.py
+++ b/gradio_demo.py
@@ -8,7 +8,7 @@
 
 
 import base64, os
-from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
+from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img, detect_device
 import torch
 from PIL import Image
 
@@ -27,7 +27,7 @@
 OmniParser is a screen parsing tool to convert general GUI screen to structured elements. 
 """
 
-DEVICE = torch.device('cuda')
+DEVICE = torch.device(detect_device())
 
 # @spaces.GPU
 # @torch.inference_mode()
diff --git a/omnitool/.DS_Store b/omnitool/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..9ce7173f5c2df5af2b3b400c7138f99dc737a3c2
GIT binary patch
literal 6148
zcmeHK%}T>S5Z-NTn^J@v6nb3nTCkQX6fdFH7cim+m70)JgE3o@)Er77XMG``#OHBl
zcXJ2^youNu*!^bbXE*af_J=XXr|aN^F`F@FK||!I)CihuT@4eA$kiMnlYXAA{7CvW
z6a7UKetVlOS;9OPu<G}J_^T*Pv*GZQH=3=zcE{>iJ?p`JlDV6C**cwi$sLZarHuSi
z_xzi1k<aae3mIo#7%vi)5C#hfxxWqLK;~1qj)PR?y4qoNt?u0JZ#Jjn(LjublkGrk
zCZpj%jL*i~ZPz+HIzGReKSwXIeA7g7;9SYR!4lp<snqo9CUGR=CD_ZHGM11SAO?s5
zVqo_eFlT|)+r84MVq$<8_<;f39|SZ+$6%pRZ5`0z^%?ywL=@2RErBQuItB}k-~r(}
z6;P*g^Tgmf9qhuyIR*=jI-PN~GR$LEE*~#ktqyjf!WnloQcnyJ162l^dT8VMe+j=#
z?IXXMLL*{;82D!l@Yc+oO`#}rwtg!Q&sqWP5gH1{6{vuKUbzIo0QZr;a%#Uo9pW5=
Wg+`nO?J6CRE&_@W>WG0~VBibiflLhm

literal 0
HcmV?d00001

diff --git a/omnitool/gradio/.DS_Store b/omnitool/gradio/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..a37f1c2feb29af5e89e4812b02fb542e3863eed3
GIT binary patch
literal 6148
zcmeHKF-yZh6n<ALREr>pAnK6CL2$6(4~V7yf`V(cX=#VV1ht@pkX0NV{U_p39Grx1
zF8&HPN56M>+B*`3E+X<C+<nR2d-uM4$=&6MNHs^j2GKGR)le9#i>OMB`?(aXVp~>$
zN{<oIF-0__K6MJ-(jW?m0)I^bp1XC-)(&NKiMczSznyr{Xh&(18OkeH-+Lx$tJRIu
zE<9@Lb+UdldD{QUyElV(Hgs787?$S2a)34Fgw8#V>hZwub$Ko9+-_Q4PJEJ1xs-c&
z`70q3U@`2|1xiFCI;WIQJr=b6R_5b!dAYy)<Wd;RW%QGu$BQ(tIj~76>zze$H%((K
z$z+>6f*aRM(;k=BxW0GzY+jv>bIBBYJqjCR42oC7f}|AFnU|9ucKu#AZ~VTsZ}U3w
zZtijqFTdghVzVf~9|){gHz>6zAPR^AUkdR05TG!I7ITAo=|H8A0Kh7mwc(o25*!m)
z3@zpc(F0Q^6lg+~Jz^*mj`qO%g%)#zCY+ScjB#vcWltzdXGeRW-ARQ8r4|K50bhYR
zbGyv@KmMG*JOBGZawiIi0{=<@Rc*CfO+1p_TMG}zdu@Pngu=#txj`L*%5KN{g16!|
b6m94W_yQPO%nhOiCO-mN2B}1WUsd1(5AU<9

literal 0
HcmV?d00001

diff --git a/omnitool/gradio/agent/.DS_Store b/omnitool/gradio/agent/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..1ef795a8a46bbef1f2c10e8db6cd2f696ce6c99f
GIT binary patch
literal 6148
zcmeHKu};H44E2Q$sk(GzV8}1@3kX&Cf-)innj(snD3PE8+wF{uEDZbwJ1cw=&u7y%
zN?M5pRmhHf@8YvB&O0fNiO5Y?iwV(~h+-&XG{n#&yw2K@NG-hR!X5=(Q$b6*p}ga5
zhC^h4&u)a%%Bi78oZaU9^73{vE9$C|wBIjxdscOp&C7ZYkGOn&IeGg0INRsF_=R_~
z+|J^QlNnNpt3u8zdZ0VZ@7uYazV&-;=Xnu5|K!=TUsZ2%-0#`W&vSkz#(*(k3>-QG
zsM##>NYF-Oz!)$F)(r6XA%QZcie4~$IxvJ5062g-2<F^Na7?0@DtbYzK%9gECDdt)
z;UpaPMEz1lFDT*UwE1xQWTzd93s1-XNpL5Z3fgE47z1qvj^uXC_5XZ#|KAR>J7d5Y
z_*V?LK{m^#cqOf^otNWUn?SFiEbQk6*CCjMQVd@%#mCSfuqQkLrixw=7Kr@_1R88G
I2L6<RZ%S}jssI20

literal 0
HcmV?d00001

diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py
index 54cca8a0..260e31e2 100644
--- a/omnitool/gradio/app.py
+++ b/omnitool/gradio/app.py
@@ -190,7 +190,7 @@ def valid_params(user_input, state):
     """Validate all requirements and return a list of error messages."""
     errors = []
     
-    for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]:
+    for server_name, url in [('OmniParser Server', args.omniparser_server_url)]:
         try:
             url = f'http://{url}/probe'
             response = requests.get(url, timeout=3)
@@ -343,12 +343,12 @@ def get_header_image_base64():
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580)
-        with gr.Column(scale=3):
-            iframe = gr.HTML(
-                f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
-                container=False,
-                elem_classes="no-padding"
-            )
+        # with gr.Column(scale=3):
+        #     iframe = gr.HTML(
+        #         f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
+        #         container=False,
+        #         elem_classes="no-padding"
+        #     )
 
     def update_model(model_selection, state):
         state["model"] = model_selection
diff --git a/omnitool/gradio/app_new.py b/omnitool/gradio/app_new.py
index d67ae185..c907dc3c 100644
--- a/omnitool/gradio/app_new.py
+++ b/omnitool/gradio/app_new.py
@@ -223,7 +223,7 @@ def valid_params(user_input, state):
     """Validate all requirements and return a list of error messages."""
     errors = []
     
-    for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]:
+    for server_name, url in [('OmniParser Server', args.omniparser_server_url)]:
         try:
             url = f'http://{url}/probe'
             response = requests.get(url, timeout=3)
diff --git a/omnitool/gradio/tools/.DS_Store b/omnitool/gradio/tools/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..c47b05954283b270d648d9cb24b3d36491d42556
GIT binary patch
literal 6148
zcmeHKu}%Xq47F*6PU_MP#FUY>e~?zkj?@o0=oQhS)!o9-Eo;?<uVO-iPvSW?g%d{y
z2856;#m`CX#P_62Vj|+%!*W72CZYmOkj3Z`8Sb4rG3OzW^BOhXQcWwmqpD}2zc?hh
zkC1IeEp3teNB-6Nelo9{O{-~FXWx5nnzCH1n*}_IW_LP$etDW6^IjgjFRx!O_D8+j
z)#of8H5u2=fHU9>I0MeW&l$j-EmDjWT{{ELfHQDlK+cDNCKwHqVm>;catQ#GXLJ_W
zQcFloFpP#t5i=0hP@smgl^Cqy7!T$b4U?jV6I=1Yw)1=O!g+VBAF?}fRCMhOI0Ib<
z2HG9V{eOjDrnkuNhIr2za0dPv13WC}<qRKXck7qWle;#c-JpqxUlIcXyYmP@N6wM+
eJgM_RZ1_dPq$sP%cn$~pLm(64${F|t2HpU}Kr<`=

literal 0
HcmV?d00001

diff --git a/omnitool/gradio/tools/computer.py b/omnitool/gradio/tools/computer.py
index 6b91bad2..e0812692 100644
--- a/omnitool/gradio/tools/computer.py
+++ b/omnitool/gradio/tools/computer.py
@@ -2,6 +2,11 @@
 import time
 from enum import StrEnum
 from typing import Literal, TypedDict
+import threading
+import shlex
+import os
+import subprocess
+import pyautogui
 
 from PIL import Image
 
@@ -12,6 +17,8 @@
 import requests
 import re
 
+computer_control_lock = threading.Lock()
+
 OUTPUT_DIR = "./tmp/outputs"
 
 TYPING_DELAY_MS = 12
@@ -236,18 +243,12 @@ def send_to_vm(self, action: str):
 
         try:
             print(f"sending to vm: {command_list}")
-            response = requests.post(
-                f"http://localhost:5000/execute", 
-                headers={'Content-Type': 'application/json'},
-                json={"command": command_list},
-                timeout=90
-            )
+            response = self.execute(command_list)
             time.sleep(0.7) # avoid async error as actions take time to complete
             print(f"action executed")
-            if response.status_code != 200:
-                raise ToolError(f"Failed to execute command. Status code: {response.status_code}")
+
             if parse:
-                output = response.json()['output'].strip()
+                output = response['output'].strip()
                 match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
                 if not match:
                     raise ToolError(f"Could not parse coordinates from output: {output}")
@@ -255,6 +256,31 @@ def send_to_vm(self, action: str):
                 return x, y
         except requests.exceptions.RequestException as e:
             raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
+        
+    def execute(self, command, shell=False):
+        with computer_control_lock:
+            if isinstance(command, str) and not shell:
+                command = shlex.split(command)
+
+            # Expand user directory
+            for i, arg in enumerate(command):
+                if arg.startswith("~/"):
+                    command[i] = os.path.expanduser(arg)
+
+            # Execute the command without any safety checks.
+            try:
+                result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120)
+                return {
+                    'status': 'success',
+                    'output': result.stdout,
+                    'error': result.stderr,
+                    'returncode': result.returncode
+                }
+            except Exception as e:
+                return {
+                    'status': 'error',
+                    'message': str(e)
+                }
 
     async def screenshot(self):
         if not hasattr(self, 'target_dimension'):
@@ -310,16 +336,9 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int):
     def get_screen_size(self):
         """Return width and height of the screen"""
         try:
-            response = requests.post(
-                f"http://localhost:5000/execute",
-                headers={'Content-Type': 'application/json'},
-                json={"command": ["python", "-c", "import pyautogui; print(pyautogui.size())"]},
-                timeout=90
-            )
-            if response.status_code != 200:
-                raise ToolError(f"Failed to get screen size. Status code: {response.status_code}")
+            response = self.execute(["python", "-c", "import pyautogui; print(pyautogui.size())"])
             
-            output = response.json()['output'].strip()
+            output = response['output'].strip()
             match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output)
             if not match:
                 raise ToolError(f"Could not parse screen size from output: {output}")
diff --git a/omnitool/gradio/tools/screen_capture.py b/omnitool/gradio/tools/screen_capture.py
index 1c1ad04a..407ecab0 100644
--- a/omnitool/gradio/tools/screen_capture.py
+++ b/omnitool/gradio/tools/screen_capture.py
@@ -4,6 +4,7 @@
 from PIL import Image
 from .base import BaseAnthropicTool, ToolError
 from io import BytesIO
+import pyautogui
 
 OUTPUT_DIR = "./tmp/outputs"
 
@@ -14,15 +15,14 @@ def get_screenshot(resize: bool = False, target_width: int = 1920, target_height
     path = output_dir / f"screenshot_{uuid4().hex}.png"
     
     try:
-        response = requests.get('http://localhost:5000/screenshot')
-        if response.status_code != 200:
-            raise ToolError(f"Failed to capture screenshot: HTTP {response.status_code}")
-        
-        # (1280, 800)
-        screenshot = Image.open(BytesIO(response.content))
+        screenshot = pyautogui.screenshot()
+        size = pyautogui.size()
+
+        target_width = size.width
+        target_height = size.height
         
-        if resize and screenshot.size != (target_width, target_height):
-            screenshot = screenshot.resize((target_width, target_height))
+        screenshot = screenshot.resize((target_width, target_height))
+
         screenshot.save(path)
         return screenshot, path
     except Exception as e:
diff --git a/omnitool/omniparserserver/omniparserserver.py b/omnitool/omniparserserver/omniparserserver.py
index 49fb306f..cfa50c7d 100644
--- a/omnitool/omniparserserver/omniparserserver.py
+++ b/omnitool/omniparserserver/omniparserserver.py
@@ -12,13 +12,14 @@
 root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(root_dir)
 from util.omniparser import Omniparser
+from util.utils import detect_device
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description='Omniparser API')
     parser.add_argument('--som_model_path', type=str, default='../../weights/icon_detect/model.pt', help='Path to the som model')
     parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model')
     parser.add_argument('--caption_model_path', type=str, default='../../weights/icon_caption_florence', help='Path to the caption model')
-    parser.add_argument('--device', type=str, default='cpu', help='Device to run the model')
+    parser.add_argument('--device', type=str, default=detect_device(), help='Device to run the model')
     parser.add_argument('--BOX_TRESHOLD', type=float, default=0.05, help='Threshold for box detection')
     parser.add_argument('--host', type=str, default='0.0.0.0', help='Host for the API')
     parser.add_argument('--port', type=int, default=8000, help='Port for the API')
diff --git a/util/omniparser.py b/util/omniparser.py
index 536385e6..5610d5d2 100644
--- a/util/omniparser.py
+++ b/util/omniparser.py
@@ -1,4 +1,4 @@
-from util.utils import get_som_labeled_img, get_caption_model_processor, get_yolo_model, check_ocr_box
+from util.utils import get_som_labeled_img, get_caption_model_processor, get_yolo_model, check_ocr_box, detect_device
 import torch
 from PIL import Image
 import io
@@ -7,7 +7,7 @@
 class Omniparser(object):
     def __init__(self, config: Dict):
         self.config = config
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        device = detect_device()
 
         self.som_model = get_yolo_model(model_path=config['som_model_path'])
         self.caption_model_processor = get_caption_model_processor(model_name=config['caption_model_name'], model_name_or_path=config['caption_model_path'], device=device)
diff --git a/util/utils.py b/util/utils.py
index eb7c8b25..7e91258c 100644
--- a/util/utils.py
+++ b/util/utils.py
@@ -43,10 +43,19 @@
 import torchvision.transforms as T
 from util.box_annotator import BoxAnnotator 
 
+def detect_device() -> str:
+    if torch.cuda.is_available():
+        print("[+] Using CUDA")
+        return "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        print("[+] Using MPS for Apple Silicon")
+        return "mps"
+    else:
+        return "cpu"
 
 def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2-opt-2.7b", device=None):
     if not device:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = detect_device()
     if model_name == "blip2":
         from transformers import Blip2Processor, Blip2ForConditionalGeneration
         processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
@@ -107,7 +116,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
         start = time.time()
         batch = croped_pil_image[i:i+batch_size]
         t1 = time.time()
-        if model.device.type == 'cuda':
+        if model.device.type == 'cuda' or model.device.type == 'mps':
             inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16)
         else:
             inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device)

From c659ca9577017c998224e1fffe8c9f7ea6e49d78 Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Sat, 5 Apr 2025 00:08:42 +0700
Subject: [PATCH 2/6] added gemini support

---
 omnitool/gradio/agent/anthropic_agent.py      |   2 +-
 .../gradio/agent/llm_utils/geminiclient.py    |  89 ++++++++++++++++++
 omnitool/gradio/agent/vlm_agent.py            |  19 +++-
 omnitool/gradio/app.py                        |   8 +-
 omnitool/gradio/loop.py                       |   6 +-
 omnitool/gradio/tools/cursor.png              | Bin 0 -> 3207 bytes
 omnitool/gradio/tools/screen_capture.py       |   8 ++
 requirements.txt                              |   4 +-
 8 files changed, 127 insertions(+), 9 deletions(-)
 create mode 100644 omnitool/gradio/agent/llm_utils/geminiclient.py
 create mode 100644 omnitool/gradio/tools/cursor.png

diff --git a/omnitool/gradio/agent/anthropic_agent.py b/omnitool/gradio/agent/anthropic_agent.py
index b1c744e2..6b9423fc 100644
--- a/omnitool/gradio/agent/anthropic_agent.py
+++ b/omnitool/gradio/agent/anthropic_agent.py
@@ -39,7 +39,7 @@ class APIProvider(StrEnum):
     VERTEX = "vertex"
 
 SYSTEM_PROMPT = f"""<SYSTEM_CAPABILITY>
-* You are utilizing a Windows system with internet access.
+* You are utilizing a {platform.system()} system with internet access.
 * The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
 </SYSTEM_CAPABILITY>
 """
diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py
new file mode 100644
index 00000000..c610303d
--- /dev/null
+++ b/omnitool/gradio/agent/llm_utils/geminiclient.py
@@ -0,0 +1,89 @@
+import os
+from google import genai
+from google.genai import types
+import tiktoken
+
+from .utils import is_image_path, encode_image
+
+def estimate_token_count(text):
+    """Estimates the token count of a text string using tiktoken.
+       Adapt this for Gemini's specific vocabulary if necessary."""
+
+    # IMPORTANT:  tiktoken is primarily for OpenAI models.
+    # You need to be aware of potential inaccuracies if Gemini
+    # uses a significantly different tokenization scheme.
+
+    try:
+        encoding = tiktoken.get_encoding("cl100k_base") # This is a good starting point, but research Gemini tokenizer
+        tokens = encoding.encode(text)
+        return len(tokens)
+    except Exception as e:
+        print(f"Error estimating token count: {e}")
+        return None  # or a reasonable default
+
+def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, temperature=0):    
+    """
+    Run a chat completion through Gemini's API, ignoring any images in the messages.
+    """
+    api_key = api_key or os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("GEMINI_API_KEY is not set")
+    
+    client = genai.Client(
+        api_key=api_key,
+    )
+
+    generate_content_config = types.GenerateContentConfig(
+        temperature=temperature,
+        response_mime_type="application/json",
+        system_instruction=[
+            types.Part.from_text(text=system),
+        ],
+    )
+
+    contents = []
+
+    if type(messages) == list:
+        for item in messages:
+            parts = []
+            if isinstance(item, dict):
+                for cnt in item["content"]:
+                    if isinstance(cnt, str):
+                        parts.append(types.Part.from_text(text=cnt))
+                    else:
+                        # in this case it is a text block from anthropic
+                        parts.append(types.Part.from_text(text=str(cnt)))
+                
+            else:  # str
+                parts.append(types.Part.from_text(text=str(item)))
+
+            content = (types.Content(
+                role="user",
+                parts=parts
+            ))
+            
+            contents.append(content)
+
+    
+    elif isinstance(messages, str):
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=messages)]
+            )
+        ]
+
+    try:
+        response = client.models.generate_content(
+            model=model_name, 
+            contents=contents,
+            config=generate_content_config
+        )
+        final_answer = response.text
+        token_usage = estimate_token_count(final_answer)
+
+        return final_answer, token_usage
+    except Exception as e:
+        print(f"Error in interleaved Gemini: {e}")
+
+        return str(e), 0
diff --git a/omnitool/gradio/agent/vlm_agent.py b/omnitool/gradio/agent/vlm_agent.py
index 9f631a70..afa748f1 100644
--- a/omnitool/gradio/agent/vlm_agent.py
+++ b/omnitool/gradio/agent/vlm_agent.py
@@ -5,6 +5,7 @@
 from PIL import Image, ImageDraw
 import base64
 from io import BytesIO
+import platform
 
 from anthropic import APIResponse
 from anthropic.types import ToolResultBlockParam
@@ -12,6 +13,7 @@
 
 from agent.llm_utils.oaiclient import run_oai_interleaved
 from agent.llm_utils.groqclient import run_groq_interleaved
+from agent.llm_utils.geminiclient import run_gemini_interleaved
 from agent.llm_utils.utils import is_image_path
 import time
 import re
@@ -49,6 +51,8 @@ def __init__(
             self.model = "o1"
         elif model == "omniparser + o3-mini":
             self.model = "o3-mini"
+        elif model == "omniparser + gemini-2.0-flash":
+            self.model = "gemini-2.0-flash"
         else:
             raise ValueError(f"Model {model} not supported")
         
@@ -133,6 +137,17 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
             print(f"qwen token usage: {token_usage}")
             self.total_token_usage += token_usage
             self.total_cost += (token_usage * 2.2 / 1000000)  # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
+        elif "gemini-2.0-flash" in self.model:
+            vlm_response, token_usage = run_gemini_interleaved(
+                messages=planner_messages,
+                system=system,
+                model_name=self.model,
+                api_key=self.api_key,
+                temperature=0,
+            )
+            print(f"gemini token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 0.99 / 1000000)
         else:
             raise ValueError(f"Model {self.model} not supported")
         latency_vlm = time.time() - start
@@ -209,9 +224,9 @@ def _api_response_callback(self, response: APIResponse):
 
     def _get_system_prompt(self, screen_info: str = ""):
         main_section = f"""
-You are using a Windows device.
+You are using a {platform.system()} device.
 You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
-You can only interact with the desktop GUI (no terminal or application menu access).
+You can only interact with the desktop GUI (no terminal or application menu access) and ignore the gradio interface (which opened in localhost:7888) including the orange send button there.
 
 You may be given some history plan and actions, this is the response from the previous loop.
 You should carefully consider your plan base on the task, screenshot, and history actions.
diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py
index 260e31e2..43e692a1 100644
--- a/omnitool/gradio/app.py
+++ b/omnitool/gradio/app.py
@@ -27,7 +27,7 @@
 API_KEY_FILE = CONFIG_DIR / "api_key"
 
 INTRO_TEXT = '''
-OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use (Sonnet).**
+OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini(2.0-flash) or Anthropic Computer Use (Sonnet).**
 
 Type a message and press submit to start OmniTool. Press stop to pause, and press the trash icon in the chat to clear the message history.
 '''
@@ -241,7 +241,7 @@ def process_input(user_input, state):
         api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
         api_key=state["api_key"],
         only_n_most_recent_images=state["only_n_most_recent_images"],
-        max_tokens=16384,
+        max_tokens=8192,
         omniparser_url=args.omniparser_server_url
     ):  
         if loop_msg is None or state.get("stop"):
@@ -302,7 +302,7 @@ def get_header_image_base64():
             with gr.Column():
                 model = gr.Dropdown(
                     label="Model",
-                    choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"],
+                    choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"],
                     value="omniparser + gpt-4o",
                     interactive=True,
                 )
@@ -362,6 +362,8 @@ def update_model(model_selection, state):
             provider_choices = ["groq"]
         elif model_selection == "omniparser + qwen2.5vl":
             provider_choices = ["dashscope"]
+        elif model_selection == "omniparser + gemini-2.0-flash":
+            provider_choices = ["gemini"]
         else:
             provider_choices = [option.value for option in APIProvider]
         default_provider_value = provider_choices[0]
diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py
index 9ce63169..bdd856e7 100644
--- a/omnitool/gradio/loop.py
+++ b/omnitool/gradio/loop.py
@@ -28,6 +28,7 @@ class APIProvider(StrEnum):
     BEDROCK = "bedrock"
     VERTEX = "vertex"
     OPENAI = "openai"
+    GEMINI = "gemini"
 
 
 PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
@@ -35,6 +36,7 @@ class APIProvider(StrEnum):
     APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
     APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
     APIProvider.OPENAI: "gpt-4o",
+    APIProvider.GEMINI: "gemini-2.0-flash"
 }
 
 def sampling_loop_sync(
@@ -66,7 +68,7 @@ def sampling_loop_sync(
             max_tokens=max_tokens,
             only_n_most_recent_images=only_n_most_recent_images
         )
-    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl"]):
+    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash"]):
         actor = VLMAgent(
             model=model,
             provider=provider,
@@ -115,7 +117,7 @@ def sampling_loop_sync(
 
             messages.append({"content": tool_result_content, "role": "user"})
     
-    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]):
+    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]):
         while True:
             parsed_screen = omniparser_client()
             tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)
diff --git a/omnitool/gradio/tools/cursor.png b/omnitool/gradio/tools/cursor.png
new file mode 100644
index 0000000000000000000000000000000000000000..d3a3c5bbe1d070b65dc3bb31ab18998823ff54f8
GIT binary patch
literal 3207
zcmV;240!X2P)<h;3K|Lk000e1NJLTq000yK001Be1^@s65@J}k000T0X+uL$Nkc;*
zP;zf(X>4Tx0C=43nP*T`Sr&lrd#`h#p}T1^G&xC>(17HeBu7z!?k3Yhlfj4#j=%^a
zB1sWM6mY=NA;>5yq8Jbb1<cdPAfSwcqlk*i0{)n-+O6H%s;&LAeSW<6)w!qMIrr7A
zdSBH8K!%XOPE3LI0Z8Pf@_n7{n86{TOzeGt17tt~RDg+KrzCm$Ir)R-Sg8g75bs9;
zv~s_v^Uv4+Wso?DJPrU5eVOAp+!XdQ?_A~>P9kR|KfS^ksY$5-z~0MD=PwVq%$yaD
z{KlCp9Q}=pR%`ry?U(r|060?gDicnvaO5geyH=PT!%qZ2x^KB&g`LD-e!lcdFU4;&
ztcK{yewfi*9+w}H%H=Ts!>#@M_`6={l;xNIUvKa~?z3`szR_FD)iVJxD*noCDFcA@
z8UUo^ubj$Z04Tcw=;-+?7kM0j&JqA5pRv>U>ECjw)!6|7PyiiB0ePSdG=L5;1g5|O
z*Z@c12E2ej2mzY_8^nM_zy}#%E7%3{K_Mss2SF961r4AHw1P9B6PyQ^!8LFT+yx`x
zF%W=B@CwX<_h1o%AS^_NXpj`70I5RSkRil^tRP3o1M-DJpl~P};z4Q9R%ka=2$e!r
zP#x3+{REwZE<-n<A?P7A4$VM+LZ4s+Cc$)A4pxKpU>0l(yTktQMmQQyhO^;3cpqE=
z*Tbja4)`*B3m$<5@GE#80SFOcAc}|%!b0p3FC-L+LiosbqyQ;L>XBBY8|g>xAp&F;
z`G~@xXecI12W5eBK?R`LsASZ3R1vBQbrRKq>O<W_jicsJU(sZ=ELt0Fj&?-{qodIo
z=zMfJx)I%uzKR|}Pom#rFc=0#9b<}d!GvJqFgcha%wbF$<`QNIGl7}MVzE+KZLB5M
z3mbt=#pYuxv8S*XvBTI&><1hXr+_oWIpIQaJX|iW9M_EN!42c4aEo|Ryei%t?}g{!
zv+%|E27EXE4t^59NDw2a6D$e-ggC-3LIt6faFy_c@Rmp<DiO_yKEzn!PGSY|H1Rrd
zocNJMC25iDNnxZk(mv7&(k0SE(pxf_tVXsWhmupt`^Zh?E97V74<a-XT@hCij>ry?
zYLO0+VUgDqJVljaOW8!prj%38QtnV@Me(9)qV}Q@qT5BQMbC+jiO!4B#0<o|#1h4d
z#G1u!h)q+mR5hw2l|#*=9;05NPS6mVGR>aGrtPLR(5})Z=@_~?-IX3kFQorSAEeKT
zQ^k$M1I07NtHm#f3m6DPjp52jU=%aXFh&`lB;+J)CAboMC0ZqhB^D%QByA+Qk_D1&
zk|UCzq?l5UQt?vzr8=daN~5H;q<y3_q>o5nm7bNM%UH;;WeQ}@$UKsTWwm5|WpiYY
z$qvZQ%gM_*%O%NG%3YD0m1oG?$j8Z-$@j?5D9{wF6k-+16fP>fWQsFwnF-7aW*>7-
zQC87KF-@^faZqtlNnI&GDOagY>8Ub7nWY@1T&CQsJg1_d;;FJ#rA6hDDqfYP8l!qh
zwO@5XO<gTWZI4=~+O)c)x~qD&dW-rK4YG!{MzTh|#(hnUCQCD3^RVV!Em+G~D@LnY
z>y9?8ZK55keOP;F4SJ2)8s3`vHDfv?9b27rofe&OU52iQZmw>p?rS|2y->aVdi{D|
z^^Nrt^c(aa8&D103~~*+4gNILGGrT88{S__UhA}W$J(y7Z;Z5!xJE~e9vIV%J&pGm
z_ZojOF*V_vw3^JAs+fkG)|ig5=qztm5$n1c+RV;uhuL|vkLD)kspe<Q|FF=ph_`6A
zm|my8j=Qd5-Grr*WrSs&rND}56>fFZN?@&M9bsK>{oF>`hGWxcGi9r38)tjU_O+e9
z9pA3qZo%H%ew+Oz2gJeIp}=9#k?I)WSmpT4Ny#bNsl{o|+1NS9`H~CT#ocAU%b2U2
zE8Dfnb<WMyZJS%4JIUS8y~_Q$hn5H5quUeqbn`sm`DDH7dfxg@FW}|sb-?SXx4L(-
z_b)yeA8(&(pBKIczFU3!{b+uh{hIyW``h^!`#%oQ2uKU)4WtBy1vUjP1UUqi1_^@o
zg0}_V3Xu+p4e1KSh6aW<hR$zr+;DKiWSDVSe%R<n^^KVuZ)}p@l(6Z-X3FM>&1b{W
z;X&ce;h!VCBI+aNBV8kFBInrl>_hBV92-s<XNGIZE#*!{Sw@vcO-EZsABdibv5qN^
znT@rNt%`jU=Mq;Nw-E0Ye<FS<At<3O5tkUAc#cQq#q;{M$ZyHqGL)p9l%FI>Hcvj7
zJjZwEH>N--8&kSc>B}GKfi#V@@6yK8t<tO0KV$@EoXr%=<Yf+IX=Uxrn#y*{KAwZh
zVdwO2RouF3>-aX?ZFSqB?UCDicPQ`3+cCM*dFRPpgkABwZs+Rfmgc_O9kjbUPcCm~
z-o$s#-?ik6<n!~#_E_wx{T}sw?DvCv*Y2&{`?Y{maI;Xqu)Oed5xeN-4+cL}{IIky
zYTsb7N%4^qObM@Kbieiflci#%*`*T)JP&k~G0O_d-W}Y0@J6{&dF>&>p|nGS3Xh7e
zN|nlz%Fk6XRU_4Q)onGhHG69o4s#9<AF(;oS}RvuSo`T{?9m5x&UGF2YW3yEFvrr5
zy=Vw*=s(Un-qa}FSlIaG1n-34r0>bTCex;-X4&TA7Puv?W#&}ask=Wq{Mgm1(^}WY
zXe&GoPN$uI`BV5$qi5XD^qys%ZT(sG=bCm}dqD@(k<~HR8Ph4~3h5d;=XS2Q+oHSu
z7u{b@oL4+weL?&}aSx#<uV?9E_Qm;2Ntb3XM_-=k-Q4^5O7NBYeLj7ISKY5(zvgu9
zO22LY#p{;W&)+b=aqcGTX6G%FTO9+&1MRnsZ?_Mc40hZxz0-Bq>~8nax}lz7>*34y
z9PVAa?|T2%$oi3?(SXqhV_{>02b>4f4-+1~d6fQW@$t?l=qCkFMW2>GlYLhItLCq5
z0u#Z7afk7N=YG#0Pp~Im{g(3E=gGVm<QL^r3R6wfhSTR~oM!I6-0*VpRnn`)+5F$d
zey@40@%raKZ2lOS3!a;N!+*2%XVF`!w<q2iz3ZL#o`3c}@%`dL!3U`iCqJ@2_J0cc
zG_{!V8T+~7i{_W^ukK%;EF~^230*+w0zwxMx`5CHgf1X-0ig>BT|nppLKhIafY1em
zE+BM)|6><e{g1O8$jar|7y$k`0C?O6pt1@8wI~26VX55A)Rk4<7AAX3QWifpIwo}u
zGcAS7jN)^-30Z*fHx7Ra9bWTRr2qf|p-DtRRCt`N)lW#%VHm*iZ|{tmE`EZV`Datq
zAr1#JtUTnQd2t702MrIQK_2R$msF5C6<rKMC?h&dy2$oZ`@=Muu!A@h4<b=s@(^2i
zF%a}%!E#TRjHa6#?e`qt=Y2l!^Zxj~?+b7bBg1O7{<2ss4-5l`hK9uF^NHDPeq;zZ
zI5;RGB0ir_3JVLL0B3Rn2L=Y@D5j>SMANiqfXNV0L}X@WMv9AzUjb(g0YyZD!Jw3t
zmAwYchJYd>i;Ig=Sy|}^3Jd{7M3$D8q`JEL9dO<dP();Pbye!?>puWRhJYd>>+9>%
z*w`2bN-}vWn?+qN7rL&yolfUl;9|}oKvPo_)6>&eTUuJ)164VL01XWd1OkD}t*xy=
zpeAP!ptiOaUDuo2+S=v;d(I$$)oLXW2sqqs_dMXp83ZUVFURlq+dUr7JkXpo2vAyD
z%IxfHO;1lx5V!=8mu>O#@-lmSdnd+CCKH{VomDF<E1_5{b~9^mZf;J-#>O53pHk(O
zQhC5(RwqS7BpQuMMMcHSObe&{h8r6jL?V%-F`LasS6A0fQv51OalFIr?QL;593MR%
zPb`s0B;__YH>Iek=xNsA{{FsnbaZ?LYy|}cqibtxC$`?#*Y_Q$$Oy*cap~>t-36}v
zNm={*`*%}aXZiX0>7U)BS8sTD_$P4v-&{#a$&0P6tt2p!NQl?#jgszu92psriHQjT
z?xaXNM@L709Pe^rVL_BqcT<9PyInL*dz@|ub#--bc6N4>z<4|^?d|QKfpe*9o^$#;
zd~JMu{P1{}x~_{->Q<_POy}9;a=9W02M2$#a5ya8-QC}mQnxeIri~kulamq(g~aRi
teo;yd0~h`m1F&eCwyc!84-}s^@iz_-B{HTjtFHh6002ovPDHLkV1i9}Eqwq0

literal 0
HcmV?d00001

diff --git a/omnitool/gradio/tools/screen_capture.py b/omnitool/gradio/tools/screen_capture.py
index 407ecab0..249e6358 100644
--- a/omnitool/gradio/tools/screen_capture.py
+++ b/omnitool/gradio/tools/screen_capture.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from uuid import uuid4
 import requests
@@ -23,6 +24,13 @@ def get_screenshot(resize: bool = False, target_width: int = 1920, target_height
         
         screenshot = screenshot.resize((target_width, target_height))
 
+        cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png")
+        cursor_x, cursor_y = pyautogui.position()
+        cursor = Image.open(cursor_path)
+        # make the cursor smaller
+        cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5)))
+        screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
+
         screenshot.save(path)
         return screenshot, path
     except Exception as e:
diff --git a/requirements.txt b/requirements.txt
index 901a27fa..ccc94b1e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,6 @@ google-auth<3,>=2
 screeninfo
 uiautomation
 dashscope
-groq
\ No newline at end of file
+groq
+google-genai
+tiktoken
\ No newline at end of file

From 7b37788c6c78d157831f7482aa4bd23cd7b95f84 Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Fri, 18 Apr 2025 20:11:22 +0700
Subject: [PATCH 3/6] optimize gemini client and add gemini-2.5-flash

---
 .gitignore                                    |   3 +-
 logs/gradio_gradio_app_20250418.log           |   7 +
 logs/gradio_gradio_tools_20250418.log         |   0
 logs/gradio_sampling_loop_20250418.log        |   2 +
 logs/gradio_screen_capture_20250418.log       |   0
 logs/image_utils_20250418.log                 |   0
 logs/model_utils_20250418.log                 |  18 +++
 logs/ocr_utils_20250418.log                   |  21 +++
 logs/omniparser_20250418.log                  |  62 +++++++++
 logs/utils_20250418.log                       |  70 ++++++++++
 .../gradio/agent/llm_utils/geminiclient.py    |  57 +++-----
 omnitool/gradio/agent/llm_utils/oaiclient.py  |   3 -
 omnitool/gradio/agent/vlm_agent.py            |  20 ++-
 .../agent/vlm_agent_with_orchestrator.py      | 128 +++++++++++++++++-
 omnitool/gradio/app.py                        |   8 +-
 omnitool/gradio/loop.py                       |   6 +-
 requirements.txt                              |   3 +-
 17 files changed, 353 insertions(+), 55 deletions(-)
 create mode 100644 logs/gradio_gradio_app_20250418.log
 create mode 100644 logs/gradio_gradio_tools_20250418.log
 create mode 100644 logs/gradio_sampling_loop_20250418.log
 create mode 100644 logs/gradio_screen_capture_20250418.log
 create mode 100644 logs/image_utils_20250418.log
 create mode 100644 logs/model_utils_20250418.log
 create mode 100644 logs/ocr_utils_20250418.log
 create mode 100644 logs/omniparser_20250418.log
 create mode 100644 logs/utils_20250418.log

diff --git a/.gitignore b/.gitignore
index 8b8235e6..1761ebd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ util/__pycache__/
 index.html?linkid=2289031
 wget-log
 weights/icon_caption_florence_v2/
-omnitool/gradio/uploads/
\ No newline at end of file
+omnitool/gradio/uploads/
+.DS_Store
\ No newline at end of file
diff --git a/logs/gradio_gradio_app_20250418.log b/logs/gradio_gradio_app_20250418.log
new file mode 100644
index 00000000..b60e93a4
--- /dev/null
+++ b/logs/gradio_gradio_app_20250418.log
@@ -0,0 +1,7 @@
+2025-04-18 19:49:59 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888
+2025-04-18 19:50:20 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.0-flash
+2025-04-18 19:51:16 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888
+2025-04-18 19:51:22 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.5-flash-preview-04-17
+2025-04-18 19:52:05 - gradio_app - INFO - [+] Processing user input: 'on my spotify play "garam dan madu"...' (truncated)
+2025-04-18 19:52:05 - gradio_app - ERROR - [-] Error in sampling loop: name 'OmniParserClient' is not defined
+2025-04-18 19:52:05 - gradio_app - INFO - [+] Input processing completed in 0.14s
diff --git a/logs/gradio_gradio_tools_20250418.log b/logs/gradio_gradio_tools_20250418.log
new file mode 100644
index 00000000..e69de29b
diff --git a/logs/gradio_sampling_loop_20250418.log b/logs/gradio_sampling_loop_20250418.log
new file mode 100644
index 00000000..867a09da
--- /dev/null
+++ b/logs/gradio_sampling_loop_20250418.log
@@ -0,0 +1,2 @@
+2025-04-18 19:52:05 - sampling_loop - INFO - [+] Initializing sampling loop with model: omniparser + gemini-2.5-flash-preview-04-17
+2025-04-18 19:52:05 - sampling_loop - ERROR - [-] Failed to initialize OmniParser client: name 'OmniParserClient' is not defined
diff --git a/logs/gradio_screen_capture_20250418.log b/logs/gradio_screen_capture_20250418.log
new file mode 100644
index 00000000..e69de29b
diff --git a/logs/image_utils_20250418.log b/logs/image_utils_20250418.log
new file mode 100644
index 00000000..e69de29b
diff --git a/logs/model_utils_20250418.log b/logs/model_utils_20250418.log
new file mode 100644
index 00000000..48f1d970
--- /dev/null
+++ b/logs/model_utils_20250418.log
@@ -0,0 +1,18 @@
+2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:58:52 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
+2025-04-18 18:58:52 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
+2025-04-18 18:58:52 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
+2025-04-18 18:59:03 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
+2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:59:07 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
+2025-04-18 18:59:07 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
+2025-04-18 18:59:07 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
+2025-04-18 18:59:20 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
+2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon
+2025-04-18 18:59:20 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
+2025-04-18 18:59:20 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
+2025-04-18 18:59:20 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
+2025-04-18 18:59:29 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
diff --git a/logs/ocr_utils_20250418.log b/logs/ocr_utils_20250418.log
new file mode 100644
index 00000000..86e03b17
--- /dev/null
+++ b/logs/ocr_utils_20250418.log
@@ -0,0 +1,21 @@
+2025-04-18 19:00:23 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:00:23 - ocr_utils - INFO - Initializing EasyOCR reader
+2025-04-18 19:00:27 - ocr_utils - INFO - EasyOCR found 36 text elements
+2025-04-18 19:00:43 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:00:45 - ocr_utils - INFO - EasyOCR found 35 text elements
+2025-04-18 19:01:06 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:01:08 - ocr_utils - INFO - EasyOCR found 101 text elements
+2025-04-18 19:01:31 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:01:32 - ocr_utils - INFO - EasyOCR found 86 text elements
+2025-04-18 19:01:52 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:01:54 - ocr_utils - INFO - EasyOCR found 110 text elements
+2025-04-18 19:02:12 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:02:14 - ocr_utils - INFO - EasyOCR found 86 text elements
+2025-04-18 19:02:55 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:02:57 - ocr_utils - INFO - EasyOCR found 99 text elements
+2025-04-18 19:03:11 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:03:14 - ocr_utils - INFO - EasyOCR found 84 text elements
+2025-04-18 19:03:24 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:03:27 - ocr_utils - INFO - EasyOCR found 21 text elements
+2025-04-18 19:03:41 - ocr_utils - INFO - Running OCR on image of size 1470x956
+2025-04-18 19:03:44 - ocr_utils - INFO - EasyOCR found 125 text elements
diff --git a/logs/omniparser_20250418.log b/logs/omniparser_20250418.log
new file mode 100644
index 00000000..e381e029
--- /dev/null
+++ b/logs/omniparser_20250418.log
@@ -0,0 +1,62 @@
+2025-04-18 18:58:51 - omniparser - INFO - Initializing OmniParser
+2025-04-18 18:58:52 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
+2025-04-18 18:59:03 - omniparser - INFO - Caption model loaded: florence2
+2025-04-18 18:59:03 - omniparser - INFO - OmniParser initialization complete!
+2025-04-18 18:59:07 - omniparser - INFO - Initializing OmniParser
+2025-04-18 18:59:07 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
+2025-04-18 18:59:20 - omniparser - INFO - Caption model loaded: florence2
+2025-04-18 18:59:20 - omniparser - INFO - OmniParser initialization complete!
+2025-04-18 18:59:20 - omniparser - INFO - Initializing OmniParser
+2025-04-18 18:59:20 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
+2025-04-18 18:59:29 - omniparser - INFO - Caption model loaded: florence2
+2025-04-18 18:59:29 - omniparser - INFO - OmniParser initialization complete!
+2025-04-18 19:00:23 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:00:23 - omniparser - INFO - Running OCR on image
+2025-04-18 19:00:27 - omniparser - INFO - OCR found 36 text elements
+2025-04-18 19:00:27 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:00:29 - omniparser - INFO - Parsing complete. Found 62 UI elements
+2025-04-18 19:00:43 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:00:43 - omniparser - INFO - Running OCR on image
+2025-04-18 19:00:45 - omniparser - INFO - OCR found 35 text elements
+2025-04-18 19:00:45 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:00:47 - omniparser - INFO - Parsing complete. Found 58 UI elements
+2025-04-18 19:01:06 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:01:06 - omniparser - INFO - Running OCR on image
+2025-04-18 19:01:08 - omniparser - INFO - OCR found 101 text elements
+2025-04-18 19:01:08 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:01:10 - omniparser - INFO - Parsing complete. Found 130 UI elements
+2025-04-18 19:01:31 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:01:31 - omniparser - INFO - Running OCR on image
+2025-04-18 19:01:32 - omniparser - INFO - OCR found 86 text elements
+2025-04-18 19:01:32 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:01:34 - omniparser - INFO - Parsing complete. Found 127 UI elements
+2025-04-18 19:01:52 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:01:52 - omniparser - INFO - Running OCR on image
+2025-04-18 19:01:54 - omniparser - INFO - OCR found 110 text elements
+2025-04-18 19:01:54 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:01:56 - omniparser - INFO - Parsing complete. Found 141 UI elements
+2025-04-18 19:02:12 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:02:12 - omniparser - INFO - Running OCR on image
+2025-04-18 19:02:14 - omniparser - INFO - OCR found 86 text elements
+2025-04-18 19:02:14 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:02:15 - omniparser - INFO - Parsing complete. Found 127 UI elements
+2025-04-18 19:02:55 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:02:55 - omniparser - INFO - Running OCR on image
+2025-04-18 19:02:57 - omniparser - INFO - OCR found 99 text elements
+2025-04-18 19:02:57 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:03:00 - omniparser - INFO - Parsing complete. Found 126 UI elements
+2025-04-18 19:03:11 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:03:11 - omniparser - INFO - Running OCR on image
+2025-04-18 19:03:14 - omniparser - INFO - OCR found 84 text elements
+2025-04-18 19:03:14 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:03:16 - omniparser - INFO - Parsing complete. Found 115 UI elements
+2025-04-18 19:03:24 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:03:24 - omniparser - INFO - Running OCR on image
+2025-04-18 19:03:27 - omniparser - INFO - OCR found 21 text elements
+2025-04-18 19:03:27 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:03:30 - omniparser - INFO - Parsing complete. Found 73 UI elements
+2025-04-18 19:03:41 - omniparser - INFO - Processing image of size 1470x956
+2025-04-18 19:03:41 - omniparser - INFO - Running OCR on image
+2025-04-18 19:03:44 - omniparser - INFO - OCR found 125 text elements
+2025-04-18 19:03:44 - omniparser - INFO - Processing image with SOM labeling
+2025-04-18 19:03:47 - omniparser - INFO - Parsing complete. Found 151 UI elements
diff --git a/logs/utils_20250418.log b/logs/utils_20250418.log
new file mode 100644
index 00000000..0e702fe6
--- /dev/null
+++ b/logs/utils_20250418.log
@@ -0,0 +1,70 @@
+2025-04-18 19:00:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:00:28 - utils - INFO - Found 62 filtered boxes (starting_idx=33)
+2025-04-18 19:00:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:00:28 - utils - INFO - Processing 29 regions in 1 batches
+2025-04-18 19:00:29 - utils - INFO - All captions generated in 1.02s
+2025-04-18 19:00:29 - utils - INFO - Caption processing completed in 1.02s
+2025-04-18 19:00:29 - utils - INFO - SOM labeling completed in 2.17s
+2025-04-18 19:00:45 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:00:45 - utils - INFO - Found 58 filtered boxes (starting_idx=31)
+2025-04-18 19:00:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:00:45 - utils - INFO - Processing 27 regions in 1 batches
+2025-04-18 19:00:47 - utils - INFO - All captions generated in 1.91s
+2025-04-18 19:00:47 - utils - INFO - Caption processing completed in 1.91s
+2025-04-18 19:00:47 - utils - INFO - SOM labeling completed in 2.84s
+2025-04-18 19:01:08 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:01:09 - utils - INFO - Found 130 filtered boxes (starting_idx=87)
+2025-04-18 19:01:09 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:01:09 - utils - INFO - Processing 43 regions in 1 batches
+2025-04-18 19:01:10 - utils - INFO - All captions generated in 1.24s
+2025-04-18 19:01:10 - utils - INFO - Caption processing completed in 1.24s
+2025-04-18 19:01:10 - utils - INFO - SOM labeling completed in 2.18s
+2025-04-18 19:01:32 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:01:33 - utils - INFO - Found 127 filtered boxes (starting_idx=86)
+2025-04-18 19:01:33 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:01:33 - utils - INFO - Processing 41 regions in 1 batches
+2025-04-18 19:01:34 - utils - INFO - All captions generated in 0.93s
+2025-04-18 19:01:34 - utils - INFO - Caption processing completed in 0.93s
+2025-04-18 19:01:34 - utils - INFO - SOM labeling completed in 1.73s
+2025-04-18 19:01:54 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:01:55 - utils - INFO - Found 141 filtered boxes (starting_idx=97)
+2025-04-18 19:01:55 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:01:55 - utils - INFO - Processing 44 regions in 1 batches
+2025-04-18 19:01:56 - utils - INFO - All captions generated in 1.08s
+2025-04-18 19:01:56 - utils - INFO - Caption processing completed in 1.08s
+2025-04-18 19:01:56 - utils - INFO - SOM labeling completed in 1.83s
+2025-04-18 19:02:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:02:14 - utils - INFO - Found 127 filtered boxes (starting_idx=86)
+2025-04-18 19:02:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:02:14 - utils - INFO - Processing 41 regions in 1 batches
+2025-04-18 19:02:15 - utils - INFO - All captions generated in 0.82s
+2025-04-18 19:02:15 - utils - INFO - Caption processing completed in 0.82s
+2025-04-18 19:02:15 - utils - INFO - SOM labeling completed in 1.67s
+2025-04-18 19:02:57 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:02:58 - utils - INFO - Found 126 filtered boxes (starting_idx=85)
+2025-04-18 19:02:58 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:02:58 - utils - INFO - Processing 41 regions in 1 batches
+2025-04-18 19:02:59 - utils - INFO - All captions generated in 1.14s
+2025-04-18 19:02:59 - utils - INFO - Caption processing completed in 1.14s
+2025-04-18 19:03:00 - utils - INFO - SOM labeling completed in 2.10s
+2025-04-18 19:03:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:03:14 - utils - INFO - Found 115 filtered boxes (starting_idx=78)
+2025-04-18 19:03:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:03:14 - utils - INFO - Processing 37 regions in 1 batches
+2025-04-18 19:03:16 - utils - INFO - All captions generated in 1.47s
+2025-04-18 19:03:16 - utils - INFO - Caption processing completed in 1.47s
+2025-04-18 19:03:16 - utils - INFO - SOM labeling completed in 2.14s
+2025-04-18 19:03:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:03:28 - utils - INFO - Found 73 filtered boxes (starting_idx=21)
+2025-04-18 19:03:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:03:28 - utils - INFO - Processing 52 regions in 1 batches
+2025-04-18 19:03:30 - utils - INFO - All captions generated in 2.59s
+2025-04-18 19:03:30 - utils - INFO - Caption processing completed in 2.60s
+2025-04-18 19:03:30 - utils - INFO - SOM labeling completed in 3.79s
+2025-04-18 19:03:44 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
+2025-04-18 19:03:45 - utils - INFO - Found 151 filtered boxes (starting_idx=96)
+2025-04-18 19:03:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
+2025-04-18 19:03:45 - utils - INFO - Processing 55 regions in 1 batches
+2025-04-18 19:03:47 - utils - INFO - All captions generated in 1.70s
+2025-04-18 19:03:47 - utils - INFO - Caption processing completed in 1.70s
+2025-04-18 19:03:47 - utils - INFO - SOM labeling completed in 2.81s
diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py
index c610303d..ec17341d 100644
--- a/omnitool/gradio/agent/llm_utils/geminiclient.py
+++ b/omnitool/gradio/agent/llm_utils/geminiclient.py
@@ -1,27 +1,20 @@
 import os
 from google import genai
 from google.genai import types
-import tiktoken
+from pydantic import BaseModel, Field
+from typing import Optional
+from PIL import Image
+from pprint import pprint
 
 from .utils import is_image_path, encode_image
 
-def estimate_token_count(text):
-    """Estimates the token count of a text string using tiktoken.
-       Adapt this for Gemini's specific vocabulary if necessary."""
+class Action(BaseModel):
+    reasoning: str = Field(..., alias="Reasoning")
+    next_action: str = Field(..., alias="Next Action")
+    box_id: str | None = Field(None, alias="Box ID")
+    value: str | None = None
 
-    # IMPORTANT:  tiktoken is primarily for OpenAI models.
-    # You need to be aware of potential inaccuracies if Gemini
-    # uses a significantly different tokenization scheme.
-
-    try:
-        encoding = tiktoken.get_encoding("cl100k_base") # This is a good starting point, but research Gemini tokenizer
-        tokens = encoding.encode(text)
-        return len(tokens)
-    except Exception as e:
-        print(f"Error estimating token count: {e}")
-        return None  # or a reasonable default
-
-def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, temperature=0):    
+def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens: int, temperature=0):    
     """
     Run a chat completion through Gemini's API, ignoring any images in the messages.
     """
@@ -35,7 +28,9 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key
 
     generate_content_config = types.GenerateContentConfig(
         temperature=temperature,
+        max_output_tokens=max_tokens,
         response_mime_type="application/json",
+        response_schema=Action,
         system_instruction=[
             types.Part.from_text(text=system),
         ],
@@ -45,33 +40,21 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key
 
     if type(messages) == list:
         for item in messages:
-            parts = []
             if isinstance(item, dict):
                 for cnt in item["content"]:
                     if isinstance(cnt, str):
-                        parts.append(types.Part.from_text(text=cnt))
+                        if is_image_path(cnt):
+                            contents.append(Image.open(cnt))
+                        else:
+                            contents.append(cnt)
                     else:
-                        # in this case it is a text block from anthropic
-                        parts.append(types.Part.from_text(text=str(cnt)))
+                        contents.append(str(cnt))
                 
             else:  # str
-                parts.append(types.Part.from_text(text=str(item)))
-
-            content = (types.Content(
-                role="user",
-                parts=parts
-            ))
-            
-            contents.append(content)
+                contents.append(str(cnt))
 
-    
     elif isinstance(messages, str):
-        contents = [
-            types.Content(
-                role="user",
-                parts=[types.Part.from_text(text=messages)]
-            )
-        ]
+        contents.push(messages)
 
     try:
         response = client.models.generate_content(
@@ -80,7 +63,7 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key
             config=generate_content_config
         )
         final_answer = response.text
-        token_usage = estimate_token_count(final_answer)
+        token_usage = response.usage_metadata.total_token_count
 
         return final_answer, token_usage
     except Exception as e:
diff --git a/omnitool/gradio/agent/llm_utils/oaiclient.py b/omnitool/gradio/agent/llm_utils/oaiclient.py
index ad421100..768a86e8 100644
--- a/omnitool/gradio/agent/llm_utils/oaiclient.py
+++ b/omnitool/gradio/agent/llm_utils/oaiclient.py
@@ -1,6 +1,3 @@
-import os
-import logging
-import base64
 import requests
 from .utils import is_image_path, encode_image
 
diff --git a/omnitool/gradio/agent/vlm_agent.py b/omnitool/gradio/agent/vlm_agent.py
index afa748f1..ee6f6d0e 100644
--- a/omnitool/gradio/agent/vlm_agent.py
+++ b/omnitool/gradio/agent/vlm_agent.py
@@ -53,6 +53,8 @@ def __init__(
             self.model = "o3-mini"
         elif model == "omniparser + gemini-2.0-flash":
             self.model = "gemini-2.0-flash"
+        elif model == "omniparser + gemini-2.5-flash-preview-04-17": 
+            self.model = "gemini-2.5-flash-preview-04-17"
         else:
             raise ValueError(f"Model {model} not supported")
         
@@ -137,17 +139,18 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
             print(f"qwen token usage: {token_usage}")
             self.total_token_usage += token_usage
             self.total_cost += (token_usage * 2.2 / 1000000)  # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
-        elif "gemini-2.0-flash" in self.model:
+        elif "gemini" in self.model:
             vlm_response, token_usage = run_gemini_interleaved(
                 messages=planner_messages,
                 system=system,
                 model_name=self.model,
                 api_key=self.api_key,
+                max_tokens=self.max_tokens,
                 temperature=0,
             )
             print(f"gemini token usage: {token_usage}")
             self.total_token_usage += token_usage
-            self.total_cost += (token_usage * 0.99 / 1000000)
+            self.total_cost += 0 # assume using free tier
         else:
             raise ValueError(f"Model {self.model} not supported")
         latency_vlm = time.time() - start
@@ -226,7 +229,9 @@ def _get_system_prompt(self, screen_info: str = ""):
         main_section = f"""
 You are using a {platform.system()} device.
 You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
-You can only interact with the desktop GUI (no terminal or application menu access) and ignore the gradio interface (which opened in localhost:7888) including the orange send button there.
+You can only interact with the desktop GUI (no terminal or application menu access)
+
+!!!DO NOT interact with the chatbot webpage interface that opens in 0.0.0.0:7888. You don't need to click the orange send button because the user already clicked it!!!
 
 You may be given some history plan and actions, this is the response from the previous loop.
 You should carefully consider your plan base on the task, screenshot, and history actions.
@@ -245,6 +250,15 @@ def _get_system_prompt(self, screen_info: str = ""):
 
 Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
 
+Use this JSON schema:
+
+Action = {{
+    "Reasoning": str,
+    "Next Action": str,
+    "Box ID": str | None,
+    "value": str | None
+}}
+
 Output format:
 ```json
 {{
diff --git a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py
index 74d554a8..4b5d0275 100644
--- a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py
+++ b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py
@@ -14,6 +14,7 @@
 
 from agent.llm_utils.oaiclient import run_oai_interleaved
 from agent.llm_utils.groqclient import run_groq_interleaved
+from agent.llm_utils.geminiclient import run_gemini_interleaved
 from agent.llm_utils.utils import is_image_path
 import time
 import re
@@ -85,6 +86,10 @@ def __init__(
             self.model = "o1"
         elif model == "omniparser + o3-mini" or model == "omniparser + o3-mini-orchestrated":
             self.model = "o3-mini"
+        elif model == "omniparser + gemini-2.0-flash" or model == "omniparser + gemini-2.0-flash-orchestrated":
+            self.model = "gemini-2.0-flash"
+        elif model == "omniparser + gemini-2.5-flash-preview-04-17" or model == "omniparser + gemini-2.5-flash-preview-04-17-orchestrated":
+            self.model = "gemini-2.5-flash-preview-04-17"
         else:
             raise ValueError(f"Model {model} not supported")
         
@@ -194,6 +199,18 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
             print(f"qwen token usage: {token_usage}")
             self.total_token_usage += token_usage
             self.total_cost += (token_usage * 2.2 / 1000000)  # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
+        elif "gemini" in self.model:
+            vlm_response, token_usage = run_gemini_interleaved(
+                messages=planner_messages,
+                system=system,
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+                temperature=0,
+            )
+            print(f"gemini token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += 0 # assume using free tier
         else:
             raise ValueError(f"Model {self.model} not supported")
         latency_vlm = time.time() - start
@@ -312,6 +329,15 @@ def _get_system_prompt(self, screen_info: str = ""):
 
 Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task.
 
+Use this JSON schema:
+
+Action = {{
+    "Reasoning": str,
+    "Next Action": str,
+    "Box ID": str | None,
+    "value": str | None
+}}
+
 Output format:
 ```json
 {{
@@ -381,7 +407,9 @@ def _initialize_task(self, messages: list):
         plan_prompt = self._get_plan_prompt(self._task)
         input_message = copy.deepcopy(messages)
         input_message.append({"role": "user", "content": plan_prompt})
-        vlm_response, token_usage = run_oai_interleaved(
+
+        if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model:
+            vlm_response, token_usage = run_oai_interleaved(
                 messages=input_message,
                 system="",
                 model_name=self.model,
@@ -390,6 +418,53 @@ def _initialize_task(self, messages: list):
                 provider_base_url="https://api.openai.com/v1",
                 temperature=0,
             )
+            print(f"oai token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            if 'gpt' in self.model:
+                self.total_cost += (token_usage * 2.5 / 1000000)  # https://openai.com/api/pricing/
+            elif 'o1' in self.model:
+                self.total_cost += (token_usage * 15 / 1000000)  # https://openai.com/api/pricing/
+            elif 'o3-mini' in self.model:
+                self.total_cost += (token_usage * 1.1 / 1000000)  # https://openai.com/api/pricing/
+        elif "r1" in self.model:
+            vlm_response, token_usage = run_groq_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+            )
+            print(f"groq token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 0.99 / 1000000)
+        elif "qwen" in self.model:
+            vlm_response, token_usage = run_oai_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=min(2048, self.max_tokens),
+                provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+                temperature=0,
+            )
+            print(f"qwen token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 2.2 / 1000000)  # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
+        elif "gemini" in self.model:
+            vlm_response, token_usage = run_gemini_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+                temperature=0,
+            )
+            print(f"gemini token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += 0 # assume using free tier
+        else:
+            raise ValueError(f"Model {self.model} not supported")
+        
         plan = extract_data(vlm_response, "json")
         
         # Create a filename with timestamp
@@ -413,7 +488,9 @@ def _update_ledger(self, messages):
         update_ledger_prompt = ORCHESTRATOR_LEDGER_PROMPT.format(task=self._task)
         input_message = copy.deepcopy(messages)
         input_message.append({"role": "user", "content": update_ledger_prompt})
-        vlm_response, token_usage = run_oai_interleaved(
+        
+        if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model:
+            vlm_response, token_usage = run_oai_interleaved(
                 messages=input_message,
                 system="",
                 model_name=self.model,
@@ -422,6 +499,53 @@ def _update_ledger(self, messages):
                 provider_base_url="https://api.openai.com/v1",
                 temperature=0,
             )
+            print(f"oai token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            if 'gpt' in self.model:
+                self.total_cost += (token_usage * 2.5 / 1000000)  # https://openai.com/api/pricing/
+            elif 'o1' in self.model:
+                self.total_cost += (token_usage * 15 / 1000000)  # https://openai.com/api/pricing/
+            elif 'o3-mini' in self.model:
+                self.total_cost += (token_usage * 1.1 / 1000000)  # https://openai.com/api/pricing/
+        elif "r1" in self.model:
+            vlm_response, token_usage = run_groq_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+            )
+            print(f"groq token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 0.99 / 1000000)
+        elif "qwen" in self.model:
+            vlm_response, token_usage = run_oai_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=min(2048, self.max_tokens),
+                provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+                temperature=0,
+            )
+            print(f"qwen token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += (token_usage * 2.2 / 1000000)  # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a
+        elif "gemini" in self.model:
+            vlm_response, token_usage = run_gemini_interleaved(
+                messages=input_message,
+                system="",
+                model_name=self.model,
+                api_key=self.api_key,
+                max_tokens=self.max_tokens,
+                temperature=0,
+            )
+            print(f"gemini token usage: {token_usage}")
+            self.total_token_usage += token_usage
+            self.total_cost += 0 # assume using free tier
+        else:
+            raise ValueError(f"Model {self.model} not supported")
+        
         updated_ledger = extract_data(vlm_response, "json")
         return updated_ledger
     
diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py
index 43e692a1..59e6c855 100644
--- a/omnitool/gradio/app.py
+++ b/omnitool/gradio/app.py
@@ -27,7 +27,7 @@
 API_KEY_FILE = CONFIG_DIR / "api_key"
 
 INTRO_TEXT = '''
-OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini(2.0-flash) or Anthropic Computer Use (Sonnet).**
+OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini (2.0/2.5) or Anthropic Computer Use (Sonnet).**
 
 Type a message and press submit to start OmniTool. Press stop to pause, and press the trash icon in the chat to clear the message history.
 '''
@@ -241,7 +241,7 @@ def process_input(user_input, state):
         api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
         api_key=state["api_key"],
         only_n_most_recent_images=state["only_n_most_recent_images"],
-        max_tokens=8192,
+        max_tokens=16384,
         omniparser_url=args.omniparser_server_url
     ):  
         if loop_msg is None or state.get("stop"):
@@ -302,7 +302,7 @@ def get_header_image_base64():
             with gr.Column():
                 model = gr.Dropdown(
                     label="Model",
-                    choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"],
+                    choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"],
                     value="omniparser + gpt-4o",
                     interactive=True,
                 )
@@ -362,7 +362,7 @@ def update_model(model_selection, state):
             provider_choices = ["groq"]
         elif model_selection == "omniparser + qwen2.5vl":
             provider_choices = ["dashscope"]
-        elif model_selection == "omniparser + gemini-2.0-flash":
+        elif model_selection in set(["omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"]):
             provider_choices = ["gemini"]
         else:
             provider_choices = [option.value for option in APIProvider]
diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py
index bdd856e7..323cb4b2 100644
--- a/omnitool/gradio/loop.py
+++ b/omnitool/gradio/loop.py
@@ -68,7 +68,7 @@ def sampling_loop_sync(
             max_tokens=max_tokens,
             only_n_most_recent_images=only_n_most_recent_images
         )
-    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash"]):
+    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17"]):
         actor = VLMAgent(
             model=model,
             provider=provider,
@@ -78,7 +78,7 @@ def sampling_loop_sync(
             max_tokens=max_tokens,
             only_n_most_recent_images=only_n_most_recent_images
         )
-    elif model in set(["omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]):
+    elif model in set(["omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"]):
         actor = VLMOrchestratedAgent(
             model=model,
             provider=provider,
@@ -117,7 +117,7 @@ def sampling_loop_sync(
 
             messages.append({"content": tool_result_content, "role": "user"})
     
-    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]):
+    elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.0-flash-thinking-exp-orchestrated"]):
         while True:
             parsed_screen = omniparser_client()
             tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)
diff --git a/requirements.txt b/requirements.txt
index ccc94b1e..b58b5423 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,5 +30,4 @@ screeninfo
 uiautomation
 dashscope
 groq
-google-genai
-tiktoken
\ No newline at end of file
+google-genai
\ No newline at end of file

From 73b7e70fc02413d497e93136902474e10c7afaeb Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Fri, 18 Apr 2025 21:22:55 +0700
Subject: [PATCH 4/6] Refactor OmniParser integration and enhance device
 support

- Updated .gitignore to ignore all .DS_Store files.
- Modified AnthropicAgent to accept host_device argument for better flexibility.
- Enhanced OmniParserClient to utilize host_device for screenshot functionality.
- Updated app.py and app_new.py to include host_device argument in command line options.
- Refactored ComputerTool to support actions based on host_device, improving compatibility with local and omnibox_windows environments.
- Adjusted get_screenshot function to handle different host_device scenarios for capturing screenshots.
---
 .gitignore                                    |  2 +-
 logs/gradio_gradio_app_20250418.log           |  7 --
 logs/gradio_gradio_tools_20250418.log         |  0
 logs/gradio_sampling_loop_20250418.log        |  2 -
 logs/gradio_screen_capture_20250418.log       |  0
 logs/image_utils_20250418.log                 |  0
 logs/model_utils_20250418.log                 | 18 -----
 logs/ocr_utils_20250418.log                   | 21 ------
 logs/omniparser_20250418.log                  | 62 ----------------
 logs/utils_20250418.log                       | 70 -------------------
 omnitool/gradio/agent/anthropic_agent.py      |  3 +-
 .../agent/llm_utils/omniparserclient.py       |  4 +-
 omnitool/gradio/app.py                        | 30 +++++---
 omnitool/gradio/app_new.py                    | 11 ++-
 .../gradio/executor/anthropic_executor.py     |  3 +-
 omnitool/gradio/loop.py                       |  6 +-
 omnitool/gradio/tools/computer.py             | 55 +++++++++------
 omnitool/gradio/tools/screen_capture.py       | 34 +++++----
 18 files changed, 93 insertions(+), 235 deletions(-)
 delete mode 100644 logs/gradio_gradio_app_20250418.log
 delete mode 100644 logs/gradio_gradio_tools_20250418.log
 delete mode 100644 logs/gradio_sampling_loop_20250418.log
 delete mode 100644 logs/gradio_screen_capture_20250418.log
 delete mode 100644 logs/image_utils_20250418.log
 delete mode 100644 logs/model_utils_20250418.log
 delete mode 100644 logs/ocr_utils_20250418.log
 delete mode 100644 logs/omniparser_20250418.log
 delete mode 100644 logs/utils_20250418.log

diff --git a/.gitignore b/.gitignore
index 1761ebd1..20db02ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,4 @@ index.html?linkid=2289031
 wget-log
 weights/icon_caption_florence_v2/
 omnitool/gradio/uploads/
-.DS_Store
\ No newline at end of file
+**/.DS_Store
\ No newline at end of file
diff --git a/logs/gradio_gradio_app_20250418.log b/logs/gradio_gradio_app_20250418.log
deleted file mode 100644
index b60e93a4..00000000
--- a/logs/gradio_gradio_app_20250418.log
+++ /dev/null
@@ -1,7 +0,0 @@
-2025-04-18 19:49:59 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888
-2025-04-18 19:50:20 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.0-flash
-2025-04-18 19:51:16 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888
-2025-04-18 19:51:22 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.5-flash-preview-04-17
-2025-04-18 19:52:05 - gradio_app - INFO - [+] Processing user input: 'on my spotify play "garam dan madu"...' (truncated)
-2025-04-18 19:52:05 - gradio_app - ERROR - [-] Error in sampling loop: name 'OmniParserClient' is not defined
-2025-04-18 19:52:05 - gradio_app - INFO - [+] Input processing completed in 0.14s
diff --git a/logs/gradio_gradio_tools_20250418.log b/logs/gradio_gradio_tools_20250418.log
deleted file mode 100644
index e69de29b..00000000
diff --git a/logs/gradio_sampling_loop_20250418.log b/logs/gradio_sampling_loop_20250418.log
deleted file mode 100644
index 867a09da..00000000
--- a/logs/gradio_sampling_loop_20250418.log
+++ /dev/null
@@ -1,2 +0,0 @@
-2025-04-18 19:52:05 - sampling_loop - INFO - [+] Initializing sampling loop with model: omniparser + gemini-2.5-flash-preview-04-17
-2025-04-18 19:52:05 - sampling_loop - ERROR - [-] Failed to initialize OmniParser client: name 'OmniParserClient' is not defined
diff --git a/logs/gradio_screen_capture_20250418.log b/logs/gradio_screen_capture_20250418.log
deleted file mode 100644
index e69de29b..00000000
diff --git a/logs/image_utils_20250418.log b/logs/image_utils_20250418.log
deleted file mode 100644
index e69de29b..00000000
diff --git a/logs/model_utils_20250418.log b/logs/model_utils_20250418.log
deleted file mode 100644
index 48f1d970..00000000
--- a/logs/model_utils_20250418.log
+++ /dev/null
@@ -1,18 +0,0 @@
-2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:58:52 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
-2025-04-18 18:58:52 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
-2025-04-18 18:58:52 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
-2025-04-18 18:59:03 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
-2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:59:07 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
-2025-04-18 18:59:07 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
-2025-04-18 18:59:07 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
-2025-04-18 18:59:20 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
-2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon
-2025-04-18 18:59:20 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt
-2025-04-18 18:59:20 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt
-2025-04-18 18:59:20 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence
-2025-04-18 18:59:29 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps
diff --git a/logs/ocr_utils_20250418.log b/logs/ocr_utils_20250418.log
deleted file mode 100644
index 86e03b17..00000000
--- a/logs/ocr_utils_20250418.log
+++ /dev/null
@@ -1,21 +0,0 @@
-2025-04-18 19:00:23 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:00:23 - ocr_utils - INFO - Initializing EasyOCR reader
-2025-04-18 19:00:27 - ocr_utils - INFO - EasyOCR found 36 text elements
-2025-04-18 19:00:43 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:00:45 - ocr_utils - INFO - EasyOCR found 35 text elements
-2025-04-18 19:01:06 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:01:08 - ocr_utils - INFO - EasyOCR found 101 text elements
-2025-04-18 19:01:31 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:01:32 - ocr_utils - INFO - EasyOCR found 86 text elements
-2025-04-18 19:01:52 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:01:54 - ocr_utils - INFO - EasyOCR found 110 text elements
-2025-04-18 19:02:12 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:02:14 - ocr_utils - INFO - EasyOCR found 86 text elements
-2025-04-18 19:02:55 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:02:57 - ocr_utils - INFO - EasyOCR found 99 text elements
-2025-04-18 19:03:11 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:03:14 - ocr_utils - INFO - EasyOCR found 84 text elements
-2025-04-18 19:03:24 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:03:27 - ocr_utils - INFO - EasyOCR found 21 text elements
-2025-04-18 19:03:41 - ocr_utils - INFO - Running OCR on image of size 1470x956
-2025-04-18 19:03:44 - ocr_utils - INFO - EasyOCR found 125 text elements
diff --git a/logs/omniparser_20250418.log b/logs/omniparser_20250418.log
deleted file mode 100644
index e381e029..00000000
--- a/logs/omniparser_20250418.log
+++ /dev/null
@@ -1,62 +0,0 @@
-2025-04-18 18:58:51 - omniparser - INFO - Initializing OmniParser
-2025-04-18 18:58:52 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
-2025-04-18 18:59:03 - omniparser - INFO - Caption model loaded: florence2
-2025-04-18 18:59:03 - omniparser - INFO - OmniParser initialization complete!
-2025-04-18 18:59:07 - omniparser - INFO - Initializing OmniParser
-2025-04-18 18:59:07 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
-2025-04-18 18:59:20 - omniparser - INFO - Caption model loaded: florence2
-2025-04-18 18:59:20 - omniparser - INFO - OmniParser initialization complete!
-2025-04-18 18:59:20 - omniparser - INFO - Initializing OmniParser
-2025-04-18 18:59:20 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt
-2025-04-18 18:59:29 - omniparser - INFO - Caption model loaded: florence2
-2025-04-18 18:59:29 - omniparser - INFO - OmniParser initialization complete!
-2025-04-18 19:00:23 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:00:23 - omniparser - INFO - Running OCR on image
-2025-04-18 19:00:27 - omniparser - INFO - OCR found 36 text elements
-2025-04-18 19:00:27 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:00:29 - omniparser - INFO - Parsing complete. Found 62 UI elements
-2025-04-18 19:00:43 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:00:43 - omniparser - INFO - Running OCR on image
-2025-04-18 19:00:45 - omniparser - INFO - OCR found 35 text elements
-2025-04-18 19:00:45 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:00:47 - omniparser - INFO - Parsing complete. Found 58 UI elements
-2025-04-18 19:01:06 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:01:06 - omniparser - INFO - Running OCR on image
-2025-04-18 19:01:08 - omniparser - INFO - OCR found 101 text elements
-2025-04-18 19:01:08 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:01:10 - omniparser - INFO - Parsing complete. Found 130 UI elements
-2025-04-18 19:01:31 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:01:31 - omniparser - INFO - Running OCR on image
-2025-04-18 19:01:32 - omniparser - INFO - OCR found 86 text elements
-2025-04-18 19:01:32 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:01:34 - omniparser - INFO - Parsing complete. Found 127 UI elements
-2025-04-18 19:01:52 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:01:52 - omniparser - INFO - Running OCR on image
-2025-04-18 19:01:54 - omniparser - INFO - OCR found 110 text elements
-2025-04-18 19:01:54 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:01:56 - omniparser - INFO - Parsing complete. Found 141 UI elements
-2025-04-18 19:02:12 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:02:12 - omniparser - INFO - Running OCR on image
-2025-04-18 19:02:14 - omniparser - INFO - OCR found 86 text elements
-2025-04-18 19:02:14 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:02:15 - omniparser - INFO - Parsing complete. Found 127 UI elements
-2025-04-18 19:02:55 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:02:55 - omniparser - INFO - Running OCR on image
-2025-04-18 19:02:57 - omniparser - INFO - OCR found 99 text elements
-2025-04-18 19:02:57 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:03:00 - omniparser - INFO - Parsing complete. Found 126 UI elements
-2025-04-18 19:03:11 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:03:11 - omniparser - INFO - Running OCR on image
-2025-04-18 19:03:14 - omniparser - INFO - OCR found 84 text elements
-2025-04-18 19:03:14 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:03:16 - omniparser - INFO - Parsing complete. Found 115 UI elements
-2025-04-18 19:03:24 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:03:24 - omniparser - INFO - Running OCR on image
-2025-04-18 19:03:27 - omniparser - INFO - OCR found 21 text elements
-2025-04-18 19:03:27 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:03:30 - omniparser - INFO - Parsing complete. Found 73 UI elements
-2025-04-18 19:03:41 - omniparser - INFO - Processing image of size 1470x956
-2025-04-18 19:03:41 - omniparser - INFO - Running OCR on image
-2025-04-18 19:03:44 - omniparser - INFO - OCR found 125 text elements
-2025-04-18 19:03:44 - omniparser - INFO - Processing image with SOM labeling
-2025-04-18 19:03:47 - omniparser - INFO - Parsing complete. Found 151 UI elements
diff --git a/logs/utils_20250418.log b/logs/utils_20250418.log
deleted file mode 100644
index 0e702fe6..00000000
--- a/logs/utils_20250418.log
+++ /dev/null
@@ -1,70 +0,0 @@
-2025-04-18 19:00:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:00:28 - utils - INFO - Found 62 filtered boxes (starting_idx=33)
-2025-04-18 19:00:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:00:28 - utils - INFO - Processing 29 regions in 1 batches
-2025-04-18 19:00:29 - utils - INFO - All captions generated in 1.02s
-2025-04-18 19:00:29 - utils - INFO - Caption processing completed in 1.02s
-2025-04-18 19:00:29 - utils - INFO - SOM labeling completed in 2.17s
-2025-04-18 19:00:45 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:00:45 - utils - INFO - Found 58 filtered boxes (starting_idx=31)
-2025-04-18 19:00:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:00:45 - utils - INFO - Processing 27 regions in 1 batches
-2025-04-18 19:00:47 - utils - INFO - All captions generated in 1.91s
-2025-04-18 19:00:47 - utils - INFO - Caption processing completed in 1.91s
-2025-04-18 19:00:47 - utils - INFO - SOM labeling completed in 2.84s
-2025-04-18 19:01:08 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:01:09 - utils - INFO - Found 130 filtered boxes (starting_idx=87)
-2025-04-18 19:01:09 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:01:09 - utils - INFO - Processing 43 regions in 1 batches
-2025-04-18 19:01:10 - utils - INFO - All captions generated in 1.24s
-2025-04-18 19:01:10 - utils - INFO - Caption processing completed in 1.24s
-2025-04-18 19:01:10 - utils - INFO - SOM labeling completed in 2.18s
-2025-04-18 19:01:32 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:01:33 - utils - INFO - Found 127 filtered boxes (starting_idx=86)
-2025-04-18 19:01:33 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:01:33 - utils - INFO - Processing 41 regions in 1 batches
-2025-04-18 19:01:34 - utils - INFO - All captions generated in 0.93s
-2025-04-18 19:01:34 - utils - INFO - Caption processing completed in 0.93s
-2025-04-18 19:01:34 - utils - INFO - SOM labeling completed in 1.73s
-2025-04-18 19:01:54 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:01:55 - utils - INFO - Found 141 filtered boxes (starting_idx=97)
-2025-04-18 19:01:55 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:01:55 - utils - INFO - Processing 44 regions in 1 batches
-2025-04-18 19:01:56 - utils - INFO - All captions generated in 1.08s
-2025-04-18 19:01:56 - utils - INFO - Caption processing completed in 1.08s
-2025-04-18 19:01:56 - utils - INFO - SOM labeling completed in 1.83s
-2025-04-18 19:02:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:02:14 - utils - INFO - Found 127 filtered boxes (starting_idx=86)
-2025-04-18 19:02:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:02:14 - utils - INFO - Processing 41 regions in 1 batches
-2025-04-18 19:02:15 - utils - INFO - All captions generated in 0.82s
-2025-04-18 19:02:15 - utils - INFO - Caption processing completed in 0.82s
-2025-04-18 19:02:15 - utils - INFO - SOM labeling completed in 1.67s
-2025-04-18 19:02:57 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:02:58 - utils - INFO - Found 126 filtered boxes (starting_idx=85)
-2025-04-18 19:02:58 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:02:58 - utils - INFO - Processing 41 regions in 1 batches
-2025-04-18 19:02:59 - utils - INFO - All captions generated in 1.14s
-2025-04-18 19:02:59 - utils - INFO - Caption processing completed in 1.14s
-2025-04-18 19:03:00 - utils - INFO - SOM labeling completed in 2.10s
-2025-04-18 19:03:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:03:14 - utils - INFO - Found 115 filtered boxes (starting_idx=78)
-2025-04-18 19:03:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:03:14 - utils - INFO - Processing 37 regions in 1 batches
-2025-04-18 19:03:16 - utils - INFO - All captions generated in 1.47s
-2025-04-18 19:03:16 - utils - INFO - Caption processing completed in 1.47s
-2025-04-18 19:03:16 - utils - INFO - SOM labeling completed in 2.14s
-2025-04-18 19:03:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:03:28 - utils - INFO - Found 73 filtered boxes (starting_idx=21)
-2025-04-18 19:03:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:03:28 - utils - INFO - Processing 52 regions in 1 batches
-2025-04-18 19:03:30 - utils - INFO - All captions generated in 2.59s
-2025-04-18 19:03:30 - utils - INFO - Caption processing completed in 2.60s
-2025-04-18 19:03:30 - utils - INFO - SOM labeling completed in 3.79s
-2025-04-18 19:03:44 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7)
-2025-04-18 19:03:45 - utils - INFO - Found 151 filtered boxes (starting_idx=96)
-2025-04-18 19:03:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning
-2025-04-18 19:03:45 - utils - INFO - Processing 55 regions in 1 batches
-2025-04-18 19:03:47 - utils - INFO - All captions generated in 1.70s
-2025-04-18 19:03:47 - utils - INFO - Caption processing completed in 1.70s
-2025-04-18 19:03:47 - utils - INFO - SOM labeling completed in 2.81s
diff --git a/omnitool/gradio/agent/anthropic_agent.py b/omnitool/gradio/agent/anthropic_agent.py
index 6b9423fc..55d9b1fa 100644
--- a/omnitool/gradio/agent/anthropic_agent.py
+++ b/omnitool/gradio/agent/anthropic_agent.py
@@ -47,6 +47,7 @@ class APIProvider(StrEnum):
 class AnthropicActor:
     def __init__(
         self, 
+        args,
         model: str, 
         provider: APIProvider,
         api_key: str,
@@ -62,7 +63,7 @@ def __init__(
         self.max_tokens = max_tokens
         self.only_n_most_recent_images = only_n_most_recent_images
         
-        self.tool_collection = ToolCollection(ComputerTool())
+        self.tool_collection = ToolCollection(ComputerTool(args=args))
 
         self.system = SYSTEM_PROMPT
         
diff --git a/omnitool/gradio/agent/llm_utils/omniparserclient.py b/omnitool/gradio/agent/llm_utils/omniparserclient.py
index e90ddef8..fc6921aa 100644
--- a/omnitool/gradio/agent/llm_utils/omniparserclient.py
+++ b/omnitool/gradio/agent/llm_utils/omniparserclient.py
@@ -8,11 +8,13 @@
 
 class OmniParserClient:
     def __init__(self, 
+                 host_device: str,
                  url: str) -> None:
+        self.host_device = host_device
         self.url = url
 
     def __call__(self,):
-        screenshot, screenshot_path = get_screenshot()
+        screenshot, screenshot_path = get_screenshot(host_device=self.host_device)
         screenshot_path = str(screenshot_path)
         image_base64 = encode_image(screenshot_path)
         response = requests.post(self.url, json={"base64_image": image_base64})
diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py
index 59e6c855..53dc1c0c 100644
--- a/omnitool/gradio/app.py
+++ b/omnitool/gradio/app.py
@@ -1,5 +1,6 @@
 """
-python app.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
+python app.py --host_device omnibox_windows --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
+python app.py --host_device local --omniparser_server_url localhost:8000
 """
 
 import os
@@ -35,7 +36,8 @@
 def parse_arguments():
 
     parser = argparse.ArgumentParser(description="Gradio App")
-    parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
+    parser.add_argument("--host_device", type=str, choices=["omnibox_windows", "local"], default="omnibox_windows")
+    parser.add_argument("--windows_host_url", type=str, default="localhost:8006")
     parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
     return parser.parse_args()
 args = parse_arguments()
@@ -189,8 +191,13 @@ def _truncate_string(s, max_length=500):
 def valid_params(user_input, state):
     """Validate all requirements and return a list of error messages."""
     errors = []
+
+    servers = [('OmniParser Server', args.omniparser_server_url)]
+
+    if args.host_device == "omnibox_windows":
+        servers.append(("Windows Host", args.windows_host_url))
     
-    for server_name, url in [('OmniParser Server', args.omniparser_server_url)]:
+    for server_name, url in servers:
         try:
             url = f'http://{url}/probe'
             response = requests.get(url, timeout=3)
@@ -233,6 +240,7 @@ def process_input(user_input, state):
 
     # Run sampling_loop_sync with the chatbot_output_callback
     for loop_msg in sampling_loop_sync(
+        args=args,
         model=state["model"],
         provider=state["provider"],
         messages=state["messages"],
@@ -241,8 +249,7 @@ def process_input(user_input, state):
         api_response_callback=partial(_api_response_callback, response_state=state["responses"]),
         api_key=state["api_key"],
         only_n_most_recent_images=state["only_n_most_recent_images"],
-        max_tokens=16384,
-        omniparser_url=args.omniparser_server_url
+        max_tokens=16384
     ):  
         if loop_msg is None or state.get("stop"):
             yield state['chatbot_messages']
@@ -343,12 +350,13 @@ def get_header_image_base64():
     with gr.Row():
         with gr.Column(scale=2):
             chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580)
-        # with gr.Column(scale=3):
-        #     iframe = gr.HTML(
-        #         f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
-        #         container=False,
-        #         elem_classes="no-padding"
-        #     )
+        if args.host_device == "omnibox_windows":
+            with gr.Column(scale=3):
+                iframe = gr.HTML(
+                    f'<iframe src="http://{args.windows_host_url}/vnc.html?view_only=1&autoconnect=1&resize=scale" width="100%" height="580" allow="fullscreen"></iframe>',
+                    container=False,
+                    elem_classes="no-padding"
+                )
 
     def update_model(model_selection, state):
         state["model"] = model_selection
diff --git a/omnitool/gradio/app_new.py b/omnitool/gradio/app_new.py
index c907dc3c..4bb80b16 100644
--- a/omnitool/gradio/app_new.py
+++ b/omnitool/gradio/app_new.py
@@ -3,6 +3,7 @@
 - a new UI for the OmniParser AI Agent.
 - 
 python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
+python app_new.py --host_device local --omniparser_server_url localhost:8000
 """
 
 import os
@@ -43,6 +44,7 @@
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Gradio App")
+    parser.add_argument("--host_device", type=str, choices=["omnibox_windows", "local"], default="omnibox_windows")
     parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
     parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
     parser.add_argument("--run_folder", type=str, default="./tmp/outputs")
@@ -222,8 +224,13 @@ def _truncate_string(s, max_length=500):
 def valid_params(user_input, state):
     """Validate all requirements and return a list of error messages."""
     errors = []
+
+    servers = [('OmniParser Server', args.omniparser_server_url)]
+
+    if args.host_device == "omnibox_windows":
+        servers.append(("Windows Host", args.windows_host_url))
     
-    for server_name, url in [('OmniParser Server', args.omniparser_server_url)]:
+    for server_name, url in servers:
         try:
             url = f'http://{url}/probe'
             response = requests.get(url, timeout=3)
@@ -266,6 +273,7 @@ def process_input(user_input, state):
 
     # Run sampling_loop_sync with the chatbot_output_callback
     for loop_msg in sampling_loop_sync(
+        args=args,
         model=state["model"],
         provider=state["provider"],
         messages=state["messages"],
@@ -275,7 +283,6 @@ def process_input(user_input, state):
         api_key=state["api_key"],
         only_n_most_recent_images=state["only_n_most_recent_images"],
         max_tokens=16384,
-        omniparser_url=args.omniparser_server_url,
         save_folder=str(RUN_FOLDER)
     ):  
         if loop_msg is None or state.get("stop"):
diff --git a/omnitool/gradio/executor/anthropic_executor.py b/omnitool/gradio/executor/anthropic_executor.py
index f5c1a77f..7014b9fb 100644
--- a/omnitool/gradio/executor/anthropic_executor.py
+++ b/omnitool/gradio/executor/anthropic_executor.py
@@ -18,11 +18,12 @@
 class AnthropicExecutor:
     def __init__(
         self, 
+        args,
         output_callback: Callable[[BetaContentBlockParam], None], 
         tool_output_callback: Callable[[Any, str], None],
     ):
         self.tool_collection = ToolCollection(
-            ComputerTool()
+            ComputerTool(args=args)
         )
         self.output_callback = output_callback
         self.tool_output_callback = tool_output_callback
diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py
index 323cb4b2..6985c928 100644
--- a/omnitool/gradio/loop.py
+++ b/omnitool/gradio/loop.py
@@ -41,6 +41,7 @@ class APIProvider(StrEnum):
 
 def sampling_loop_sync(
     *,
+    args,
     model: str,
     provider: APIProvider | None,
     messages: list[BetaMessageParam],
@@ -50,17 +51,17 @@ def sampling_loop_sync(
     api_key: str,
     only_n_most_recent_images: int | None = 2,
     max_tokens: int = 4096,
-    omniparser_url: str,
     save_folder: str = "./uploads"
 ):
     """
     Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
     """
     print('in sampling_loop_sync, model:', model)
-    omniparser_client = OmniParserClient(url=f"http://{omniparser_url}/parse/")
+    omniparser_client = OmniParserClient(host_device=args.host_device, url=f"http://{args.omniparser_server_url}/parse/")
     if model == "claude-3-5-sonnet-20241022":
         # Register Actor and Executor
         actor = AnthropicActor(
+            args=args,
             model=model, 
             provider=provider,
             api_key=api_key, 
@@ -92,6 +93,7 @@ def sampling_loop_sync(
     else:
         raise ValueError(f"Model {model} not supported")
     executor = AnthropicExecutor(
+        args=args,
         output_callback=output_callback,
         tool_output_callback=tool_output_callback,
     )
diff --git a/omnitool/gradio/tools/computer.py b/omnitool/gradio/tools/computer.py
index e0812692..90dcc450 100644
--- a/omnitool/gradio/tools/computer.py
+++ b/omnitool/gradio/tools/computer.py
@@ -6,7 +6,6 @@
 import shlex
 import os
 import subprocess
-import pyautogui
 
 from PIL import Image
 
@@ -95,9 +94,11 @@ def options(self) -> ComputerToolOptions:
     def to_params(self) -> BetaToolComputerUse20241022Param:
         return {"name": self.name, "type": self.api_type, **self.options}
 
-    def __init__(self, is_scaling: bool = False):
+    def __init__(self, args, is_scaling: bool = False):
         super().__init__()
 
+        self.args = args
+
         # Get screen width and height using Windows command
         self.display_num = None
         self.offset_x = 0
@@ -148,11 +149,11 @@ async def __call__(
             print(f"mouse move to {x}, {y}")
             
             if action == "mouse_move":
-                self.send_to_vm(f"pyautogui.moveTo({x}, {y})")
+                self.send_to_host_device(f"pyautogui.moveTo({x}, {y})")
                 return ToolResult(output=f"Moved mouse to ({x}, {y})")
             elif action == "left_click_drag":
-                current_x, current_y = self.send_to_vm("pyautogui.position()")
-                self.send_to_vm(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
+                current_x, current_y = self.send_to_host_device("pyautogui.position()")
+                self.send_to_host_device(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
                 return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
 
         if action in ("key", "type"):
@@ -169,18 +170,18 @@ async def __call__(
                 for key in keys:
                     key = self.key_conversion.get(key.strip(), key.strip())
                     key = key.lower()
-                    self.send_to_vm(f"pyautogui.keyDown('{key}')")  # Press down each key
+                    self.send_to_host_device(f"pyautogui.keyDown('{key}')")  # Press down each key
                 for key in reversed(keys):
                     key = self.key_conversion.get(key.strip(), key.strip())
                     key = key.lower()
-                    self.send_to_vm(f"pyautogui.keyUp('{key}')")    # Release each key in reverse order
+                    self.send_to_host_device(f"pyautogui.keyUp('{key}')")    # Release each key in reverse order
                 return ToolResult(output=f"Pressed keys: {text}")
             
             elif action == "type":
                 # default click before type TODO: check if this is needed
-                self.send_to_vm("pyautogui.click()")
-                self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
-                self.send_to_vm("pyautogui.press('enter')")
+                self.send_to_host_device("pyautogui.click()")
+                self.send_to_host_device(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
+                self.send_to_host_device("pyautogui.press('enter')")
                 screenshot_base64 = (await self.screenshot()).base64_image
                 return ToolResult(output=text, base64_image=screenshot_base64)
 
@@ -201,28 +202,28 @@ async def __call__(
             if action == "screenshot":
                 return await self.screenshot()
             elif action == "cursor_position":
-                x, y = self.send_to_vm("pyautogui.position()")
+                x, y = self.send_to_host_device("pyautogui.position()")
                 x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
                 return ToolResult(output=f"X={x},Y={y}")
             else:
                 if action == "left_click":
-                    self.send_to_vm("pyautogui.click()")
+                    self.send_to_host_device("pyautogui.click()")
                 elif action == "right_click":
-                    self.send_to_vm("pyautogui.rightClick()")
+                    self.send_to_host_device("pyautogui.rightClick()")
                 elif action == "middle_click":
-                    self.send_to_vm("pyautogui.middleClick()")
+                    self.send_to_host_device("pyautogui.middleClick()")
                 elif action == "double_click":
-                    self.send_to_vm("pyautogui.doubleClick()")
+                    self.send_to_host_device("pyautogui.doubleClick()")
                 elif action == "left_press":
-                    self.send_to_vm("pyautogui.mouseDown()")
+                    self.send_to_host_device("pyautogui.mouseDown()")
                     time.sleep(1)
-                    self.send_to_vm("pyautogui.mouseUp()")
+                    self.send_to_host_device("pyautogui.mouseUp()")
                 return ToolResult(output=f"Performed {action}")
         if action in ("scroll_up", "scroll_down"):
             if action == "scroll_up":
-                self.send_to_vm("pyautogui.scroll(100)")
+                self.send_to_host_device("pyautogui.scroll(100)")
             elif action == "scroll_down":
-                self.send_to_vm("pyautogui.scroll(-100)")
+                self.send_to_host_device("pyautogui.scroll(-100)")
             return ToolResult(output=f"Performed {action}")
         if action == "hover":
             return ToolResult(output=f"Performed {action}")
@@ -231,7 +232,7 @@ async def __call__(
             return ToolResult(output=f"Performed {action}")
         raise ToolError(f"Invalid action: {action}")
 
-    def send_to_vm(self, action: str):
+    def send_to_host_device(self, action: str):
         """
         Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
         """
@@ -243,7 +244,17 @@ def send_to_vm(self, action: str):
 
         try:
             print(f"sending to vm: {command_list}")
-            response = self.execute(command_list)
+
+            if self.args.host_device == "omnibox_windows":
+                response = requests.post(
+                    f"http://localhost:5000/execute", 
+                    headers={'Content-Type': 'application/json'},
+                    json={"command": command_list},
+                    timeout=90
+                )
+            elif self.args.host_device == "local":
+                response = self.execute(command_list)
+
             time.sleep(0.7) # avoid async error as actions take time to complete
             print(f"action executed")
 
@@ -287,7 +298,7 @@ async def screenshot(self):
             screenshot = self.padding_image(screenshot)
             self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
         width, height = self.target_dimension["width"], self.target_dimension["height"]
-        screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height)
+        screenshot, path = get_screenshot(host_device=self.args.host_device, resize=True, target_width=width, target_height=height)
         time.sleep(0.7) # avoid async error as actions take time to complete
         return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())
 
diff --git a/omnitool/gradio/tools/screen_capture.py b/omnitool/gradio/tools/screen_capture.py
index 249e6358..b3b25daa 100644
--- a/omnitool/gradio/tools/screen_capture.py
+++ b/omnitool/gradio/tools/screen_capture.py
@@ -9,27 +9,33 @@
 
 OUTPUT_DIR = "./tmp/outputs"
 
-def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080):
+def get_screenshot(host_device: str, resize = False, target_width: int = 1920, target_height: int = 1080):
     """Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized"""
     output_dir = Path(OUTPUT_DIR)
     output_dir.mkdir(parents=True, exist_ok=True)
     path = output_dir / f"screenshot_{uuid4().hex}.png"
     
     try:
-        screenshot = pyautogui.screenshot()
-        size = pyautogui.size()
+        if host_device == "omnibox_windows":
+            response = requests.get('http://localhost:5000/screenshot')
+            if response.status_code != 200:
+                raise ToolError(f"Failed to capture screenshot: HTTP {response.status_code}")
+            # (1280, 800)
+            screenshot = Image.open(BytesIO(response.content))
+            if resize and screenshot.size != (target_width, target_height):
+                screenshot = screenshot.resize((target_width, target_height))
+        elif host_device == "local":
+            screenshot = pyautogui.screenshot()
+            size = pyautogui.size()
+            
+            screenshot = screenshot.resize((size.width, size.height))
 
-        target_width = size.width
-        target_height = size.height
-        
-        screenshot = screenshot.resize((target_width, target_height))
-
-        cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png")
-        cursor_x, cursor_y = pyautogui.position()
-        cursor = Image.open(cursor_path)
-        # make the cursor smaller
-        cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5)))
-        screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
+            cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png")
+            cursor_x, cursor_y = pyautogui.position()
+            cursor = Image.open(cursor_path)
+            # make the cursor smaller
+            cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5)))
+            screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
 
         screenshot.save(path)
         return screenshot, path

From fc895b688b6f12608e5c31c80f53ef2197e19d11 Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Fri, 18 Apr 2025 22:49:04 +0700
Subject: [PATCH 5/6] update run_gemini_interleaved docs

---
 omnitool/gradio/agent/llm_utils/geminiclient.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py
index ec17341d..d6bd8b51 100644
--- a/omnitool/gradio/agent/llm_utils/geminiclient.py
+++ b/omnitool/gradio/agent/llm_utils/geminiclient.py
@@ -16,7 +16,7 @@ class Action(BaseModel):
 
 def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens: int, temperature=0):    
     """
-    Run a chat completion through Gemini's API, ignoring any images in the messages.
+    Run a chat completion through Google Gemini's API
     """
     api_key = api_key or os.environ.get("GEMINI_API_KEY")
     if not api_key:

From 9d7dc540238f186567983556ec110829d82c6e99 Mon Sep 17 00:00:00 2001
From: boedegoat <bhremada.fka@gmail.com>
Date: Fri, 18 Apr 2025 22:59:59 +0700
Subject: [PATCH 6/6] Improve command execution handling for host device and
 enhance error reporting

---
 omnitool/gradio/tools/computer.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/omnitool/gradio/tools/computer.py b/omnitool/gradio/tools/computer.py
index 90dcc450..f54f0b11 100644
--- a/omnitool/gradio/tools/computer.py
+++ b/omnitool/gradio/tools/computer.py
@@ -234,7 +234,7 @@ async def __call__(
 
     def send_to_host_device(self, action: str):
         """
-        Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
+        Executes a python command on the host device. Only return tuple of x,y when action is "pyautogui.position()"
         """
         prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
         command_list = ["python", "-c", f"{prefix} {action}"]
@@ -252,14 +252,17 @@ def send_to_host_device(self, action: str):
                     json={"command": command_list},
                     timeout=90
                 )
+                if response.status_code != 200:
+                    raise ToolError(f"Failed to execute command. Status code: {response.status_code}")
+                output = response.json()['output'].strip()
             elif self.args.host_device == "local":
                 response = self.execute(command_list)
+                output = response['output'].strip()
 
             time.sleep(0.7) # avoid async error as actions take time to complete
             print(f"action executed")
 
             if parse:
-                output = response['output'].strip()
                 match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
                 if not match:
                     raise ToolError(f"Could not parse coordinates from output: {output}")
@@ -347,9 +350,21 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int):
     def get_screen_size(self):
         """Return width and height of the screen"""
         try:
-            response = self.execute(["python", "-c", "import pyautogui; print(pyautogui.size())"])
+            if self.args.host_device == "omnibox_windows":
+                response = requests.post(
+                    f"http://localhost:5000/execute",
+                    headers={'Content-Type': 'application/json'},
+                    json={"command": ["python", "-c", "import pyautogui; print(pyautogui.size())"]},
+                    timeout=90
+                )
+
+                if response.status_code != 200:
+                    raise ToolError(f"Failed to get screen size. Status code: {response.status_code}")
+                output = response.json()['output'].strip()
+            elif self.args.host_device == "local":
+                response = self.execute(["python", "-c", "import pyautogui; print(pyautogui.size())"])
+                output = response['output'].strip()
             
-            output = response['output'].strip()
             match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output)
             if not match:
                 raise ToolError(f"Could not parse screen size from output: {output}")