From 84c5fd810b467d7ea04a866b082d2de5131c6d7f Mon Sep 17 00:00:00 2001 From: boedegoat Date: Fri, 4 Apr 2025 14:11:29 +0700 Subject: [PATCH 1/6] modify send_vm to use our local device --- gradio_demo.py | 4 +- omnitool/.DS_Store | Bin 0 -> 6148 bytes omnitool/gradio/.DS_Store | Bin 0 -> 6148 bytes omnitool/gradio/agent/.DS_Store | Bin 0 -> 6148 bytes omnitool/gradio/app.py | 14 ++--- omnitool/gradio/app_new.py | 2 +- omnitool/gradio/tools/.DS_Store | Bin 0 -> 6148 bytes omnitool/gradio/tools/computer.py | 55 ++++++++++++------ omnitool/gradio/tools/screen_capture.py | 16 ++--- omnitool/omniparserserver/omniparserserver.py | 3 +- util/omniparser.py | 4 +- util/utils.py | 13 ++++- 12 files changed, 70 insertions(+), 41 deletions(-) create mode 100644 omnitool/.DS_Store create mode 100644 omnitool/gradio/.DS_Store create mode 100644 omnitool/gradio/agent/.DS_Store create mode 100644 omnitool/gradio/tools/.DS_Store diff --git a/gradio_demo.py b/gradio_demo.py index 15664d31..e2f9266d 100644 --- a/gradio_demo.py +++ b/gradio_demo.py @@ -8,7 +8,7 @@ import base64, os -from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img +from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img, detect_device import torch from PIL import Image @@ -27,7 +27,7 @@ OmniParser is a screen parsing tool to convert general GUI screen to structured elements. """ -DEVICE = torch.device('cuda') +DEVICE = torch.device(detect_device()) # @spaces.GPU # @torch.inference_mode() diff --git a/omnitool/.DS_Store b/omnitool/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9ce7173f5c2df5af2b3b400c7138f99dc737a3c2 GIT binary patch literal 6148 zcmeHK%}T>S5Z-NTn^J@v6nb3nTCkQX6fdFH7cim+m70)JgE3o@)Er77XMG``#OHBl zcXJ2^youNu*!^bbXE*af_J=XXr|aN^F`F@FK||!I)CihuT@4eA$kiMnlYXAA{7CvW z6a7UKetVlOS;9OPuiJ?p`JlDV6C**cwi$sLZarHuSi z_xzi1kWG0~VBibiflLhm literal 0 HcmV?d00001 diff --git a/omnitool/gradio/.DS_Store b/omnitool/gradio/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a37f1c2feb29af5e89e4812b02fb542e3863eed3 GIT binary patch literal 6148 zcmeHKF-yZh6npAnK6CL2$6(4~V7yf`V(cX=#VV1ht@pkX0NV{U_p39Grx1 zF8&HPN56M>+B*`3E+XOMB`?(aXVp~>$ zN{hZwub$Ko9+-_Q4PJEJ1xs-c& z`70q3U@`2|1xiFCI;WIQJr=b6R_5b!dAYy)zze$H%((K z$z+>6f*aRM(;k=BxW0GzY+jv>bIBBYJqjCR42oC7f}|AFnU|9ucKu#AZ~VTsZ}U3w zZtijqFTdghVzVf~9|){gHz>6zAPR^AUkdR05TG!I7ITAo=|H8A0Kh7mwc(o25*!m) z3@zpc(F0Q^6lg+~Jz^*mj`qO%g%)#zCY+ScjB#vcWltzdXGeRW-ARQ8r4|K50bhYR zbGyv@KmMG*JOBGZawiIi0{=<@Rc*CfO+1p_TMG}zdu@Pngu=#txj`L*%5KN{g16!| b6m94W_yQPO%nhOiCO-mN2B}1WUsd1(5AU<9 literal 0 HcmV?d00001 diff --git a/omnitool/gradio/agent/.DS_Store b/omnitool/gradio/agent/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..1ef795a8a46bbef1f2c10e8db6cd2f696ce6c99f GIT binary patch literal 6148 zcmeHKu};H44E2Q$sk(GzV8}1@3kX&Cf-)innj(snD3PE8+wF{uEDZbwJ1cw=&u7y% zN?M5pRmhHf@8YvB&O0fNiO5Y?iwV(~h+-&XG{n#&yw2K@NG-hR!X5=(Q$b6*p}ga5 zhC^h4&u)a%%Bi78oZaU9^73{vE9$C|wBIjxdscOp&C7ZYkGOn&IeGg0INRsF_=R_~ z+|J^QlNnNpt3u8zdZ0VZ@7uYazV&-;=Xnu5|K!=TUsZ2%-0#`W&vSkz#(*(k3>-QG zsM##>NYF-Oz!)$F)(r6XA%QZcie4~$IxvJ5062g-2J7d5Y z_*V?LK{m^#cqOf^otNWUn?SFiEbQk6*CCjMQVd@%#mCSfuqQkLrixw=7Kr@_1R88G I2L6', - container=False, - elem_classes="no-padding" - ) + # with gr.Column(scale=3): + # iframe = gr.HTML( + # f'', + # container=False, + # elem_classes="no-padding" + # ) def update_model(model_selection, state): state["model"] = model_selection diff --git a/omnitool/gradio/app_new.py b/omnitool/gradio/app_new.py index d67ae185..c907dc3c 100644 --- a/omnitool/gradio/app_new.py +++ b/omnitool/gradio/app_new.py @@ -223,7 +223,7 @@ def valid_params(user_input, state): """Validate all requirements and return a list of error messages.""" errors = [] - for server_name, url in [('Windows Host', 'localhost:5000'), ('OmniParser Server', args.omniparser_server_url)]: + for server_name, url in [('OmniParser Server', args.omniparser_server_url)]: try: url = f'http://{url}/probe' response = requests.get(url, timeout=3) diff --git a/omnitool/gradio/tools/.DS_Store b/omnitool/gradio/tools/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..c47b05954283b270d648d9cb24b3d36491d42556 GIT binary patch literal 6148 zcmeHKu}%Xq47F*6PU_MP#FUY>e~?zkj?@o0=oQhS)!o9-Eo;?I0MeW&l$j-EmDjWT{{ELfHQDlK+cDNCKwHqVm>;catQ#GXLJ_W zQcFloFpP#t5i=0hP@smgl^Cqy7!T$b4U?jV6I=1Yw)1=O!g+VBAF?}fRCMhOI0Ib< z2HG9V{eOjDrnkuNhIr2za0dPv13WC} str: + if torch.cuda.is_available(): + print("[+] Using CUDA") + return "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + print("[+] Using MPS for Apple Silicon") + return "mps" + else: + return "cpu" def get_caption_model_processor(model_name, model_name_or_path="Salesforce/blip2-opt-2.7b", device=None): if not device: - device = "cuda" if torch.cuda.is_available() else "cpu" + device = detect_device() if model_name == "blip2": from transformers import Blip2Processor, Blip2ForConditionalGeneration processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") @@ -107,7 +116,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_ start = time.time() batch = croped_pil_image[i:i+batch_size] t1 = time.time() - if model.device.type == 'cuda': + if model.device.type == 'cuda' or model.device.type == 'mps': inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt", do_resize=False).to(device=device, dtype=torch.float16) else: inputs = processor(images=batch, text=[prompt]*len(batch), return_tensors="pt").to(device=device) From c659ca9577017c998224e1fffe8c9f7ea6e49d78 Mon Sep 17 00:00:00 2001 From: boedegoat Date: Sat, 5 Apr 2025 00:08:42 +0700 Subject: [PATCH 2/6] added gemini support --- omnitool/gradio/agent/anthropic_agent.py | 2 +- .../gradio/agent/llm_utils/geminiclient.py | 89 ++++++++++++++++++ omnitool/gradio/agent/vlm_agent.py | 19 +++- omnitool/gradio/app.py | 8 +- omnitool/gradio/loop.py | 6 +- omnitool/gradio/tools/cursor.png | Bin 0 -> 3207 bytes omnitool/gradio/tools/screen_capture.py | 8 ++ requirements.txt | 4 +- 8 files changed, 127 insertions(+), 9 deletions(-) create mode 100644 omnitool/gradio/agent/llm_utils/geminiclient.py create mode 100644 omnitool/gradio/tools/cursor.png diff --git a/omnitool/gradio/agent/anthropic_agent.py b/omnitool/gradio/agent/anthropic_agent.py index b1c744e2..6b9423fc 100644 --- a/omnitool/gradio/agent/anthropic_agent.py +++ b/omnitool/gradio/agent/anthropic_agent.py @@ -39,7 +39,7 @@ class APIProvider(StrEnum): VERTEX = "vertex" SYSTEM_PROMPT = f""" -* You are utilizing a Windows system with internet access. +* You are utilizing a {platform.system()} system with internet access. * The current date is {datetime.today().strftime('%A, %B %d, %Y')}. """ diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py new file mode 100644 index 00000000..c610303d --- /dev/null +++ b/omnitool/gradio/agent/llm_utils/geminiclient.py @@ -0,0 +1,89 @@ +import os +from google import genai +from google.genai import types +import tiktoken + +from .utils import is_image_path, encode_image + +def estimate_token_count(text): + """Estimates the token count of a text string using tiktoken. + Adapt this for Gemini's specific vocabulary if necessary.""" + + # IMPORTANT: tiktoken is primarily for OpenAI models. + # You need to be aware of potential inaccuracies if Gemini + # uses a significantly different tokenization scheme. + + try: + encoding = tiktoken.get_encoding("cl100k_base") # This is a good starting point, but research Gemini tokenizer + tokens = encoding.encode(text) + return len(tokens) + except Exception as e: + print(f"Error estimating token count: {e}") + return None # or a reasonable default + +def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, temperature=0): + """ + Run a chat completion through Gemini's API, ignoring any images in the messages. + """ + api_key = api_key or os.environ.get("GEMINI_API_KEY") + if not api_key: + raise ValueError("GEMINI_API_KEY is not set") + + client = genai.Client( + api_key=api_key, + ) + + generate_content_config = types.GenerateContentConfig( + temperature=temperature, + response_mime_type="application/json", + system_instruction=[ + types.Part.from_text(text=system), + ], + ) + + contents = [] + + if type(messages) == list: + for item in messages: + parts = [] + if isinstance(item, dict): + for cnt in item["content"]: + if isinstance(cnt, str): + parts.append(types.Part.from_text(text=cnt)) + else: + # in this case it is a text block from anthropic + parts.append(types.Part.from_text(text=str(cnt))) + + else: # str + parts.append(types.Part.from_text(text=str(item))) + + content = (types.Content( + role="user", + parts=parts + )) + + contents.append(content) + + + elif isinstance(messages, str): + contents = [ + types.Content( + role="user", + parts=[types.Part.from_text(text=messages)] + ) + ] + + try: + response = client.models.generate_content( + model=model_name, + contents=contents, + config=generate_content_config + ) + final_answer = response.text + token_usage = estimate_token_count(final_answer) + + return final_answer, token_usage + except Exception as e: + print(f"Error in interleaved Gemini: {e}") + + return str(e), 0 diff --git a/omnitool/gradio/agent/vlm_agent.py b/omnitool/gradio/agent/vlm_agent.py index 9f631a70..afa748f1 100644 --- a/omnitool/gradio/agent/vlm_agent.py +++ b/omnitool/gradio/agent/vlm_agent.py @@ -5,6 +5,7 @@ from PIL import Image, ImageDraw import base64 from io import BytesIO +import platform from anthropic import APIResponse from anthropic.types import ToolResultBlockParam @@ -12,6 +13,7 @@ from agent.llm_utils.oaiclient import run_oai_interleaved from agent.llm_utils.groqclient import run_groq_interleaved +from agent.llm_utils.geminiclient import run_gemini_interleaved from agent.llm_utils.utils import is_image_path import time import re @@ -49,6 +51,8 @@ def __init__( self.model = "o1" elif model == "omniparser + o3-mini": self.model = "o3-mini" + elif model == "omniparser + gemini-2.0-flash": + self.model = "gemini-2.0-flash" else: raise ValueError(f"Model {model} not supported") @@ -133,6 +137,17 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]): print(f"qwen token usage: {token_usage}") self.total_token_usage += token_usage self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a + elif "gemini-2.0-flash" in self.model: + vlm_response, token_usage = run_gemini_interleaved( + messages=planner_messages, + system=system, + model_name=self.model, + api_key=self.api_key, + temperature=0, + ) + print(f"gemini token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 0.99 / 1000000) else: raise ValueError(f"Model {self.model} not supported") latency_vlm = time.time() - start @@ -209,9 +224,9 @@ def _api_response_callback(self, response: APIResponse): def _get_system_prompt(self, screen_info: str = ""): main_section = f""" -You are using a Windows device. +You are using a {platform.system()} device. You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot. -You can only interact with the desktop GUI (no terminal or application menu access). +You can only interact with the desktop GUI (no terminal or application menu access) and ignore the gradio interface (which opened in localhost:7888) including the orange send button there. You may be given some history plan and actions, this is the response from the previous loop. You should carefully consider your plan base on the task, screenshot, and history actions. diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py index 260e31e2..43e692a1 100644 --- a/omnitool/gradio/app.py +++ b/omnitool/gradio/app.py @@ -27,7 +27,7 @@ API_KEY_FILE = CONFIG_DIR / "api_key" INTRO_TEXT = ''' -OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use (Sonnet).** +OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini(2.0-flash) or Anthropic Computer Use (Sonnet).** Type a message and press submit to start OmniTool. Press stop to pause, and press the trash icon in the chat to clear the message history. ''' @@ -241,7 +241,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - max_tokens=16384, + max_tokens=8192, omniparser_url=args.omniparser_server_url ): if loop_msg is None or state.get("stop"): @@ -302,7 +302,7 @@ def get_header_image_base64(): with gr.Column(): model = gr.Dropdown( label="Model", - choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"], + choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"], value="omniparser + gpt-4o", interactive=True, ) @@ -362,6 +362,8 @@ def update_model(model_selection, state): provider_choices = ["groq"] elif model_selection == "omniparser + qwen2.5vl": provider_choices = ["dashscope"] + elif model_selection == "omniparser + gemini-2.0-flash": + provider_choices = ["gemini"] else: provider_choices = [option.value for option in APIProvider] default_provider_value = provider_choices[0] diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py index 9ce63169..bdd856e7 100644 --- a/omnitool/gradio/loop.py +++ b/omnitool/gradio/loop.py @@ -28,6 +28,7 @@ class APIProvider(StrEnum): BEDROCK = "bedrock" VERTEX = "vertex" OPENAI = "openai" + GEMINI = "gemini" PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { @@ -35,6 +36,7 @@ class APIProvider(StrEnum): APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", APIProvider.OPENAI: "gpt-4o", + APIProvider.GEMINI: "gemini-2.0-flash" } def sampling_loop_sync( @@ -66,7 +68,7 @@ def sampling_loop_sync( max_tokens=max_tokens, only_n_most_recent_images=only_n_most_recent_images ) - elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl"]): + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash"]): actor = VLMAgent( model=model, provider=provider, @@ -115,7 +117,7 @@ def sampling_loop_sync( messages.append({"content": tool_result_content, "role": "user"}) - elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]): + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]): while True: parsed_screen = omniparser_client() tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen) diff --git a/omnitool/gradio/tools/cursor.png b/omnitool/gradio/tools/cursor.png new file mode 100644 index 0000000000000000000000000000000000000000..d3a3c5bbe1d070b65dc3bb31ab18998823ff54f8 GIT binary patch literal 3207 zcmV;240!X2P)4Tx0C=43nP*T`Sr&lrd#`h#p}T1^G&xC>(17HeBu7z!?k3Yhlfj4#j=%^a zB1sWM6mY=NA;>5yq8Jbb1P9kR|KfS^ksY$5-z~0MD=PwVq%$yaD z{KlCp9Q}=pR%`ry?U(r|060?gDicnvaO5geyH=PT!%qZ2x^KB&g`LD-e!lcdFU4;& ztcK{yewfi*9+w}H%H=Ts!>#@M_`6={l;xNIUvKa~?z3`szR_FD)iVJxD*noCDFcA@ z8UUo^ubj$Z04Tcw=;-+?7kM0j&JqA5pRv>U>ECjw)!6|7PyiiB0ePSdG=L5;1g5|O z*Z@c12E2ej2mzY_8^nM_zy}#%E7%3{K_Mss2SF961r4AHw1P9B6PyQ^!8LFT+yx`x zF%W=B@CwX<_h1o%AS^_NXpj`70I5RSkRil^tRP3o1M-DJpl~P};z4Q9R%ka=2$e!r zP#x3+{REwZE<-n0l(yTktQMmQQyhO^;3cpqE= z*Tbja4)`*B3m$<5@GE#80SFOcAc}|%!b0p3FC-L+LiosbqyQ;L>XBBY8|g>xAp&F; z`G~@xXecI12W5eBK?R`LsASZ3R1vBQbrRKq>O<1hXr+_oWIpIQaJX|iW9M_EN!42c4aEo|Ryei%t?}g{! zv+%|E27EXE4t^59NDw2a6D$e-ggC-3LIt6faFy_c@Rmpry? zYLO0+VUgDqJVljaOW8!prj%38QtnV@Me(9)qV}Q@qT5BQMbC+jiO!4B#0aGrtPLR(5})Z=@_~?-IX3kFQorSAEeKT zQ^k$M1I07NtHm#f3m6DPjp52jU=%aXFh&`lB;+J)CAboMC0ZqhB^D%QByA+Qk_D1& zk|UCzq?l5UQt?vzr8=daN~5H;qo5nm7bNM%UH;;WeQ}@$UKsTWwm5|WpiYY z$qvZQ%gM_*%O%NG%3YD0m1oG?$j8Z-$@j?5D9{wF6k-+16fP>fWQsFwnF-7aW*>7- zQC87KF-@^faZqtlNnI&GDOagY>8Ub7nWY@1T&CQsJg1_d;;FJ#rA6hDDqfYP8l!qh zwO@5XOy9?8ZK55keOP;F4SJ2)8s3`vHDfv?9b27rofe&OU52iQZmw>p?rS|2y->aVdi{D| z^^Nrt^c(aa8&D103~~*+4gNILGGrT88{S__UhA}W$J(y7Z;Z5!xJE~e9vIV%J&pGm z_ZojOF*V_vw3^JAs+fkG)|ig5=qztm5$n1c+RV;uhuL|vkLD)kspefFZN?@&M9bsK>{oF>`hGWxcGi9r38)tjU_O+e9 z9pA3qZo%H%ew+Oz2gJeIp}=9#k?I)WSmpT4Ny#bNsl{o|+1NS9`H~CT#ocAU%b2U2 zE8Dfnb&1b{W z;X&ce;h!VCBI+aNBV8kFBInrl>_hBV92-sHcvj7 zJjZwEH>N--8&kSc>B}GKfi#V@@6yK8t-aX?ZFSqB?UCDicPQ`3+cCM*dFRPpgkABwZs+Rfmgc_O9kjbUPcCm~ z-o$s#-?ik6&>p|nGS3Xh7e zN|nlz%Fk6XRU_4Q)onGhHG69o4s#9bTCex;-X4&TA7Puv?W#&}ask=Wq{Mgm1(^}WY zXe&GoPN$uI`BV5$qi5XD^qys%ZT(sG=bCm}dqD@(k<~HR8Ph4~3h5d;=XS2Q+oHSu z7u{b@oL4+weL?&}aSx#~8nax}lz7>*34y z9PVAa?|T2%$oi3?(SXqhV_{>02b>4f4-+1~d6fQW@$t?l=qCkFMW2>GlYLhItLCq5 z0u#Z7afk7N=YG#0Pp~Im{g(3E=gGVmBT|nppLKhIafY1em zE+BM)|6>SMANiqfXNV0L}X@WMv9AzUjb(g0YyZD!Jw3t zmAwYchJYd>i;Ig=Sy|}^3Jd{7M3$D8q`JEL9dOpuWRhJYd>>+9>% z*w`2bN-}vWn?+qN7rL&yolfUl;9|}oKvPo_)6>&eTUuJ)164VL01XWd1OkD}t*xy= zpeAP!ptiOaUDuo2+S=v;d(I$$)oLXW2sqqs_dMXp83ZUVFURlq+dUr7JkXpo2vAyD z%IxfHO;1lx5V!=8mu>O#@-lmSdnd+CCKH{VomDFO53pHk(O zQhC5(RwqS7BpQuMMMcHSObe&{h8r6jL?V%-F`LasS6A0fQv51OalFIr?QL;593MR% zPb`s0B;__YH>Iek=xNsA{{FsnbaZ?LYy|}cqibtxC$`?#*Y_Q$$Oy*cap~>t-36}v zNm={*`*%}aXZiX0>7U)BS8sTD_$P4v-&{#a$&0P6tt2p!NQl?#jgszu92psriHQjT z?xaXNM@L709Pe^rVL_BqcT<9PyInL*dz@|ub#--bc6N4>z<4|^?d|QKfpe*9o^$#; zd~JMu{P1{}x~_{->Q<_POy}9;a=9W02M2$#a5ya8-QC}mQnxeIri~kulamq(g~aRi teo;yd0~h`m1F&eCwyc!84-}s^@iz_-B{HTjtFHh6002ovPDHLkV1i9}Eqwq0 literal 0 HcmV?d00001 diff --git a/omnitool/gradio/tools/screen_capture.py b/omnitool/gradio/tools/screen_capture.py index 407ecab0..249e6358 100644 --- a/omnitool/gradio/tools/screen_capture.py +++ b/omnitool/gradio/tools/screen_capture.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from uuid import uuid4 import requests @@ -23,6 +24,13 @@ def get_screenshot(resize: bool = False, target_width: int = 1920, target_height screenshot = screenshot.resize((target_width, target_height)) + cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png") + cursor_x, cursor_y = pyautogui.position() + cursor = Image.open(cursor_path) + # make the cursor smaller + cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5))) + screenshot.paste(cursor, (cursor_x, cursor_y), cursor) + screenshot.save(path) return screenshot, path except Exception as e: diff --git a/requirements.txt b/requirements.txt index 901a27fa..ccc94b1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,4 +29,6 @@ google-auth<3,>=2 screeninfo uiautomation dashscope -groq \ No newline at end of file +groq +google-genai +tiktoken \ No newline at end of file From 7b37788c6c78d157831f7482aa4bd23cd7b95f84 Mon Sep 17 00:00:00 2001 From: boedegoat Date: Fri, 18 Apr 2025 20:11:22 +0700 Subject: [PATCH 3/6] optimize gemini client and add gemini-2.5-flash --- .gitignore | 3 +- logs/gradio_gradio_app_20250418.log | 7 + logs/gradio_gradio_tools_20250418.log | 0 logs/gradio_sampling_loop_20250418.log | 2 + logs/gradio_screen_capture_20250418.log | 0 logs/image_utils_20250418.log | 0 logs/model_utils_20250418.log | 18 +++ logs/ocr_utils_20250418.log | 21 +++ logs/omniparser_20250418.log | 62 +++++++++ logs/utils_20250418.log | 70 ++++++++++ .../gradio/agent/llm_utils/geminiclient.py | 57 +++----- omnitool/gradio/agent/llm_utils/oaiclient.py | 3 - omnitool/gradio/agent/vlm_agent.py | 20 ++- .../agent/vlm_agent_with_orchestrator.py | 128 +++++++++++++++++- omnitool/gradio/app.py | 8 +- omnitool/gradio/loop.py | 6 +- requirements.txt | 3 +- 17 files changed, 353 insertions(+), 55 deletions(-) create mode 100644 logs/gradio_gradio_app_20250418.log create mode 100644 logs/gradio_gradio_tools_20250418.log create mode 100644 logs/gradio_sampling_loop_20250418.log create mode 100644 logs/gradio_screen_capture_20250418.log create mode 100644 logs/image_utils_20250418.log create mode 100644 logs/model_utils_20250418.log create mode 100644 logs/ocr_utils_20250418.log create mode 100644 logs/omniparser_20250418.log create mode 100644 logs/utils_20250418.log diff --git a/.gitignore b/.gitignore index 8b8235e6..1761ebd1 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ util/__pycache__/ index.html?linkid=2289031 wget-log weights/icon_caption_florence_v2/ -omnitool/gradio/uploads/ \ No newline at end of file +omnitool/gradio/uploads/ +.DS_Store \ No newline at end of file diff --git a/logs/gradio_gradio_app_20250418.log b/logs/gradio_gradio_app_20250418.log new file mode 100644 index 00000000..b60e93a4 --- /dev/null +++ b/logs/gradio_gradio_app_20250418.log @@ -0,0 +1,7 @@ +2025-04-18 19:49:59 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888 +2025-04-18 19:50:20 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.0-flash +2025-04-18 19:51:16 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888 +2025-04-18 19:51:22 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.5-flash-preview-04-17 +2025-04-18 19:52:05 - gradio_app - INFO - [+] Processing user input: 'on my spotify play "garam dan madu"...' (truncated) +2025-04-18 19:52:05 - gradio_app - ERROR - [-] Error in sampling loop: name 'OmniParserClient' is not defined +2025-04-18 19:52:05 - gradio_app - INFO - [+] Input processing completed in 0.14s diff --git a/logs/gradio_gradio_tools_20250418.log b/logs/gradio_gradio_tools_20250418.log new file mode 100644 index 00000000..e69de29b diff --git a/logs/gradio_sampling_loop_20250418.log b/logs/gradio_sampling_loop_20250418.log new file mode 100644 index 00000000..867a09da --- /dev/null +++ b/logs/gradio_sampling_loop_20250418.log @@ -0,0 +1,2 @@ +2025-04-18 19:52:05 - sampling_loop - INFO - [+] Initializing sampling loop with model: omniparser + gemini-2.5-flash-preview-04-17 +2025-04-18 19:52:05 - sampling_loop - ERROR - [-] Failed to initialize OmniParser client: name 'OmniParserClient' is not defined diff --git a/logs/gradio_screen_capture_20250418.log b/logs/gradio_screen_capture_20250418.log new file mode 100644 index 00000000..e69de29b diff --git a/logs/image_utils_20250418.log b/logs/image_utils_20250418.log new file mode 100644 index 00000000..e69de29b diff --git a/logs/model_utils_20250418.log b/logs/model_utils_20250418.log new file mode 100644 index 00000000..48f1d970 --- /dev/null +++ b/logs/model_utils_20250418.log @@ -0,0 +1,18 @@ +2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:58:52 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt +2025-04-18 18:58:52 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt +2025-04-18 18:58:52 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence +2025-04-18 18:59:03 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps +2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:59:07 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt +2025-04-18 18:59:07 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt +2025-04-18 18:59:07 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence +2025-04-18 18:59:20 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps +2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon +2025-04-18 18:59:20 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt +2025-04-18 18:59:20 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt +2025-04-18 18:59:20 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence +2025-04-18 18:59:29 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps diff --git a/logs/ocr_utils_20250418.log b/logs/ocr_utils_20250418.log new file mode 100644 index 00000000..86e03b17 --- /dev/null +++ b/logs/ocr_utils_20250418.log @@ -0,0 +1,21 @@ +2025-04-18 19:00:23 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:00:23 - ocr_utils - INFO - Initializing EasyOCR reader +2025-04-18 19:00:27 - ocr_utils - INFO - EasyOCR found 36 text elements +2025-04-18 19:00:43 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:00:45 - ocr_utils - INFO - EasyOCR found 35 text elements +2025-04-18 19:01:06 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:01:08 - ocr_utils - INFO - EasyOCR found 101 text elements +2025-04-18 19:01:31 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:01:32 - ocr_utils - INFO - EasyOCR found 86 text elements +2025-04-18 19:01:52 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:01:54 - ocr_utils - INFO - EasyOCR found 110 text elements +2025-04-18 19:02:12 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:02:14 - ocr_utils - INFO - EasyOCR found 86 text elements +2025-04-18 19:02:55 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:02:57 - ocr_utils - INFO - EasyOCR found 99 text elements +2025-04-18 19:03:11 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:03:14 - ocr_utils - INFO - EasyOCR found 84 text elements +2025-04-18 19:03:24 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:03:27 - ocr_utils - INFO - EasyOCR found 21 text elements +2025-04-18 19:03:41 - ocr_utils - INFO - Running OCR on image of size 1470x956 +2025-04-18 19:03:44 - ocr_utils - INFO - EasyOCR found 125 text elements diff --git a/logs/omniparser_20250418.log b/logs/omniparser_20250418.log new file mode 100644 index 00000000..e381e029 --- /dev/null +++ b/logs/omniparser_20250418.log @@ -0,0 +1,62 @@ +2025-04-18 18:58:51 - omniparser - INFO - Initializing OmniParser +2025-04-18 18:58:52 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt +2025-04-18 18:59:03 - omniparser - INFO - Caption model loaded: florence2 +2025-04-18 18:59:03 - omniparser - INFO - OmniParser initialization complete! +2025-04-18 18:59:07 - omniparser - INFO - Initializing OmniParser +2025-04-18 18:59:07 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt +2025-04-18 18:59:20 - omniparser - INFO - Caption model loaded: florence2 +2025-04-18 18:59:20 - omniparser - INFO - OmniParser initialization complete! +2025-04-18 18:59:20 - omniparser - INFO - Initializing OmniParser +2025-04-18 18:59:20 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt +2025-04-18 18:59:29 - omniparser - INFO - Caption model loaded: florence2 +2025-04-18 18:59:29 - omniparser - INFO - OmniParser initialization complete! +2025-04-18 19:00:23 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:00:23 - omniparser - INFO - Running OCR on image +2025-04-18 19:00:27 - omniparser - INFO - OCR found 36 text elements +2025-04-18 19:00:27 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:00:29 - omniparser - INFO - Parsing complete. Found 62 UI elements +2025-04-18 19:00:43 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:00:43 - omniparser - INFO - Running OCR on image +2025-04-18 19:00:45 - omniparser - INFO - OCR found 35 text elements +2025-04-18 19:00:45 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:00:47 - omniparser - INFO - Parsing complete. Found 58 UI elements +2025-04-18 19:01:06 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:01:06 - omniparser - INFO - Running OCR on image +2025-04-18 19:01:08 - omniparser - INFO - OCR found 101 text elements +2025-04-18 19:01:08 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:01:10 - omniparser - INFO - Parsing complete. Found 130 UI elements +2025-04-18 19:01:31 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:01:31 - omniparser - INFO - Running OCR on image +2025-04-18 19:01:32 - omniparser - INFO - OCR found 86 text elements +2025-04-18 19:01:32 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:01:34 - omniparser - INFO - Parsing complete. Found 127 UI elements +2025-04-18 19:01:52 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:01:52 - omniparser - INFO - Running OCR on image +2025-04-18 19:01:54 - omniparser - INFO - OCR found 110 text elements +2025-04-18 19:01:54 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:01:56 - omniparser - INFO - Parsing complete. Found 141 UI elements +2025-04-18 19:02:12 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:02:12 - omniparser - INFO - Running OCR on image +2025-04-18 19:02:14 - omniparser - INFO - OCR found 86 text elements +2025-04-18 19:02:14 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:02:15 - omniparser - INFO - Parsing complete. Found 127 UI elements +2025-04-18 19:02:55 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:02:55 - omniparser - INFO - Running OCR on image +2025-04-18 19:02:57 - omniparser - INFO - OCR found 99 text elements +2025-04-18 19:02:57 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:03:00 - omniparser - INFO - Parsing complete. Found 126 UI elements +2025-04-18 19:03:11 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:03:11 - omniparser - INFO - Running OCR on image +2025-04-18 19:03:14 - omniparser - INFO - OCR found 84 text elements +2025-04-18 19:03:14 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:03:16 - omniparser - INFO - Parsing complete. Found 115 UI elements +2025-04-18 19:03:24 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:03:24 - omniparser - INFO - Running OCR on image +2025-04-18 19:03:27 - omniparser - INFO - OCR found 21 text elements +2025-04-18 19:03:27 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:03:30 - omniparser - INFO - Parsing complete. Found 73 UI elements +2025-04-18 19:03:41 - omniparser - INFO - Processing image of size 1470x956 +2025-04-18 19:03:41 - omniparser - INFO - Running OCR on image +2025-04-18 19:03:44 - omniparser - INFO - OCR found 125 text elements +2025-04-18 19:03:44 - omniparser - INFO - Processing image with SOM labeling +2025-04-18 19:03:47 - omniparser - INFO - Parsing complete. Found 151 UI elements diff --git a/logs/utils_20250418.log b/logs/utils_20250418.log new file mode 100644 index 00000000..0e702fe6 --- /dev/null +++ b/logs/utils_20250418.log @@ -0,0 +1,70 @@ +2025-04-18 19:00:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:00:28 - utils - INFO - Found 62 filtered boxes (starting_idx=33) +2025-04-18 19:00:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:00:28 - utils - INFO - Processing 29 regions in 1 batches +2025-04-18 19:00:29 - utils - INFO - All captions generated in 1.02s +2025-04-18 19:00:29 - utils - INFO - Caption processing completed in 1.02s +2025-04-18 19:00:29 - utils - INFO - SOM labeling completed in 2.17s +2025-04-18 19:00:45 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:00:45 - utils - INFO - Found 58 filtered boxes (starting_idx=31) +2025-04-18 19:00:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:00:45 - utils - INFO - Processing 27 regions in 1 batches +2025-04-18 19:00:47 - utils - INFO - All captions generated in 1.91s +2025-04-18 19:00:47 - utils - INFO - Caption processing completed in 1.91s +2025-04-18 19:00:47 - utils - INFO - SOM labeling completed in 2.84s +2025-04-18 19:01:08 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:01:09 - utils - INFO - Found 130 filtered boxes (starting_idx=87) +2025-04-18 19:01:09 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:01:09 - utils - INFO - Processing 43 regions in 1 batches +2025-04-18 19:01:10 - utils - INFO - All captions generated in 1.24s +2025-04-18 19:01:10 - utils - INFO - Caption processing completed in 1.24s +2025-04-18 19:01:10 - utils - INFO - SOM labeling completed in 2.18s +2025-04-18 19:01:32 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:01:33 - utils - INFO - Found 127 filtered boxes (starting_idx=86) +2025-04-18 19:01:33 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:01:33 - utils - INFO - Processing 41 regions in 1 batches +2025-04-18 19:01:34 - utils - INFO - All captions generated in 0.93s +2025-04-18 19:01:34 - utils - INFO - Caption processing completed in 0.93s +2025-04-18 19:01:34 - utils - INFO - SOM labeling completed in 1.73s +2025-04-18 19:01:54 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:01:55 - utils - INFO - Found 141 filtered boxes (starting_idx=97) +2025-04-18 19:01:55 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:01:55 - utils - INFO - Processing 44 regions in 1 batches +2025-04-18 19:01:56 - utils - INFO - All captions generated in 1.08s +2025-04-18 19:01:56 - utils - INFO - Caption processing completed in 1.08s +2025-04-18 19:01:56 - utils - INFO - SOM labeling completed in 1.83s +2025-04-18 19:02:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:02:14 - utils - INFO - Found 127 filtered boxes (starting_idx=86) +2025-04-18 19:02:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:02:14 - utils - INFO - Processing 41 regions in 1 batches +2025-04-18 19:02:15 - utils - INFO - All captions generated in 0.82s +2025-04-18 19:02:15 - utils - INFO - Caption processing completed in 0.82s +2025-04-18 19:02:15 - utils - INFO - SOM labeling completed in 1.67s +2025-04-18 19:02:57 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:02:58 - utils - INFO - Found 126 filtered boxes (starting_idx=85) +2025-04-18 19:02:58 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:02:58 - utils - INFO - Processing 41 regions in 1 batches +2025-04-18 19:02:59 - utils - INFO - All captions generated in 1.14s +2025-04-18 19:02:59 - utils - INFO - Caption processing completed in 1.14s +2025-04-18 19:03:00 - utils - INFO - SOM labeling completed in 2.10s +2025-04-18 19:03:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:03:14 - utils - INFO - Found 115 filtered boxes (starting_idx=78) +2025-04-18 19:03:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:03:14 - utils - INFO - Processing 37 regions in 1 batches +2025-04-18 19:03:16 - utils - INFO - All captions generated in 1.47s +2025-04-18 19:03:16 - utils - INFO - Caption processing completed in 1.47s +2025-04-18 19:03:16 - utils - INFO - SOM labeling completed in 2.14s +2025-04-18 19:03:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:03:28 - utils - INFO - Found 73 filtered boxes (starting_idx=21) +2025-04-18 19:03:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:03:28 - utils - INFO - Processing 52 regions in 1 batches +2025-04-18 19:03:30 - utils - INFO - All captions generated in 2.59s +2025-04-18 19:03:30 - utils - INFO - Caption processing completed in 2.60s +2025-04-18 19:03:30 - utils - INFO - SOM labeling completed in 3.79s +2025-04-18 19:03:44 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) +2025-04-18 19:03:45 - utils - INFO - Found 151 filtered boxes (starting_idx=96) +2025-04-18 19:03:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning +2025-04-18 19:03:45 - utils - INFO - Processing 55 regions in 1 batches +2025-04-18 19:03:47 - utils - INFO - All captions generated in 1.70s +2025-04-18 19:03:47 - utils - INFO - Caption processing completed in 1.70s +2025-04-18 19:03:47 - utils - INFO - SOM labeling completed in 2.81s diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py index c610303d..ec17341d 100644 --- a/omnitool/gradio/agent/llm_utils/geminiclient.py +++ b/omnitool/gradio/agent/llm_utils/geminiclient.py @@ -1,27 +1,20 @@ import os from google import genai from google.genai import types -import tiktoken +from pydantic import BaseModel, Field +from typing import Optional +from PIL import Image +from pprint import pprint from .utils import is_image_path, encode_image -def estimate_token_count(text): - """Estimates the token count of a text string using tiktoken. - Adapt this for Gemini's specific vocabulary if necessary.""" +class Action(BaseModel): + reasoning: str = Field(..., alias="Reasoning") + next_action: str = Field(..., alias="Next Action") + box_id: str | None = Field(None, alias="Box ID") + value: str | None = None - # IMPORTANT: tiktoken is primarily for OpenAI models. - # You need to be aware of potential inaccuracies if Gemini - # uses a significantly different tokenization scheme. - - try: - encoding = tiktoken.get_encoding("cl100k_base") # This is a good starting point, but research Gemini tokenizer - tokens = encoding.encode(text) - return len(tokens) - except Exception as e: - print(f"Error estimating token count: {e}") - return None # or a reasonable default - -def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, temperature=0): +def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens: int, temperature=0): """ Run a chat completion through Gemini's API, ignoring any images in the messages. """ @@ -35,7 +28,9 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key generate_content_config = types.GenerateContentConfig( temperature=temperature, + max_output_tokens=max_tokens, response_mime_type="application/json", + response_schema=Action, system_instruction=[ types.Part.from_text(text=system), ], @@ -45,33 +40,21 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key if type(messages) == list: for item in messages: - parts = [] if isinstance(item, dict): for cnt in item["content"]: if isinstance(cnt, str): - parts.append(types.Part.from_text(text=cnt)) + if is_image_path(cnt): + contents.append(Image.open(cnt)) + else: + contents.append(cnt) else: - # in this case it is a text block from anthropic - parts.append(types.Part.from_text(text=str(cnt))) + contents.append(str(cnt)) else: # str - parts.append(types.Part.from_text(text=str(item))) - - content = (types.Content( - role="user", - parts=parts - )) - - contents.append(content) + contents.append(str(cnt)) - elif isinstance(messages, str): - contents = [ - types.Content( - role="user", - parts=[types.Part.from_text(text=messages)] - ) - ] + contents.push(messages) try: response = client.models.generate_content( @@ -80,7 +63,7 @@ def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key config=generate_content_config ) final_answer = response.text - token_usage = estimate_token_count(final_answer) + token_usage = response.usage_metadata.total_token_count return final_answer, token_usage except Exception as e: diff --git a/omnitool/gradio/agent/llm_utils/oaiclient.py b/omnitool/gradio/agent/llm_utils/oaiclient.py index ad421100..768a86e8 100644 --- a/omnitool/gradio/agent/llm_utils/oaiclient.py +++ b/omnitool/gradio/agent/llm_utils/oaiclient.py @@ -1,6 +1,3 @@ -import os -import logging -import base64 import requests from .utils import is_image_path, encode_image diff --git a/omnitool/gradio/agent/vlm_agent.py b/omnitool/gradio/agent/vlm_agent.py index afa748f1..ee6f6d0e 100644 --- a/omnitool/gradio/agent/vlm_agent.py +++ b/omnitool/gradio/agent/vlm_agent.py @@ -53,6 +53,8 @@ def __init__( self.model = "o3-mini" elif model == "omniparser + gemini-2.0-flash": self.model = "gemini-2.0-flash" + elif model == "omniparser + gemini-2.5-flash-preview-04-17": + self.model = "gemini-2.5-flash-preview-04-17" else: raise ValueError(f"Model {model} not supported") @@ -137,17 +139,18 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]): print(f"qwen token usage: {token_usage}") self.total_token_usage += token_usage self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a - elif "gemini-2.0-flash" in self.model: + elif "gemini" in self.model: vlm_response, token_usage = run_gemini_interleaved( messages=planner_messages, system=system, model_name=self.model, api_key=self.api_key, + max_tokens=self.max_tokens, temperature=0, ) print(f"gemini token usage: {token_usage}") self.total_token_usage += token_usage - self.total_cost += (token_usage * 0.99 / 1000000) + self.total_cost += 0 # assume using free tier else: raise ValueError(f"Model {self.model} not supported") latency_vlm = time.time() - start @@ -226,7 +229,9 @@ def _get_system_prompt(self, screen_info: str = ""): main_section = f""" You are using a {platform.system()} device. You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot. -You can only interact with the desktop GUI (no terminal or application menu access) and ignore the gradio interface (which opened in localhost:7888) including the orange send button there. +You can only interact with the desktop GUI (no terminal or application menu access) + +!!!DO NOT interact with the chatbot webpage interface that opens in 0.0.0.0:7888. You don't need to click the orange send button because the user already clicked it!!! You may be given some history plan and actions, this is the response from the previous loop. You should carefully consider your plan base on the task, screenshot, and history actions. @@ -245,6 +250,15 @@ def _get_system_prompt(self, screen_info: str = ""): Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task. +Use this JSON schema: + +Action = {{ + "Reasoning": str, + "Next Action": str, + "Box ID": str | None, + "value": str | None +}} + Output format: ```json {{ diff --git a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py index 74d554a8..4b5d0275 100644 --- a/omnitool/gradio/agent/vlm_agent_with_orchestrator.py +++ b/omnitool/gradio/agent/vlm_agent_with_orchestrator.py @@ -14,6 +14,7 @@ from agent.llm_utils.oaiclient import run_oai_interleaved from agent.llm_utils.groqclient import run_groq_interleaved +from agent.llm_utils.geminiclient import run_gemini_interleaved from agent.llm_utils.utils import is_image_path import time import re @@ -85,6 +86,10 @@ def __init__( self.model = "o1" elif model == "omniparser + o3-mini" or model == "omniparser + o3-mini-orchestrated": self.model = "o3-mini" + elif model == "omniparser + gemini-2.0-flash" or model == "omniparser + gemini-2.0-flash-orchestrated": + self.model = "gemini-2.0-flash" + elif model == "omniparser + gemini-2.5-flash-preview-04-17" or model == "omniparser + gemini-2.5-flash-preview-04-17-orchestrated": + self.model = "gemini-2.5-flash-preview-04-17" else: raise ValueError(f"Model {model} not supported") @@ -194,6 +199,18 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]): print(f"qwen token usage: {token_usage}") self.total_token_usage += token_usage self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a + elif "gemini" in self.model: + vlm_response, token_usage = run_gemini_interleaved( + messages=planner_messages, + system=system, + model_name=self.model, + api_key=self.api_key, + max_tokens=self.max_tokens, + temperature=0, + ) + print(f"gemini token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += 0 # assume using free tier else: raise ValueError(f"Model {self.model} not supported") latency_vlm = time.time() - start @@ -312,6 +329,15 @@ def _get_system_prompt(self, screen_info: str = ""): Based on the visual information from the screenshot image and the detected bounding boxes, please determine the next action, the Box ID you should operate on (if action is one of 'type', 'hover', 'scroll_up', 'scroll_down', 'wait', there should be no Box ID field), and the value (if the action is 'type') in order to complete the task. +Use this JSON schema: + +Action = {{ + "Reasoning": str, + "Next Action": str, + "Box ID": str | None, + "value": str | None +}} + Output format: ```json {{ @@ -381,7 +407,9 @@ def _initialize_task(self, messages: list): plan_prompt = self._get_plan_prompt(self._task) input_message = copy.deepcopy(messages) input_message.append({"role": "user", "content": plan_prompt}) - vlm_response, token_usage = run_oai_interleaved( + + if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model: + vlm_response, token_usage = run_oai_interleaved( messages=input_message, system="", model_name=self.model, @@ -390,6 +418,53 @@ def _initialize_task(self, messages: list): provider_base_url="https://api.openai.com/v1", temperature=0, ) + print(f"oai token usage: {token_usage}") + self.total_token_usage += token_usage + if 'gpt' in self.model: + self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/ + elif 'o1' in self.model: + self.total_cost += (token_usage * 15 / 1000000) # https://openai.com/api/pricing/ + elif 'o3-mini' in self.model: + self.total_cost += (token_usage * 1.1 / 1000000) # https://openai.com/api/pricing/ + elif "r1" in self.model: + vlm_response, token_usage = run_groq_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=self.max_tokens, + ) + print(f"groq token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 0.99 / 1000000) + elif "qwen" in self.model: + vlm_response, token_usage = run_oai_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=min(2048, self.max_tokens), + provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + temperature=0, + ) + print(f"qwen token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a + elif "gemini" in self.model: + vlm_response, token_usage = run_gemini_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=self.max_tokens, + temperature=0, + ) + print(f"gemini token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += 0 # assume using free tier + else: + raise ValueError(f"Model {self.model} not supported") + plan = extract_data(vlm_response, "json") # Create a filename with timestamp @@ -413,7 +488,9 @@ def _update_ledger(self, messages): update_ledger_prompt = ORCHESTRATOR_LEDGER_PROMPT.format(task=self._task) input_message = copy.deepcopy(messages) input_message.append({"role": "user", "content": update_ledger_prompt}) - vlm_response, token_usage = run_oai_interleaved( + + if "gpt" in self.model or "o1" in self.model or "o3-mini" in self.model: + vlm_response, token_usage = run_oai_interleaved( messages=input_message, system="", model_name=self.model, @@ -422,6 +499,53 @@ def _update_ledger(self, messages): provider_base_url="https://api.openai.com/v1", temperature=0, ) + print(f"oai token usage: {token_usage}") + self.total_token_usage += token_usage + if 'gpt' in self.model: + self.total_cost += (token_usage * 2.5 / 1000000) # https://openai.com/api/pricing/ + elif 'o1' in self.model: + self.total_cost += (token_usage * 15 / 1000000) # https://openai.com/api/pricing/ + elif 'o3-mini' in self.model: + self.total_cost += (token_usage * 1.1 / 1000000) # https://openai.com/api/pricing/ + elif "r1" in self.model: + vlm_response, token_usage = run_groq_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=self.max_tokens, + ) + print(f"groq token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 0.99 / 1000000) + elif "qwen" in self.model: + vlm_response, token_usage = run_oai_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=min(2048, self.max_tokens), + provider_base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + temperature=0, + ) + print(f"qwen token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += (token_usage * 2.2 / 1000000) # https://help.aliyun.com/zh/model-studio/getting-started/models?spm=a2c4g.11186623.0.0.74b04823CGnPv7#fe96cfb1a422a + elif "gemini" in self.model: + vlm_response, token_usage = run_gemini_interleaved( + messages=input_message, + system="", + model_name=self.model, + api_key=self.api_key, + max_tokens=self.max_tokens, + temperature=0, + ) + print(f"gemini token usage: {token_usage}") + self.total_token_usage += token_usage + self.total_cost += 0 # assume using free tier + else: + raise ValueError(f"Model {self.model} not supported") + updated_ledger = extract_data(vlm_response, "json") return updated_ledger diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py index 43e692a1..59e6c855 100644 --- a/omnitool/gradio/app.py +++ b/omnitool/gradio/app.py @@ -27,7 +27,7 @@ API_KEY_FILE = CONFIG_DIR / "api_key" INTRO_TEXT = ''' -OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini(2.0-flash) or Anthropic Computer Use (Sonnet).** +OmniParser lets you turn any vision-langauge model into an AI agent. We currently support **OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL), Gemini (2.0/2.5) or Anthropic Computer Use (Sonnet).** Type a message and press submit to start OmniTool. Press stop to pause, and press the trash icon in the chat to clear the message history. ''' @@ -241,7 +241,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - max_tokens=8192, + max_tokens=16384, omniparser_url=args.omniparser_server_url ): if loop_msg is None or state.get("stop"): @@ -302,7 +302,7 @@ def get_header_image_base64(): with gr.Column(): model = gr.Dropdown( label="Model", - choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"], + choices=["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "claude-3-5-sonnet-20241022", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"], value="omniparser + gpt-4o", interactive=True, ) @@ -362,7 +362,7 @@ def update_model(model_selection, state): provider_choices = ["groq"] elif model_selection == "omniparser + qwen2.5vl": provider_choices = ["dashscope"] - elif model_selection == "omniparser + gemini-2.0-flash": + elif model_selection in set(["omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"]): provider_choices = ["gemini"] else: provider_choices = [option.value for option in APIProvider] diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py index bdd856e7..323cb4b2 100644 --- a/omnitool/gradio/loop.py +++ b/omnitool/gradio/loop.py @@ -68,7 +68,7 @@ def sampling_loop_sync( max_tokens=max_tokens, only_n_most_recent_images=only_n_most_recent_images ) - elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash"]): + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17"]): actor = VLMAgent( model=model, provider=provider, @@ -78,7 +78,7 @@ def sampling_loop_sync( max_tokens=max_tokens, only_n_most_recent_images=only_n_most_recent_images ) - elif model in set(["omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]): + elif model in set(["omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.5-flash-preview-04-17-orchestrated"]): actor = VLMOrchestratedAgent( model=model, provider=provider, @@ -117,7 +117,7 @@ def sampling_loop_sync( messages.append({"content": tool_result_content, "role": "user"}) - elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated"]): + elif model in set(["omniparser + gpt-4o", "omniparser + o1", "omniparser + o3-mini", "omniparser + R1", "omniparser + qwen2.5vl", "omniparser + gemini-2.0-flash", "omniparser + gemini-2.5-flash-preview-04-17", "omniparser + gpt-4o-orchestrated", "omniparser + o1-orchestrated", "omniparser + o3-mini-orchestrated", "omniparser + R1-orchestrated", "omniparser + qwen2.5vl-orchestrated", "omniparser + gemini-2.0-flash-orchestrated", "omniparser + gemini-2.0-flash-thinking-exp-orchestrated"]): while True: parsed_screen = omniparser_client() tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen) diff --git a/requirements.txt b/requirements.txt index ccc94b1e..b58b5423 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,5 +30,4 @@ screeninfo uiautomation dashscope groq -google-genai -tiktoken \ No newline at end of file +google-genai \ No newline at end of file From 73b7e70fc02413d497e93136902474e10c7afaeb Mon Sep 17 00:00:00 2001 From: boedegoat Date: Fri, 18 Apr 2025 21:22:55 +0700 Subject: [PATCH 4/6] Refactor OmniParser integration and enhance device support - Updated .gitignore to ignore all .DS_Store files. - Modified AnthropicAgent to accept host_device argument for better flexibility. - Enhanced OmniParserClient to utilize host_device for screenshot functionality. - Updated app.py and app_new.py to include host_device argument in command line options. - Refactored ComputerTool to support actions based on host_device, improving compatibility with local and omnibox_windows environments. - Adjusted get_screenshot function to handle different host_device scenarios for capturing screenshots. --- .gitignore | 2 +- logs/gradio_gradio_app_20250418.log | 7 -- logs/gradio_gradio_tools_20250418.log | 0 logs/gradio_sampling_loop_20250418.log | 2 - logs/gradio_screen_capture_20250418.log | 0 logs/image_utils_20250418.log | 0 logs/model_utils_20250418.log | 18 ----- logs/ocr_utils_20250418.log | 21 ------ logs/omniparser_20250418.log | 62 ---------------- logs/utils_20250418.log | 70 ------------------- omnitool/gradio/agent/anthropic_agent.py | 3 +- .../agent/llm_utils/omniparserclient.py | 4 +- omnitool/gradio/app.py | 30 +++++--- omnitool/gradio/app_new.py | 11 ++- .../gradio/executor/anthropic_executor.py | 3 +- omnitool/gradio/loop.py | 6 +- omnitool/gradio/tools/computer.py | 55 +++++++++------ omnitool/gradio/tools/screen_capture.py | 34 +++++---- 18 files changed, 93 insertions(+), 235 deletions(-) delete mode 100644 logs/gradio_gradio_app_20250418.log delete mode 100644 logs/gradio_gradio_tools_20250418.log delete mode 100644 logs/gradio_sampling_loop_20250418.log delete mode 100644 logs/gradio_screen_capture_20250418.log delete mode 100644 logs/image_utils_20250418.log delete mode 100644 logs/model_utils_20250418.log delete mode 100644 logs/ocr_utils_20250418.log delete mode 100644 logs/omniparser_20250418.log delete mode 100644 logs/utils_20250418.log diff --git a/.gitignore b/.gitignore index 1761ebd1..20db02ca 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ index.html?linkid=2289031 wget-log weights/icon_caption_florence_v2/ omnitool/gradio/uploads/ -.DS_Store \ No newline at end of file +**/.DS_Store \ No newline at end of file diff --git a/logs/gradio_gradio_app_20250418.log b/logs/gradio_gradio_app_20250418.log deleted file mode 100644 index b60e93a4..00000000 --- a/logs/gradio_gradio_app_20250418.log +++ /dev/null @@ -1,7 +0,0 @@ -2025-04-18 19:49:59 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888 -2025-04-18 19:50:20 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.0-flash -2025-04-18 19:51:16 - gradio_app - INFO - [+] Starting OmniTool Gradio server on port 7888 -2025-04-18 19:51:22 - gradio_app - INFO - [+] Model updated to: omniparser + gemini-2.5-flash-preview-04-17 -2025-04-18 19:52:05 - gradio_app - INFO - [+] Processing user input: 'on my spotify play "garam dan madu"...' (truncated) -2025-04-18 19:52:05 - gradio_app - ERROR - [-] Error in sampling loop: name 'OmniParserClient' is not defined -2025-04-18 19:52:05 - gradio_app - INFO - [+] Input processing completed in 0.14s diff --git a/logs/gradio_gradio_tools_20250418.log b/logs/gradio_gradio_tools_20250418.log deleted file mode 100644 index e69de29b..00000000 diff --git a/logs/gradio_sampling_loop_20250418.log b/logs/gradio_sampling_loop_20250418.log deleted file mode 100644 index 867a09da..00000000 --- a/logs/gradio_sampling_loop_20250418.log +++ /dev/null @@ -1,2 +0,0 @@ -2025-04-18 19:52:05 - sampling_loop - INFO - [+] Initializing sampling loop with model: omniparser + gemini-2.5-flash-preview-04-17 -2025-04-18 19:52:05 - sampling_loop - ERROR - [-] Failed to initialize OmniParser client: name 'OmniParserClient' is not defined diff --git a/logs/gradio_screen_capture_20250418.log b/logs/gradio_screen_capture_20250418.log deleted file mode 100644 index e69de29b..00000000 diff --git a/logs/image_utils_20250418.log b/logs/image_utils_20250418.log deleted file mode 100644 index e69de29b..00000000 diff --git a/logs/model_utils_20250418.log b/logs/model_utils_20250418.log deleted file mode 100644 index 48f1d970..00000000 --- a/logs/model_utils_20250418.log +++ /dev/null @@ -1,18 +0,0 @@ -2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:58:51 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:58:52 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt -2025-04-18 18:58:52 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt -2025-04-18 18:58:52 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence -2025-04-18 18:59:03 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps -2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:59:07 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:59:07 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt -2025-04-18 18:59:07 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt -2025-04-18 18:59:07 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence -2025-04-18 18:59:20 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps -2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:59:20 - model_utils - INFO - Using MPS for Apple Silicon -2025-04-18 18:59:20 - model_utils - INFO - Loading YOLO model from ../../weights/icon_detect/model.pt -2025-04-18 18:59:20 - model_utils - INFO - Successfully loaded YOLO model: ../../weights/icon_detect/model.pt -2025-04-18 18:59:20 - model_utils - INFO - Loading caption model florence2 from ../../weights/icon_caption_florence -2025-04-18 18:59:29 - model_utils - INFO - Loaded Florence2 model in float16 precision for mps diff --git a/logs/ocr_utils_20250418.log b/logs/ocr_utils_20250418.log deleted file mode 100644 index 86e03b17..00000000 --- a/logs/ocr_utils_20250418.log +++ /dev/null @@ -1,21 +0,0 @@ -2025-04-18 19:00:23 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:00:23 - ocr_utils - INFO - Initializing EasyOCR reader -2025-04-18 19:00:27 - ocr_utils - INFO - EasyOCR found 36 text elements -2025-04-18 19:00:43 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:00:45 - ocr_utils - INFO - EasyOCR found 35 text elements -2025-04-18 19:01:06 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:01:08 - ocr_utils - INFO - EasyOCR found 101 text elements -2025-04-18 19:01:31 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:01:32 - ocr_utils - INFO - EasyOCR found 86 text elements -2025-04-18 19:01:52 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:01:54 - ocr_utils - INFO - EasyOCR found 110 text elements -2025-04-18 19:02:12 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:02:14 - ocr_utils - INFO - EasyOCR found 86 text elements -2025-04-18 19:02:55 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:02:57 - ocr_utils - INFO - EasyOCR found 99 text elements -2025-04-18 19:03:11 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:03:14 - ocr_utils - INFO - EasyOCR found 84 text elements -2025-04-18 19:03:24 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:03:27 - ocr_utils - INFO - EasyOCR found 21 text elements -2025-04-18 19:03:41 - ocr_utils - INFO - Running OCR on image of size 1470x956 -2025-04-18 19:03:44 - ocr_utils - INFO - EasyOCR found 125 text elements diff --git a/logs/omniparser_20250418.log b/logs/omniparser_20250418.log deleted file mode 100644 index e381e029..00000000 --- a/logs/omniparser_20250418.log +++ /dev/null @@ -1,62 +0,0 @@ -2025-04-18 18:58:51 - omniparser - INFO - Initializing OmniParser -2025-04-18 18:58:52 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt -2025-04-18 18:59:03 - omniparser - INFO - Caption model loaded: florence2 -2025-04-18 18:59:03 - omniparser - INFO - OmniParser initialization complete! -2025-04-18 18:59:07 - omniparser - INFO - Initializing OmniParser -2025-04-18 18:59:07 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt -2025-04-18 18:59:20 - omniparser - INFO - Caption model loaded: florence2 -2025-04-18 18:59:20 - omniparser - INFO - OmniParser initialization complete! -2025-04-18 18:59:20 - omniparser - INFO - Initializing OmniParser -2025-04-18 18:59:20 - omniparser - INFO - SOM model loaded from ../../weights/icon_detect/model.pt -2025-04-18 18:59:29 - omniparser - INFO - Caption model loaded: florence2 -2025-04-18 18:59:29 - omniparser - INFO - OmniParser initialization complete! -2025-04-18 19:00:23 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:00:23 - omniparser - INFO - Running OCR on image -2025-04-18 19:00:27 - omniparser - INFO - OCR found 36 text elements -2025-04-18 19:00:27 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:00:29 - omniparser - INFO - Parsing complete. Found 62 UI elements -2025-04-18 19:00:43 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:00:43 - omniparser - INFO - Running OCR on image -2025-04-18 19:00:45 - omniparser - INFO - OCR found 35 text elements -2025-04-18 19:00:45 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:00:47 - omniparser - INFO - Parsing complete. Found 58 UI elements -2025-04-18 19:01:06 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:01:06 - omniparser - INFO - Running OCR on image -2025-04-18 19:01:08 - omniparser - INFO - OCR found 101 text elements -2025-04-18 19:01:08 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:01:10 - omniparser - INFO - Parsing complete. Found 130 UI elements -2025-04-18 19:01:31 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:01:31 - omniparser - INFO - Running OCR on image -2025-04-18 19:01:32 - omniparser - INFO - OCR found 86 text elements -2025-04-18 19:01:32 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:01:34 - omniparser - INFO - Parsing complete. Found 127 UI elements -2025-04-18 19:01:52 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:01:52 - omniparser - INFO - Running OCR on image -2025-04-18 19:01:54 - omniparser - INFO - OCR found 110 text elements -2025-04-18 19:01:54 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:01:56 - omniparser - INFO - Parsing complete. Found 141 UI elements -2025-04-18 19:02:12 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:02:12 - omniparser - INFO - Running OCR on image -2025-04-18 19:02:14 - omniparser - INFO - OCR found 86 text elements -2025-04-18 19:02:14 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:02:15 - omniparser - INFO - Parsing complete. Found 127 UI elements -2025-04-18 19:02:55 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:02:55 - omniparser - INFO - Running OCR on image -2025-04-18 19:02:57 - omniparser - INFO - OCR found 99 text elements -2025-04-18 19:02:57 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:03:00 - omniparser - INFO - Parsing complete. Found 126 UI elements -2025-04-18 19:03:11 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:03:11 - omniparser - INFO - Running OCR on image -2025-04-18 19:03:14 - omniparser - INFO - OCR found 84 text elements -2025-04-18 19:03:14 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:03:16 - omniparser - INFO - Parsing complete. Found 115 UI elements -2025-04-18 19:03:24 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:03:24 - omniparser - INFO - Running OCR on image -2025-04-18 19:03:27 - omniparser - INFO - OCR found 21 text elements -2025-04-18 19:03:27 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:03:30 - omniparser - INFO - Parsing complete. Found 73 UI elements -2025-04-18 19:03:41 - omniparser - INFO - Processing image of size 1470x956 -2025-04-18 19:03:41 - omniparser - INFO - Running OCR on image -2025-04-18 19:03:44 - omniparser - INFO - OCR found 125 text elements -2025-04-18 19:03:44 - omniparser - INFO - Processing image with SOM labeling -2025-04-18 19:03:47 - omniparser - INFO - Parsing complete. Found 151 UI elements diff --git a/logs/utils_20250418.log b/logs/utils_20250418.log deleted file mode 100644 index 0e702fe6..00000000 --- a/logs/utils_20250418.log +++ /dev/null @@ -1,70 +0,0 @@ -2025-04-18 19:00:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:00:28 - utils - INFO - Found 62 filtered boxes (starting_idx=33) -2025-04-18 19:00:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:00:28 - utils - INFO - Processing 29 regions in 1 batches -2025-04-18 19:00:29 - utils - INFO - All captions generated in 1.02s -2025-04-18 19:00:29 - utils - INFO - Caption processing completed in 1.02s -2025-04-18 19:00:29 - utils - INFO - SOM labeling completed in 2.17s -2025-04-18 19:00:45 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:00:45 - utils - INFO - Found 58 filtered boxes (starting_idx=31) -2025-04-18 19:00:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:00:45 - utils - INFO - Processing 27 regions in 1 batches -2025-04-18 19:00:47 - utils - INFO - All captions generated in 1.91s -2025-04-18 19:00:47 - utils - INFO - Caption processing completed in 1.91s -2025-04-18 19:00:47 - utils - INFO - SOM labeling completed in 2.84s -2025-04-18 19:01:08 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:01:09 - utils - INFO - Found 130 filtered boxes (starting_idx=87) -2025-04-18 19:01:09 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:01:09 - utils - INFO - Processing 43 regions in 1 batches -2025-04-18 19:01:10 - utils - INFO - All captions generated in 1.24s -2025-04-18 19:01:10 - utils - INFO - Caption processing completed in 1.24s -2025-04-18 19:01:10 - utils - INFO - SOM labeling completed in 2.18s -2025-04-18 19:01:32 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:01:33 - utils - INFO - Found 127 filtered boxes (starting_idx=86) -2025-04-18 19:01:33 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:01:33 - utils - INFO - Processing 41 regions in 1 batches -2025-04-18 19:01:34 - utils - INFO - All captions generated in 0.93s -2025-04-18 19:01:34 - utils - INFO - Caption processing completed in 0.93s -2025-04-18 19:01:34 - utils - INFO - SOM labeling completed in 1.73s -2025-04-18 19:01:54 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:01:55 - utils - INFO - Found 141 filtered boxes (starting_idx=97) -2025-04-18 19:01:55 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:01:55 - utils - INFO - Processing 44 regions in 1 batches -2025-04-18 19:01:56 - utils - INFO - All captions generated in 1.08s -2025-04-18 19:01:56 - utils - INFO - Caption processing completed in 1.08s -2025-04-18 19:01:56 - utils - INFO - SOM labeling completed in 1.83s -2025-04-18 19:02:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:02:14 - utils - INFO - Found 127 filtered boxes (starting_idx=86) -2025-04-18 19:02:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:02:14 - utils - INFO - Processing 41 regions in 1 batches -2025-04-18 19:02:15 - utils - INFO - All captions generated in 0.82s -2025-04-18 19:02:15 - utils - INFO - Caption processing completed in 0.82s -2025-04-18 19:02:15 - utils - INFO - SOM labeling completed in 1.67s -2025-04-18 19:02:57 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:02:58 - utils - INFO - Found 126 filtered boxes (starting_idx=85) -2025-04-18 19:02:58 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:02:58 - utils - INFO - Processing 41 regions in 1 batches -2025-04-18 19:02:59 - utils - INFO - All captions generated in 1.14s -2025-04-18 19:02:59 - utils - INFO - Caption processing completed in 1.14s -2025-04-18 19:03:00 - utils - INFO - SOM labeling completed in 2.10s -2025-04-18 19:03:14 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:03:14 - utils - INFO - Found 115 filtered boxes (starting_idx=78) -2025-04-18 19:03:14 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:03:14 - utils - INFO - Processing 37 regions in 1 batches -2025-04-18 19:03:16 - utils - INFO - All captions generated in 1.47s -2025-04-18 19:03:16 - utils - INFO - Caption processing completed in 1.47s -2025-04-18 19:03:16 - utils - INFO - SOM labeling completed in 2.14s -2025-04-18 19:03:27 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:03:28 - utils - INFO - Found 73 filtered boxes (starting_idx=21) -2025-04-18 19:03:28 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:03:28 - utils - INFO - Processing 52 regions in 1 batches -2025-04-18 19:03:30 - utils - INFO - All captions generated in 2.59s -2025-04-18 19:03:30 - utils - INFO - Caption processing completed in 2.60s -2025-04-18 19:03:30 - utils - INFO - SOM labeling completed in 3.79s -2025-04-18 19:03:44 - utils - INFO - Processing image with SOM labeling (threshold=0.05, iou=0.7) -2025-04-18 19:03:45 - utils - INFO - Found 151 filtered boxes (starting_idx=96) -2025-04-18 19:03:45 - utils - INFO - Using ../../weights/icon_caption_florence model for captioning -2025-04-18 19:03:45 - utils - INFO - Processing 55 regions in 1 batches -2025-04-18 19:03:47 - utils - INFO - All captions generated in 1.70s -2025-04-18 19:03:47 - utils - INFO - Caption processing completed in 1.70s -2025-04-18 19:03:47 - utils - INFO - SOM labeling completed in 2.81s diff --git a/omnitool/gradio/agent/anthropic_agent.py b/omnitool/gradio/agent/anthropic_agent.py index 6b9423fc..55d9b1fa 100644 --- a/omnitool/gradio/agent/anthropic_agent.py +++ b/omnitool/gradio/agent/anthropic_agent.py @@ -47,6 +47,7 @@ class APIProvider(StrEnum): class AnthropicActor: def __init__( self, + args, model: str, provider: APIProvider, api_key: str, @@ -62,7 +63,7 @@ def __init__( self.max_tokens = max_tokens self.only_n_most_recent_images = only_n_most_recent_images - self.tool_collection = ToolCollection(ComputerTool()) + self.tool_collection = ToolCollection(ComputerTool(args=args)) self.system = SYSTEM_PROMPT diff --git a/omnitool/gradio/agent/llm_utils/omniparserclient.py b/omnitool/gradio/agent/llm_utils/omniparserclient.py index e90ddef8..fc6921aa 100644 --- a/omnitool/gradio/agent/llm_utils/omniparserclient.py +++ b/omnitool/gradio/agent/llm_utils/omniparserclient.py @@ -8,11 +8,13 @@ class OmniParserClient: def __init__(self, + host_device: str, url: str) -> None: + self.host_device = host_device self.url = url def __call__(self,): - screenshot, screenshot_path = get_screenshot() + screenshot, screenshot_path = get_screenshot(host_device=self.host_device) screenshot_path = str(screenshot_path) image_base64 = encode_image(screenshot_path) response = requests.post(self.url, json={"base64_image": image_base64}) diff --git a/omnitool/gradio/app.py b/omnitool/gradio/app.py index 59e6c855..53dc1c0c 100644 --- a/omnitool/gradio/app.py +++ b/omnitool/gradio/app.py @@ -1,5 +1,6 @@ """ -python app.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000 +python app.py --host_device omnibox_windows --windows_host_url localhost:8006 --omniparser_server_url localhost:8000 +python app.py --host_device local --omniparser_server_url localhost:8000 """ import os @@ -35,7 +36,8 @@ def parse_arguments(): parser = argparse.ArgumentParser(description="Gradio App") - parser.add_argument("--windows_host_url", type=str, default='localhost:8006') + parser.add_argument("--host_device", type=str, choices=["omnibox_windows", "local"], default="omnibox_windows") + parser.add_argument("--windows_host_url", type=str, default="localhost:8006") parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") return parser.parse_args() args = parse_arguments() @@ -189,8 +191,13 @@ def _truncate_string(s, max_length=500): def valid_params(user_input, state): """Validate all requirements and return a list of error messages.""" errors = [] + + servers = [('OmniParser Server', args.omniparser_server_url)] + + if args.host_device == "omnibox_windows": + servers.append(("Windows Host", args.windows_host_url)) - for server_name, url in [('OmniParser Server', args.omniparser_server_url)]: + for server_name, url in servers: try: url = f'http://{url}/probe' response = requests.get(url, timeout=3) @@ -233,6 +240,7 @@ def process_input(user_input, state): # Run sampling_loop_sync with the chatbot_output_callback for loop_msg in sampling_loop_sync( + args=args, model=state["model"], provider=state["provider"], messages=state["messages"], @@ -241,8 +249,7 @@ def process_input(user_input, state): api_response_callback=partial(_api_response_callback, response_state=state["responses"]), api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], - max_tokens=16384, - omniparser_url=args.omniparser_server_url + max_tokens=16384 ): if loop_msg is None or state.get("stop"): yield state['chatbot_messages'] @@ -343,12 +350,13 @@ def get_header_image_base64(): with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot(label="Chatbot History", autoscroll=True, height=580) - # with gr.Column(scale=3): - # iframe = gr.HTML( - # f'', - # container=False, - # elem_classes="no-padding" - # ) + if args.host_device == "omnibox_windows": + with gr.Column(scale=3): + iframe = gr.HTML( + f'', + container=False, + elem_classes="no-padding" + ) def update_model(model_selection, state): state["model"] = model_selection diff --git a/omnitool/gradio/app_new.py b/omnitool/gradio/app_new.py index c907dc3c..4bb80b16 100644 --- a/omnitool/gradio/app_new.py +++ b/omnitool/gradio/app_new.py @@ -3,6 +3,7 @@ - a new UI for the OmniParser AI Agent. - python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000 +python app_new.py --host_device local --omniparser_server_url localhost:8000 """ import os @@ -43,6 +44,7 @@ def parse_arguments(): parser = argparse.ArgumentParser(description="Gradio App") + parser.add_argument("--host_device", type=str, choices=["omnibox_windows", "local"], default="omnibox_windows") parser.add_argument("--windows_host_url", type=str, default='localhost:8006') parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000") parser.add_argument("--run_folder", type=str, default="./tmp/outputs") @@ -222,8 +224,13 @@ def _truncate_string(s, max_length=500): def valid_params(user_input, state): """Validate all requirements and return a list of error messages.""" errors = [] + + servers = [('OmniParser Server', args.omniparser_server_url)] + + if args.host_device == "omnibox_windows": + servers.append(("Windows Host", args.windows_host_url)) - for server_name, url in [('OmniParser Server', args.omniparser_server_url)]: + for server_name, url in servers: try: url = f'http://{url}/probe' response = requests.get(url, timeout=3) @@ -266,6 +273,7 @@ def process_input(user_input, state): # Run sampling_loop_sync with the chatbot_output_callback for loop_msg in sampling_loop_sync( + args=args, model=state["model"], provider=state["provider"], messages=state["messages"], @@ -275,7 +283,6 @@ def process_input(user_input, state): api_key=state["api_key"], only_n_most_recent_images=state["only_n_most_recent_images"], max_tokens=16384, - omniparser_url=args.omniparser_server_url, save_folder=str(RUN_FOLDER) ): if loop_msg is None or state.get("stop"): diff --git a/omnitool/gradio/executor/anthropic_executor.py b/omnitool/gradio/executor/anthropic_executor.py index f5c1a77f..7014b9fb 100644 --- a/omnitool/gradio/executor/anthropic_executor.py +++ b/omnitool/gradio/executor/anthropic_executor.py @@ -18,11 +18,12 @@ class AnthropicExecutor: def __init__( self, + args, output_callback: Callable[[BetaContentBlockParam], None], tool_output_callback: Callable[[Any, str], None], ): self.tool_collection = ToolCollection( - ComputerTool() + ComputerTool(args=args) ) self.output_callback = output_callback self.tool_output_callback = tool_output_callback diff --git a/omnitool/gradio/loop.py b/omnitool/gradio/loop.py index 323cb4b2..6985c928 100644 --- a/omnitool/gradio/loop.py +++ b/omnitool/gradio/loop.py @@ -41,6 +41,7 @@ class APIProvider(StrEnum): def sampling_loop_sync( *, + args, model: str, provider: APIProvider | None, messages: list[BetaMessageParam], @@ -50,17 +51,17 @@ def sampling_loop_sync( api_key: str, only_n_most_recent_images: int | None = 2, max_tokens: int = 4096, - omniparser_url: str, save_folder: str = "./uploads" ): """ Synchronous agentic sampling loop for the assistant/tool interaction of computer use. """ print('in sampling_loop_sync, model:', model) - omniparser_client = OmniParserClient(url=f"http://{omniparser_url}/parse/") + omniparser_client = OmniParserClient(host_device=args.host_device, url=f"http://{args.omniparser_server_url}/parse/") if model == "claude-3-5-sonnet-20241022": # Register Actor and Executor actor = AnthropicActor( + args=args, model=model, provider=provider, api_key=api_key, @@ -92,6 +93,7 @@ def sampling_loop_sync( else: raise ValueError(f"Model {model} not supported") executor = AnthropicExecutor( + args=args, output_callback=output_callback, tool_output_callback=tool_output_callback, ) diff --git a/omnitool/gradio/tools/computer.py b/omnitool/gradio/tools/computer.py index e0812692..90dcc450 100644 --- a/omnitool/gradio/tools/computer.py +++ b/omnitool/gradio/tools/computer.py @@ -6,7 +6,6 @@ import shlex import os import subprocess -import pyautogui from PIL import Image @@ -95,9 +94,11 @@ def options(self) -> ComputerToolOptions: def to_params(self) -> BetaToolComputerUse20241022Param: return {"name": self.name, "type": self.api_type, **self.options} - def __init__(self, is_scaling: bool = False): + def __init__(self, args, is_scaling: bool = False): super().__init__() + self.args = args + # Get screen width and height using Windows command self.display_num = None self.offset_x = 0 @@ -148,11 +149,11 @@ async def __call__( print(f"mouse move to {x}, {y}") if action == "mouse_move": - self.send_to_vm(f"pyautogui.moveTo({x}, {y})") + self.send_to_host_device(f"pyautogui.moveTo({x}, {y})") return ToolResult(output=f"Moved mouse to ({x}, {y})") elif action == "left_click_drag": - current_x, current_y = self.send_to_vm("pyautogui.position()") - self.send_to_vm(f"pyautogui.dragTo({x}, {y}, duration=0.5)") + current_x, current_y = self.send_to_host_device("pyautogui.position()") + self.send_to_host_device(f"pyautogui.dragTo({x}, {y}, duration=0.5)") return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})") if action in ("key", "type"): @@ -169,18 +170,18 @@ async def __call__( for key in keys: key = self.key_conversion.get(key.strip(), key.strip()) key = key.lower() - self.send_to_vm(f"pyautogui.keyDown('{key}')") # Press down each key + self.send_to_host_device(f"pyautogui.keyDown('{key}')") # Press down each key for key in reversed(keys): key = self.key_conversion.get(key.strip(), key.strip()) key = key.lower() - self.send_to_vm(f"pyautogui.keyUp('{key}')") # Release each key in reverse order + self.send_to_host_device(f"pyautogui.keyUp('{key}')") # Release each key in reverse order return ToolResult(output=f"Pressed keys: {text}") elif action == "type": # default click before type TODO: check if this is needed - self.send_to_vm("pyautogui.click()") - self.send_to_vm(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})") - self.send_to_vm("pyautogui.press('enter')") + self.send_to_host_device("pyautogui.click()") + self.send_to_host_device(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})") + self.send_to_host_device("pyautogui.press('enter')") screenshot_base64 = (await self.screenshot()).base64_image return ToolResult(output=text, base64_image=screenshot_base64) @@ -201,28 +202,28 @@ async def __call__( if action == "screenshot": return await self.screenshot() elif action == "cursor_position": - x, y = self.send_to_vm("pyautogui.position()") + x, y = self.send_to_host_device("pyautogui.position()") x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y) return ToolResult(output=f"X={x},Y={y}") else: if action == "left_click": - self.send_to_vm("pyautogui.click()") + self.send_to_host_device("pyautogui.click()") elif action == "right_click": - self.send_to_vm("pyautogui.rightClick()") + self.send_to_host_device("pyautogui.rightClick()") elif action == "middle_click": - self.send_to_vm("pyautogui.middleClick()") + self.send_to_host_device("pyautogui.middleClick()") elif action == "double_click": - self.send_to_vm("pyautogui.doubleClick()") + self.send_to_host_device("pyautogui.doubleClick()") elif action == "left_press": - self.send_to_vm("pyautogui.mouseDown()") + self.send_to_host_device("pyautogui.mouseDown()") time.sleep(1) - self.send_to_vm("pyautogui.mouseUp()") + self.send_to_host_device("pyautogui.mouseUp()") return ToolResult(output=f"Performed {action}") if action in ("scroll_up", "scroll_down"): if action == "scroll_up": - self.send_to_vm("pyautogui.scroll(100)") + self.send_to_host_device("pyautogui.scroll(100)") elif action == "scroll_down": - self.send_to_vm("pyautogui.scroll(-100)") + self.send_to_host_device("pyautogui.scroll(-100)") return ToolResult(output=f"Performed {action}") if action == "hover": return ToolResult(output=f"Performed {action}") @@ -231,7 +232,7 @@ async def __call__( return ToolResult(output=f"Performed {action}") raise ToolError(f"Invalid action: {action}") - def send_to_vm(self, action: str): + def send_to_host_device(self, action: str): """ Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()" """ @@ -243,7 +244,17 @@ def send_to_vm(self, action: str): try: print(f"sending to vm: {command_list}") - response = self.execute(command_list) + + if self.args.host_device == "omnibox_windows": + response = requests.post( + f"http://localhost:5000/execute", + headers={'Content-Type': 'application/json'}, + json={"command": command_list}, + timeout=90 + ) + elif self.args.host_device == "local": + response = self.execute(command_list) + time.sleep(0.7) # avoid async error as actions take time to complete print(f"action executed") @@ -287,7 +298,7 @@ async def screenshot(self): screenshot = self.padding_image(screenshot) self.target_dimension = MAX_SCALING_TARGETS["WXGA"] width, height = self.target_dimension["width"], self.target_dimension["height"] - screenshot, path = get_screenshot(resize=True, target_width=width, target_height=height) + screenshot, path = get_screenshot(host_device=self.args.host_device, resize=True, target_width=width, target_height=height) time.sleep(0.7) # avoid async error as actions take time to complete return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode()) diff --git a/omnitool/gradio/tools/screen_capture.py b/omnitool/gradio/tools/screen_capture.py index 249e6358..b3b25daa 100644 --- a/omnitool/gradio/tools/screen_capture.py +++ b/omnitool/gradio/tools/screen_capture.py @@ -9,27 +9,33 @@ OUTPUT_DIR = "./tmp/outputs" -def get_screenshot(resize: bool = False, target_width: int = 1920, target_height: int = 1080): +def get_screenshot(host_device: str, resize = False, target_width: int = 1920, target_height: int = 1080): """Capture screenshot by requesting from HTTP endpoint - returns native resolution unless resized""" output_dir = Path(OUTPUT_DIR) output_dir.mkdir(parents=True, exist_ok=True) path = output_dir / f"screenshot_{uuid4().hex}.png" try: - screenshot = pyautogui.screenshot() - size = pyautogui.size() + if host_device == "omnibox_windows": + response = requests.get('http://localhost:5000/screenshot') + if response.status_code != 200: + raise ToolError(f"Failed to capture screenshot: HTTP {response.status_code}") + # (1280, 800) + screenshot = Image.open(BytesIO(response.content)) + if resize and screenshot.size != (target_width, target_height): + screenshot = screenshot.resize((target_width, target_height)) + elif host_device == "local": + screenshot = pyautogui.screenshot() + size = pyautogui.size() + + screenshot = screenshot.resize((size.width, size.height)) - target_width = size.width - target_height = size.height - - screenshot = screenshot.resize((target_width, target_height)) - - cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png") - cursor_x, cursor_y = pyautogui.position() - cursor = Image.open(cursor_path) - # make the cursor smaller - cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5))) - screenshot.paste(cursor, (cursor_x, cursor_y), cursor) + cursor_path = os.path.join(os.path.dirname(__file__), "cursor.png") + cursor_x, cursor_y = pyautogui.position() + cursor = Image.open(cursor_path) + # make the cursor smaller + cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5))) + screenshot.paste(cursor, (cursor_x, cursor_y), cursor) screenshot.save(path) return screenshot, path From fc895b688b6f12608e5c31c80f53ef2197e19d11 Mon Sep 17 00:00:00 2001 From: boedegoat Date: Fri, 18 Apr 2025 22:49:04 +0700 Subject: [PATCH 5/6] update run_gemini_interleaved docs --- omnitool/gradio/agent/llm_utils/geminiclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnitool/gradio/agent/llm_utils/geminiclient.py b/omnitool/gradio/agent/llm_utils/geminiclient.py index ec17341d..d6bd8b51 100644 --- a/omnitool/gradio/agent/llm_utils/geminiclient.py +++ b/omnitool/gradio/agent/llm_utils/geminiclient.py @@ -16,7 +16,7 @@ class Action(BaseModel): def run_gemini_interleaved(messages: list, system: str, model_name: str, api_key: str, max_tokens: int, temperature=0): """ - Run a chat completion through Gemini's API, ignoring any images in the messages. + Run a chat completion through Google Gemini's API """ api_key = api_key or os.environ.get("GEMINI_API_KEY") if not api_key: From 9d7dc540238f186567983556ec110829d82c6e99 Mon Sep 17 00:00:00 2001 From: boedegoat Date: Fri, 18 Apr 2025 22:59:59 +0700 Subject: [PATCH 6/6] Improve command execution handling for host device and enhance error reporting --- omnitool/gradio/tools/computer.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/omnitool/gradio/tools/computer.py b/omnitool/gradio/tools/computer.py index 90dcc450..f54f0b11 100644 --- a/omnitool/gradio/tools/computer.py +++ b/omnitool/gradio/tools/computer.py @@ -234,7 +234,7 @@ async def __call__( def send_to_host_device(self, action: str): """ - Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()" + Executes a python command on the host device. Only return tuple of x,y when action is "pyautogui.position()" """ prefix = "import pyautogui; pyautogui.FAILSAFE = False;" command_list = ["python", "-c", f"{prefix} {action}"] @@ -252,14 +252,17 @@ def send_to_host_device(self, action: str): json={"command": command_list}, timeout=90 ) + if response.status_code != 200: + raise ToolError(f"Failed to execute command. Status code: {response.status_code}") + output = response.json()['output'].strip() elif self.args.host_device == "local": response = self.execute(command_list) + output = response['output'].strip() time.sleep(0.7) # avoid async error as actions take time to complete print(f"action executed") if parse: - output = response['output'].strip() match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output) if not match: raise ToolError(f"Could not parse coordinates from output: {output}") @@ -347,9 +350,21 @@ def scale_coordinates(self, source: ScalingSource, x: int, y: int): def get_screen_size(self): """Return width and height of the screen""" try: - response = self.execute(["python", "-c", "import pyautogui; print(pyautogui.size())"]) + if self.args.host_device == "omnibox_windows": + response = requests.post( + f"http://localhost:5000/execute", + headers={'Content-Type': 'application/json'}, + json={"command": ["python", "-c", "import pyautogui; print(pyautogui.size())"]}, + timeout=90 + ) + + if response.status_code != 200: + raise ToolError(f"Failed to get screen size. Status code: {response.status_code}") + output = response.json()['output'].strip() + elif self.args.host_device == "local": + response = self.execute(["python", "-c", "import pyautogui; print(pyautogui.size())"]) + output = response['output'].strip() - output = response['output'].strip() match = re.search(r'Size\(width=(\d+),\s*height=(\d+)\)', output) if not match: raise ToolError(f"Could not parse screen size from output: {output}")