From ec3bb17cabd583ec55380e1997c52ea0049a0858 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 13:37:32 +0530
Subject: [PATCH 1/9] issue of the utils

---
 requirements.txt | 30 +++++++++++++++---------------
 util/utils.py    | 17 +++++++++--------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 901a27fa..b8b09d69 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,31 +1,31 @@
 torch
 easyocr
 torchvision
-supervision==0.18.0
-openai==1.3.5
+supervision
+openai
 transformers
-ultralytics==8.3.70
+ultralytics
 azure-identity
-numpy==1.26.4
+numpy
 opencv-python
 opencv-python-headless
 gradio
 dill
 accelerate
 timm
-einops==0.8.0
+einops
 paddlepaddle
 paddleocr
-ruff==0.6.7
-pre-commit==3.8.0
-pytest==8.3.3
-pytest-asyncio==0.23.6
-pyautogui==0.9.54
-streamlit>=1.38.0
-anthropic[bedrock,vertex]>=0.37.1
-jsonschema==4.22.0
-boto3>=1.28.57
-google-auth<3,>=2
+ruff
+pre-commit
+pytest
+pytest-asyncio
+pyautogui
+streamlit
+anthropic[bedrock,vertex]
+jsonschema
+boto3
+google-auth
 screeninfo
 uiautomation
 dashscope
diff --git a/util/utils.py b/util/utils.py
index eb7c8b25..b291c4bc 100644
--- a/util/utils.py
+++ b/util/utils.py
@@ -21,14 +21,15 @@
 from paddleocr import PaddleOCR
 reader = easyocr.Reader(['en'])
 paddle_ocr = PaddleOCR(
-    lang='en',  # other lang also available
+    # lang='en',  # other lang also available
     use_angle_cls=False,
-    use_gpu=False,  # using cuda will conflict with pytorch in the same process
-    show_log=False,
-    max_batch_size=1024,
-    use_dilation=True,  # improves accuracy
-    det_db_score_mode='slow',  # improves accuracy
-    rec_batch_num=1024)
+    # use_gpu=False,  # using cuda will conflict with pytorch in the same process
+    # show_log=False,
+    # max_batch_size=1024,
+    # use_dilation=True,  # improves accuracy
+    # det_db_score_mode='slow',  # improves accuracy
+    # rec_batch_num=1024)
+)
 import time
 import base64
 
@@ -514,7 +515,7 @@ def check_ocr_box(image_source: Union[str, Image.Image], display_img = True, out
             text_threshold = 0.5
         else:
             text_threshold = easyocr_args['text_threshold']
-        result = paddle_ocr.ocr(image_np, cls=False)[0]
+        result = paddle_ocr.ocr(image_np)[0]
         coord = [item[0] for item in result if item[1][1] > text_threshold]
         text = [item[1][0] for item in result if item[1][1] > text_threshold]
     else:  # EasyOCR

From 40f5ca65750838ed4ad5272bbb459b0d005c2147 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 16:20:33 +0530
Subject: [PATCH 2/9] push

---
 main.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 main.py

diff --git a/main.py b/main.py
new file mode 100644
index 00000000..b2c6b7aa
--- /dev/null
+++ b/main.py
@@ -0,0 +1,74 @@
+from fastapi import FastAPI, UploadFile, File
+from typing import Optional
+import torch
+from PIL import Image
+import io
+import base64
+import os
+from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
+
+# Initialize models
+yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
+caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
+DEVICE = torch.device('cuda')
+
+app = FastAPI(title="OmniParser API")
+
+def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]:
+    """
+    Process the image and return the annotated image and parsed elements.
+    """
+    box_overlay_ratio = image.size[0] / 3200
+    draw_bbox_config = {
+        'text_scale': 0.8 * box_overlay_ratio,
+        'text_thickness': max(int(2 * box_overlay_ratio), 1),
+        'text_padding': max(int(3 * box_overlay_ratio), 1),
+        'thickness': max(int(3 * box_overlay_ratio), 1),
+    }
+
+    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
+    text, ocr_bbox = ocr_bbox_rslt
+    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
+        image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,
+        draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,
+        iou_threshold=iou_threshold, imgsz=imgsz
+    )
+    annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
+    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
+    return annotated_image, parsed_content_str
+
+@app.post("/process")
+async def process_endpoint(
+    file: UploadFile = File(...),
+    box_threshold: float = 0.05,
+    iou_threshold: float = 0.1,
+    use_paddleocr: bool = True,
+    imgsz: int = 640
+):
+    """
+    Endpoint to upload an image and get parsed elements.
+    """
+    try:
+        # Read the uploaded image
+        image_bytes = await file.read()
+        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+
+        # Process the image
+        annotated_image, parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
+
+        # Convert annotated image to bytes for return
+        img_byte_arr = io.BytesIO()
+        annotated_image.save(img_byte_arr, format='PNG')
+        img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+
+        # Return JSON response with image and parsed content
+        return {
+            "annotated_image": img_base64,
+            "parsed_elements": parsed_content
+        }
+    except Exception as e:
+        return {"error": str(e)}
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From 045f454fa947e9809886ee8e16240433ab43d981 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 17:23:20 +0530
Subject: [PATCH 3/9] check

---
 main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.py b/main.py
index b2c6b7aa..cde9ae8e 100644
--- a/main.py
+++ b/main.py
@@ -7,6 +7,8 @@
 import os
 from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+
 # Initialize models
 yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
 caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")

From 09c5216d9d46693487d52acc395e73cb8e54bdf5 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 18:40:49 +0530
Subject: [PATCH 4/9] changes

---
 .gitignore |  3 ++-
 main.py    | 28 +++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8b8235e6..a0c7ff8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ util/__pycache__/
 index.html?linkid=2289031
 wget-log
 weights/icon_caption_florence_v2/
-omnitool/gradio/uploads/
\ No newline at end of file
+omnitool/gradio/uploads/
+.env
\ No newline at end of file
diff --git a/main.py b/main.py
index cde9ae8e..76310614 100644
--- a/main.py
+++ b/main.py
@@ -16,7 +16,8 @@
 
 app = FastAPI(title="OmniParser API")
 
-def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]:
+def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, 
+      use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]:
     """
     Process the image and return the annotated image and parsed elements.
     """
@@ -28,15 +29,32 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float
         'thickness': max(int(3 * box_overlay_ratio), 1),
     }
 
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
+    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
+      output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
+      'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
     text, ocr_bbox = ocr_bbox_rslt
+
     dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
-        image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox,
-        draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,
+        image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
+      ocr_bbox=ocr_bbox,
+        draw_bbox_config=draw_bbox_config,
+      caption_model_processor=caption_model_processor, ocr_text=text,
         iou_threshold=iou_threshold, imgsz=imgsz
     )
+
+    # --- ADD THIS DEFENSIVE CODE BLOCK ---
+    # If the model returns an empty string for the image, it means nothing was detected.
+    if not dino_labled_img:
+        print("Warning: get_som_labeled_img returned an empty image string. No objects were detected.")
+        # Return the original image and an empty string for the content.
+        return image, ""
+    # --- END OF DEFENSIVE CODE BLOCK ---
+
     annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
+
+    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate
+      (parsed_content_list)])
+
     return annotated_image, parsed_content_str
 
 @app.post("/process")

From cb8f841c9a471b4ea631dfbd82407c742294c7a0 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 19:25:29 +0530
Subject: [PATCH 5/9] main

---
 main.py | 52 +++++++++++++++++-----------------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/main.py b/main.py
index 76310614..f16156ba 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,4 @@
-from fastapi import FastAPI, UploadFile, File
+from fastapi import FastAPI, UploadFile, File, Form
 from typing import Optional
 import torch
 from PIL import Image
@@ -7,6 +7,7 @@
 import os
 from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 
+# This line is still needed for Kaggle environments
 os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 
 # Initialize models
@@ -17,9 +18,10 @@
 app = FastAPI(title="OmniParser API")
 
 def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, 
-      use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]:
+      use_paddleocr: bool, imgsz: int) -> str:
     """
-    Process the image and return the annotated image and parsed elements.
+    Process the image and return the parsed elements as a string.
+    This is the same core logic as your Gradio app's process function.
     """
     box_overlay_ratio = image.size[0] / 3200
     draw_bbox_config = {
@@ -34,7 +36,8 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float
       'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
     text, ocr_bbox = ocr_bbox_rslt
 
-    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
+    # We only need the parsed content list, not the annotated image.
+    _, _, parsed_content_list = get_som_labeled_img(
         image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
       ocr_bbox=ocr_bbox,
         draw_bbox_config=draw_bbox_config,
@@ -42,50 +45,29 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float
         iou_threshold=iou_threshold, imgsz=imgsz
     )
 
-    # --- ADD THIS DEFENSIVE CODE BLOCK ---
-    # If the model returns an empty string for the image, it means nothing was detected.
-    if not dino_labled_img:
-        print("Warning: get_som_labeled_img returned an empty image string. No objects were detected.")
-        # Return the original image and an empty string for the content.
-        return image, ""
-    # --- END OF DEFENSIVE CODE BLOCK ---
+    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
 
-    annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-
-    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate
-      (parsed_content_list)])
-
-    return annotated_image, parsed_content_str
+    return parsed_content_str
 
 @app.post("/process")
 async def process_endpoint(
     file: UploadFile = File(...),
-    box_threshold: float = 0.05,
-    iou_threshold: float = 0.1,
-    use_paddleocr: bool = True,
-    imgsz: int = 640
+    box_threshold: float = Form(...), # This forces FastAPI to read the value from the client
+    iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client
+    use_paddleocr: bool = Form(...),  # This forces FastAPI to read the value from the client
+    imgsz: int = Form(...)            # This forces FastAPI to read the value from the client
 ):
     """
-    Endpoint to upload an image and get parsed elements.
+    Endpoint to upload an image and get parsed elements as JSON.
     """
     try:
-        # Read the uploaded image
         image_bytes = await file.read()
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 
-        # Process the image
-        annotated_image, parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
-
-        # Convert annotated image to bytes for return
-        img_byte_arr = io.BytesIO()
-        annotated_image.save(img_byte_arr, format='PNG')
-        img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+        # This will now correctly use the low thresholds sent by your script
+        parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
 
-        # Return JSON response with image and parsed content
-        return {
-            "annotated_image": img_base64,
-            "parsed_elements": parsed_content
-        }
+        return { "parsed_elements": parsed_content }
     except Exception as e:
         return {"error": str(e)}
 

From 9b43625e2f243e1e64d0c37b774d2d7c5abe2489 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 19:56:46 +0530
Subject: [PATCH 6/9] changes

---
 main.py | 141 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 53 deletions(-)

diff --git a/main.py b/main.py
index f16156ba..d9df190b 100644
--- a/main.py
+++ b/main.py
@@ -1,76 +1,111 @@
-from fastapi import FastAPI, UploadFile, File, Form
-from typing import Optional
-import torch
-from PIL import Image
-import io
-import base64
 import os
-from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
+import sys
+import io
+import traceback
+from PIL import Image
+from fastapi import FastAPI, File, UploadFile, Form
+from fastapi.responses import JSONResponse
+
+# Add the parent directory of 'util' to the Python path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 
-# This line is still needed for Kaggle environments
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+from util.omniparser import get_som_labeled_img, initialize_models
+from util.utils import check_ocr_box
 
-# Initialize models
-yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
-caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
-DEVICE = torch.device('cuda')
+# --- Initialize Models ---
+# It's better to initialize models once when the application starts
+yolo_model, caption_model_processor = initialize_models()
 
-app = FastAPI(title="OmniParser API")
+app = FastAPI()
 
-def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, 
-      use_paddleocr: bool, imgsz: int) -> str:
+def process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz):
     """
-    Process the image and return the parsed elements as a string.
-    This is the same core logic as your Gradio app's process function.
+    This function contains the core image processing logic.
+    It's designed to be testable and to log every step.
     """
-    box_overlay_ratio = image.size[0] / 3200
-    draw_bbox_config = {
-        'text_scale': 0.8 * box_overlay_ratio,
-        'text_thickness': max(int(2 * box_overlay_ratio), 1),
-        'text_padding': max(int(3 * box_overlay_ratio), 1),
-        'thickness': max(int(3 * box_overlay_ratio), 1),
-    }
-
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
-      output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
-      'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
-    text, ocr_bbox = ocr_bbox_rslt
-
-    # We only need the parsed content list, not the annotated image.
-    _, _, parsed_content_list = get_som_labeled_img(
-        image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
-      ocr_bbox=ocr_bbox,
-        draw_bbox_config=draw_bbox_config,
-      caption_model_processor=caption_model_processor, ocr_text=text,
-        iou_threshold=iou_threshold, imgsz=imgsz
-    )
-
-    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
-
-    return parsed_content_str
+    log = ["--- Inside process_image_with_logging ---"]
+    try:
+        log.append(f"Step 1: Received parameters: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}")
+
+        # --- OCR Check Case ---
+        log.append("Step 2: Starting OCR check...")
+        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
+          output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
+          'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
+        text, ocr_bbox = ocr_bbox_rslt
+        log.append(f"Step 3: OCR check complete. Found {len(text)} text elements.")
+
+        # --- Main Model Case ---
+        log.append("Step 4: Starting main model (get_som_labeled_img)...")
+        _, _, parsed_content_list = get_som_labeled_img(
+            image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
+            ocr_bbox=ocr_bbox, draw_bbox_config={}, # draw_bbox_config is not needed for data-only output
+            caption_model_processor=caption_model_processor, ocr_text=text,
+            iou_threshold=iou_threshold, imgsz=imgz
+        )
+        log.append(f"Step 5: Main model finished. It returned a list with {len(parsed_content_list)} elements.")
+
+        # --- Final Formatting Case ---
+        if not parsed_content_list:
+            log.append("WARNING: The final list of elements is empty. The model did not detect any objects that met the criteria.")
+
+        parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
+        log.append("Step 6: Successfully formatted the final list into a string.")
+
+        return parsed_content_str, log
+
+    except Exception as e:
+        log.append(f"\n!!!!!! AN ERROR OCCURRED INSIDE process_image_with_logging !!!!!!")
+        log.append(f"Error Type: {type(e)}")
+        log.append(f"Error Message: {str(e)}")
+        log.append("--- Full Traceback ---")
+        log.append(traceback.format_exc())
+        return "", log # Return an empty string but the full log of the crash
 
 @app.post("/process")
 async def process_endpoint(
     file: UploadFile = File(...),
-    box_threshold: float = Form(...), # This forces FastAPI to read the value from the client
-    iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client
-    use_paddleocr: bool = Form(...),  # This forces FastAPI to read the value from the client
-    imgsz: int = Form(...)            # This forces FastAPI to read the value from the client
+    box_threshold: float = Form(...),
+    iou_threshold: float = Form(...),
+    use_paddleocr: bool = Form(...),
+    imgsz: int = Form(...)
 ):
     """
-    Endpoint to upload an image and get parsed elements as JSON.
+    This endpoint now acts as a "black box recorder".
+    It logs everything and returns the log in the response.
     """
+    master_log = ["--- Server received request in process_endpoint ---"]
     try:
+        # --- Parameter Reception Case ---
+        master_log.append(f"Received box_threshold: {box_threshold} (type: {type(box_threshold)})")
+        master_log.append(f"Received iou_threshold: {iou_threshold} (type: {type(iou_threshold)})")
+        master_log.append(f"Received use_paddleocr: {use_paddleocr} (type: {type(use_paddleocr)})")
+        master_log.append(f"Received imgsz: {imgsz} (type: {type(imgsz)})")
+
+        # --- Image Loading Case ---
         image_bytes = await file.read()
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        master_log.append(f"Image successfully loaded. Size: {image.size}")
+
+        # --- Call Core Logic Case ---
+        parsed_content, function_log = process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
+        master_log.extend(function_log) # Add the detailed log from the function
 
-        # This will now correctly use the low thresholds sent by your script
-        parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
+        master_log.append("\n--- FINAL SUMMARY ---")
+        master_log.append(f"Final content string length: {len(parsed_content)}")
+
+        # We will return the log itself for debugging
+        return { "debug_log": "\n".join(master_log), "parsed_elements": parsed_content }
 
-        return { "parsed_elements": parsed_content }
     except Exception as e:
-        return {"error": str(e)}
+        master_log.append(f"\n!!!!!! AN UNEXPECTED ERROR OCCURRED IN process_endpoint !!!!!!")
+        master_log.append(f"Error Type: {type(e)}")
+        master_log.append(f"Error Message: {str(e)}")
+        master_log.append("--- Full Traceback ---")
+        master_log.append(traceback.format_exc())
+
+        return { "debug_log": "\n".join(master_log), "parsed_elements": "" }
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
+    uvicorn.run(app, host="0.0.0.0", port=8000)

From 07a17debfef53c7d133556b40f04afe0f09e576a Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Thu, 28 Aug 2025 22:59:11 +0530
Subject: [PATCH 7/9] chnages

---
 main.py | 141 +++++++++++++++++++++-----------------------------------
 1 file changed, 53 insertions(+), 88 deletions(-)

diff --git a/main.py b/main.py
index d9df190b..f16156ba 100644
--- a/main.py
+++ b/main.py
@@ -1,111 +1,76 @@
-import os
-import sys
-import io
-import traceback
+from fastapi import FastAPI, UploadFile, File, Form
+from typing import Optional
+import torch
 from PIL import Image
-from fastapi import FastAPI, File, UploadFile, Form
-from fastapi.responses import JSONResponse
-
-# Add the parent directory of 'util' to the Python path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import io
+import base64
+import os
+from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
 
-from util.omniparser import get_som_labeled_img, initialize_models
-from util.utils import check_ocr_box
+# This line is still needed for Kaggle environments
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 
-# --- Initialize Models ---
-# It's better to initialize models once when the application starts
-yolo_model, caption_model_processor = initialize_models()
+# Initialize models
+yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
+caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
+DEVICE = torch.device('cuda')
 
-app = FastAPI()
+app = FastAPI(title="OmniParser API")
 
-def process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz):
+def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, 
+      use_paddleocr: bool, imgsz: int) -> str:
     """
-    This function contains the core image processing logic.
-    It's designed to be testable and to log every step.
+    Process the image and return the parsed elements as a string.
+    This is the same core logic as your Gradio app's process function.
     """
-    log = ["--- Inside process_image_with_logging ---"]
-    try:
-        log.append(f"Step 1: Received parameters: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}")
-
-        # --- OCR Check Case ---
-        log.append("Step 2: Starting OCR check...")
-        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
-          output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
-          'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
-        text, ocr_bbox = ocr_bbox_rslt
-        log.append(f"Step 3: OCR check complete. Found {len(text)} text elements.")
-
-        # --- Main Model Case ---
-        log.append("Step 4: Starting main model (get_som_labeled_img)...")
-        _, _, parsed_content_list = get_som_labeled_img(
-            image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
-            ocr_bbox=ocr_bbox, draw_bbox_config={}, # draw_bbox_config is not needed for data-only output
-            caption_model_processor=caption_model_processor, ocr_text=text,
-            iou_threshold=iou_threshold, imgsz=imgz
-        )
-        log.append(f"Step 5: Main model finished. It returned a list with {len(parsed_content_list)} elements.")
-
-        # --- Final Formatting Case ---
-        if not parsed_content_list:
-            log.append("WARNING: The final list of elements is empty. The model did not detect any objects that met the criteria.")
-
-        parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
-        log.append("Step 6: Successfully formatted the final list into a string.")
-
-        return parsed_content_str, log
-
-    except Exception as e:
-        log.append(f"\n!!!!!! AN ERROR OCCURRED INSIDE process_image_with_logging !!!!!!")
-        log.append(f"Error Type: {type(e)}")
-        log.append(f"Error Message: {str(e)}")
-        log.append("--- Full Traceback ---")
-        log.append(traceback.format_exc())
-        return "", log # Return an empty string but the full log of the crash
+    box_overlay_ratio = image.size[0] / 3200
+    draw_bbox_config = {
+        'text_scale': 0.8 * box_overlay_ratio,
+        'text_thickness': max(int(2 * box_overlay_ratio), 1),
+        'text_padding': max(int(3 * box_overlay_ratio), 1),
+        'thickness': max(int(3 * box_overlay_ratio), 1),
+    }
+
+    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
+      output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
+      'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
+    text, ocr_bbox = ocr_bbox_rslt
+
+    # We only need the parsed content list, not the annotated image.
+    _, _, parsed_content_list = get_som_labeled_img(
+        image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
+      ocr_bbox=ocr_bbox,
+        draw_bbox_config=draw_bbox_config,
+      caption_model_processor=caption_model_processor, ocr_text=text,
+        iou_threshold=iou_threshold, imgsz=imgsz
+    )
+
+    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
+
+    return parsed_content_str
 
 @app.post("/process")
 async def process_endpoint(
     file: UploadFile = File(...),
-    box_threshold: float = Form(...),
-    iou_threshold: float = Form(...),
-    use_paddleocr: bool = Form(...),
-    imgsz: int = Form(...)
+    box_threshold: float = Form(...), # This forces FastAPI to read the value from the client
+    iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client
+    use_paddleocr: bool = Form(...),  # This forces FastAPI to read the value from the client
+    imgsz: int = Form(...)            # This forces FastAPI to read the value from the client
 ):
     """
-    This endpoint now acts as a "black box recorder".
-    It logs everything and returns the log in the response.
+    Endpoint to upload an image and get parsed elements as JSON.
     """
-    master_log = ["--- Server received request in process_endpoint ---"]
     try:
-        # --- Parameter Reception Case ---
-        master_log.append(f"Received box_threshold: {box_threshold} (type: {type(box_threshold)})")
-        master_log.append(f"Received iou_threshold: {iou_threshold} (type: {type(iou_threshold)})")
-        master_log.append(f"Received use_paddleocr: {use_paddleocr} (type: {type(use_paddleocr)})")
-        master_log.append(f"Received imgsz: {imgsz} (type: {type(imgsz)})")
-
-        # --- Image Loading Case ---
         image_bytes = await file.read()
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        master_log.append(f"Image successfully loaded. Size: {image.size}")
-
-        # --- Call Core Logic Case ---
-        parsed_content, function_log = process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
-        master_log.extend(function_log) # Add the detailed log from the function
 
-        master_log.append("\n--- FINAL SUMMARY ---")
-        master_log.append(f"Final content string length: {len(parsed_content)}")
-
-        # We will return the log itself for debugging
-        return { "debug_log": "\n".join(master_log), "parsed_elements": parsed_content }
+        # This will now correctly use the low thresholds sent by your script
+        parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
 
+        return { "parsed_elements": parsed_content }
     except Exception as e:
-        master_log.append(f"\n!!!!!! AN UNEXPECTED ERROR OCCURRED IN process_endpoint !!!!!!")
-        master_log.append(f"Error Type: {type(e)}")
-        master_log.append(f"Error Message: {str(e)}")
-        master_log.append("--- Full Traceback ---")
-        master_log.append(traceback.format_exc())
-
-        return { "debug_log": "\n".join(master_log), "parsed_elements": "" }
+        return {"error": str(e)}
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file

From 2abafa1d3418d05a3c817201482176f1c18bd3b6 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Fri, 29 Aug 2025 08:43:32 +0530
Subject: [PATCH 8/9] main

---
 main.py | 69 +++++++++++++++++++--------------------------------------
 1 file changed, 23 insertions(+), 46 deletions(-)

diff --git a/main.py b/main.py
index f16156ba..bdae8bcd 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,4 @@
-from fastapi import FastAPI, UploadFile, File, Form
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
 from typing import Optional
 import torch
 from PIL import Image
@@ -6,71 +6,48 @@
 import base64
 import os
 from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
+import logging
+
+# --- Basic Logging Setup ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 
-# This line is still needed for Kaggle environments
 os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 
 # Initialize models
+logger.info("Initializing models...")
 yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt')
 caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence")
-DEVICE = torch.device('cuda')
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+logger.info(f"Models initialized. Using device: {DEVICE}")
 
 app = FastAPI(title="OmniParser API")
 
 def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, 
       use_paddleocr: bool, imgsz: int) -> str:
-    """
-    Process the image and return the parsed elements as a string.
-    This is the same core logic as your Gradio app's process function.
-    """
-    box_overlay_ratio = image.size[0] / 3200
-    draw_bbox_config = {
-        'text_scale': 0.8 * box_overlay_ratio,
-        'text_thickness': max(int(2 * box_overlay_ratio), 1),
-        'text_padding': max(int(3 * box_overlay_ratio), 1),
-        'thickness': max(int(3 * box_overlay_ratio), 1),
-    }
-
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False,
-      output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False,
-      'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
+    logger.info(f"Processing image with params: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}")
+    ocr_bbox_rslt, _ = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr)
     text, ocr_bbox = ocr_bbox_rslt
-
-    # We only need the parsed content list, not the annotated image.
     _, _, parsed_content_list = get_som_labeled_img(
         image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True,
-      ocr_bbox=ocr_bbox,
-        draw_bbox_config=draw_bbox_config,
-      caption_model_processor=caption_model_processor, ocr_text=text,
-        iou_threshold=iou_threshold, imgsz=imgsz
+        ocr_bbox=ocr_bbox, draw_bbox_config={}, caption_model_processor=caption_model_processor,
+        ocr_text=text, iou_threshold=iou_threshold, imgsz=imgsz
     )
-
-    parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
-
-    return parsed_content_str
+    return '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)])
 
 @app.post("/process")
 async def process_endpoint(
     file: UploadFile = File(...),
-    box_threshold: float = Form(...), # This forces FastAPI to read the value from the client
-    iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client
-    use_paddleocr: bool = Form(...),  # This forces FastAPI to read the value from the client
-    imgsz: int = Form(...)            # This forces FastAPI to read the value from the client
+    box_threshold: float = Form(...),
+    iou_threshold: float = Form(...),
+    use_paddleocr: str = Form(...),
+    imgsz: int = Form(...)
 ):
-    """
-    Endpoint to upload an image and get parsed elements as JSON.
-    """
-    try:
-        image_bytes = await file.read()
-        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-
-        # This will now correctly use the low thresholds sent by your script
-        parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz)
-
-        return { "parsed_elements": parsed_content }
-    except Exception as e:
-        return {"error": str(e)}
+    use_paddleocr_bool = use_paddleocr.lower() in ('true', '1')
+    parsed_content = process_image(Image.open(file.file).convert("RGB"), box_threshold, iou_threshold, use_paddleocr_bool, imgsz)
+    return { "parsed_elements": parsed_content }
 
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
\ No newline at end of file
+    logger.info("Starting OmniParser API server for local testing...")
+    uvicorn.run(app, host="0.0.0.0", port=8000)

From c1c7301b84c9e0eef2ba5ba43668e6c026ae7473 Mon Sep 17 00:00:00 2001
From: Jai Aggarwal <jaiaggarwal14032002@gmail.com>
Date: Fri, 29 Aug 2025 16:52:13 +0530
Subject: [PATCH 9/9] commit

---
 omnitool/omniparserserver/omniparserserver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/omnitool/omniparserserver/omniparserserver.py b/omnitool/omniparserserver/omniparserserver.py
index 045fbace..781e4b67 100644
--- a/omnitool/omniparserserver/omniparserserver.py
+++ b/omnitool/omniparserserver/omniparserserver.py
@@ -19,7 +19,7 @@ def parse_arguments():
     parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model')
     parser.add_argument('--caption_model_path', type=str, default='../../weights/icon_caption_florence', help='Path to the caption model')
     parser.add_argument('--device', type=str, default='cpu', help='Device to run the model')
-    parser.add_argument('--BOX_TRESHOLD', type=float, default=0.05, help='Threshold for box detection')
+    parser.add_argument('--BOX_TRESHOLD', type=float, default=0.01, help='Threshold for box detection')
     parser.add_argument('--host', type=str, default='127.0.0.1', help='Host for the API')
     parser.add_argument('--port', type=int, default=8000, help='Port for the API')
     args = parser.parse_args()