From ec3bb17cabd583ec55380e1997c52ea0049a0858 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 13:37:32 +0530 Subject: [PATCH 1/9] issue of the utils --- requirements.txt | 30 +++++++++++++++--------------- util/utils.py | 17 +++++++++-------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/requirements.txt b/requirements.txt index 901a27fa..b8b09d69 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,31 +1,31 @@ torch easyocr torchvision -supervision==0.18.0 -openai==1.3.5 +supervision +openai transformers -ultralytics==8.3.70 +ultralytics azure-identity -numpy==1.26.4 +numpy opencv-python opencv-python-headless gradio dill accelerate timm -einops==0.8.0 +einops paddlepaddle paddleocr -ruff==0.6.7 -pre-commit==3.8.0 -pytest==8.3.3 -pytest-asyncio==0.23.6 -pyautogui==0.9.54 -streamlit>=1.38.0 -anthropic[bedrock,vertex]>=0.37.1 -jsonschema==4.22.0 -boto3>=1.28.57 -google-auth<3,>=2 +ruff +pre-commit +pytest +pytest-asyncio +pyautogui +streamlit +anthropic[bedrock,vertex] +jsonschema +boto3 +google-auth screeninfo uiautomation dashscope diff --git a/util/utils.py b/util/utils.py index eb7c8b25..b291c4bc 100644 --- a/util/utils.py +++ b/util/utils.py @@ -21,14 +21,15 @@ from paddleocr import PaddleOCR reader = easyocr.Reader(['en']) paddle_ocr = PaddleOCR( - lang='en', # other lang also available + # lang='en', # other lang also available use_angle_cls=False, - use_gpu=False, # using cuda will conflict with pytorch in the same process - show_log=False, - max_batch_size=1024, - use_dilation=True, # improves accuracy - det_db_score_mode='slow', # improves accuracy - rec_batch_num=1024) + # use_gpu=False, # using cuda will conflict with pytorch in the same process + # show_log=False, + # max_batch_size=1024, + # use_dilation=True, # improves accuracy + # det_db_score_mode='slow', # improves accuracy + # rec_batch_num=1024) +) import time import base64 @@ -514,7 +515,7 @@ def check_ocr_box(image_source: Union[str, Image.Image], display_img = True, out text_threshold = 0.5 else: text_threshold = easyocr_args['text_threshold'] - result = paddle_ocr.ocr(image_np, cls=False)[0] + result = paddle_ocr.ocr(image_np)[0] coord = [item[0] for item in result if item[1][1] > text_threshold] text = [item[1][0] for item in result if item[1][1] > text_threshold] else: # EasyOCR From 40f5ca65750838ed4ad5272bbb459b0d005c2147 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 16:20:33 +0530 Subject: [PATCH 2/9] push --- main.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 00000000..b2c6b7aa --- /dev/null +++ b/main.py @@ -0,0 +1,74 @@ +from fastapi import FastAPI, UploadFile, File +from typing import Optional +import torch +from PIL import Image +import io +import base64 +import os +from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img + +# Initialize models +yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt') +caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") +DEVICE = torch.device('cuda') + +app = FastAPI(title="OmniParser API") + +def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]: + """ + Process the image and return the annotated image and parsed elements. + """ + box_overlay_ratio = image.size[0] / 3200 + draw_bbox_config = { + 'text_scale': 0.8 * box_overlay_ratio, + 'text_thickness': max(int(2 * box_overlay_ratio), 1), + 'text_padding': max(int(3 * box_overlay_ratio), 1), + 'thickness': max(int(3 * box_overlay_ratio), 1), + } + + ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) + text, ocr_bbox = ocr_bbox_rslt + dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img( + image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox, + draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text, + iou_threshold=iou_threshold, imgsz=imgsz + ) + annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) + parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) + return annotated_image, parsed_content_str + +@app.post("/process") +async def process_endpoint( + file: UploadFile = File(...), + box_threshold: float = 0.05, + iou_threshold: float = 0.1, + use_paddleocr: bool = True, + imgsz: int = 640 +): + """ + Endpoint to upload an image and get parsed elements. + """ + try: + # Read the uploaded image + image_bytes = await file.read() + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + + # Process the image + annotated_image, parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) + + # Convert annotated image to bytes for return + img_byte_arr = io.BytesIO() + annotated_image.save(img_byte_arr, format='PNG') + img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8') + + # Return JSON response with image and parsed content + return { + "annotated_image": img_base64, + "parsed_elements": parsed_content + } + except Exception as e: + return {"error": str(e)} + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file From 045f454fa947e9809886ee8e16240433ab43d981 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 17:23:20 +0530 Subject: [PATCH 3/9] check --- main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.py b/main.py index b2c6b7aa..cde9ae8e 100644 --- a/main.py +++ b/main.py @@ -7,6 +7,8 @@ import os from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" + # Initialize models yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt') caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") From 09c5216d9d46693487d52acc395e73cb8e54bdf5 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 18:40:49 +0530 Subject: [PATCH 4/9] changes --- .gitignore | 3 ++- main.py | 28 +++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 8b8235e6..a0c7ff8c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ util/__pycache__/ index.html?linkid=2289031 wget-log weights/icon_caption_florence_v2/ -omnitool/gradio/uploads/ \ No newline at end of file +omnitool/gradio/uploads/ +.env \ No newline at end of file diff --git a/main.py b/main.py index cde9ae8e..76310614 100644 --- a/main.py +++ b/main.py @@ -16,7 +16,8 @@ app = FastAPI(title="OmniParser API") -def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]: +def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, + use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]: """ Process the image and return the annotated image and parsed elements. """ @@ -28,15 +29,32 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float 'thickness': max(int(3 * box_overlay_ratio), 1), } - ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) + ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, + output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, + 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) text, ocr_bbox = ocr_bbox_rslt + dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img( - image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox, - draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text, + image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, + ocr_bbox=ocr_bbox, + draw_bbox_config=draw_bbox_config, + caption_model_processor=caption_model_processor, ocr_text=text, iou_threshold=iou_threshold, imgsz=imgsz ) + + # --- ADD THIS DEFENSIVE CODE BLOCK --- + # If the model returns an empty string for the image, it means nothing was detected. + if not dino_labled_img: + print("Warning: get_som_labeled_img returned an empty image string. No objects were detected.") + # Return the original image and an empty string for the content. + return image, "" + # --- END OF DEFENSIVE CODE BLOCK --- + annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) - parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) + + parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate + (parsed_content_list)]) + return annotated_image, parsed_content_str @app.post("/process") From cb8f841c9a471b4ea631dfbd82407c742294c7a0 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 19:25:29 +0530 Subject: [PATCH 5/9] main --- main.py | 52 +++++++++++++++++----------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/main.py b/main.py index 76310614..f16156ba 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, UploadFile, File +from fastapi import FastAPI, UploadFile, File, Form from typing import Optional import torch from PIL import Image @@ -7,6 +7,7 @@ import os from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img +# This line is still needed for Kaggle environments os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" # Initialize models @@ -17,9 +18,10 @@ app = FastAPI(title="OmniParser API") def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, - use_paddleocr: bool, imgsz: int) -> tuple[Image.Image, str]: + use_paddleocr: bool, imgsz: int) -> str: """ - Process the image and return the annotated image and parsed elements. + Process the image and return the parsed elements as a string. + This is the same core logic as your Gradio app's process function. """ box_overlay_ratio = image.size[0] / 3200 draw_bbox_config = { @@ -34,7 +36,8 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) text, ocr_bbox = ocr_bbox_rslt - dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img( + # We only need the parsed content list, not the annotated image. + _, _, parsed_content_list = get_som_labeled_img( image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, ocr_bbox=ocr_bbox, draw_bbox_config=draw_bbox_config, @@ -42,50 +45,29 @@ def process_image(image: Image.Image, box_threshold: float, iou_threshold: float iou_threshold=iou_threshold, imgsz=imgsz ) - # --- ADD THIS DEFENSIVE CODE BLOCK --- - # If the model returns an empty string for the image, it means nothing was detected. - if not dino_labled_img: - print("Warning: get_som_labeled_img returned an empty image string. No objects were detected.") - # Return the original image and an empty string for the content. - return image, "" - # --- END OF DEFENSIVE CODE BLOCK --- + parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) - annotated_image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img))) - - parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate - (parsed_content_list)]) - - return annotated_image, parsed_content_str + return parsed_content_str @app.post("/process") async def process_endpoint( file: UploadFile = File(...), - box_threshold: float = 0.05, - iou_threshold: float = 0.1, - use_paddleocr: bool = True, - imgsz: int = 640 + box_threshold: float = Form(...), # This forces FastAPI to read the value from the client + iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client + use_paddleocr: bool = Form(...), # This forces FastAPI to read the value from the client + imgsz: int = Form(...) # This forces FastAPI to read the value from the client ): """ - Endpoint to upload an image and get parsed elements. + Endpoint to upload an image and get parsed elements as JSON. """ try: - # Read the uploaded image image_bytes = await file.read() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") - # Process the image - annotated_image, parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) - - # Convert annotated image to bytes for return - img_byte_arr = io.BytesIO() - annotated_image.save(img_byte_arr, format='PNG') - img_base64 = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8') + # This will now correctly use the low thresholds sent by your script + parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) - # Return JSON response with image and parsed content - return { - "annotated_image": img_base64, - "parsed_elements": parsed_content - } + return { "parsed_elements": parsed_content } except Exception as e: return {"error": str(e)} From 9b43625e2f243e1e64d0c37b774d2d7c5abe2489 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 19:56:46 +0530 Subject: [PATCH 6/9] changes --- main.py | 141 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 88 insertions(+), 53 deletions(-) diff --git a/main.py b/main.py index f16156ba..d9df190b 100644 --- a/main.py +++ b/main.py @@ -1,76 +1,111 @@ -from fastapi import FastAPI, UploadFile, File, Form -from typing import Optional -import torch -from PIL import Image -import io -import base64 import os -from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img +import sys +import io +import traceback +from PIL import Image +from fastapi import FastAPI, File, UploadFile, Form +from fastapi.responses import JSONResponse + +# Add the parent directory of 'util' to the Python path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -# This line is still needed for Kaggle environments -os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" +from util.omniparser import get_som_labeled_img, initialize_models +from util.utils import check_ocr_box -# Initialize models -yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt') -caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") -DEVICE = torch.device('cuda') +# --- Initialize Models --- +# It's better to initialize models once when the application starts +yolo_model, caption_model_processor = initialize_models() -app = FastAPI(title="OmniParser API") +app = FastAPI() -def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, - use_paddleocr: bool, imgsz: int) -> str: +def process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz): """ - Process the image and return the parsed elements as a string. - This is the same core logic as your Gradio app's process function. + This function contains the core image processing logic. + It's designed to be testable and to log every step. """ - box_overlay_ratio = image.size[0] / 3200 - draw_bbox_config = { - 'text_scale': 0.8 * box_overlay_ratio, - 'text_thickness': max(int(2 * box_overlay_ratio), 1), - 'text_padding': max(int(3 * box_overlay_ratio), 1), - 'thickness': max(int(3 * box_overlay_ratio), 1), - } - - ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, - output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, - 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) - text, ocr_bbox = ocr_bbox_rslt - - # We only need the parsed content list, not the annotated image. - _, _, parsed_content_list = get_som_labeled_img( - image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, - ocr_bbox=ocr_bbox, - draw_bbox_config=draw_bbox_config, - caption_model_processor=caption_model_processor, ocr_text=text, - iou_threshold=iou_threshold, imgsz=imgsz - ) - - parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) - - return parsed_content_str + log = ["--- Inside process_image_with_logging ---"] + try: + log.append(f"Step 1: Received parameters: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}") + + # --- OCR Check Case --- + log.append("Step 2: Starting OCR check...") + ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, + output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, + 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) + text, ocr_bbox = ocr_bbox_rslt + log.append(f"Step 3: OCR check complete. Found {len(text)} text elements.") + + # --- Main Model Case --- + log.append("Step 4: Starting main model (get_som_labeled_img)...") + _, _, parsed_content_list = get_som_labeled_img( + image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, + ocr_bbox=ocr_bbox, draw_bbox_config={}, # draw_bbox_config is not needed for data-only output + caption_model_processor=caption_model_processor, ocr_text=text, + iou_threshold=iou_threshold, imgsz=imgz + ) + log.append(f"Step 5: Main model finished. It returned a list with {len(parsed_content_list)} elements.") + + # --- Final Formatting Case --- + if not parsed_content_list: + log.append("WARNING: The final list of elements is empty. The model did not detect any objects that met the criteria.") + + parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) + log.append("Step 6: Successfully formatted the final list into a string.") + + return parsed_content_str, log + + except Exception as e: + log.append(f"\n!!!!!! AN ERROR OCCURRED INSIDE process_image_with_logging !!!!!!") + log.append(f"Error Type: {type(e)}") + log.append(f"Error Message: {str(e)}") + log.append("--- Full Traceback ---") + log.append(traceback.format_exc()) + return "", log # Return an empty string but the full log of the crash @app.post("/process") async def process_endpoint( file: UploadFile = File(...), - box_threshold: float = Form(...), # This forces FastAPI to read the value from the client - iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client - use_paddleocr: bool = Form(...), # This forces FastAPI to read the value from the client - imgsz: int = Form(...) # This forces FastAPI to read the value from the client + box_threshold: float = Form(...), + iou_threshold: float = Form(...), + use_paddleocr: bool = Form(...), + imgsz: int = Form(...) ): """ - Endpoint to upload an image and get parsed elements as JSON. + This endpoint now acts as a "black box recorder". + It logs everything and returns the log in the response. """ + master_log = ["--- Server received request in process_endpoint ---"] try: + # --- Parameter Reception Case --- + master_log.append(f"Received box_threshold: {box_threshold} (type: {type(box_threshold)})") + master_log.append(f"Received iou_threshold: {iou_threshold} (type: {type(iou_threshold)})") + master_log.append(f"Received use_paddleocr: {use_paddleocr} (type: {type(use_paddleocr)})") + master_log.append(f"Received imgsz: {imgsz} (type: {type(imgsz)})") + + # --- Image Loading Case --- image_bytes = await file.read() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + master_log.append(f"Image successfully loaded. Size: {image.size}") + + # --- Call Core Logic Case --- + parsed_content, function_log = process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz) + master_log.extend(function_log) # Add the detailed log from the function - # This will now correctly use the low thresholds sent by your script - parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) + master_log.append("\n--- FINAL SUMMARY ---") + master_log.append(f"Final content string length: {len(parsed_content)}") + + # We will return the log itself for debugging + return { "debug_log": "\n".join(master_log), "parsed_elements": parsed_content } - return { "parsed_elements": parsed_content } except Exception as e: - return {"error": str(e)} + master_log.append(f"\n!!!!!! AN UNEXPECTED ERROR OCCURRED IN process_endpoint !!!!!!") + master_log.append(f"Error Type: {type(e)}") + master_log.append(f"Error Message: {str(e)}") + master_log.append("--- Full Traceback ---") + master_log.append(traceback.format_exc()) + + return { "debug_log": "\n".join(master_log), "parsed_elements": "" } if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + uvicorn.run(app, host="0.0.0.0", port=8000) From 07a17debfef53c7d133556b40f04afe0f09e576a Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Thu, 28 Aug 2025 22:59:11 +0530 Subject: [PATCH 7/9] chnages --- main.py | 141 +++++++++++++++++++++----------------------------------- 1 file changed, 53 insertions(+), 88 deletions(-) diff --git a/main.py b/main.py index d9df190b..f16156ba 100644 --- a/main.py +++ b/main.py @@ -1,111 +1,76 @@ -import os -import sys -import io -import traceback +from fastapi import FastAPI, UploadFile, File, Form +from typing import Optional +import torch from PIL import Image -from fastapi import FastAPI, File, UploadFile, Form -from fastapi.responses import JSONResponse - -# Add the parent directory of 'util' to the Python path -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import io +import base64 +import os +from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img -from util.omniparser import get_som_labeled_img, initialize_models -from util.utils import check_ocr_box +# This line is still needed for Kaggle environments +os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" -# --- Initialize Models --- -# It's better to initialize models once when the application starts -yolo_model, caption_model_processor = initialize_models() +# Initialize models +yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt') +caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") +DEVICE = torch.device('cuda') -app = FastAPI() +app = FastAPI(title="OmniParser API") -def process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz): +def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, + use_paddleocr: bool, imgsz: int) -> str: """ - This function contains the core image processing logic. - It's designed to be testable and to log every step. + Process the image and return the parsed elements as a string. + This is the same core logic as your Gradio app's process function. """ - log = ["--- Inside process_image_with_logging ---"] - try: - log.append(f"Step 1: Received parameters: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}") - - # --- OCR Check Case --- - log.append("Step 2: Starting OCR check...") - ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, - output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, - 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) - text, ocr_bbox = ocr_bbox_rslt - log.append(f"Step 3: OCR check complete. Found {len(text)} text elements.") - - # --- Main Model Case --- - log.append("Step 4: Starting main model (get_som_labeled_img)...") - _, _, parsed_content_list = get_som_labeled_img( - image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, - ocr_bbox=ocr_bbox, draw_bbox_config={}, # draw_bbox_config is not needed for data-only output - caption_model_processor=caption_model_processor, ocr_text=text, - iou_threshold=iou_threshold, imgsz=imgz - ) - log.append(f"Step 5: Main model finished. It returned a list with {len(parsed_content_list)} elements.") - - # --- Final Formatting Case --- - if not parsed_content_list: - log.append("WARNING: The final list of elements is empty. The model did not detect any objects that met the criteria.") - - parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) - log.append("Step 6: Successfully formatted the final list into a string.") - - return parsed_content_str, log - - except Exception as e: - log.append(f"\n!!!!!! AN ERROR OCCURRED INSIDE process_image_with_logging !!!!!!") - log.append(f"Error Type: {type(e)}") - log.append(f"Error Message: {str(e)}") - log.append("--- Full Traceback ---") - log.append(traceback.format_exc()) - return "", log # Return an empty string but the full log of the crash + box_overlay_ratio = image.size[0] / 3200 + draw_bbox_config = { + 'text_scale': 0.8 * box_overlay_ratio, + 'text_thickness': max(int(2 * box_overlay_ratio), 1), + 'text_padding': max(int(3 * box_overlay_ratio), 1), + 'thickness': max(int(3 * box_overlay_ratio), 1), + } + + ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, + output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, + 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) + text, ocr_bbox = ocr_bbox_rslt + + # We only need the parsed content list, not the annotated image. + _, _, parsed_content_list = get_som_labeled_img( + image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, + ocr_bbox=ocr_bbox, + draw_bbox_config=draw_bbox_config, + caption_model_processor=caption_model_processor, ocr_text=text, + iou_threshold=iou_threshold, imgsz=imgsz + ) + + parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) + + return parsed_content_str @app.post("/process") async def process_endpoint( file: UploadFile = File(...), - box_threshold: float = Form(...), - iou_threshold: float = Form(...), - use_paddleocr: bool = Form(...), - imgsz: int = Form(...) + box_threshold: float = Form(...), # This forces FastAPI to read the value from the client + iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client + use_paddleocr: bool = Form(...), # This forces FastAPI to read the value from the client + imgsz: int = Form(...) # This forces FastAPI to read the value from the client ): """ - This endpoint now acts as a "black box recorder". - It logs everything and returns the log in the response. + Endpoint to upload an image and get parsed elements as JSON. """ - master_log = ["--- Server received request in process_endpoint ---"] try: - # --- Parameter Reception Case --- - master_log.append(f"Received box_threshold: {box_threshold} (type: {type(box_threshold)})") - master_log.append(f"Received iou_threshold: {iou_threshold} (type: {type(iou_threshold)})") - master_log.append(f"Received use_paddleocr: {use_paddleocr} (type: {type(use_paddleocr)})") - master_log.append(f"Received imgsz: {imgsz} (type: {type(imgsz)})") - - # --- Image Loading Case --- image_bytes = await file.read() image = Image.open(io.BytesIO(image_bytes)).convert("RGB") - master_log.append(f"Image successfully loaded. Size: {image.size}") - - # --- Call Core Logic Case --- - parsed_content, function_log = process_image_with_logging(image, box_threshold, iou_threshold, use_paddleocr, imgsz) - master_log.extend(function_log) # Add the detailed log from the function - master_log.append("\n--- FINAL SUMMARY ---") - master_log.append(f"Final content string length: {len(parsed_content)}") - - # We will return the log itself for debugging - return { "debug_log": "\n".join(master_log), "parsed_elements": parsed_content } + # This will now correctly use the low thresholds sent by your script + parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) + return { "parsed_elements": parsed_content } except Exception as e: - master_log.append(f"\n!!!!!! AN UNEXPECTED ERROR OCCURRED IN process_endpoint !!!!!!") - master_log.append(f"Error Type: {type(e)}") - master_log.append(f"Error Message: {str(e)}") - master_log.append("--- Full Traceback ---") - master_log.append(traceback.format_exc()) - - return { "debug_log": "\n".join(master_log), "parsed_elements": "" } + return {"error": str(e)} if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file From 2abafa1d3418d05a3c817201482176f1c18bd3b6 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Fri, 29 Aug 2025 08:43:32 +0530 Subject: [PATCH 8/9] main --- main.py | 69 +++++++++++++++++++-------------------------------------- 1 file changed, 23 insertions(+), 46 deletions(-) diff --git a/main.py b/main.py index f16156ba..bdae8bcd 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, UploadFile, File, Form +from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request from typing import Optional import torch from PIL import Image @@ -6,71 +6,48 @@ import base64 import os from util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img +import logging + +# --- Basic Logging Setup --- +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) -# This line is still needed for Kaggle environments os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" # Initialize models +logger.info("Initializing models...") yolo_model = get_yolo_model(model_path='weights/icon_detect/model.pt') caption_model_processor = get_caption_model_processor(model_name="florence2", model_name_or_path="weights/icon_caption_florence") -DEVICE = torch.device('cuda') +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +logger.info(f"Models initialized. Using device: {DEVICE}") app = FastAPI(title="OmniParser API") def process_image(image: Image.Image, box_threshold: float, iou_threshold: float, use_paddleocr: bool, imgsz: int) -> str: - """ - Process the image and return the parsed elements as a string. - This is the same core logic as your Gradio app's process function. - """ - box_overlay_ratio = image.size[0] / 3200 - draw_bbox_config = { - 'text_scale': 0.8 * box_overlay_ratio, - 'text_thickness': max(int(2 * box_overlay_ratio), 1), - 'text_padding': max(int(3 * box_overlay_ratio), 1), - 'thickness': max(int(3 * box_overlay_ratio), 1), - } - - ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image, display_img=False, - output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, - 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) + logger.info(f"Processing image with params: box_threshold={box_threshold}, iou_threshold={iou_threshold}, use_paddleocr={use_paddleocr}, imgsz={imgsz}") + ocr_bbox_rslt, _ = check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold': 0.9}, use_paddleocr=use_paddleocr) text, ocr_bbox = ocr_bbox_rslt - - # We only need the parsed content list, not the annotated image. _, _, parsed_content_list = get_som_labeled_img( image, yolo_model, BOX_TRESHOLD=box_threshold, output_coord_in_ratio=True, - ocr_bbox=ocr_bbox, - draw_bbox_config=draw_bbox_config, - caption_model_processor=caption_model_processor, ocr_text=text, - iou_threshold=iou_threshold, imgsz=imgsz + ocr_bbox=ocr_bbox, draw_bbox_config={}, caption_model_processor=caption_model_processor, + ocr_text=text, iou_threshold=iou_threshold, imgsz=imgsz ) - - parsed_content_str = '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) - - return parsed_content_str + return '\n'.join([f'icon {i}: ' + str(v) for i, v in enumerate(parsed_content_list)]) @app.post("/process") async def process_endpoint( file: UploadFile = File(...), - box_threshold: float = Form(...), # This forces FastAPI to read the value from the client - iou_threshold: float = Form(...), # This forces FastAPI to read the value from the client - use_paddleocr: bool = Form(...), # This forces FastAPI to read the value from the client - imgsz: int = Form(...) # This forces FastAPI to read the value from the client + box_threshold: float = Form(...), + iou_threshold: float = Form(...), + use_paddleocr: str = Form(...), + imgsz: int = Form(...) ): - """ - Endpoint to upload an image and get parsed elements as JSON. - """ - try: - image_bytes = await file.read() - image = Image.open(io.BytesIO(image_bytes)).convert("RGB") - - # This will now correctly use the low thresholds sent by your script - parsed_content = process_image(image, box_threshold, iou_threshold, use_paddleocr, imgsz) - - return { "parsed_elements": parsed_content } - except Exception as e: - return {"error": str(e)} + use_paddleocr_bool = use_paddleocr.lower() in ('true', '1') + parsed_content = process_image(Image.open(file.file).convert("RGB"), box_threshold, iou_threshold, use_paddleocr_bool, imgsz) + return { "parsed_elements": parsed_content } if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file + logger.info("Starting OmniParser API server for local testing...") + uvicorn.run(app, host="0.0.0.0", port=8000) From c1c7301b84c9e0eef2ba5ba43668e6c026ae7473 Mon Sep 17 00:00:00 2001 From: Jai Aggarwal Date: Fri, 29 Aug 2025 16:52:13 +0530 Subject: [PATCH 9/9] commit --- omnitool/omniparserserver/omniparserserver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omnitool/omniparserserver/omniparserserver.py b/omnitool/omniparserserver/omniparserserver.py index 045fbace..781e4b67 100644 --- a/omnitool/omniparserserver/omniparserserver.py +++ b/omnitool/omniparserserver/omniparserserver.py @@ -19,7 +19,7 @@ def parse_arguments(): parser.add_argument('--caption_model_name', type=str, default='florence2', help='Name of the caption model') parser.add_argument('--caption_model_path', type=str, default='../../weights/icon_caption_florence', help='Path to the caption model') parser.add_argument('--device', type=str, default='cpu', help='Device to run the model') - parser.add_argument('--BOX_TRESHOLD', type=float, default=0.05, help='Threshold for box detection') + parser.add_argument('--BOX_TRESHOLD', type=float, default=0.01, help='Threshold for box detection') parser.add_argument('--host', type=str, default='127.0.0.1', help='Host for the API') parser.add_argument('--port', type=int, default=8000, help='Port for the API') args = parser.parse_args()