diff --git a/main.py b/main.py index 38a6ea3..05ba2e6 100644 --- a/main.py +++ b/main.py @@ -13,10 +13,11 @@ from langchain_core.prompts import PromptTemplate from langchain_core.messages import HumanMessage - from tts_utils import speak from config import TTS_ENGINE, IP_WEBCAM_URL +from ocr_utils import extract_text_from_image + # Load Gemini API key from .env load_dotenv() api_key = os.getenv("API_KEY") @@ -145,10 +146,18 @@ def listen_for_scan(): scan_triggered = False status = "Analyzing surroundings..." speak("Analyzing surroundings") + + # ✅ Gemini AI description desc = process_frame(frame) speak(desc) + # ✅ OCR text detection + text = extract_text_from_image(frame) + if text.strip(): + speak(f"Detected text: {text}") + # Alert on specific signs + lower_desc = desc.lower() if "stop sign" in desc.lower() or "stop" in desc.lower(): speak("Stop! There's a stop sign.") elif "red light" in desc.lower(): diff --git a/ocr_utils.py b/ocr_utils.py new file mode 100644 index 0000000..88b4518 --- /dev/null +++ b/ocr_utils.py @@ -0,0 +1,21 @@ +import pytesseract +import cv2 +import numpy as np + +def extract_text_from_image(image: 'np.ndarray') -> str: + """ + Extract text from a given image using pytesseract OCR. + + Args: + image: Input image as a numpy array (OpenCV format) + + Returns: + Extracted text as a string + """ + # Convert image to grayscale (OCR works better on gray) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + # Use pytesseract to extract text + text = pytesseract.image_to_string(gray) + + return text diff --git a/requirements.txt b/requirements.txt index 9f37db8..e8e23ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +# Existing dependencies opencv-python pyttsx3 gTTS @@ -10,4 +11,7 @@ playsound python-dotenv langchain-google-genai langchain-core -insightface \ No newline at end of file + +# Add InsightFace +insightface +pytesseract