Brandon-Shen · BrandonS09 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -165,4 +165,5 @@ dataset/train/labels
 app_logs.txt
 .aider*
 *.tex
-runs/
+runs/
+*.md
diff --git a/dataset/data.yaml b/dataset/data.yaml
@@ -2,4 +2,4 @@ path: C:\Users\Brandon Shen\Documents\SearchVision\dataset
 train: C:\Users\Brandon Shen\Documents\SearchVision\dataset\train\images
 val: C:\Users\Brandon Shen\Documents\SearchVision\dataset\train\images
 names:
-  0: soccer ball
+  0: coffee cup
diff --git a/dataset/train/labels.cache b/dataset/train/labels.cache
diff --git a/src/download_images.py b/src/download_images.py
@@ -8,32 +8,38 @@ def download_images(image_urls, download_path="dataset/train/images"):
     """
     Downloads images from a list of URLs and saves them to the specified directory.
 
+    Maintains index alignment by returning tuples of (original_index, file_path)
+    so that ranking algorithms can correctly map back to original URLs.
+
     :param image_urls: List of image URLs to download.
     :param download_path: Directory to save downloaded images.
-    :return: List of file paths for successfully downloaded images.
+    :return: List of tuples (original_index, file_path) for successfully downloaded images,
+             preserving which position in the input list each downloaded image came from.
     """
     print("Starting image download...")  # Debugging statement
 
     # Ensure the download directory exists
     if not os.path.exists(download_path):
         os.makedirs(download_path)
 
-    # List to hold paths of successfully downloaded images
+    # List to hold (original_index, file_path) tuples
+    # This preserves alignment between downloaded images and input URLs
     downloaded_paths = []
 
     # Iterate over the image URLs and download each image
     for idx, url in enumerate(image_urls):
-        print(f"Attempting to download: {url}")  # Debugging statement
+        print(f"Attempting to download ({idx}/{len(image_urls)}): {url}")
         try:
-            response = requests.get(url)
+            response = requests.get(url, timeout=10)
             if response.status_code == 200:
                 file_path = os.path.join(download_path, f"image_{idx}.jpg")
                 with open(file_path, "wb") as f:
                     f.write(response.content)
                 print(f"Downloaded: {file_path}")
-                downloaded_paths.append(file_path)  # Add path to list
+                # Store both the original index and the file path
+                downloaded_paths.append((idx, file_path))
             else:
-                print(f"Failed to download {url}")
+                print(f"Failed to download {url}: status {response.status_code}")
         except Exception as e:
             print(f"Error downloading {url}: {e}")
 
@@ -42,3 +48,4 @@ def download_images(image_urls, download_path="dataset/train/images"):
         print("No images were downloaded.")
 
     return downloaded_paths
+
diff --git a/src/main.py b/src/main.py
@@ -139,21 +139,34 @@ async def search(
         os.makedirs(temp_download_path, exist_ok=True)
 
         try:
-            image_paths = download_images(images_subset, temp_download_path)
+            # Extract URLs from metadata if using new format
+            if images_subset and isinstance(images_subset[0], dict):
+                urls_to_download = [r['url'] for r in images_subset]
+            else:
+                urls_to_download = images_subset
 
-            # Select balanced images (70% relevance, 30% dissimilarity)
+            image_paths = download_images(urls_to_download, temp_download_path)
+
+            # Select balanced images (60% popularity, 25% caption, 15% dissimilarity)
             selected_images = select_balanced_images(
                 images_subset,
                 image_paths,
+                query=query,
                 num_images=min(9, len(images_subset)),
-                relevance_weight=0.7
+                popularity_weight=0.6,
+                caption_weight=0.25,
+                dissimilarity_weight=0.15
             )
             logger.info(
                 f"Selected {len(selected_images)} balanced images for query: {query} (page {page})")
         except Exception as e:
             logger.warning(
                 f"Balanced selection failed, falling back to first 9 images: {e}")
-            selected_images = images_subset[:9]
+            # Extract URLs from metadata if needed
+            if images_subset and isinstance(images_subset[0], dict):
+                selected_images = [r['url'] for r in images_subset[:9]]
+            else:
+                selected_images = images_subset[:9]
         finally:
             # Clean up temporary downloads
             if os.path.exists(temp_download_path):

diff --git a/src/scrape_similar.py b/src/scrape_similar.py
@@ -15,6 +15,9 @@ def scrape_similar_images(
     Scrape similar images for training augmentation.
     Uses multiple query variations to find diverse training images.
     Falls back gracefully if search fails.
+
+    Returns:
+        List of image URLs (strips metadata for compatibility)
     """
     similar_images = []
 
@@ -41,16 +44,18 @@ def scrape_similar_images(
         try:
             logger.debug(f"Attempting search with query: {query}")
 
-            images = search_images(
+            results = search_images(
                 query,
                 api_key,
                 search_engine_id,
                 num_results=num_results_per_image
             )
 
-            if images:
-                logger.info(f"Got {len(images)} images from query: {query}")
-                similar_images.extend(images)
+            if results:
+                # Extract URLs from metadata dicts
+                urls = [r['url'] for r in results]
+                logger.info(f"Got {len(urls)} images from query: {query}")
+                similar_images.extend(urls)
             else:
                 logger.debug(f"No images from query: {query}")
 

diff --git a/src/search_images.py b/src/search_images.py
@@ -11,6 +11,13 @@ def search_images(query, api_key, search_engine_id, num_results=10):
     """
     Search for images using Google Custom Search API.
     Falls back to Bing Images if Google fails (no API key needed).
+
+    Returns:
+        List of dicts containing image metadata: {
+            'url': image_url,
+            'title': caption/title,
+            'snippet': description
+        }
     """
     images = []
     google_error = None
@@ -53,7 +60,10 @@ def _search_google_custom_search(
         api_key,
         search_engine_id,
         num_results=10):
-    """Search using Google Custom Search API"""
+    """
+    Search using Google Custom Search API.
+    Extracts image URLs, titles, and snippets for relevancy ranking.
+    """
     images = []
     results_per_page = 10
     start_index = 1
@@ -78,7 +88,11 @@ def _search_google_custom_search(
                 break
 
             for item in data['items']:
-                images.append(item['link'])
+                images.append({
+                    'url': item['link'],
+                    'title': item.get('title', ''),
+                    'snippet': item.get('snippet', '')
+                })
 
             start_index += results_per_page
 
@@ -93,8 +107,7 @@ def _search_google_custom_search(
 def _search_bing_images(query, num_results=10):
     """
     Search using Bing Images (free, no API key required)
-    Scrapes image URLs from Bing image search with retry logic.
-    Strips problematic filter syntax before searching.
+    Scrapes image URLs and captions from Bing image search.
     """
     images = []
     max_retries = 3
@@ -137,10 +150,9 @@ def _search_bing_images(query, num_results=10):
                 raise Exception(
                     f"Bing Images returned status {response.status_code}")
 
-            # Extract image URLs from the HTML response using regex
-            # Bing stores lazy-loaded images in data-src attributes
-            # These are Bing image proxy URLs (tse1.mm.bing.net, etc.)
-            image_pattern = r'<img[^>]+data-src="([^"]+)"'
+            # Extract image URLs and captions from HTML
+            # Bing stores images in img tags with data-src attributes
+            image_pattern = r'<img[^>]+data-src="([^"]+)"[^>]+alt="([^"]*)"'
             matches = re.findall(image_pattern, response.text)
 
             if not matches:
@@ -152,13 +164,17 @@ def _search_bing_images(query, num_results=10):
                     continue
                 raise Exception("No images found on Bing Images after retries")
 
-            # Process URLs and decode HTML entities
-            for url in matches:
+            # Process URLs and captions
+            for url, caption in matches:
                 if url.startswith('http') and len(images) < num_results:
                     # Decode HTML entities (e.g., &amp; to &)
                     url = url.replace('&amp;', '&')
                     url = url.replace('\\/', '/')
-                    images.append(url)
+                    images.append({
+                        'url': url,
+                        'title': caption,
+                        'snippet': caption
+                    })
 
             if not images:
                 logger.debug(
-Original file line number
+Diff line change
@@ Expand Up / @@ -165,4 +165,5 @@ dataset/train/labels @@
     app_logs.txt
     .aider*
     *.tex
-    runs/
+    runs/
+    *.md