Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,5 @@ dataset/train/labels
app_logs.txt
.aider*
*.tex
runs/
runs/
*.md
2 changes: 1 addition & 1 deletion dataset/data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ path: C:\Users\Brandon Shen\Documents\SearchVision\dataset
train: C:\Users\Brandon Shen\Documents\SearchVision\dataset\train\images
val: C:\Users\Brandon Shen\Documents\SearchVision\dataset\train\images
names:
0: soccer ball
0: coffee cup
Binary file modified dataset/train/labels.cache
Binary file not shown.
19 changes: 13 additions & 6 deletions src/download_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,38 @@ def download_images(image_urls, download_path="dataset/train/images"):
"""
Downloads images from a list of URLs and saves them to the specified directory.

Maintains index alignment by returning tuples of (original_index, file_path)
so that ranking algorithms can correctly map back to original URLs.

:param image_urls: List of image URLs to download.
:param download_path: Directory to save downloaded images.
:return: List of file paths for successfully downloaded images.
:return: List of tuples (original_index, file_path) for successfully downloaded images,
preserving which position in the input list each downloaded image came from.
"""
print("Starting image download...") # Debugging statement

# Ensure the download directory exists
if not os.path.exists(download_path):
os.makedirs(download_path)

# List to hold paths of successfully downloaded images
# List to hold (original_index, file_path) tuples
# This preserves alignment between downloaded images and input URLs
downloaded_paths = []

# Iterate over the image URLs and download each image
for idx, url in enumerate(image_urls):
print(f"Attempting to download: {url}") # Debugging statement
print(f"Attempting to download ({idx}/{len(image_urls)}): {url}")
try:
response = requests.get(url)
response = requests.get(url, timeout=10)
if response.status_code == 200:
file_path = os.path.join(download_path, f"image_{idx}.jpg")
with open(file_path, "wb") as f:
f.write(response.content)
print(f"Downloaded: {file_path}")
downloaded_paths.append(file_path) # Add path to list
# Store both the original index and the file path
downloaded_paths.append((idx, file_path))
else:
print(f"Failed to download {url}")
print(f"Failed to download {url}: status {response.status_code}")
except Exception as e:
print(f"Error downloading {url}: {e}")

Expand All @@ -42,3 +48,4 @@ def download_images(image_urls, download_path="dataset/train/images"):
print("No images were downloaded.")

return downloaded_paths

21 changes: 17 additions & 4 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,21 +139,34 @@ async def search(
os.makedirs(temp_download_path, exist_ok=True)

try:
image_paths = download_images(images_subset, temp_download_path)
# Extract URLs from metadata if using new format
if images_subset and isinstance(images_subset[0], dict):
urls_to_download = [r['url'] for r in images_subset]
else:
urls_to_download = images_subset

# Select balanced images (70% relevance, 30% dissimilarity)
image_paths = download_images(urls_to_download, temp_download_path)

# Select balanced images (60% popularity, 25% caption, 15% dissimilarity)
selected_images = select_balanced_images(
images_subset,
image_paths,
query=query,
num_images=min(9, len(images_subset)),
relevance_weight=0.7
popularity_weight=0.6,
caption_weight=0.25,
dissimilarity_weight=0.15
)
logger.info(
f"Selected {len(selected_images)} balanced images for query: {query} (page {page})")
except Exception as e:
logger.warning(
f"Balanced selection failed, falling back to first 9 images: {e}")
selected_images = images_subset[:9]
# Extract URLs from metadata if needed
if images_subset and isinstance(images_subset[0], dict):
selected_images = [r['url'] for r in images_subset[:9]]
else:
selected_images = images_subset[:9]
finally:
# Clean up temporary downloads
if os.path.exists(temp_download_path):
Expand Down
13 changes: 9 additions & 4 deletions src/scrape_similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def scrape_similar_images(
Scrape similar images for training augmentation.
Uses multiple query variations to find diverse training images.
Falls back gracefully if search fails.

Returns:
List of image URLs (strips metadata for compatibility)
"""
similar_images = []

Expand All @@ -41,16 +44,18 @@ def scrape_similar_images(
try:
logger.debug(f"Attempting search with query: {query}")

images = search_images(
results = search_images(
query,
api_key,
search_engine_id,
num_results=num_results_per_image
)

if images:
logger.info(f"Got {len(images)} images from query: {query}")
similar_images.extend(images)
if results:
# Extract URLs from metadata dicts
urls = [r['url'] for r in results]
logger.info(f"Got {len(urls)} images from query: {query}")
similar_images.extend(urls)
else:
logger.debug(f"No images from query: {query}")

Expand Down
38 changes: 27 additions & 11 deletions src/search_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ def search_images(query, api_key, search_engine_id, num_results=10):
"""
Search for images using Google Custom Search API.
Falls back to Bing Images if Google fails (no API key needed).

Returns:
List of dicts containing image metadata: {
'url': image_url,
'title': caption/title,
'snippet': description
}
"""
images = []
google_error = None
Expand Down Expand Up @@ -53,7 +60,10 @@ def _search_google_custom_search(
api_key,
search_engine_id,
num_results=10):
"""Search using Google Custom Search API"""
"""
Search using Google Custom Search API.
Extracts image URLs, titles, and snippets for relevancy ranking.
"""
images = []
results_per_page = 10
start_index = 1
Expand All @@ -78,7 +88,11 @@ def _search_google_custom_search(
break

for item in data['items']:
images.append(item['link'])
images.append({
'url': item['link'],
'title': item.get('title', ''),
'snippet': item.get('snippet', '')
})

start_index += results_per_page

Expand All @@ -93,8 +107,7 @@ def _search_google_custom_search(
def _search_bing_images(query, num_results=10):
"""
Search using Bing Images (free, no API key required)
Scrapes image URLs from Bing image search with retry logic.
Strips problematic filter syntax before searching.
Scrapes image URLs and captions from Bing image search.
"""
images = []
max_retries = 3
Expand Down Expand Up @@ -137,10 +150,9 @@ def _search_bing_images(query, num_results=10):
raise Exception(
f"Bing Images returned status {response.status_code}")

# Extract image URLs from the HTML response using regex
# Bing stores lazy-loaded images in data-src attributes
# These are Bing image proxy URLs (tse1.mm.bing.net, etc.)
image_pattern = r'<img[^>]+data-src="([^"]+)"'
# Extract image URLs and captions from HTML
# Bing stores images in img tags with data-src attributes
image_pattern = r'<img[^>]+data-src="([^"]+)"[^>]+alt="([^"]*)"'
matches = re.findall(image_pattern, response.text)

if not matches:
Expand All @@ -152,13 +164,17 @@ def _search_bing_images(query, num_results=10):
continue
raise Exception("No images found on Bing Images after retries")

# Process URLs and decode HTML entities
for url in matches:
# Process URLs and captions
for url, caption in matches:
if url.startswith('http') and len(images) < num_results:
# Decode HTML entities (e.g., &amp; to &)
url = url.replace('&amp;', '&')
url = url.replace('\\/', '/')
images.append(url)
images.append({
'url': url,
'title': caption,
'snippet': caption
})

if not images:
logger.debug(
Expand Down
Loading
Loading