-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathImageCaptioningSystem.py
More file actions
538 lines (451 loc) · 23.9 KB
/
ImageCaptioningSystem.py
File metadata and controls
538 lines (451 loc) · 23.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
import streamlit as st
from PIL import Image, ImageDraw, ImageFont
import io
from gtts import gTTS
import base64
import os
import google.generativeai as genai
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import zipfile
import re
import textwrap
# Set page configuration
st.set_page_config(page_title="Image Captioning System", layout="wide")
# --- Configuration Section ---
# IMPORTANT: Replace these with your own API keys or use environment variables
# Configure Gemini API (Get your API key from: https://ai.google.dev/)
# genai.configure(api_key="YOUR_GEMINI_API_KEY") # Uncomment and add your key
# Alternatively, use environment variables:
# import os
# genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# --- Model Loading ---
@st.cache_resource
def load_blip_model():
"""Load BLIP model for initial image captioning"""
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-base")
return processor, model
# Convert a given caption text to speech and return audio as a byte stream
def generate_audio(text):
try:
# Remove special characters that might cause issues with TTS
clean_text = ''.join(c for c in text if c.isalnum() or c.isspace() or c in ',.!?-:;\'\"')
tts = gTTS(clean_text)
audio_bytes = io.BytesIO()
tts.write_to_fp(audio_bytes)
audio_bytes.seek(0)
return audio_bytes
except Exception as e:
st.error(f"Error generating audio: {e}")
# Return a minimal valid audio file in case of error
return None
# Optimized prompt formats for advanced image captioning with error-handling
caption_styles = {
"Generalised Caption": (
"Analyze the image and generate a concise, general-purpose caption that broadly describes the scene or subject. "
"Use simple, clear language suitable for all audiences. "
"If the image is unclear or ambiguous, describe what is most prominent without guessing."
),
"Creative Caption": (
"Observe the image and create a highly imaginative and artistic caption. Use metaphor, personification, or poetic language. "
"Make it emotionally engaging and original. If the content is vague or abstract, focus on evoking a feeling or theme rather than specifics."
),
"Professional Caption": (
"Generate a formal, polished caption suitable for professional or business use. Emphasize clarity, purpose, and tone appropriate for marketing, branding, or corporate presentations. "
"If the image lacks context, focus on neutral, safe interpretations that could fit a wide professional setting."
),
"Descriptive Caption": (
"Examine the image and write a descriptive caption of about 25 words. Focus on visually important elements like objects, setting, actions, and mood. "
"Avoid interpretation or assumption—stick to visible, factual details. If the image is blurry or unclear, describe what can be confidently identified."
),
"Quote Caption": (
"Interpret the essence or theme of the image and generate an inspiring or thoughtful quote that matches the visual. "
"Attribute it to a relevant or fictional person (e.g., a philosopher, artist, or visionary). "
"If the image content is not meaningful enough for a quote, provide a general life reflection loosely inspired by it."
)
}
# Use BLIP to generate a base caption
def generate_blip_caption(image):
processor, model = load_blip_model()
inputs = processor(image, return_tensors="pt")
output = model.generate(**inputs)
return processor.batch_decode(output, skip_special_tokens=True)[0]
# Use Gemini to generate multiple stylized captions based on base BLIP caption
def generate_gemini_captions(image, base_caption, style, count=5):
# Convert PIL image to bytes for Gemini
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='JPEG')
img_byte_arr.seek(0)
# Create a multimodal prompt with both the image and text
model = genai.GenerativeModel('gemini-1.5-pro')
prompt = f"{caption_styles[style]}\nBase description: {base_caption}\nGenerate {count} unique, high-quality captions."
response = model.generate_content([
prompt,
{"mime_type": "image/jpeg", "data": img_byte_arr.getvalue()}
])
# Process the response to extract captions
captions = []
for line in response.text.strip().split('\n'):
# Clean up numbering, bullets, etc.
clean_line = line.strip()
if clean_line and not clean_line.isdigit():
# Remove numbering patterns like "1.", "1)", "[1]", etc.
import re
clean_line = re.sub(r'^(\d+[\.\)\]]\s*|\*\s*|\-\s*)', '', clean_line)
captions.append(clean_line.strip())
# Return only the requested number of captions
return captions[:count]
# Parse markdown-like formatting for captions
def parse_caption_formatting(caption):
# Store original caption for audio generation
original_caption = caption
# Track formatting for text segments
formatted_segments = []
# Parse bold text (**text**)
bold_pattern = r'\*\*(.*?)\*\*'
bold_matches = re.finditer(bold_pattern, caption)
for match in bold_matches:
start, end = match.span()
formatted_segments.append({
'text': match.group(1),
'start': start,
'end': end,
'bold': True,
'italic': False,
'underline': False
})
# Parse italic text (*text*)
italic_pattern = r'(?<!\*)\*([^\*]+)\*(?!\*)'
italic_matches = re.finditer(italic_pattern, caption)
for match in italic_matches:
start, end = match.span()
# Check if this segment overlaps with a bold segment
is_new_segment = True
for segment in formatted_segments:
if (start >= segment['start'] and start < segment['end']) or \
(end > segment['start'] and end <= segment['end']):
segment['italic'] = True
is_new_segment = False
break
if is_new_segment:
formatted_segments.append({
'text': match.group(1),
'start': start,
'end': end,
'bold': False,
'italic': True,
'underline': False
})
# Parse underlined text (_text_)
underline_pattern = r'_(.*?)_'
underline_matches = re.finditer(underline_pattern, caption)
for match in underline_matches:
start, end = match.span()
# Check if this segment overlaps with an existing segment
is_new_segment = True
for segment in formatted_segments:
if (start >= segment['start'] and start < segment['end']) or \
(end > segment['start'] and end <= segment['end']):
segment['underline'] = True
is_new_segment = False
break
if is_new_segment:
formatted_segments.append({
'text': match.group(1),
'start': start,
'end': end,
'bold': False,
'italic': False,
'underline': True
})
# Clean up the caption by removing markdown syntax
clean_caption = re.sub(r'\*\*(.*?)\*\*', r'\1', caption) # Remove bold markers
clean_caption = re.sub(r'(?<!\*)\*([^\*]+)\*(?!\*)', r'\1', clean_caption) # Remove italic markers
clean_caption = re.sub(r'_(.*?)_', r'\1', clean_caption) # Remove underline markers
return {
'original': original_caption,
'clean': clean_caption,
'segments': sorted(formatted_segments, key=lambda x: x['start'])
}
# Overlay selected caption text on top of the image with formatting support
def add_caption_to_image(image, caption):
width, height = image.size
# Parse caption formatting
caption_data = parse_caption_formatting(caption)
clean_caption = caption_data['clean']
# Dynamically adjust font size based on image dimensions - bigger font sizes
font_size = max(32, min(64, int(width/14))) # Even larger font sizes for better visibility
# Prepare font options - modern fonts with better visibility
font_families = [
# Primary choice - Calibri (modern, clean font)
("Calibri", "Calibri Bold", "Calibri Italic", "Calibri Bold Italic"),
# Second choice - Segoe UI (modern Microsoft font)
("Segoe UI", "Segoe UI Bold", "Segoe UI Italic", "Segoe UI Bold Italic"),
# Third choice - Arial (universally available)
("Arial", "Arial Bold", "Arial Italic", "Arial Bold Italic"),
# Fourth choice - specific filenames
("calibri.ttf", "calibrib.ttf", "calibrii.ttf", "calibriz.ttf"),
("segoeui.ttf", "segoeuib.ttf", "segoeuii.ttf", "segoeuiz.ttf"),
("arial.ttf", "arialbd.ttf", "ariali.ttf", "arialbi.ttf")
]
# Try each font family until one works
regular_font = None
bold_font = None
italic_font = None
bold_italic_font = None
for font_family in font_families:
try:
regular_font = ImageFont.truetype(font_family[0], font_size)
bold_font = ImageFont.truetype(font_family[1], font_size)
italic_font = ImageFont.truetype(font_family[2], font_size)
bold_italic_font = ImageFont.truetype(font_family[3], font_size)
break # If we successfully loaded all fonts, break the loop
except:
continue
# If no fonts were loaded successfully, use default font with artificial styles
if regular_font is None:
regular_font = ImageFont.load_default()
bold_font = regular_font
italic_font = regular_font
bold_italic_font = regular_font
# Use smaller margins to allow for larger text
max_width = width - 80 # Slightly larger margin for better appearance
# Better character width estimation for wrapping
wrapper = textwrap.TextWrapper(width=max(1, int(max_width / (font_size * 0.45)))) # Even more space for text
wrapped_lines = wrapper.wrap(clean_caption)
# Add extra space if formatting is detected
formatting_detected = len(caption_data['segments']) > 0
# Calculate required height for caption area
line_height = font_size * 1.8 # Increased line spacing
padding = 120 if formatting_detected else 100 # More padding for better visual appearance
bar_height = int(len(wrapped_lines) * line_height + padding)
# Create new image with appropriate white bar at top
result_img = Image.new("RGB", (width, height + bar_height), (255, 255, 255)) # Pure white background
result_img.paste(image, (0, bar_height))
# Draw on the image
draw = ImageDraw.Draw(result_img)
# Draw each line of text with proper formatting
y_position = 60 # Increased top margin for better spacing
# If no formatting is detected but markdown characters are present,
# treat the entire text as formatted based on simple rules
if not formatting_detected and ('**' in caption or '*' in caption or '_' in caption):
is_bold = '**' in caption
is_italic = '*' in caption and not '**' in caption
is_underline = '_' in caption
# Assign the right font based on detected formatting
if is_bold and is_italic:
font = bold_italic_font
elif is_bold:
font = bold_font
elif is_italic:
font = italic_font
else:
font = regular_font
# Draw each line with the detected formatting
for line in wrapped_lines:
# Center the text
line_width = font.getlength(line) if hasattr(font, 'getlength') else font.getsize(line)[0]
x_position = (width - line_width) // 2
# Draw shadow effect for better visibility
shadow_color = (220, 220, 220) # Light gray shadow
draw.text((x_position + 2, y_position + 2), line, fill=shadow_color, font=font)
# Draw the main text
if is_bold and font == regular_font: # If we're using fallback and need bold
for offset in range(-1, 2):
draw.text((x_position + offset, y_position), line, fill="black", font=font)
draw.text((x_position, y_position + offset), line, fill="black", font=font)
else:
draw.text((x_position, y_position), line, fill="black", font=font)
# Draw underline if needed
if is_underline:
line_width = font.getlength(line) if hasattr(font, 'getlength') else font.getsize(line)[0]
underline_y = y_position + font_size + 3
draw.line((x_position, underline_y, x_position + line_width, underline_y), fill="black", width=3)
y_position += line_height
else:
# Process line by line with individual segment formatting
for line in wrapped_lines:
# Calculate center position for this line
line_width = regular_font.getlength(line) if hasattr(regular_font, 'getlength') else regular_font.getsize(line)[0]
x_position = (width - line_width) // 2
# Draw shadow for the base text
draw.text((x_position + 2, y_position + 2), line, fill=(220, 220, 220), font=regular_font)
# Draw the base text
draw.text((x_position, y_position), line, fill="black", font=regular_font)
# Then overlay formatted segments if we have any
if caption_data['segments']:
current_pos = 0
for segment in caption_data['segments']:
if segment['text'] in line:
segment_start = line.find(segment['text'], current_pos)
if segment_start != -1:
# Calculate position for this segment
before_text = line[:segment_start]
before_width = regular_font.getlength(before_text) if hasattr(regular_font, 'getlength') else regular_font.getsize(before_text)[0]
segment_x = x_position + before_width
# Select appropriate font based on formatting
if segment['bold'] and segment['italic']:
font = bold_italic_font
elif segment['bold']:
font = bold_font
elif segment['italic']:
font = italic_font
else:
font = regular_font
# Draw the formatted text with shadow and background
text_height = font_size
text_width = font.getlength(segment['text']) if hasattr(font, 'getlength') else font.getsize(segment['text'])[0]
# Draw white background
draw.rectangle([segment_x-2, y_position-2, segment_x+text_width+2, y_position+text_height+2],
fill=(255, 255, 255))
# Draw shadow
draw.text((segment_x + 2, y_position + 2), segment['text'],
fill=(220, 220, 220), font=font)
# Draw the formatted text
if segment['bold'] and font == regular_font: # If we're using fallback and need bold
for offset in range(-1, 2):
draw.text((segment_x + offset, y_position), segment['text'],
fill="black", font=font)
draw.text((segment_x, y_position + offset), segment['text'],
fill="black", font=font)
else:
draw.text((segment_x, y_position), segment['text'],
fill="black", font=font)
# Draw underline if needed - thicker line for better visibility
if segment['underline']:
underline_y = y_position + font_size + 3
draw.line((segment_x, underline_y, segment_x + text_width, underline_y),
fill="black", width=3)
current_pos = segment_start + len(segment['text'])
y_position += line_height
return result_img
# --- Streamlit App UI ---
st.title("🖼️ Advanced Image Captioning System")
st.write("Upload an image and generate creative captions with audio!")
# Initialize session state for tracking the selected style
if 'current_style' not in st.session_state:
st.session_state.current_style = None
# Step 1: Upload an image
uploaded_file = st.file_uploader("Step 1: Upload an image", type=["jpg", "jpeg", "png"])
if uploaded_file:
# Display uploaded image
image = Image.open(uploaded_file).convert("RGB")
st.image(image, caption="Uploaded Image", use_column_width=True)
# Step 2: Select caption style from dropdown
st.write("Step 2: Select the type of caption to generate")
selected_style = st.selectbox("Caption Style:", list(caption_styles.keys()))
# Check if style has changed
if st.session_state.current_style != selected_style:
# Clear previous captions when style changes
if 'styled_captions' in st.session_state:
st.session_state.styled_captions = []
# Update the current style
st.session_state.current_style = selected_style
# Initialize session state for storing captions
if 'styled_captions' not in st.session_state:
st.session_state.styled_captions = []
# Generate captions button
if st.button("Generate Captions") or ('styled_captions' in st.session_state and len(st.session_state.styled_captions) > 0):
# Only run the generation if we don't already have captions
if len(st.session_state.styled_captions) == 0:
with st.spinner("Analyzing image and generating captions..."):
# Generate a base caption using BLIP model
base_caption = generate_blip_caption(image)
# Generate five stylized captions using Gemini
styled_captions = generate_gemini_captions(image, base_caption, selected_style)
st.session_state.styled_captions = styled_captions
# Display the captions with audio buttons
st.subheader("Top 5 Generated Captions:")
st.write("Each caption has an audio button and a generate image button.")
# Add some CSS for better styling
st.markdown("""
<style>
.caption-container {
background-color: grey;
border-radius: 10px;
padding: 1px;
margin-bottom: 15px;
border: 1px solid #dee2e6;
}
</style>
""", unsafe_allow_html=True)
for i, caption in enumerate(st.session_state.styled_captions):
# Create a container for each caption
#st.markdown(f"""<div class="caption-container"></div>""", unsafe_allow_html=True)
st.markdown("---")
# Create three columns: caption, audio button, and generate button
col1, col2, col3 = st.columns([6, 1, 3])
with col1:
# Display the caption text
st.markdown(f"**Caption {i+1}:**")
st.write(caption)
with col2:
# Generate and display audio button
st.markdown("**Audio:**")
audio = generate_audio(caption)
if audio is not None:
st.audio(audio, format="audio/mp3")
else:
st.warning("❌")
with col3:
# Add a dedicated button for generating the image with this caption
st.markdown("**Generate:**")
if st.button(f"Generate Image", key=f"generate_btn_{i}"):
# Store the selected caption and generate the image
st.session_state.selected_caption = caption
# Store the raw caption as well for audio generation
st.session_state.raw_caption = caption
st.session_state.final_image = add_caption_to_image(image, caption)
# Set a flag to indicate that we should display the image
st.session_state.show_final_image = True
# Store the index of the selected caption
st.session_state.selected_caption_index = i
# Display the final image if it has been generated
if 'show_final_image' in st.session_state and st.session_state.show_final_image and 'final_image' in st.session_state:
st.markdown("---")
st.subheader("Final Image with Caption:")
# Display which caption was selected
if 'selected_caption_index' in st.session_state:
st.info(f"Using Caption {st.session_state.selected_caption_index + 1}: {st.session_state.selected_caption}")
st.image(st.session_state.final_image, use_column_width=True)
# Generate audio for the selected caption if not already generated
if 'selected_caption_audio' not in st.session_state and 'raw_caption' in st.session_state:
# Use the raw caption without formatting for better audio
clean_caption = parse_caption_formatting(st.session_state.raw_caption)['clean']
st.session_state.selected_caption_audio = generate_audio(clean_caption)
# Create a ZIP file containing both the image and audio
if 'selected_caption_audio' in st.session_state:
# Convert final image to bytes
img_bytes = io.BytesIO()
st.session_state.final_image.save(img_bytes, format="JPEG")
img_bytes.seek(0)
# Create a zip file in memory
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# Add the image to the zip
zip_file.writestr('captioned_image.jpg', img_bytes.getvalue())
# Add the audio to the zip
if st.session_state.selected_caption_audio is not None:
audio_bytes = st.session_state.selected_caption_audio.getvalue()
zip_file.writestr('caption_audio.mp3', audio_bytes)
# Reset buffer position
zip_buffer.seek(0)
# Add a download button for the zip file
st.download_button(
"📥 Download Image & Audio (ZIP)",
zip_buffer,
file_name="captioned_image_with_audio.zip",
mime="application/zip",
use_container_width=False
)
# Also offer individual downloads
st.download_button(
"📥 Download Image Only",
img_bytes,
file_name="captioned_image.jpg",
mime="image/jpeg",
use_container_width=False
)
else:
st.info("Please upload an image to get started.")