Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions backend/blueprints/pdf_extract_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
PDF Embedded Images Extractor
Extracts raw raster images (JPEG/PNG) from PDF files without re-compression
"""

import fitz # PyMuPDF
import io
import zipfile
from flask import Blueprint, request, jsonify, send_file
from werkzeug.utils import secure_filename

pdf_extract_images_bp = Blueprint('pdf_extract_images', __name__)


def extract_images_from_pdf(pdf_bytes, original_filename="document"):
"""Extract all embedded images from a PDF file"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

if len(doc) == 0:
return "PDF file is empty or corrupted", None, None

zip_buffer = io.BytesIO()
total_images = 0
image_names = []

with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:

for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)

for img_index, img in enumerate(image_list):
xref = img[0]

try:
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha < 4:
img_data = pix.tobytes("png")
ext = "png"
else:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix.tobytes("png")
ext = "png"

pix = None

base_name = secure_filename(original_filename).replace('.pdf', '')
img_filename = f"{base_name}_page{page_num+1}_{img_index+1}.{ext}"

zip_file.writestr(img_filename, img_data)
image_names.append(img_filename)
total_images += 1

except Exception as e:
print(f"Error: {str(e)}")
continue

doc.close()

if total_images == 0:
return "No embedded images found in this PDF", None, None

metadata = f"""Extracted Images Report
PDF File: {original_filename}
Total Images Found: {total_images}
Extracted Images:
{chr(10).join(f'- {name}' for name in image_names)}
"""
zip_file.writestr("extraction_report.txt", metadata)

zip_buffer.seek(0)
return zip_buffer, total_images, image_names

except Exception as e:
return f"Error processing PDF: {str(e)}", None, None


@pdf_extract_images_bp.route('/extract-pdf-images', methods=['POST'])
def extract_images():
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400

file = request.files['file']

if file.filename == '':
return jsonify({'error': 'No file selected'}), 400

if not file.filename.lower().endswith('.pdf'):
return jsonify({'error': 'File must be a PDF'}), 400

try:
pdf_bytes = file.read()
result, count, names = extract_images_from_pdf(pdf_bytes, file.filename)

if count is None:
return jsonify({'error': result}), 400

return send_file(
result,
mimetype='application/zip',
as_attachment=True,
download_name=f"{file.filename.replace('.pdf', '')}_extracted_images.zip"
)

except Exception as e:
return jsonify({'error': f'Server error: {str(e)}'}), 500


@pdf_extract_images_bp.route('/preview-pdf-images', methods=['POST'])
def preview_images():
import base64

if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400

file = request.files['file']

if file.filename == '' or not file.filename.lower().endswith('.pdf'):
return jsonify({'error': 'Valid PDF required'}), 400

try:
pdf_bytes = file.read()
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

previews = []
count = 0

for page_num in range(min(len(doc), 3)):
page = doc[page_num]
image_list = page.get_images(full=True)

for img_index, img in enumerate(image_list[:3]):
if count >= 9:
break

xref = img[0]
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)

img_data = pix.tobytes("png")
b64 = base64.b64encode(img_data).decode('utf-8')

previews.append({
'page': page_num + 1,
'index': img_index + 1,
'data': f'data:image/png;base64,{b64}'
})
count += 1
pix = None

doc.close()

return jsonify({
'success': True,
'total_previews': len(previews),
'previews': previews
})

except Exception as e:
return jsonify({'error': str(e)}), 500
117 changes: 117 additions & 0 deletions backend/blueprints/progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
SSE (Server-Sent Events) Blueprint for Real-Time Progress
"""

import time
import json
from flask import Blueprint, Response, request, stream_with_context
from utils.progress_manager import progress_manager

progress_bp = Blueprint('progress', __name__)


@progress_bp.route('/progress/<task_id>')
def progress_stream(task_id: str):
"""
SSE endpoint for progress updates
Client connects to this endpoint and receives progress updates
"""
def generate():
last_percent = -1

while True:
progress = progress_manager.get_progress(task_id)

if progress:
# Only send update if progress changed
current_percent = progress.get('percent', 0)

if current_percent != last_percent or progress.get('status') in ['complete', 'error']:
last_percent = current_percent

# Send SSE message
yield f"data: {json.dumps(progress)}\n\n"

# Stop streaming when complete or error
if progress.get('status') in ['complete', 'error']:
break

# Wait before next check (don't flood)
time.sleep(0.5)

return Response(
stream_with_context(generate()),
mimetype='text/event-stream',
headers={
'Cache-Control': 'no-cache',
'X-Accel-Buffering': 'no' # Disable nginx buffering
}
)


@progress_bp.route('/progress/<task_id>/status')
def get_progress_status(task_id: str):
"""Get current progress status as JSON"""
from flask import jsonify

progress = progress_manager.get_progress(task_id)
if progress:
return jsonify(progress)

return jsonify({'error': 'Task not found'}), 404


@progress_bp.route('/convert-pdf-progress', methods=['POST'])
def convert_pdf_with_progress():
"""
Example: PDF to PNG conversion with progress tracking
"""
from flask import jsonify

if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400

file = request.files['file']

if file.filename == '':
return jsonify({'error': 'No file selected'}), 400

# Create task for progress tracking
task_id = progress_manager.create_task()

# Start async processing (simplified - use threading for real implementation)
import threading

def process():
try:
# Simulate processing with progress updates
total_pages = 10 # In real code, get actual page count

progress_manager.update(task_id, 0, total_pages, "Starting conversion...")

for page in range(1, total_pages + 1):
# Simulate work
time.sleep(0.5)

# Update progress
progress_manager.update(
task_id,
page,
total_pages,
f"Processing page {page} of {total_pages}..."
)

progress_manager.complete(task_id, "/download/result.zip")

except Exception as e:
progress_manager.error(task_id, str(e))

# Start background thread
thread = threading.Thread(target=process)
thread.start()

return jsonify({
'task_id': task_id,
'stream_url': f'/progress/{task_id}',
'status_url': f'/progress/{task_id}/status'
})
8 changes: 6 additions & 2 deletions backend/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from app import create_app

from blueprints.pdf_extract_images import pdf_extract_images_bp
from blueprints.progress import progress_bp
import os

app = create_app()

# ← moved AFTER app is created
# βœ… YAHAN PE REGISTER KARO (app create hone ke turant baad)
app.register_blueprint(pdf_extract_images_bp)

app.register_blueprint(progress_bp)

if __name__ == "__main__":
port = int(os.getenv("PORT", "5000"))
Expand Down
79 changes: 79 additions & 0 deletions backend/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import fitz # PyMuPDF
import io
import zipfile
import os

def test_extract(pdf_path):
"""Test PDF image extraction without Flask"""

if not os.path.exists(pdf_path):
print(f"❌ File not found: {pdf_path}")
return

# Read PDF
with open(pdf_path, 'rb') as f:
pdf_bytes = f.read()

# Open PDF from memory
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

print(f"βœ… PDF opened: {pdf_path}")
print(f"πŸ“„ Total pages: {len(doc)}")

total_images = 0

# Create ZIP in memory
zip_buffer = io.BytesIO()

with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:

for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)

print(f" Page {page_num + 1}: {len(image_list)} images found")

for img_index, img in enumerate(image_list):
xref = img[0]

try:
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha < 4:
img_data = pix.tobytes("png")
ext = "png"
else:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix.tobytes("png")
ext = "png"

pix = None

img_filename = f"page{page_num+1}_img{img_index+1}.{ext}"
zip_file.writestr(img_filename, img_data)
total_images += 1

print(f" βœ… Extracted: {img_filename}")

except Exception as e:
print(f" ❌ Error: {e}")

doc.close()

# Add report
report = f"Extracted {total_images} images from {pdf_path}"
zip_file.writestr("report.txt", report)

if total_images > 0:
# Save ZIP file
zip_buffer.seek(0)
with open("extracted_images.zip", "wb") as f:
f.write(zip_buffer.read())
print(f"\nβœ… ZIP created: extracted_images.zip ({total_images} images)")
else:
print("\n⚠️ No images found in PDF")

if __name__ == "__main__":
# Ask for PDF path
pdf_input = input("Enter PDF path (or drag-drop PDF here): ").strip().strip('"')
test_extract(pdf_input)
Loading