Durgeshwar-AI · BhakktiGautam · Jun 14, 2026 · Jun 14, 2026 · Jun 16, 2026
diff --git a/backend/blueprints/pdf_extract_images.py b/backend/blueprints/pdf_extract_images.py
@@ -0,0 +1,164 @@
+"""
+PDF Embedded Images Extractor
+Extracts raw raster images (JPEG/PNG) from PDF files without re-compression
+"""
+
+import fitz  # PyMuPDF
+import io
+import zipfile
+from flask import Blueprint, request, jsonify, send_file
+from werkzeug.utils import secure_filename
+
+pdf_extract_images_bp = Blueprint('pdf_extract_images', __name__)
+
+
+def extract_images_from_pdf(pdf_bytes, original_filename="document"):
+    """Extract all embedded images from a PDF file"""
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+        if len(doc) == 0:
+            return "PDF file is empty or corrupted", None, None
+
+        zip_buffer = io.BytesIO()
+        total_images = 0
+        image_names = []
+
+        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                image_list = page.get_images(full=True)
+
+                for img_index, img in enumerate(image_list):
+                    xref = img[0]
+
+                    try:
+                        pix = fitz.Pixmap(doc, xref)
+
+                        if pix.n - pix.alpha < 4:
+                            img_data = pix.tobytes("png")
+                            ext = "png"
+                        else:
+                            pix = fitz.Pixmap(fitz.csRGB, pix)
+                            img_data = pix.tobytes("png")
+                            ext = "png"
+
+                        pix = None
+
+                        base_name = secure_filename(original_filename).replace('.pdf', '')
+                        img_filename = f"{base_name}_page{page_num+1}_{img_index+1}.{ext}"
+
+                        zip_file.writestr(img_filename, img_data)
+                        image_names.append(img_filename)
+                        total_images += 1
+
+                    except Exception as e:
+                        print(f"Error: {str(e)}")
+                        continue
+
+            doc.close()
+
+            if total_images == 0:
+                return "No embedded images found in this PDF", None, None
+
+            metadata = f"""Extracted Images Report
+PDF File: {original_filename}
+Total Images Found: {total_images}
+Extracted Images:
+{chr(10).join(f'- {name}' for name in image_names)}
+"""
+            zip_file.writestr("extraction_report.txt", metadata)
+
+        zip_buffer.seek(0)
+        return zip_buffer, total_images, image_names
+
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}", None, None
+
+
+@pdf_extract_images_bp.route('/extract-pdf-images', methods=['POST'])
+def extract_images():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '':
+        return jsonify({'error': 'No file selected'}), 400
+
+    if not file.filename.lower().endswith('.pdf'):
+        return jsonify({'error': 'File must be a PDF'}), 400
+
+    try:
+        pdf_bytes = file.read()
+        result, count, names = extract_images_from_pdf(pdf_bytes, file.filename)
+
+        if count is None:
+            return jsonify({'error': result}), 400
+
+        return send_file(
+            result,
+            mimetype='application/zip',
+            as_attachment=True,
+            download_name=f"{file.filename.replace('.pdf', '')}_extracted_images.zip"
+        )
+
+    except Exception as e:
+        return jsonify({'error': f'Server error: {str(e)}'}), 500
+
+
+@pdf_extract_images_bp.route('/preview-pdf-images', methods=['POST'])
+def preview_images():
+    import base64
+
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '' or not file.filename.lower().endswith('.pdf'):
+        return jsonify({'error': 'Valid PDF required'}), 400
+
+    try:
+        pdf_bytes = file.read()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+        previews = []
+        count = 0
+
+        for page_num in range(min(len(doc), 3)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+
+            for img_index, img in enumerate(image_list[:3]):
+                if count >= 9:
+                    break
+
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+
+                if pix.n - pix.alpha >= 4:
+                    pix = fitz.Pixmap(fitz.csRGB, pix)
+
+                img_data = pix.tobytes("png")
+                b64 = base64.b64encode(img_data).decode('utf-8')
+
+                previews.append({
+                    'page': page_num + 1,
+                    'index': img_index + 1,
+                    'data': f'data:image/png;base64,{b64}'
+                })
+                count += 1
+                pix = None
+
+        doc.close()
+
+        return jsonify({
+            'success': True,
+            'total_previews': len(previews),
+            'previews': previews
+        })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
diff --git a/backend/blueprints/progress.py b/backend/blueprints/progress.py
@@ -0,0 +1,117 @@
+"""
+SSE (Server-Sent Events) Blueprint for Real-Time Progress
+"""
+
+import time
+import json
+from flask import Blueprint, Response, request, stream_with_context
+from utils.progress_manager import progress_manager
+
+progress_bp = Blueprint('progress', __name__)
+
+
+@progress_bp.route('/progress/<task_id>')
+def progress_stream(task_id: str):
+    """
+    SSE endpoint for progress updates
+    Client connects to this endpoint and receives progress updates
+    """
+    def generate():
+        last_percent = -1
+
+        while True:
+            progress = progress_manager.get_progress(task_id)
+
+            if progress:
+                # Only send update if progress changed
+                current_percent = progress.get('percent', 0)
+
+                if current_percent != last_percent or progress.get('status') in ['complete', 'error']:
+                    last_percent = current_percent
+
+                    # Send SSE message
+                    yield f"data: {json.dumps(progress)}\n\n"
+
+                    # Stop streaming when complete or error
+                    if progress.get('status') in ['complete', 'error']:
+                        break
+
+            # Wait before next check (don't flood)
+            time.sleep(0.5)
+
+    return Response(
+        stream_with_context(generate()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'  # Disable nginx buffering
+        }
+    )
+
+
+@progress_bp.route('/progress/<task_id>/status')
+def get_progress_status(task_id: str):
+    """Get current progress status as JSON"""
+    from flask import jsonify
+
+    progress = progress_manager.get_progress(task_id)
+    if progress:
+        return jsonify(progress)
+
+    return jsonify({'error': 'Task not found'}), 404
+
+
+@progress_bp.route('/convert-pdf-progress', methods=['POST'])
+def convert_pdf_with_progress():
+    """
+    Example: PDF to PNG conversion with progress tracking
+    """
+    from flask import jsonify
+
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '':
+        return jsonify({'error': 'No file selected'}), 400
+
+    # Create task for progress tracking
+    task_id = progress_manager.create_task()
+
+    # Start async processing (simplified - use threading for real implementation)
+    import threading
+
+    def process():
+        try:
+            # Simulate processing with progress updates
+            total_pages = 10  # In real code, get actual page count
+
+            progress_manager.update(task_id, 0, total_pages, "Starting conversion...")
+
+            for page in range(1, total_pages + 1):
+                # Simulate work
+                time.sleep(0.5)
+
+                # Update progress
+                progress_manager.update(
+                    task_id, 
+                    page, 
+                    total_pages, 
+                    f"Processing page {page} of {total_pages}..."
+                )
+
+            progress_manager.complete(task_id, "/download/result.zip")
+
+        except Exception as e:
+            progress_manager.error(task_id, str(e))
+
+    # Start background thread
+    thread = threading.Thread(target=process)
+    thread.start()
+
+    return jsonify({
+        'task_id': task_id,
+        'stream_url': f'/progress/{task_id}',
+        'status_url': f'/progress/{task_id}/status'
+    })
diff --git a/backend/main.py b/backend/main.py
@@ -1,10 +1,14 @@
 from app import create_app
-
+from blueprints.pdf_extract_images import pdf_extract_images_bp
+from blueprints.progress import progress_bp
 import os
 
 app = create_app()
 
- # ← moved AFTER app is created
+# ✅ YAHAN PE REGISTER KARO (app create hone ke turant baad)
+app.register_blueprint(pdf_extract_images_bp)
+
+app.register_blueprint(progress_bp)
 
 if __name__ == "__main__":
     port = int(os.getenv("PORT", "5000"))

diff --git a/backend/test_extract.py b/backend/test_extract.py
@@ -0,0 +1,79 @@
+import fitz  # PyMuPDF
+import io
+import zipfile
+import os
+
+def test_extract(pdf_path):
+    """Test PDF image extraction without Flask"""
+
+    if not os.path.exists(pdf_path):
+        print(f"❌ File not found: {pdf_path}")
+        return
+
+    # Read PDF
+    with open(pdf_path, 'rb') as f:
+        pdf_bytes = f.read()
+
+    # Open PDF from memory
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+    print(f"✅ PDF opened: {pdf_path}")
+    print(f"📄 Total pages: {len(doc)}")
+
+    total_images = 0
+
+    # Create ZIP in memory
+    zip_buffer = io.BytesIO()
+
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+
+            print(f"  Page {page_num + 1}: {len(image_list)} images found")
+
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+
+                try:
+                    pix = fitz.Pixmap(doc, xref)
+
+                    if pix.n - pix.alpha < 4:
+                        img_data = pix.tobytes("png")
+                        ext = "png"
+                    else:
+                        pix = fitz.Pixmap(fitz.csRGB, pix)
+                        img_data = pix.tobytes("png")
+                        ext = "png"
+
+                    pix = None
+
+                    img_filename = f"page{page_num+1}_img{img_index+1}.{ext}"
+                    zip_file.writestr(img_filename, img_data)
+                    total_images += 1
+
+                    print(f"    ✅ Extracted: {img_filename}")
+
+                except Exception as e:
+                    print(f"    ❌ Error: {e}")
+
+        doc.close()
+
+        # Add report
+        report = f"Extracted {total_images} images from {pdf_path}"
+        zip_file.writestr("report.txt", report)
+
+    if total_images > 0:
+        # Save ZIP file
+        zip_buffer.seek(0)
+        with open("extracted_images.zip", "wb") as f:
+            f.write(zip_buffer.read())
+        print(f"\n✅ ZIP created: extracted_images.zip ({total_images} images)")
+    else:
+        print("\n⚠️ No images found in PDF")
+
+if __name__ == "__main__":
+    # Ask for PDF path
+    pdf_input = input("Enter PDF path (or drag-drop PDF here): ").strip().strip('"')
+    test_extract(pdf_input)