Durgeshwar-AI · BhakktiGautam · Jun 14, 2026
diff --git a/backend/blueprints/pdf_extract_images.py b/backend/blueprints/pdf_extract_images.py
@@ -0,0 +1,164 @@
+"""
+PDF Embedded Images Extractor
+Extracts raw raster images (JPEG/PNG) from PDF files without re-compression
+"""
+
+import fitz  # PyMuPDF
+import io
+import zipfile
+from flask import Blueprint, request, jsonify, send_file
+from werkzeug.utils import secure_filename
+
+pdf_extract_images_bp = Blueprint('pdf_extract_images', __name__)
+
+
+def extract_images_from_pdf(pdf_bytes, original_filename="document"):
+    """Extract all embedded images from a PDF file"""
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+        if len(doc) == 0:
+            return "PDF file is empty or corrupted", None, None
+
+        zip_buffer = io.BytesIO()
+        total_images = 0
+        image_names = []
+
+        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                image_list = page.get_images(full=True)
+
+                for img_index, img in enumerate(image_list):
+                    xref = img[0]
+
+                    try:
+                        pix = fitz.Pixmap(doc, xref)
+
+                        if pix.n - pix.alpha < 4:
+                            img_data = pix.tobytes("png")
+                            ext = "png"
+                        else:
+                            pix = fitz.Pixmap(fitz.csRGB, pix)
+                            img_data = pix.tobytes("png")
+                            ext = "png"
+
+                        pix = None
+
+                        base_name = secure_filename(original_filename).replace('.pdf', '')
+                        img_filename = f"{base_name}_page{page_num+1}_{img_index+1}.{ext}"
+
+                        zip_file.writestr(img_filename, img_data)
+                        image_names.append(img_filename)
+                        total_images += 1
+
+                    except Exception as e:
+                        print(f"Error: {str(e)}")
+                        continue
+
+            doc.close()
+
+            if total_images == 0:
+                return "No embedded images found in this PDF", None, None
+
+            metadata = f"""Extracted Images Report
+PDF File: {original_filename}
+Total Images Found: {total_images}
+Extracted Images:
+{chr(10).join(f'- {name}' for name in image_names)}
+"""
+            zip_file.writestr("extraction_report.txt", metadata)
+
+        zip_buffer.seek(0)
+        return zip_buffer, total_images, image_names
+
+    except Exception as e:
+        return f"Error processing PDF: {str(e)}", None, None
+
+
+@pdf_extract_images_bp.route('/extract-pdf-images', methods=['POST'])
+def extract_images():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '':
+        return jsonify({'error': 'No file selected'}), 400
+
+    if not file.filename.lower().endswith('.pdf'):
+        return jsonify({'error': 'File must be a PDF'}), 400
+
+    try:
+        pdf_bytes = file.read()
+        result, count, names = extract_images_from_pdf(pdf_bytes, file.filename)
+
+        if count is None:
+            return jsonify({'error': result}), 400
+
+        return send_file(
+            result,
+            mimetype='application/zip',
+            as_attachment=True,
+            download_name=f"{file.filename.replace('.pdf', '')}_extracted_images.zip"
+        )
+
+    except Exception as e:
+        return jsonify({'error': f'Server error: {str(e)}'}), 500
+
+
+@pdf_extract_images_bp.route('/preview-pdf-images', methods=['POST'])
+def preview_images():
+    import base64
+
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file provided'}), 400
+
+    file = request.files['file']
+
+    if file.filename == '' or not file.filename.lower().endswith('.pdf'):
+        return jsonify({'error': 'Valid PDF required'}), 400
+
+    try:
+        pdf_bytes = file.read()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+        previews = []
+        count = 0
+
+        for page_num in range(min(len(doc), 3)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+
+            for img_index, img in enumerate(image_list[:3]):
+                if count >= 9:
+                    break
+
+                xref = img[0]
+                pix = fitz.Pixmap(doc, xref)
+
+                if pix.n - pix.alpha >= 4:
+                    pix = fitz.Pixmap(fitz.csRGB, pix)
+
+                img_data = pix.tobytes("png")
+                b64 = base64.b64encode(img_data).decode('utf-8')
+
+                previews.append({
+                    'page': page_num + 1,
+                    'index': img_index + 1,
+                    'data': f'data:image/png;base64,{b64}'
+                })
+                count += 1
+                pix = None
+
+        doc.close()
+
+        return jsonify({
+            'success': True,
+            'total_previews': len(previews),
+            'previews': previews
+        })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
diff --git a/backend/main.py b/backend/main.py
@@ -1,10 +1,11 @@
 from app import create_app
-
+from blueprints.pdf_extract_images import pdf_extract_images_bp
 import os
 
 app = create_app()
 
- # ← moved AFTER app is created
+# ✅ YAHAN PE REGISTER KARO (app create hone ke turant baad)
+app.register_blueprint(pdf_extract_images_bp)
 
 if __name__ == "__main__":
     port = int(os.getenv("PORT", "5000"))

diff --git a/backend/test_extract.py b/backend/test_extract.py
@@ -0,0 +1,79 @@
+import fitz  # PyMuPDF
+import io
+import zipfile
+import os
+
+def test_extract(pdf_path):
+    """Test PDF image extraction without Flask"""
+
+    if not os.path.exists(pdf_path):
+        print(f"❌ File not found: {pdf_path}")
+        return
+
+    # Read PDF
+    with open(pdf_path, 'rb') as f:
+        pdf_bytes = f.read()
+
+    # Open PDF from memory
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+    print(f"✅ PDF opened: {pdf_path}")
+    print(f"📄 Total pages: {len(doc)}")
+
+    total_images = 0
+
+    # Create ZIP in memory
+    zip_buffer = io.BytesIO()
+
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+
+            print(f"  Page {page_num + 1}: {len(image_list)} images found")
+
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+
+                try:
+                    pix = fitz.Pixmap(doc, xref)
+
+                    if pix.n - pix.alpha < 4:
+                        img_data = pix.tobytes("png")
+                        ext = "png"
+                    else:
+                        pix = fitz.Pixmap(fitz.csRGB, pix)
+                        img_data = pix.tobytes("png")
+                        ext = "png"
+
+                    pix = None
+
+                    img_filename = f"page{page_num+1}_img{img_index+1}.{ext}"
+                    zip_file.writestr(img_filename, img_data)
+                    total_images += 1
+
+                    print(f"    ✅ Extracted: {img_filename}")
+
+                except Exception as e:
+                    print(f"    ❌ Error: {e}")
+
+        doc.close()
+
+        # Add report
+        report = f"Extracted {total_images} images from {pdf_path}"
+        zip_file.writestr("report.txt", report)
+
+    if total_images > 0:
+        # Save ZIP file
+        zip_buffer.seek(0)
+        with open("extracted_images.zip", "wb") as f:
+            f.write(zip_buffer.read())
+        print(f"\n✅ ZIP created: extracted_images.zip ({total_images} images)")
+    else:
+        print("\n⚠️ No images found in PDF")
+
+if __name__ == "__main__":
+    # Ask for PDF path
+    pdf_input = input("Enter PDF path (or drag-drop PDF here): ").strip().strip('"')
+    test_extract(pdf_input)
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
@@ -5,6 +5,7 @@ import ScrollToTop from "./components/ScrollToTop";
 
 import Layout from "./components/Layout/Layout";
 import ErrorBoundary from "./ErrorBoundary";
+import PdfExtractImages from './pages/PdfExtractImages';
 
 const UrlToQr = lazy(() => import("./pages/UrlToQr"));
 
@@ -56,6 +57,8 @@ function App() {
           {/* The Landing Page has its own clean view */}
           <Route path="/" element={<LandingPage />} />
 
+          <Route path="/pdf/extract-images" element={<PdfExtractImages />} />
+
           {/* Informational pages (Navbar + Footer wrapper, no tool sidebar) */}
           <Route path="/about" element={<About />} />
           <Route path="/privacy" element={<Privacy />} />

diff --git a/frontend/src/data/toolsData.jsx b/frontend/src/data/toolsData.jsx
@@ -315,7 +315,18 @@ const tools = [
     path: "/url-to-qr",
     gradient: "from-emerald-500/10 to-green-500/10",
     iconGradient: "from-emerald-500 to-green-500",
-  }
+  },
+
+{
+  id: 'pdf-extract-images',
+  name: 'Extract PDF Images',
+  category: 'PDF Tools',
+  description: 'Extract all embedded images from PDF files',
+  icon: '🖼️',
+  path: '/pdf/extract-images',
+  component: lazy(() => import('../pages/PdfExtractImages')),
+  comingSoon: false,
+},
 
 ];