Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions backend/blueprints/pdf_extract_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""
PDF Embedded Images Extractor
Extracts raw raster images (JPEG/PNG) from PDF files without re-compression
"""

import fitz # PyMuPDF
import io
import zipfile
from flask import Blueprint, request, jsonify, send_file
from werkzeug.utils import secure_filename

pdf_extract_images_bp = Blueprint('pdf_extract_images', __name__)


def extract_images_from_pdf(pdf_bytes, original_filename="document"):
"""Extract all embedded images from a PDF file"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

if len(doc) == 0:
return "PDF file is empty or corrupted", None, None

zip_buffer = io.BytesIO()
total_images = 0
image_names = []

with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:

for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)

for img_index, img in enumerate(image_list):
xref = img[0]

try:
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha < 4:
img_data = pix.tobytes("png")
ext = "png"
else:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix.tobytes("png")
ext = "png"

pix = None

base_name = secure_filename(original_filename).replace('.pdf', '')
img_filename = f"{base_name}_page{page_num+1}_{img_index+1}.{ext}"

zip_file.writestr(img_filename, img_data)
image_names.append(img_filename)
total_images += 1

except Exception as e:
print(f"Error: {str(e)}")
continue

doc.close()

if total_images == 0:
return "No embedded images found in this PDF", None, None

metadata = f"""Extracted Images Report
PDF File: {original_filename}
Total Images Found: {total_images}
Extracted Images:
{chr(10).join(f'- {name}' for name in image_names)}
"""
zip_file.writestr("extraction_report.txt", metadata)

zip_buffer.seek(0)
return zip_buffer, total_images, image_names

except Exception as e:
return f"Error processing PDF: {str(e)}", None, None


@pdf_extract_images_bp.route('/extract-pdf-images', methods=['POST'])
def extract_images():
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400

file = request.files['file']

if file.filename == '':
return jsonify({'error': 'No file selected'}), 400

if not file.filename.lower().endswith('.pdf'):
return jsonify({'error': 'File must be a PDF'}), 400

try:
pdf_bytes = file.read()
result, count, names = extract_images_from_pdf(pdf_bytes, file.filename)

if count is None:
return jsonify({'error': result}), 400

return send_file(
result,
mimetype='application/zip',
as_attachment=True,
download_name=f"{file.filename.replace('.pdf', '')}_extracted_images.zip"
)

except Exception as e:
return jsonify({'error': f'Server error: {str(e)}'}), 500


@pdf_extract_images_bp.route('/preview-pdf-images', methods=['POST'])
def preview_images():
import base64

if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400

file = request.files['file']

if file.filename == '' or not file.filename.lower().endswith('.pdf'):
return jsonify({'error': 'Valid PDF required'}), 400

try:
pdf_bytes = file.read()
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

previews = []
count = 0

for page_num in range(min(len(doc), 3)):
page = doc[page_num]
image_list = page.get_images(full=True)

for img_index, img in enumerate(image_list[:3]):
if count >= 9:
break

xref = img[0]
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha >= 4:
pix = fitz.Pixmap(fitz.csRGB, pix)

img_data = pix.tobytes("png")
b64 = base64.b64encode(img_data).decode('utf-8')

previews.append({
'page': page_num + 1,
'index': img_index + 1,
'data': f'data:image/png;base64,{b64}'
})
count += 1
pix = None

doc.close()

return jsonify({
'success': True,
'total_previews': len(previews),
'previews': previews
})

except Exception as e:
return jsonify({'error': str(e)}), 500
5 changes: 3 additions & 2 deletions backend/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from app import create_app

from blueprints.pdf_extract_images import pdf_extract_images_bp
import os

app = create_app()

# ← moved AFTER app is created
# βœ… YAHAN PE REGISTER KARO (app create hone ke turant baad)
app.register_blueprint(pdf_extract_images_bp)

if __name__ == "__main__":
port = int(os.getenv("PORT", "5000"))
Expand Down
79 changes: 79 additions & 0 deletions backend/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import fitz # PyMuPDF
import io
import zipfile
import os

def test_extract(pdf_path):
"""Test PDF image extraction without Flask"""

if not os.path.exists(pdf_path):
print(f"❌ File not found: {pdf_path}")
return

# Read PDF
with open(pdf_path, 'rb') as f:
pdf_bytes = f.read()

# Open PDF from memory
doc = fitz.open(stream=pdf_bytes, filetype="pdf")

print(f"βœ… PDF opened: {pdf_path}")
print(f"πŸ“„ Total pages: {len(doc)}")

total_images = 0

# Create ZIP in memory
zip_buffer = io.BytesIO()

with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:

for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)

print(f" Page {page_num + 1}: {len(image_list)} images found")

for img_index, img in enumerate(image_list):
xref = img[0]

try:
pix = fitz.Pixmap(doc, xref)

if pix.n - pix.alpha < 4:
img_data = pix.tobytes("png")
ext = "png"
else:
pix = fitz.Pixmap(fitz.csRGB, pix)
img_data = pix.tobytes("png")
ext = "png"

pix = None

img_filename = f"page{page_num+1}_img{img_index+1}.{ext}"
zip_file.writestr(img_filename, img_data)
total_images += 1

print(f" βœ… Extracted: {img_filename}")

except Exception as e:
print(f" ❌ Error: {e}")

doc.close()

# Add report
report = f"Extracted {total_images} images from {pdf_path}"
zip_file.writestr("report.txt", report)

if total_images > 0:
# Save ZIP file
zip_buffer.seek(0)
with open("extracted_images.zip", "wb") as f:
f.write(zip_buffer.read())
print(f"\nβœ… ZIP created: extracted_images.zip ({total_images} images)")
else:
print("\n⚠️ No images found in PDF")

if __name__ == "__main__":
# Ask for PDF path
pdf_input = input("Enter PDF path (or drag-drop PDF here): ").strip().strip('"')
test_extract(pdf_input)
3 changes: 3 additions & 0 deletions frontend/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import ScrollToTop from "./components/ScrollToTop";

import Layout from "./components/Layout/Layout";
import ErrorBoundary from "./ErrorBoundary";
import PdfExtractImages from './pages/PdfExtractImages';

const UrlToQr = lazy(() => import("./pages/UrlToQr"));

Expand Down Expand Up @@ -56,6 +57,8 @@ function App() {
{/* The Landing Page has its own clean view */}
<Route path="/" element={<LandingPage />} />

<Route path="/pdf/extract-images" element={<PdfExtractImages />} />

{/* Informational pages (Navbar + Footer wrapper, no tool sidebar) */}
<Route path="/about" element={<About />} />
<Route path="/privacy" element={<Privacy />} />
Expand Down
13 changes: 12 additions & 1 deletion frontend/src/data/toolsData.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,18 @@ const tools = [
path: "/url-to-qr",
gradient: "from-emerald-500/10 to-green-500/10",
iconGradient: "from-emerald-500 to-green-500",
}
},

{
id: 'pdf-extract-images',
name: 'Extract PDF Images',
category: 'PDF Tools',
description: 'Extract all embedded images from PDF files',
icon: 'πŸ–ΌοΈ',
path: '/pdf/extract-images',
component: lazy(() => import('../pages/PdfExtractImages')),
comingSoon: false,
},

];

Expand Down
Loading