diff --git a/backend/blueprints/pdf_extract_images.py b/backend/blueprints/pdf_extract_images.py new file mode 100644 index 0000000..e7b6633 --- /dev/null +++ b/backend/blueprints/pdf_extract_images.py @@ -0,0 +1,164 @@ +""" +PDF Embedded Images Extractor +Extracts raw raster images (JPEG/PNG) from PDF files without re-compression +""" + +import fitz # PyMuPDF +import io +import zipfile +from flask import Blueprint, request, jsonify, send_file +from werkzeug.utils import secure_filename + +pdf_extract_images_bp = Blueprint('pdf_extract_images', __name__) + + +def extract_images_from_pdf(pdf_bytes, original_filename="document"): + """Extract all embedded images from a PDF file""" + try: + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + if len(doc) == 0: + return "PDF file is empty or corrupted", None, None + + zip_buffer = io.BytesIO() + total_images = 0 + image_names = [] + + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + + for page_num in range(len(doc)): + page = doc[page_num] + image_list = page.get_images(full=True) + + for img_index, img in enumerate(image_list): + xref = img[0] + + try: + pix = fitz.Pixmap(doc, xref) + + if pix.n - pix.alpha < 4: + img_data = pix.tobytes("png") + ext = "png" + else: + pix = fitz.Pixmap(fitz.csRGB, pix) + img_data = pix.tobytes("png") + ext = "png" + + pix = None + + base_name = secure_filename(original_filename).replace('.pdf', '') + img_filename = f"{base_name}_page{page_num+1}_{img_index+1}.{ext}" + + zip_file.writestr(img_filename, img_data) + image_names.append(img_filename) + total_images += 1 + + except Exception as e: + print(f"Error: {str(e)}") + continue + + doc.close() + + if total_images == 0: + return "No embedded images found in this PDF", None, None + + metadata = f"""Extracted Images Report +PDF File: {original_filename} +Total Images Found: {total_images} +Extracted Images: +{chr(10).join(f'- {name}' for name in image_names)} +""" + zip_file.writestr("extraction_report.txt", metadata) + + zip_buffer.seek(0) + return zip_buffer, total_images, image_names + + except Exception as e: + return f"Error processing PDF: {str(e)}", None, None + + +@pdf_extract_images_bp.route('/extract-pdf-images', methods=['POST']) +def extract_images(): + if 'file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['file'] + + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + if not file.filename.lower().endswith('.pdf'): + return jsonify({'error': 'File must be a PDF'}), 400 + + try: + pdf_bytes = file.read() + result, count, names = extract_images_from_pdf(pdf_bytes, file.filename) + + if count is None: + return jsonify({'error': result}), 400 + + return send_file( + result, + mimetype='application/zip', + as_attachment=True, + download_name=f"{file.filename.replace('.pdf', '')}_extracted_images.zip" + ) + + except Exception as e: + return jsonify({'error': f'Server error: {str(e)}'}), 500 + + +@pdf_extract_images_bp.route('/preview-pdf-images', methods=['POST']) +def preview_images(): + import base64 + + if 'file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['file'] + + if file.filename == '' or not file.filename.lower().endswith('.pdf'): + return jsonify({'error': 'Valid PDF required'}), 400 + + try: + pdf_bytes = file.read() + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + previews = [] + count = 0 + + for page_num in range(min(len(doc), 3)): + page = doc[page_num] + image_list = page.get_images(full=True) + + for img_index, img in enumerate(image_list[:3]): + if count >= 9: + break + + xref = img[0] + pix = fitz.Pixmap(doc, xref) + + if pix.n - pix.alpha >= 4: + pix = fitz.Pixmap(fitz.csRGB, pix) + + img_data = pix.tobytes("png") + b64 = base64.b64encode(img_data).decode('utf-8') + + previews.append({ + 'page': page_num + 1, + 'index': img_index + 1, + 'data': f'data:image/png;base64,{b64}' + }) + count += 1 + pix = None + + doc.close() + + return jsonify({ + 'success': True, + 'total_previews': len(previews), + 'previews': previews + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 \ No newline at end of file diff --git a/backend/main.py b/backend/main.py index 8d26f32..41710ad 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,10 +1,11 @@ from app import create_app - +from blueprints.pdf_extract_images import pdf_extract_images_bp import os app = create_app() - # ā moved AFTER app is created +# ā YAHAN PE REGISTER KARO (app create hone ke turant baad) +app.register_blueprint(pdf_extract_images_bp) if __name__ == "__main__": port = int(os.getenv("PORT", "5000")) diff --git a/backend/test_extract.py b/backend/test_extract.py new file mode 100644 index 0000000..4108c3b --- /dev/null +++ b/backend/test_extract.py @@ -0,0 +1,79 @@ +import fitz # PyMuPDF +import io +import zipfile +import os + +def test_extract(pdf_path): + """Test PDF image extraction without Flask""" + + if not os.path.exists(pdf_path): + print(f"ā File not found: {pdf_path}") + return + + # Read PDF + with open(pdf_path, 'rb') as f: + pdf_bytes = f.read() + + # Open PDF from memory + doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + print(f"ā PDF opened: {pdf_path}") + print(f"š Total pages: {len(doc)}") + + total_images = 0 + + # Create ZIP in memory + zip_buffer = io.BytesIO() + + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + + for page_num in range(len(doc)): + page = doc[page_num] + image_list = page.get_images(full=True) + + print(f" Page {page_num + 1}: {len(image_list)} images found") + + for img_index, img in enumerate(image_list): + xref = img[0] + + try: + pix = fitz.Pixmap(doc, xref) + + if pix.n - pix.alpha < 4: + img_data = pix.tobytes("png") + ext = "png" + else: + pix = fitz.Pixmap(fitz.csRGB, pix) + img_data = pix.tobytes("png") + ext = "png" + + pix = None + + img_filename = f"page{page_num+1}_img{img_index+1}.{ext}" + zip_file.writestr(img_filename, img_data) + total_images += 1 + + print(f" ā Extracted: {img_filename}") + + except Exception as e: + print(f" ā Error: {e}") + + doc.close() + + # Add report + report = f"Extracted {total_images} images from {pdf_path}" + zip_file.writestr("report.txt", report) + + if total_images > 0: + # Save ZIP file + zip_buffer.seek(0) + with open("extracted_images.zip", "wb") as f: + f.write(zip_buffer.read()) + print(f"\nā ZIP created: extracted_images.zip ({total_images} images)") + else: + print("\nā ļø No images found in PDF") + +if __name__ == "__main__": + # Ask for PDF path + pdf_input = input("Enter PDF path (or drag-drop PDF here): ").strip().strip('"') + test_extract(pdf_input) \ No newline at end of file diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index abcd8d2..4c8ca99 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -5,6 +5,7 @@ import ScrollToTop from "./components/ScrollToTop"; import Layout from "./components/Layout/Layout"; import ErrorBoundary from "./ErrorBoundary"; +import PdfExtractImages from './pages/PdfExtractImages'; const UrlToQr = lazy(() => import("./pages/UrlToQr")); @@ -56,6 +57,8 @@ function App() { {/* The Landing Page has its own clean view */} } /> + } /> + {/* Informational pages (Navbar + Footer wrapper, no tool sidebar) */} } /> } /> diff --git a/frontend/src/data/toolsData.jsx b/frontend/src/data/toolsData.jsx index 3fdd97d..c556f25 100644 --- a/frontend/src/data/toolsData.jsx +++ b/frontend/src/data/toolsData.jsx @@ -315,7 +315,18 @@ const tools = [ path: "/url-to-qr", gradient: "from-emerald-500/10 to-green-500/10", iconGradient: "from-emerald-500 to-green-500", - } + }, + +{ + id: 'pdf-extract-images', + name: 'Extract PDF Images', + category: 'PDF Tools', + description: 'Extract all embedded images from PDF files', + icon: 'š¼ļø', + path: '/pdf/extract-images', + component: lazy(() => import('../pages/PdfExtractImages')), + comingSoon: false, +}, ]; diff --git a/frontend/src/pages/PdfExtractImages.jsx b/frontend/src/pages/PdfExtractImages.jsx new file mode 100644 index 0000000..fb8a99c --- /dev/null +++ b/frontend/src/pages/PdfExtractImages.jsx @@ -0,0 +1,146 @@ +import React, { useState } from 'react'; +import ToolPageTemplate from '../components/ToolPageTemplate'; +import FileUploadArea from '../components/FileUploadArea'; +import axios from 'axios'; + +const PdfExtractImages = () => { + const [file, setFile] = useState(null); + const [loading, setLoading] = useState(false); + const [previews, setPreviews] = useState([]); + const [error, setError] = useState(null); + + const API_URL = import.meta.env.VITE_API_URL || 'http://localhost:5000'; + + const handleFileSelect = async (selectedFile) => { + setFile(selectedFile); + setError(null); + await loadPreview(selectedFile); + }; + + const loadPreview = async (fileToPreview) => { + const formData = new FormData(); + formData.append('file', fileToPreview); + + try { + const response = await axios.post(`${API_URL}/preview-pdf-images`, formData, { + headers: { 'Content-Type': 'multipart/form-data' } + }); + + if (response.data.success) { + setPreviews(response.data.previews); + } else { + setError(response.data.error || 'No images to preview'); + } + } catch (err) { + setError('Could not load preview. PDF may have no images.'); + } + }; + + const handleExtract = async () => { + if (!file) { + setError('Please select a PDF file first'); + return; + } + + setLoading(true); + setError(null); + + const formData = new FormData(); + formData.append('file', file); + + try { + const response = await axios.post(`${API_URL}/extract-pdf-images`, formData, { + responseType: 'blob', + headers: { 'Content-Type': 'multipart/form-data' } + }); + + const url = window.URL.createObjectURL(new Blob([response.data])); + const link = document.createElement('a'); + link.href = url; + link.setAttribute('download', `${file.name.replace('.pdf', '')}_extracted_images.zip`); + document.body.appendChild(link); + link.click(); + link.remove(); + window.URL.revokeObjectURL(url); + + } catch (err) { + if (err.response && err.response.data instanceof Blob) { + const text = await err.response.data.text(); + try { + const errorJson = JSON.parse(text); + setError(errorJson.error || 'Extraction failed'); + } catch { + setError('Extraction failed. Please check the PDF format.'); + } + } else { + setError(err.response?.data?.error || 'Something went wrong'); + } + } finally { + setLoading(false); + } + }; + + return ( + + + + + + + {previews.length > 0 && ( + + + š· Preview Images Found + + + {previews.map((preview, idx) => ( + + + + Page {preview.page} ⢠Image {preview.index} + + + ))} + + + )} + + {file && ( + + + {loading ? 'Extracting Images...' : 'š„ Extract All Images (ZIP)'} + + + )} + + {error && ( + + {error} + + )} + + + ); +}; + +export default PdfExtractImages; \ No newline at end of file
+ Page {preview.page} ⢠Image {preview.index} +
{error}