diff --git a/backend/app/__init__.py b/backend/app/__init__.py index 2325631..a0cc1a6 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -65,6 +65,7 @@ def _health(): from blueprints.compress_pdf import compress_pdf_bp from blueprints.protect_pdf import protect_pdf_bp from blueprints.unlock_pdf import unlock_pdf_bp + from blueprints.searchable_pdf_ocr import searchable_pdf_ocr_bp app.register_blueprint(pdf_bp) app.register_blueprint(pdf_docx_bp) @@ -82,5 +83,6 @@ def _health(): app.register_blueprint(compress_pdf_bp) app.register_blueprint(protect_pdf_bp) app.register_blueprint(unlock_pdf_bp) + app.register_blueprint(searchable_pdf_ocr_bp) return app diff --git a/backend/blueprints/searchable_pdf_ocr.py b/backend/blueprints/searchable_pdf_ocr.py new file mode 100644 index 0000000..aa83e9e --- /dev/null +++ b/backend/blueprints/searchable_pdf_ocr.py @@ -0,0 +1,97 @@ +from flask import Blueprint, request, send_file, jsonify +import io +import fitz +import pytesseract +from PIL import Image +import cv2 +import numpy as np + +searchable_pdf_ocr_bp = Blueprint("searchable_pdf_ocr", __name__) + + +def preprocess_image(pil_image, mode="balanced"): + image = np.array(pil_image.convert("RGB")) + gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + + if mode == "none": + return pil_image.convert("RGB") + + if mode == "light": + processed = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21) + elif mode == "strong": + denoised = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21) + processed = cv2.adaptiveThreshold( + denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 31, 11 + ) + else: + denoised = cv2.fastNlMeansDenoising(gray, None, 20, 7, 21) + processed = cv2.adaptiveThreshold( + denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, 25, 15 + ) + + return Image.fromarray(processed).convert("RGB") + + +@searchable_pdf_ocr_bp.route("/searchable-pdf-ocr", methods=["POST"]) +def searchable_pdf_ocr(): + if "file" not in request.files: + return jsonify({"error": "No file uploaded."}), 400 + + file = request.files["file"] + filename = file.filename or "" + + if not filename.lower().endswith(".pdf"): + return jsonify({"error": "Please upload a PDF file."}), 400 + + language = request.form.get("language", "eng").strip() or "eng" + preprocess_mode = request.form.get("preprocess", "balanced").strip() or "balanced" + + if preprocess_mode not in {"none", "light", "balanced", "strong"}: + return jsonify({"error": "Invalid preprocessing mode."}), 400 + + source_doc = None + output_doc = fitz.open() + + try: + pdf_bytes = file.read() + source_doc = fitz.open(stream=pdf_bytes, filetype="pdf") + + if source_doc.page_count == 0: + return jsonify({"error": "The uploaded PDF has no pages."}), 400 + + for page in source_doc: + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False) + pil_image = Image.open(io.BytesIO(pix.tobytes("png"))) + processed_image = preprocess_image(pil_image, preprocess_mode) + + ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr( + processed_image, + extension="pdf", + lang=language, + ) + + page_doc = fitz.open(stream=ocr_pdf_bytes, filetype="pdf") + output_doc.insert_pdf(page_doc) + page_doc.close() + + output_buffer = io.BytesIO() + output_doc.save(output_buffer, garbage=3, deflate=True) + output_buffer.seek(0) + + base_name = filename.rsplit(".", 1)[0] or "document" + + return send_file( + output_buffer, + mimetype="application/pdf", + as_attachment=True, + download_name=f"{base_name}_searchable.pdf", + ) + + except Exception as exc: + return jsonify({"error": f"Failed to create searchable PDF: {str(exc)}"}), 500 + finally: + if source_doc: + source_doc.close() + output_doc.close() diff --git a/backend/requirements.txt b/backend/requirements.txt index 2a3546e..7b9b047 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,69 +1,72 @@ -Flask>=2.2 -flask-cors>=3.0 -gunicorn>=20.1 -Pillow>=9.0.0 -PyMuPDF>=1.22.0 -python-docx>=0.8.11 -reportlab>=3.6.12 -etelemetry==0.3.1 -filelock==3.19.1 -Flask==3.1.2 -flask-cors==6.0.1 -flatbuffers==25.12.19 -gunicorn==23.0.0 -httplib2==0.30.0 -humanfriendly==10.0 -idna==3.10 -ImageIO==2.37.2 -itsdangerous==2.2.0 -Jinja2==3.1.6 -jsonschema==4.25.1 -jsonschema-specifications==2025.9.1 -lazy_loader==0.4 -llvmlite==0.46.0 -looseversion==1.3.0 -lxml==6.0.1 -MarkupSafe==3.0.2 -mpmath==1.3.0 -networkx>=3.3,<4.0 -nibabel==5.3.2 -nipype==1.10.0 -numba==0.63.1 -numpy==2.2.6 -onnxruntime==1.23.2 -packaging==25.0 -pandas==2.3.2 -pathlib==1.0.1 -pdf2image==1.17.0 -pillow==12.1.0 -piexif==1.1.3 -platformdirs==4.5.1 -pooch==1.8.2 -protobuf==6.33.2 -prov==2.1.1 -puremagic==1.30 -pydot==4.0.1 -PyMatting==1.1.14 -PyMuPDF==1.26.4 -pyparsing==3.2.3 -pyreadline3==3.5.4 -python-dateutil==2.9.0.post0 -pytz==2025.2 -pyxnat==1.6.3 -rdflib==7.1.4 -referencing==0.37.0 -rembg==2.0.69 -requests==2.32.5 -rpds-py==0.30.0 -scikit-image==0.25.2 -scipy==1.15.3 -simplejson==3.20.1 -six==1.17.0 -sympy==1.14.0 -tifffile==2025.5.10 -tqdm==4.67.1 -traits==7.0.2 -tzdata==2025.2 -urllib3==2.5.0 -Werkzeug==3.1.3 -markdown2>=1.0.0 +Flask>=2.2 +flask-cors>=3.0 +gunicorn>=20.1 +Pillow>=9.0.0 +PyMuPDF>=1.22.0 +python-docx>=0.8.11 +reportlab>=3.6.12 +etelemetry==0.3.1 +filelock==3.19.1 +Flask==3.1.2 +flask-cors==6.0.1 +flatbuffers==25.12.19 +gunicorn==23.0.0 +httplib2==0.30.0 +humanfriendly==10.0 +idna==3.10 +ImageIO==2.37.2 +itsdangerous==2.2.0 +Jinja2==3.1.6 +jsonschema==4.25.1 +jsonschema-specifications==2025.9.1 +lazy_loader==0.4 +llvmlite==0.46.0 +looseversion==1.3.0 +lxml==6.0.1 +MarkupSafe==3.0.2 +mpmath==1.3.0 +networkx>=3.3,<4.0 +nibabel==5.3.2 +nipype==1.10.0 +numba==0.63.1 +numpy==2.2.6 +onnxruntime==1.23.2 +packaging==25.0 +pandas==2.3.2 +pathlib==1.0.1 +pdf2image==1.17.0 +pillow==12.1.0 +piexif==1.1.3 +platformdirs==4.5.1 +pooch==1.8.2 +protobuf==6.33.2 +prov==2.1.1 +puremagic==1.30 +pydot==4.0.1 +PyMatting==1.1.14 +PyMuPDF==1.26.4 +pyparsing==3.2.3 +pyreadline3==3.5.4 +python-dateutil==2.9.0.post0 +pytz==2025.2 +pyxnat==1.6.3 +rdflib==7.1.4 +referencing==0.37.0 +rembg==2.0.69 +requests==2.32.5 +rpds-py==0.30.0 +scikit-image==0.25.2 +scipy==1.15.3 +simplejson==3.20.1 +six==1.17.0 +sympy==1.14.0 +tifffile==2025.5.10 +tqdm==4.67.1 +traits==7.0.2 +tzdata==2025.2 +urllib3==2.5.0 +Werkzeug==3.1.3 +markdown2>=1.0.0 + +pytesseract>=0.3.10 +opencv-python-headless>=4.10.0 diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index b00849c..aefdd37 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -41,6 +41,7 @@ const PdfCompress = lazy(() => import("./pages/PdfCompress")); const PdfUnlock = lazy(() => import("./pages/PdfUnlock")); const PdfMetadata = lazy(() => import("./pages/PdfMetadata")); const PdfToText = lazy(() => import("./pages/PdfToText")); +const PdfSearchableOCR = lazy(() => import("./pages/PdfSearchableOCR")); const PdfInfo = lazy(() => import("./pages/PdfInfo")); const PdfPageNumber = lazy(() => import("./pages/PdfPageNumber")); @@ -84,6 +85,7 @@ function App() { } /> } /> } /> + } /> } /> } /> diff --git a/frontend/src/data/toolsData.jsx b/frontend/src/data/toolsData.jsx index acb4d51..9ef8212 100644 --- a/frontend/src/data/toolsData.jsx +++ b/frontend/src/data/toolsData.jsx @@ -1,6 +1,7 @@ import React from "react"; import { FileText, + FileSearch, Image, FileImage, Eraser, diff --git a/frontend/src/pages/PdfSearchableOCR.jsx b/frontend/src/pages/PdfSearchableOCR.jsx new file mode 100644 index 0000000..b191d57 --- /dev/null +++ b/frontend/src/pages/PdfSearchableOCR.jsx @@ -0,0 +1,98 @@ +import { useCallback, useState } from "react"; +import { FileSearch, Wand2 } from "lucide-react"; +import ToolPageTemplate from "../components/ToolPageTemplate"; + +function PdfSearchableOCR() { + const [language, setLanguage] = useState("eng"); + const [preprocess, setPreprocess] = useState("balanced"); + + const validateFile = useCallback((selectedFile) => { + if (selectedFile && selectedFile.type === "application/pdf") { + return { + isValid: true, + message: `File "${selectedFile.name}" selected (${( + selectedFile.size / 1024 + ).toFixed(1)} KB)`, + }; + } + + return { + isValid: false, + message: "Error: Please select a PDF file", + }; + }, []); + + const modifyFormData = (formData) => { + formData.append("language", language); + formData.append("preprocess", preprocess); + }; + + const extraFields = ({ file }) => { + if (!file) return null; + + return ( +
+
+ +

OCR Settings

+
+ + + + + + + +

+ Best for scanned PDFs, invoices, forms, notes, and image-only documents. +

+
+ ); + }; + + return ( + + fileName.replace(/\.pdf$/i, "_searchable.pdf") + } + submitButtonText="Create Searchable PDF" + loadingButtonText="Running OCR..." + onSuccess={() => "Success! Searchable PDF created."} + extraFields={extraFields} + maxWidthClass="max-w-[760px]" + defaultIcon={} + defaultText="Upload a scanned PDF" + supportText="Creates a searchable PDF using local OCR processing." + inputId="pdf-searchable-ocr-input" + /> + ); +} + +export default PdfSearchableOCR;