diff --git a/backend/app/__init__.py b/backend/app/__init__.py
index 2325631..a0cc1a6 100644
--- a/backend/app/__init__.py
+++ b/backend/app/__init__.py
@@ -65,6 +65,7 @@ def _health():
from blueprints.compress_pdf import compress_pdf_bp
from blueprints.protect_pdf import protect_pdf_bp
from blueprints.unlock_pdf import unlock_pdf_bp
+ from blueprints.searchable_pdf_ocr import searchable_pdf_ocr_bp
app.register_blueprint(pdf_bp)
app.register_blueprint(pdf_docx_bp)
@@ -82,5 +83,6 @@ def _health():
app.register_blueprint(compress_pdf_bp)
app.register_blueprint(protect_pdf_bp)
app.register_blueprint(unlock_pdf_bp)
+ app.register_blueprint(searchable_pdf_ocr_bp)
return app
diff --git a/backend/blueprints/searchable_pdf_ocr.py b/backend/blueprints/searchable_pdf_ocr.py
new file mode 100644
index 0000000..aa83e9e
--- /dev/null
+++ b/backend/blueprints/searchable_pdf_ocr.py
@@ -0,0 +1,97 @@
+from flask import Blueprint, request, send_file, jsonify
+import io
+import fitz
+import pytesseract
+from PIL import Image
+import cv2
+import numpy as np
+
+searchable_pdf_ocr_bp = Blueprint("searchable_pdf_ocr", __name__)
+
+
+def preprocess_image(pil_image, mode="balanced"):
+ image = np.array(pil_image.convert("RGB"))
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+
+ if mode == "none":
+ return pil_image.convert("RGB")
+
+ if mode == "light":
+ processed = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+ elif mode == "strong":
+ denoised = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
+ processed = cv2.adaptiveThreshold(
+ denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY, 31, 11
+ )
+ else:
+ denoised = cv2.fastNlMeansDenoising(gray, None, 20, 7, 21)
+ processed = cv2.adaptiveThreshold(
+ denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY, 25, 15
+ )
+
+ return Image.fromarray(processed).convert("RGB")
+
+
+@searchable_pdf_ocr_bp.route("/searchable-pdf-ocr", methods=["POST"])
+def searchable_pdf_ocr():
+ if "file" not in request.files:
+ return jsonify({"error": "No file uploaded."}), 400
+
+ file = request.files["file"]
+ filename = file.filename or ""
+
+ if not filename.lower().endswith(".pdf"):
+ return jsonify({"error": "Please upload a PDF file."}), 400
+
+ language = request.form.get("language", "eng").strip() or "eng"
+ preprocess_mode = request.form.get("preprocess", "balanced").strip() or "balanced"
+
+ if preprocess_mode not in {"none", "light", "balanced", "strong"}:
+ return jsonify({"error": "Invalid preprocessing mode."}), 400
+
+ source_doc = None
+ output_doc = fitz.open()
+
+ try:
+ pdf_bytes = file.read()
+ source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+ if source_doc.page_count == 0:
+ return jsonify({"error": "The uploaded PDF has no pages."}), 400
+
+ for page in source_doc:
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+ pil_image = Image.open(io.BytesIO(pix.tobytes("png")))
+ processed_image = preprocess_image(pil_image, preprocess_mode)
+
+ ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr(
+ processed_image,
+ extension="pdf",
+ lang=language,
+ )
+
+ page_doc = fitz.open(stream=ocr_pdf_bytes, filetype="pdf")
+ output_doc.insert_pdf(page_doc)
+ page_doc.close()
+
+ output_buffer = io.BytesIO()
+ output_doc.save(output_buffer, garbage=3, deflate=True)
+ output_buffer.seek(0)
+
+ base_name = filename.rsplit(".", 1)[0] or "document"
+
+ return send_file(
+ output_buffer,
+ mimetype="application/pdf",
+ as_attachment=True,
+ download_name=f"{base_name}_searchable.pdf",
+ )
+
+ except Exception as exc:
+ return jsonify({"error": f"Failed to create searchable PDF: {str(exc)}"}), 500
+ finally:
+ if source_doc:
+ source_doc.close()
+ output_doc.close()
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 2a3546e..7b9b047 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -1,69 +1,72 @@
-Flask>=2.2
-flask-cors>=3.0
-gunicorn>=20.1
-Pillow>=9.0.0
-PyMuPDF>=1.22.0
-python-docx>=0.8.11
-reportlab>=3.6.12
-etelemetry==0.3.1
-filelock==3.19.1
-Flask==3.1.2
-flask-cors==6.0.1
-flatbuffers==25.12.19
-gunicorn==23.0.0
-httplib2==0.30.0
-humanfriendly==10.0
-idna==3.10
-ImageIO==2.37.2
-itsdangerous==2.2.0
-Jinja2==3.1.6
-jsonschema==4.25.1
-jsonschema-specifications==2025.9.1
-lazy_loader==0.4
-llvmlite==0.46.0
-looseversion==1.3.0
-lxml==6.0.1
-MarkupSafe==3.0.2
-mpmath==1.3.0
-networkx>=3.3,<4.0
-nibabel==5.3.2
-nipype==1.10.0
-numba==0.63.1
-numpy==2.2.6
-onnxruntime==1.23.2
-packaging==25.0
-pandas==2.3.2
-pathlib==1.0.1
-pdf2image==1.17.0
-pillow==12.1.0
-piexif==1.1.3
-platformdirs==4.5.1
-pooch==1.8.2
-protobuf==6.33.2
-prov==2.1.1
-puremagic==1.30
-pydot==4.0.1
-PyMatting==1.1.14
-PyMuPDF==1.26.4
-pyparsing==3.2.3
-pyreadline3==3.5.4
-python-dateutil==2.9.0.post0
-pytz==2025.2
-pyxnat==1.6.3
-rdflib==7.1.4
-referencing==0.37.0
-rembg==2.0.69
-requests==2.32.5
-rpds-py==0.30.0
-scikit-image==0.25.2
-scipy==1.15.3
-simplejson==3.20.1
-six==1.17.0
-sympy==1.14.0
-tifffile==2025.5.10
-tqdm==4.67.1
-traits==7.0.2
-tzdata==2025.2
-urllib3==2.5.0
-Werkzeug==3.1.3
-markdown2>=1.0.0
+Flask>=2.2
+flask-cors>=3.0
+gunicorn>=20.1
+Pillow>=9.0.0
+PyMuPDF>=1.22.0
+python-docx>=0.8.11
+reportlab>=3.6.12
+etelemetry==0.3.1
+filelock==3.19.1
+Flask==3.1.2
+flask-cors==6.0.1
+flatbuffers==25.12.19
+gunicorn==23.0.0
+httplib2==0.30.0
+humanfriendly==10.0
+idna==3.10
+ImageIO==2.37.2
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+lazy_loader==0.4
+llvmlite==0.46.0
+looseversion==1.3.0
+lxml==6.0.1
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx>=3.3,<4.0
+nibabel==5.3.2
+nipype==1.10.0
+numba==0.63.1
+numpy==2.2.6
+onnxruntime==1.23.2
+packaging==25.0
+pandas==2.3.2
+pathlib==1.0.1
+pdf2image==1.17.0
+pillow==12.1.0
+piexif==1.1.3
+platformdirs==4.5.1
+pooch==1.8.2
+protobuf==6.33.2
+prov==2.1.1
+puremagic==1.30
+pydot==4.0.1
+PyMatting==1.1.14
+PyMuPDF==1.26.4
+pyparsing==3.2.3
+pyreadline3==3.5.4
+python-dateutil==2.9.0.post0
+pytz==2025.2
+pyxnat==1.6.3
+rdflib==7.1.4
+referencing==0.37.0
+rembg==2.0.69
+requests==2.32.5
+rpds-py==0.30.0
+scikit-image==0.25.2
+scipy==1.15.3
+simplejson==3.20.1
+six==1.17.0
+sympy==1.14.0
+tifffile==2025.5.10
+tqdm==4.67.1
+traits==7.0.2
+tzdata==2025.2
+urllib3==2.5.0
+Werkzeug==3.1.3
+markdown2>=1.0.0
+
+pytesseract>=0.3.10
+opencv-python-headless>=4.10.0
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
index b00849c..aefdd37 100644
--- a/frontend/src/App.jsx
+++ b/frontend/src/App.jsx
@@ -41,6 +41,7 @@ const PdfCompress = lazy(() => import("./pages/PdfCompress"));
const PdfUnlock = lazy(() => import("./pages/PdfUnlock"));
const PdfMetadata = lazy(() => import("./pages/PdfMetadata"));
const PdfToText = lazy(() => import("./pages/PdfToText"));
+const PdfSearchableOCR = lazy(() => import("./pages/PdfSearchableOCR"));
const PdfInfo = lazy(() => import("./pages/PdfInfo"));
const PdfPageNumber = lazy(() => import("./pages/PdfPageNumber"));
@@ -84,6 +85,7 @@ function App() {
+ Best for scanned PDFs, invoices, forms, notes, and image-only documents. +
+