Durgeshwar-AI · upasana-2006 · Jun 17, 2026
diff --git a/backend/app/__init__.py b/backend/app/__init__.py
@@ -65,6 +65,7 @@ def _health():
     from blueprints.compress_pdf import compress_pdf_bp
     from blueprints.protect_pdf import protect_pdf_bp
     from blueprints.unlock_pdf import unlock_pdf_bp
+    from blueprints.searchable_pdf_ocr import searchable_pdf_ocr_bp
 
     app.register_blueprint(pdf_bp)
     app.register_blueprint(pdf_docx_bp)
@@ -82,5 +83,6 @@ def _health():
     app.register_blueprint(compress_pdf_bp)
     app.register_blueprint(protect_pdf_bp)
     app.register_blueprint(unlock_pdf_bp)
+    app.register_blueprint(searchable_pdf_ocr_bp)
 
     return app
diff --git a/backend/blueprints/searchable_pdf_ocr.py b/backend/blueprints/searchable_pdf_ocr.py
@@ -0,0 +1,97 @@
+from flask import Blueprint, request, send_file, jsonify
+import io
+import fitz
+import pytesseract
+from PIL import Image
+import cv2
+import numpy as np
+
+searchable_pdf_ocr_bp = Blueprint("searchable_pdf_ocr", __name__)
+
+
+def preprocess_image(pil_image, mode="balanced"):
+    image = np.array(pil_image.convert("RGB"))
+    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+
+    if mode == "none":
+        return pil_image.convert("RGB")
+
+    if mode == "light":
+        processed = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
+    elif mode == "strong":
+        denoised = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
+        processed = cv2.adaptiveThreshold(
+            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 31, 11
+        )
+    else:
+        denoised = cv2.fastNlMeansDenoising(gray, None, 20, 7, 21)
+        processed = cv2.adaptiveThreshold(
+            denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+            cv2.THRESH_BINARY, 25, 15
+        )
+
+    return Image.fromarray(processed).convert("RGB")
+
+
+@searchable_pdf_ocr_bp.route("/searchable-pdf-ocr", methods=["POST"])
+def searchable_pdf_ocr():
+    if "file" not in request.files:
+        return jsonify({"error": "No file uploaded."}), 400
+
+    file = request.files["file"]
+    filename = file.filename or ""
+
+    if not filename.lower().endswith(".pdf"):
+        return jsonify({"error": "Please upload a PDF file."}), 400
+
+    language = request.form.get("language", "eng").strip() or "eng"
+    preprocess_mode = request.form.get("preprocess", "balanced").strip() or "balanced"
+
+    if preprocess_mode not in {"none", "light", "balanced", "strong"}:
+        return jsonify({"error": "Invalid preprocessing mode."}), 400
+
+    source_doc = None
+    output_doc = fitz.open()
+
+    try:
+        pdf_bytes = file.read()
+        source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+
+        if source_doc.page_count == 0:
+            return jsonify({"error": "The uploaded PDF has no pages."}), 400
+
+        for page in source_doc:
+            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+            pil_image = Image.open(io.BytesIO(pix.tobytes("png")))
+            processed_image = preprocess_image(pil_image, preprocess_mode)
+
+            ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr(
+                processed_image,
+                extension="pdf",
+                lang=language,
+            )
+
+            page_doc = fitz.open(stream=ocr_pdf_bytes, filetype="pdf")
+            output_doc.insert_pdf(page_doc)
+            page_doc.close()
+
+        output_buffer = io.BytesIO()
+        output_doc.save(output_buffer, garbage=3, deflate=True)
+        output_buffer.seek(0)
+
+        base_name = filename.rsplit(".", 1)[0] or "document"
+
+        return send_file(
+            output_buffer,
+            mimetype="application/pdf",
+            as_attachment=True,
+            download_name=f"{base_name}_searchable.pdf",
+        )
+
+    except Exception as exc:
+        return jsonify({"error": f"Failed to create searchable PDF: {str(exc)}"}), 500
+    finally:
+        if source_doc:
+            source_doc.close()
+        output_doc.close()
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,69 +1,72 @@
-Flask>=2.2
-flask-cors>=3.0
-gunicorn>=20.1
-Pillow>=9.0.0
-PyMuPDF>=1.22.0
-python-docx>=0.8.11
-reportlab>=3.6.12
-etelemetry==0.3.1
-filelock==3.19.1
-Flask==3.1.2
-flask-cors==6.0.1
-flatbuffers==25.12.19
-gunicorn==23.0.0
-httplib2==0.30.0
-humanfriendly==10.0
-idna==3.10
-ImageIO==2.37.2
-itsdangerous==2.2.0
-Jinja2==3.1.6
-jsonschema==4.25.1
-jsonschema-specifications==2025.9.1
-lazy_loader==0.4
-llvmlite==0.46.0
-looseversion==1.3.0
-lxml==6.0.1
-MarkupSafe==3.0.2
-mpmath==1.3.0
-networkx>=3.3,<4.0
-nibabel==5.3.2
-nipype==1.10.0
-numba==0.63.1
-numpy==2.2.6
-onnxruntime==1.23.2
-packaging==25.0
-pandas==2.3.2
-pathlib==1.0.1
-pdf2image==1.17.0
-pillow==12.1.0
-piexif==1.1.3
-platformdirs==4.5.1
-pooch==1.8.2
-protobuf==6.33.2
-prov==2.1.1
-puremagic==1.30
-pydot==4.0.1
-PyMatting==1.1.14
-PyMuPDF==1.26.4
-pyparsing==3.2.3
-pyreadline3==3.5.4
-python-dateutil==2.9.0.post0
-pytz==2025.2
-pyxnat==1.6.3
-rdflib==7.1.4
-referencing==0.37.0
-rembg==2.0.69
-requests==2.32.5
-rpds-py==0.30.0
-scikit-image==0.25.2
-scipy==1.15.3
-simplejson==3.20.1
-six==1.17.0
-sympy==1.14.0
-tifffile==2025.5.10
-tqdm==4.67.1
-traits==7.0.2
-tzdata==2025.2
-urllib3==2.5.0
-Werkzeug==3.1.3
-markdown2>=1.0.0
+Flask>=2.2
+flask-cors>=3.0
+gunicorn>=20.1
+Pillow>=9.0.0
+PyMuPDF>=1.22.0
+python-docx>=0.8.11
+reportlab>=3.6.12
+etelemetry==0.3.1
+filelock==3.19.1
+Flask==3.1.2
+flask-cors==6.0.1
+flatbuffers==25.12.19
+gunicorn==23.0.0
+httplib2==0.30.0
+humanfriendly==10.0
+idna==3.10
+ImageIO==2.37.2
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+lazy_loader==0.4
+llvmlite==0.46.0
+looseversion==1.3.0
+lxml==6.0.1
+MarkupSafe==3.0.2
+mpmath==1.3.0
+networkx>=3.3,<4.0
+nibabel==5.3.2
+nipype==1.10.0
+numba==0.63.1
+numpy==2.2.6
+onnxruntime==1.23.2
+packaging==25.0
+pandas==2.3.2
+pathlib==1.0.1
+pdf2image==1.17.0
+pillow==12.1.0
+piexif==1.1.3
+platformdirs==4.5.1
+pooch==1.8.2
+protobuf==6.33.2
+prov==2.1.1
+puremagic==1.30
+pydot==4.0.1
+PyMatting==1.1.14
+PyMuPDF==1.26.4
+pyparsing==3.2.3
+pyreadline3==3.5.4
+python-dateutil==2.9.0.post0
+pytz==2025.2
+pyxnat==1.6.3
+rdflib==7.1.4
+referencing==0.37.0
+rembg==2.0.69
+requests==2.32.5
+rpds-py==0.30.0
+scikit-image==0.25.2
+scipy==1.15.3
+simplejson==3.20.1
+six==1.17.0
+sympy==1.14.0
+tifffile==2025.5.10
+tqdm==4.67.1
+traits==7.0.2
+tzdata==2025.2
+urllib3==2.5.0
+Werkzeug==3.1.3
+markdown2>=1.0.0
+
+pytesseract>=0.3.10
+opencv-python-headless>=4.10.0
diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx
@@ -41,6 +41,7 @@ const PdfCompress = lazy(() => import("./pages/PdfCompress"));
 const PdfUnlock = lazy(() => import("./pages/PdfUnlock"));
 const PdfMetadata = lazy(() => import("./pages/PdfMetadata"));
 const PdfToText = lazy(() => import("./pages/PdfToText"));
+const PdfSearchableOCR = lazy(() => import("./pages/PdfSearchableOCR"));
 const PdfInfo = lazy(() => import("./pages/PdfInfo"));
 const PdfPageNumber = lazy(() => import("./pages/PdfPageNumber"));
 
@@ -84,6 +85,7 @@ function App() {
             <Route path="/pdf-unlock" element={<PdfUnlock />} />
             <Route path="/pdf-metadata" element={<PdfMetadata />} />
             <Route path="/pdf-to-text" element={<PdfToText />} />
+            <Route path="/pdf-searchable-ocr" element={<PdfSearchableOCR />} />
             <Route path="/pdf-info" element={<PdfInfo />} />
             <Route path="/pdf-page-number" element={<PdfPageNumber />} />
 

diff --git a/frontend/src/data/toolsData.jsx b/frontend/src/data/toolsData.jsx
@@ -1,6 +1,7 @@
 import React from "react";
 import {
   FileText,
+  FileSearch,
   Image,
   FileImage,
   Eraser,

diff --git a/frontend/src/pages/PdfSearchableOCR.jsx b/frontend/src/pages/PdfSearchableOCR.jsx
@@ -0,0 +1,98 @@
+import { useCallback, useState } from "react";
+import { FileSearch, Wand2 } from "lucide-react";
+import ToolPageTemplate from "../components/ToolPageTemplate";
+
+function PdfSearchableOCR() {
+  const [language, setLanguage] = useState("eng");
+  const [preprocess, setPreprocess] = useState("balanced");
+
+  const validateFile = useCallback((selectedFile) => {
+    if (selectedFile && selectedFile.type === "application/pdf") {
+      return {
+        isValid: true,
+        message: `File "${selectedFile.name}" selected (${(
+          selectedFile.size / 1024
+        ).toFixed(1)} KB)`,
+      };
+    }
+
+    return {
+      isValid: false,
+      message: "Error: Please select a PDF file",
+    };
+  }, []);
+
+  const modifyFormData = (formData) => {
+    formData.append("language", language);
+    formData.append("preprocess", preprocess);
+  };
+
+  const extraFields = ({ file }) => {
+    if (!file) return null;
+
+    return (
+      <div className="mb-6 rounded-3xl border border-slate-200 bg-white p-5 shadow-sm">
+        <div className="mb-4 flex items-center gap-2">
+          <Wand2 className="h-5 w-5 text-blue-600" />
+          <h3 className="text-sm font-bold text-slate-800">OCR Settings</h3>
+        </div>
+
+        <label className="mb-2 block text-sm font-semibold text-slate-700">
+          OCR Language
+        </label>
+        <select
+          value={language}
+          onChange={(event) => setLanguage(event.target.value)}
+          className="mb-4 w-full rounded-xl border border-slate-200 bg-white px-4 py-3 text-sm text-slate-700 outline-none transition focus:border-blue-400 focus:ring-2 focus:ring-blue-100"
+        >
+          <option value="eng">English</option>
+          <option value="hin">Hindi</option>
+          <option value="eng+hin">English + Hindi</option>
+        </select>
+
+        <label className="mb-2 block text-sm font-semibold text-slate-700">
+          Preprocessing
+        </label>
+        <select
+          value={preprocess}
+          onChange={(event) => setPreprocess(event.target.value)}
+          className="w-full rounded-xl border border-slate-200 bg-white px-4 py-3 text-sm text-slate-700 outline-none transition focus:border-blue-400 focus:ring-2 focus:ring-blue-100"
+        >
+          <option value="none">None</option>
+          <option value="light">Light denoise</option>
+          <option value="balanced">Balanced OCR cleanup</option>
+          <option value="strong">Strong thresholding</option>
+        </select>
+
+        <p className="mt-4 text-xs text-slate-500">
+          Best for scanned PDFs, invoices, forms, notes, and image-only documents.
+        </p>
+      </div>
+    );
+  };
+
+  return (
+    <ToolPageTemplate
+      title="Scanned PDF OCR"
+      description="Convert image-only scanned PDFs into searchable PDFs with selectable text."
+      endpoint="/searchable-pdf-ocr"
+      accept="application/pdf"
+      validateFile={validateFile}
+      modifyFormData={modifyFormData}
+      getOutputFileName={(fileName) =>
+        fileName.replace(/\.pdf$/i, "_searchable.pdf")
+      }
+      submitButtonText="Create Searchable PDF"
+      loadingButtonText="Running OCR..."
+      onSuccess={() => "Success! Searchable PDF created."}
+      extraFields={extraFields}
+      maxWidthClass="max-w-[760px]"
+      defaultIcon={<FileSearch />}
+      defaultText="Upload a scanned PDF"
+      supportText="Creates a searchable PDF using local OCR processing."
+      inputId="pdf-searchable-ocr-input"
+    />
+  );
+}
+
+export default PdfSearchableOCR;