Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def _health():
from blueprints.compress_pdf import compress_pdf_bp
from blueprints.protect_pdf import protect_pdf_bp
from blueprints.unlock_pdf import unlock_pdf_bp
from blueprints.searchable_pdf_ocr import searchable_pdf_ocr_bp

app.register_blueprint(pdf_bp)
app.register_blueprint(pdf_docx_bp)
Expand All @@ -82,5 +83,6 @@ def _health():
app.register_blueprint(compress_pdf_bp)
app.register_blueprint(protect_pdf_bp)
app.register_blueprint(unlock_pdf_bp)
app.register_blueprint(searchable_pdf_ocr_bp)

return app
97 changes: 97 additions & 0 deletions backend/blueprints/searchable_pdf_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from flask import Blueprint, request, send_file, jsonify
import io
import fitz
import pytesseract
from PIL import Image
import cv2
import numpy as np

searchable_pdf_ocr_bp = Blueprint("searchable_pdf_ocr", __name__)


def preprocess_image(pil_image, mode="balanced"):
image = np.array(pil_image.convert("RGB"))
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

if mode == "none":
return pil_image.convert("RGB")

if mode == "light":
processed = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
elif mode == "strong":
denoised = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
processed = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 31, 11
)
else:
denoised = cv2.fastNlMeansDenoising(gray, None, 20, 7, 21)
processed = cv2.adaptiveThreshold(
denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 25, 15
)

return Image.fromarray(processed).convert("RGB")


@searchable_pdf_ocr_bp.route("/searchable-pdf-ocr", methods=["POST"])
def searchable_pdf_ocr():
if "file" not in request.files:
return jsonify({"error": "No file uploaded."}), 400

file = request.files["file"]
filename = file.filename or ""

if not filename.lower().endswith(".pdf"):
return jsonify({"error": "Please upload a PDF file."}), 400

language = request.form.get("language", "eng").strip() or "eng"
preprocess_mode = request.form.get("preprocess", "balanced").strip() or "balanced"

if preprocess_mode not in {"none", "light", "balanced", "strong"}:
return jsonify({"error": "Invalid preprocessing mode."}), 400

source_doc = None
output_doc = fitz.open()

try:
pdf_bytes = file.read()
source_doc = fitz.open(stream=pdf_bytes, filetype="pdf")

if source_doc.page_count == 0:
return jsonify({"error": "The uploaded PDF has no pages."}), 400

for page in source_doc:
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
pil_image = Image.open(io.BytesIO(pix.tobytes("png")))
processed_image = preprocess_image(pil_image, preprocess_mode)

ocr_pdf_bytes = pytesseract.image_to_pdf_or_hocr(
processed_image,
extension="pdf",
lang=language,
)

page_doc = fitz.open(stream=ocr_pdf_bytes, filetype="pdf")
output_doc.insert_pdf(page_doc)
page_doc.close()

output_buffer = io.BytesIO()
output_doc.save(output_buffer, garbage=3, deflate=True)
output_buffer.seek(0)

base_name = filename.rsplit(".", 1)[0] or "document"

return send_file(
output_buffer,
mimetype="application/pdf",
as_attachment=True,
download_name=f"{base_name}_searchable.pdf",
)

except Exception as exc:
return jsonify({"error": f"Failed to create searchable PDF: {str(exc)}"}), 500
finally:
if source_doc:
source_doc.close()
output_doc.close()
141 changes: 72 additions & 69 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,69 +1,72 @@
ο»ΏFlask>=2.2
flask-cors>=3.0
gunicorn>=20.1
Pillow>=9.0.0
PyMuPDF>=1.22.0
python-docx>=0.8.11
reportlab>=3.6.12
etelemetry==0.3.1
filelock==3.19.1
Flask==3.1.2
flask-cors==6.0.1
flatbuffers==25.12.19
gunicorn==23.0.0
httplib2==0.30.0
humanfriendly==10.0
idna==3.10
ImageIO==2.37.2
itsdangerous==2.2.0
Jinja2==3.1.6
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
lazy_loader==0.4
llvmlite==0.46.0
looseversion==1.3.0
lxml==6.0.1
MarkupSafe==3.0.2
mpmath==1.3.0
networkx>=3.3,<4.0
nibabel==5.3.2
nipype==1.10.0
numba==0.63.1
numpy==2.2.6
onnxruntime==1.23.2
packaging==25.0
pandas==2.3.2
pathlib==1.0.1
pdf2image==1.17.0
pillow==12.1.0
piexif==1.1.3
platformdirs==4.5.1
pooch==1.8.2
protobuf==6.33.2
prov==2.1.1
puremagic==1.30
pydot==4.0.1
PyMatting==1.1.14
PyMuPDF==1.26.4
pyparsing==3.2.3
pyreadline3==3.5.4
python-dateutil==2.9.0.post0
pytz==2025.2
pyxnat==1.6.3
rdflib==7.1.4
referencing==0.37.0
rembg==2.0.69
requests==2.32.5
rpds-py==0.30.0
scikit-image==0.25.2
scipy==1.15.3
simplejson==3.20.1
six==1.17.0
sympy==1.14.0
tifffile==2025.5.10
tqdm==4.67.1
traits==7.0.2
tzdata==2025.2
urllib3==2.5.0
Werkzeug==3.1.3
markdown2>=1.0.0
ο»ΏFlask>=2.2
flask-cors>=3.0
gunicorn>=20.1
Pillow>=9.0.0
PyMuPDF>=1.22.0
python-docx>=0.8.11
reportlab>=3.6.12
etelemetry==0.3.1
filelock==3.19.1
Flask==3.1.2
flask-cors==6.0.1
flatbuffers==25.12.19
gunicorn==23.0.0
httplib2==0.30.0
humanfriendly==10.0
idna==3.10
ImageIO==2.37.2
itsdangerous==2.2.0
Jinja2==3.1.6
jsonschema==4.25.1
jsonschema-specifications==2025.9.1
lazy_loader==0.4
llvmlite==0.46.0
looseversion==1.3.0
lxml==6.0.1
MarkupSafe==3.0.2
mpmath==1.3.0
networkx>=3.3,<4.0
nibabel==5.3.2
nipype==1.10.0
numba==0.63.1
numpy==2.2.6
onnxruntime==1.23.2
packaging==25.0
pandas==2.3.2
pathlib==1.0.1
pdf2image==1.17.0
pillow==12.1.0
piexif==1.1.3
platformdirs==4.5.1
pooch==1.8.2
protobuf==6.33.2
prov==2.1.1
puremagic==1.30
pydot==4.0.1
PyMatting==1.1.14
PyMuPDF==1.26.4
pyparsing==3.2.3
pyreadline3==3.5.4
python-dateutil==2.9.0.post0
pytz==2025.2
pyxnat==1.6.3
rdflib==7.1.4
referencing==0.37.0
rembg==2.0.69
requests==2.32.5
rpds-py==0.30.0
scikit-image==0.25.2
scipy==1.15.3
simplejson==3.20.1
six==1.17.0
sympy==1.14.0
tifffile==2025.5.10
tqdm==4.67.1
traits==7.0.2
tzdata==2025.2
urllib3==2.5.0
Werkzeug==3.1.3
markdown2>=1.0.0

pytesseract>=0.3.10
opencv-python-headless>=4.10.0
2 changes: 2 additions & 0 deletions frontend/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const PdfCompress = lazy(() => import("./pages/PdfCompress"));
const PdfUnlock = lazy(() => import("./pages/PdfUnlock"));
const PdfMetadata = lazy(() => import("./pages/PdfMetadata"));
const PdfToText = lazy(() => import("./pages/PdfToText"));
const PdfSearchableOCR = lazy(() => import("./pages/PdfSearchableOCR"));
const PdfInfo = lazy(() => import("./pages/PdfInfo"));
const PdfPageNumber = lazy(() => import("./pages/PdfPageNumber"));

Expand Down Expand Up @@ -84,6 +85,7 @@ function App() {
<Route path="/pdf-unlock" element={<PdfUnlock />} />
<Route path="/pdf-metadata" element={<PdfMetadata />} />
<Route path="/pdf-to-text" element={<PdfToText />} />
<Route path="/pdf-searchable-ocr" element={<PdfSearchableOCR />} />
<Route path="/pdf-info" element={<PdfInfo />} />
<Route path="/pdf-page-number" element={<PdfPageNumber />} />

Expand Down
1 change: 1 addition & 0 deletions frontend/src/data/toolsData.jsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import React from "react";
import {
FileText,
FileSearch,
Image,
FileImage,
Eraser,
Expand Down
98 changes: 98 additions & 0 deletions frontend/src/pages/PdfSearchableOCR.jsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { useCallback, useState } from "react";
import { FileSearch, Wand2 } from "lucide-react";
import ToolPageTemplate from "../components/ToolPageTemplate";

function PdfSearchableOCR() {
const [language, setLanguage] = useState("eng");
const [preprocess, setPreprocess] = useState("balanced");

const validateFile = useCallback((selectedFile) => {
if (selectedFile && selectedFile.type === "application/pdf") {
return {
isValid: true,
message: `File "${selectedFile.name}" selected (${(
selectedFile.size / 1024
).toFixed(1)} KB)`,
};
}

return {
isValid: false,
message: "Error: Please select a PDF file",
};
}, []);

const modifyFormData = (formData) => {
formData.append("language", language);
formData.append("preprocess", preprocess);
};

const extraFields = ({ file }) => {
if (!file) return null;

return (
<div className="mb-6 rounded-3xl border border-slate-200 bg-white p-5 shadow-sm">
<div className="mb-4 flex items-center gap-2">
<Wand2 className="h-5 w-5 text-blue-600" />
<h3 className="text-sm font-bold text-slate-800">OCR Settings</h3>
</div>

<label className="mb-2 block text-sm font-semibold text-slate-700">
OCR Language
</label>
<select
value={language}
onChange={(event) => setLanguage(event.target.value)}
className="mb-4 w-full rounded-xl border border-slate-200 bg-white px-4 py-3 text-sm text-slate-700 outline-none transition focus:border-blue-400 focus:ring-2 focus:ring-blue-100"
>
<option value="eng">English</option>
<option value="hin">Hindi</option>
<option value="eng+hin">English + Hindi</option>
</select>

<label className="mb-2 block text-sm font-semibold text-slate-700">
Preprocessing
</label>
<select
value={preprocess}
onChange={(event) => setPreprocess(event.target.value)}
className="w-full rounded-xl border border-slate-200 bg-white px-4 py-3 text-sm text-slate-700 outline-none transition focus:border-blue-400 focus:ring-2 focus:ring-blue-100"
>
<option value="none">None</option>
<option value="light">Light denoise</option>
<option value="balanced">Balanced OCR cleanup</option>
<option value="strong">Strong thresholding</option>
</select>

<p className="mt-4 text-xs text-slate-500">
Best for scanned PDFs, invoices, forms, notes, and image-only documents.
</p>
</div>
);
};

return (
<ToolPageTemplate
title="Scanned PDF OCR"
description="Convert image-only scanned PDFs into searchable PDFs with selectable text."
endpoint="/searchable-pdf-ocr"
accept="application/pdf"
validateFile={validateFile}
modifyFormData={modifyFormData}
getOutputFileName={(fileName) =>
fileName.replace(/\.pdf$/i, "_searchable.pdf")
}
submitButtonText="Create Searchable PDF"
loadingButtonText="Running OCR..."
onSuccess={() => "Success! Searchable PDF created."}
extraFields={extraFields}
maxWidthClass="max-w-[760px]"
defaultIcon={<FileSearch />}
defaultText="Upload a scanned PDF"
supportText="Creates a searchable PDF using local OCR processing."
inputId="pdf-searchable-ocr-input"
/>
);
}

export default PdfSearchableOCR;