-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathKNN.py
More file actions
201 lines (176 loc) · 6.74 KB
/
Copy pathKNN.py
File metadata and controls
201 lines (176 loc) · 6.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# Authors Alexey Titov and Shir Bentabou
# Version 5.0
# Date 04.2019
# import the necessary packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader
from imutils import paths
import numpy as np
import pytesseract
import argparse
import imutils
import cv2
import os
import tempfile
import sys
# this function convert pdf file to jpg file
def convert(dirpdf):
# dir of folder and filter for pdf files
files = [
f for f in os.listdir(dirpdf) if os.path.isfile(
os.path.join(
dirpdf,
f))]
files = list(filter(lambda f: f.endswith(('.pdf', '.PDF')), files))
# variables for print information
cnt_files = len(files)
i = 0
for filepdf in files:
try:
filename = os.path.join(dirpdf, filepdf)
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(
filename, output_folder=path, last_page=1, first_page=0)
base_filename = os.path.splitext(
os.path.basename(filename))[0] + '.jpg'
save_dir = 'IMAGES'
# save image
for page in images_from_path:
page.save(os.path.join(save_dir, base_filename), 'JPEG')
i += 1
# show an update every 50 images
if (i > 0 and i % 50 == 0):
print("[INFO] processed {}/{}".format(i, cnt_files))
except Exception:
print(filepdf)
# always keep track the error until the code has been clean
print("[!] Convert PDF to JPEG")
continue
return False
return True
# this function extract color histogram for images
def extract_color_histogram(image, bins=(8, 8, 8)):
# extract a 3D color histogram from the HSV color space using
# the supplied number of `bins` per channel
hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
[0, 180, 0, 256, 0, 256])
# handle normalizing the histogram if we are using OpenCV 2.4.X
if imutils.is_cv2():
hist = cv2.normalize(hist)
# otherwise, perform "in place" normalization in OpenCV 3 (I
# personally hate the way this is done
else:
cv2.normalize(hist, hist)
# return the flattened histogram as the feature vector
return hist.flatten()
# this function detect blur
def detect_image_blur(imgPath):
try:
image = cv2.imread(imgPath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
score = cv2.Laplacian(image, cv2.CV_64F).var()
if (score < 110):
detect = {score}
return detect
else:
detect = {score}
return detect
except Exception:
print(imgPath)
detect = {0}
return detect
if __name__ == "__main__":
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True,
help="path to input dataset")
ap.add_argument("-k", "--neighbors", type=int, default=5,
help="# of nearest neighbors for classification")
ap.add_argument(
"-j",
"--jobs",
type=int,
default=-
1,
help="# of jobs for k-NN distance (-1 uses all available cores)")
args = vars(ap.parse_args())
# define the name of the directory to be created
path = "IMAGES"
try:
os.mkdir(path)
except OSError:
print(
"[!] Creation of the directory %s failed, maybe the folder is exist" %
path)
else:
print("[*] Successfully created the directory %s " % path)
arg = os.path.join(os.getcwd(), args["dataset"])
result = True #convert(arg)
if (result):
print("[*] Succces convert pdf files")
else:
print("[!] Whoops. something wrong dude. enable err var to track it")
sys.exit()
# grab the list of images that we'll be describing
print("[INFO] describing images...")
imagePaths = list(paths.list_images("IMAGES"))
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
features = []
labels = []
# loop over the input images
for (i, imagePath) in enumerate(imagePaths):
# load the image and extract the class label (assuming that our
# path as the format: /path/to/dataset/{class}.{image_num}.jpg
image = cv2.imread(imagePath)
label = imagePath.split(os.path.sep)[-1].split(".")[0]
# histogram to characterize the color distribution of the pixels
# in the image
hist = extract_color_histogram(image)
# detect blur
blur = detect_image_blur(imagePath)
hist = list(hist) + list(blur)
hist = np.array(hist)
# update the raw images, features, and labels matricies,
# respectively
features.append(hist)
labels.append(label)
# show an update every 50 images
if (i > 0 and i % 50 == 0):
print("[INFO] processed {}/{}".format(i, len(imagePaths)))
# show some information on the memory consumed by the raw images
# matrix and features matrix
features = np.array(features)
labels = np.array(labels)
print("[INFO] features matrix: {:.2f}MB".format(
features.nbytes / (1024 * 1000.0)))
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
features, labels, test_size=0.20, random_state=42)
# train and evaluate a k-NN classifer on the histogram
# representations
print("[INFO] evaluating histogram accuracy...")
# Algorithm used to compute the nearest neighbors:
model = KNeighborsClassifier(
algorithm='auto',
n_neighbors=args["neighbors"],
n_jobs=args["jobs"])
# ‘ball_tree’ will use BallTree
# ‘kd_tree’ will use KDTree
# ‘brute’ will use a brute-force search.
# ‘auto’ will attempt to decide the most appropriate algorithm based on the values passed to fit method.
model.fit(trainFeat, trainLabels)
# Only accuracy
acc = model.score(testFeat, testLabels)
print("[INFO] histogram accuracy: {:.2f}%".format(acc * 100))
# precision recall f1-score support
predictions = model.predict(testFeat)
# show a final classification report demonstrating the accuracy of the
# classifier
print("EVALUATION ON TESTING DATA")
print(classification_report(testLabels, predictions))