-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmakeocr.py
More file actions
87 lines (74 loc) · 2.87 KB
/
makeocr.py
File metadata and controls
87 lines (74 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from PIL import Image, ImageFIle
import PIL
import pytesseract
import sharedutils
import os
import json
from tqdm import tqdm
from multiprocessing import Pool
import idownloadedentirenhentaicdn
OUTPUT_DIR = "ocr/"
MEDIA_DATA_DIR = idownloadedentirenhentaicdn.DATA_DIR
REMOVE_IMAGE_THAT_NOT_INDEXED = False
ImageFile.LOAD_TRUNCATED_IMAGES = True
possible_languages = []
desired_languages = ["english"]
languages_to_tesseract = {
"english": "eng",
"japanese": "jpn",
"chinese": "chi_sim",
}
for v in sharedutils.filtered_id_to_language.values():
if v not in possible_languages:
possible_languages.append(v)
print(possible_languages)
os.environ['OMP_THREAD_LIMIT'] = '1'
filesPictures = idownloadedentirenhentaicdn.alreadyHere
alreadyHere = os.listdir(OUTPUT_DIR)
def process_ocr(file):
if file.endswith(".png") or file.endswith(".jpg"):
(media_id, page, ext) = sharedutils.files_media_to_media_id_page_ext(file)
if media_id not in sharedutils.filtered_id_to_language:
if REMOVE_IMAGE_THAT_NOT_INDEXED:
os.remove(MEDIA_DATA_DIR + file)
else:
print("Skipping " + file + " because it's not in index")
return
language = sharedutils.filtered_id_to_language[media_id]
if language not in languages_to_tesseract: return
if language not in desired_languages: return
filenOutputName = file + ".json"
if filenOutputName in alreadyHere:
# print("Skipping " + file + " because it's already done")
return
# print(file)
# print(language)
try:
image = Image.open(MEDIA_DATA_DIR + file)
except PIL.UnidentifiedImageError:
os.remove(MEDIA_DATA_DIR + file)
file = idownloadedentirenhentaicdn.http_get(media_id, page)
try:
image = Image.open(MEDIA_DATA_DIR + file)
except PIL.UnidentifiedImageError:
print("Da hood trying to download corrupted image but the downloaded image is corrupted" + str(file))
return
try:
output = pytesseract.image_to_data(image, lang=languages_to_tesseract[language],
output_type=pytesseract.Output.DICT)
output['nhentai'] = {
"id": sharedutils.filtered_id_to_id[media_id],
"media_id": media_id,
"page": page,
"ext": ext,
"language": language,
'filename': file
}
# print(output["text"])
# print()
with open(OUTPUT_DIR + filenOutputName, "w") as f:
json.dump(output, f)
except Exception as e:
print("Error processing: " + str(file) + str(e))
with Pool(4) as p:
r = list(tqdm(p.imap(process_ocr, filesPictures), total=len(filesPictures)))