-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdetectLang2.py
More file actions
53 lines (44 loc) · 1.55 KB
/
Copy pathdetectLang2.py
File metadata and controls
53 lines (44 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# USAGE
# pip install pyPdf
# python detectLang.py
# libraries
from pyPdf import PdfFileReader
import os
# dictionary for translate PDF language to tessaract language
lan_lst = {
"en-us" : "eng", "en" : "eng", "en-za" : "eng", "en-gb" : "eng", "en-in" : "eng",
"es-co" : "spa", "es" : "spa", "de-de" : "deu", "fr-fr" : "fra", "fr-ca" : "fra"
}
# dictionary for /Root/Lang 1 - except; 2 - a file have not /Root/Lang; 3 - /Root/Lang = ''; 4 - language
ans_list = dict()
# dir of folder and filter for pdf files
files = [f for f in os.listdir('trainPDF') if os.path.isfile(os.path.join('trainPDF', f))]
files = list(filter(lambda f: f.endswith(('.pdf','.PDF')), files))
f = open("Langs.txt", "w")
for filepdf in files:
try:
name = 'IMAGES/'+filepdf.replace('pdf','jpg')
pdfFile = PdfFileReader(file('trainPDF/'+filepdf, 'rb'))
catalog = pdfFile.trailer['/Root'].getObject()
if catalog.has_key("/Lang"):
value = 4
lang = catalog['/Lang'].getObject()
if (lang == ''):
value = 3
f.write(filepdf+" "+lang+" value = "+str(value)+"\n")
ans_list.update( {name : [value,'None']} )
else:
lang = lang.lower()
language = lan_lst.get(lang)
f.write(filepdf+" "+lang+" => "+language+" value = "+str(value)+"\n")
ans_list.update( {name : [value,language]} )
else:
value = 2
f.write(filepdf+" value = "+str(value)+"\n")
ans_list.update( {name : [value,'None']} )
except:
value = 1
f.write(filepdf+' except ; value = '+str(value)+"\n")
ans_list.update( {name : [value,'None']} )
f.close()
print(ans_list)