-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubs.py
More file actions
207 lines (181 loc) · 7.75 KB
/
subs.py
File metadata and controls
207 lines (181 loc) · 7.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
import sys
import codecs
all_words = {}
english_replacements = [('-', ' '), ("'re", ' are'), ("'ve", ' have'), ("n't", ' not'), ("'ll", ' will'), ("'m", ' am')]
english_suffixes = ['s', 'ing', 'd', 'ed']
spanish_replacements = []
spanish_suffixes = [u's', u'n', u'is', u'a', u'do', u'ba', u'es', u'on', u'á', u'r', u'mente', u'mos', u'o', u'd', u'ban', u'da', u'emos', u'te', u'e', u'la', u'le', u'lo', u'an', u'itas', u'as', u'os', u'los', u'ción', u'res', u'án', u'el', u'me', u'se', u'l', u'i', u'las', u'das', u'dos', u'les', u'm', u'cia', u'ron', u'ta', u'en', u'ás', u'tas', u'ita', u'é', u'ra', u'b', u'na', u'dor', u'rá', u'teis', u'dad', u'sa', u'ble', u'que', u't', u'so', u'nte', u'monos', u'eo', u'am', u'nos', u'c', u'za', u'idad', u'ndo', u'ad', u'ma', u'ga', u'de', u'at', u'no', u'samente', u'eos', u'ías', u'or', u'ro', u'jo', u'ncia', u'aim', u'je', u'nes']
def handle_text(text, replacements):
global all_words
for wht, wth in replacements:
text = text.replace(wht, wth)
for word in text.split():
word = word.lower()
for sym in ['.', "'", '"', ',', ';', '?', '!', '<i>', '</i>', '(', ')', '{', '}', ':', '-']:
word = word.replace(sym, '')
for sym in word:
if sym.lower() == sym.upper():
word = word.replace(sym, '')
all_words.setdefault(word, 0)
all_words[word] += 1
def sub_parser(line, replacements, statics={'lines': []}):
lines = statics['lines']
line = line.strip()
if '-->' in line:
handle_text('\n'.join(lines[:-1]), replacements)
lines = []
else:
lines.append(line)
def bible_parser(line, replacements):
parts = line.strip().split('\t')
handle_text(parts[-1], replacements)
def dictionary_parser(path):
dictionary = {}
with codecs.open(path, 'r', 'utf-8') as file:
for line in file:
line = line.strip()
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if not parts:
continue
words, translation, type = parts
variants = words.split(';')
print str([words, translation, type]).encode('utf-8')
for variant in variants:
translations = dictionary.setdefault(variant.strip().capitalize(), {}).setdefault(type, [])
for variant in translation.split(';'):
translations.append(variant.strip().capitalize())
return dictionary
def lingvo_parser(path):
import xml.etree.ElementTree as ET
import pickle
dictionary = {}
def deep_search(root, name):
for el in root:
if el.tag == name:
yield el
else:
for res in deep_search(el, name):
yield res
def get_text(el):
return ' '.join([child.tail for child in el if child.tail] + ([el.text] if el.text else []))
def handle_block(block):
if not block:
return
root = ET.fromstring(('<r>%s</r>' % block).encode('utf-8'))
translation = None
krefs = []
for el in root:
if 'k' == el.tag:
name = get_text(el).strip().lower()
translation = dictionary.setdefault(name, [])
elif 'dtrn' == el.tag:
translation.append(get_text(el).strip())
elif 'ex' == el.tag:
translation.append(el.text)
elif 'kref' == el.tag:
krefs.append(el)
else:
for kref in deep_search(el, 'kref'):
krefs.append(kref)
for dtrn in deep_search(el, 'dtrn'):
translation.append(get_text(dtrn).strip())
if not translation:
if krefs:
for kref in krefs:
ref = get_text(kref).strip().lower()
if ref != name:
translation.append('^:' + ref)
if not translation:
print 'no translation:', name.encode('utf-8')
if os.path.exists(path+'.pcl'):
with open(path + '.pcl', 'rb') as file:
dictionary = pickle.load(file)
else:
with codecs.open(path, 'r', 'utf-8') as file:
block = []
for line in file:
line = line.strip()
if '<k>' in line:
parts = line.split('<k>', 1)
block.append(parts[0])
handle_block('\n'.join(block))
block = ['<k>' + parts[1]]
else:
block.append(line)
with open(path+'.pcl', 'wb') as file:
pickle.dump(dictionary, file, protocol=pickle.HIGHEST_PROTOCOL)
return dictionary
def main():
#en_ru = dictionary_parser(os.path.join(os.path.dirname(__file__), 'english-russian.txt'))
#read(english_replacements, english_suffixes, sub_parser, en_ru)
en_ru = lingvo_parser(r'c:\Users\araud\Downloads\LingvoUniversalEnRu.dict')
read(r'c:\Users\araud\Downloads\The Family Man (2000).srt', english_replacements, english_suffixes, sub_parser, en_ru)
"""
sp_ru = lingvo_parser(r'c:\Users\araud\Downloads\Spanish\UniversalEsRu.dict')
read(r'c:\temp\SpanishBible\b_spanish_mod_utf-8.txt', spanish_replacements, spanish_suffixes, bible_parser, sp_ru)
"""
def read(path, replacements, suffixes, parser, dictionary):
global all_words
with codecs.open(path, 'r', 'utf-8') as file:
for line in file:
parser(line, replacements)
forms = {}
for suffix in suffixes:
for word in all_words.iterkeys():
suffix_len = len(suffix)
if word.endswith(suffix) and word[:-suffix_len] in all_words and (len(word) - suffix_len) > 1:
forms[word] = word[:-suffix_len]
for form, word in forms.iteritems():
if form in all_words and word in all_words:
all_words[word] += all_words[form]
del all_words[form]
words = list(all_words.iteritems())
print '-------------------------------------------'
sbw = sorted(words, key=lambda (word, _): word)
prev = ''
suffixes = {}
for word, count in sbw:
if prev in word and word and prev:
suffix = word.replace(prev, '')
suffixes.setdefault(suffix, 0)
suffixes[suffix] += 1
prev = word
suffixes = sorted(suffixes.iteritems(), key=lambda(_, count): count, reverse=True)
print '-------------------------------------------'
for suff, count in suffixes:
if count > 3:
print "u'%s'," % suff.encode('utf-8'),
print
for suff, count in suffixes:
print suff.encode('utf-8'), count
print '-------------------------------------------'
words.sort(key=lambda(_, count): count, reverse=True)
for word, count in words:
print '%d\t%s' % (count, word.encode('utf-8'))
print '-------------------------------------------'
print 'Total: ', len(words)
print '-------------------------------------------'
total_max = 2000
for word, count in words:
if count > 1 and word.strip():
def normalize(lst):
for el in lst:
if not el:
continue
res = el.strip()
if res:
if res.startswith('^:'):
res = '\t|\t'.join(normalize(dictionary.get(res[2:], []))).strip()
yield res
translation = '\t|\t'.join(normalize(dictionary.get(word, [])))
if not translation:
continue
print ('%s\t%s' % (word.capitalize(), translation)).encode('utf-8')
if not total_max:
break
total_max -= 1
if __name__ == '__main__':
main()