-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathanalysis.py
More file actions
126 lines (90 loc) · 3.4 KB
/
Copy pathanalysis.py
File metadata and controls
126 lines (90 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import argparse
import collections
import statistics
import json
import numpy as np
import random
from tqdm import tqdm
from smart_open import smart_open
def get_cf():
return json.load(smart_open("cht.small.freq.json"))
def analysis(input, proficiency):
co2ch = collections.defaultdict(list)
ch2co = collections.defaultdict(list)
eff_ch2co = {}
eff_co2ch = {}
for line in tqdm(input):
code, char, *_ = line.strip('\n').split()
if code not in co2ch:
eff_code = code
else:
n = 1
while True:
eff_code = f"{code}{n}"
if eff_code not in eff_co2ch:
break
n += 1
co2ch[code].append(char)
ch2co[char].append(eff_code)
for char, codes in ch2co.items():
if len(codes) == 1:
select_code = codes[0]
else:
shortest_code = min(codes, key=len)
if random.random() <= proficiency:
select_code = shortest_code
else:
s = random.choices(codes, k=2)
select_code = s[0] if s[0] != shortest_code else s[1]
eff_ch2co[char] = select_code
eff_co2ch[select_code] = char
print("* 只考慮編碼表內的字,無加權")
evaluate(ch2co, co2ch, eff_ch2co, eff_co2ch)
cf = get_cf()
co2ch_f = collections.defaultdict(list)
ch2co_f = collections.defaultdict(list)
eff_ch2co_f = {}
eff_co2ch_f = {}
for char in cf:
if char in ch2co:
codes = ch2co[char]
ch2co_f[char] = codes
code = eff_ch2co[char]
eff_ch2co_f[char] = code
eff_co2ch_f[code] = char
for code, chars in co2ch.items():
for char in chars:
if char in cf:
co2ch_f[code].append(char)
print(f"\n* 考慮最常使用的 {len(cf)} 字 (語料庫共 {sum(cf.values())} 字)")
evaluate(ch2co_f, co2ch_f, eff_ch2co_f, eff_co2ch_f, cf=cf)
def evaluate(ch2co, co2ch, eff_ch2co, eff_co2ch, cf=None):
print( "總字數 ", len(ch2co) )
print( "總編碼數", len(co2ch) )
#print(co2ch)
code_mchar = sum( len(chars) for code, chars in co2ch.items() if len(chars) > 1 )
char_mcode = len([ (char, codes) for char, codes in ch2co.items() if len(codes) > 1])
print( "一碼多字(重碼字) {:6d} {:6.2%}".format(code_mchar, code_mchar / len(ch2co)))
print( "一字多碼(多種拆法) {:6d} {:6.2%}".format(char_mcode, char_mcode / len(ch2co)))
codelen = [ len(code) for char, code in eff_ch2co.items() ]
codelen_avg = statistics.mean(codelen)
codelen_std = statistics.stdev(codelen)
print( f"碼長: 平均 {codelen_avg:5.3f} 標準差 {codelen_std:5.3f}")
if cf:
print("依字頻加權後")
char_freq = [ cf[char] for char in ch2co ]
codelen_wavg = np.average(codelen, weights=char_freq)
print( f"碼長: 平均 {float(codelen_wavg):5.3f}" )
def main():
ap = argparse.ArgumentParser()
ap.add_argument("input", metavar="FILE",
help="要分析的 cin 檔",
type=argparse.FileType('r', errors="ignore") )
ap.add_argument("--proficiency", "-p",
help="熟練度(使用最短碼的機率), 預設 1.0",
type=float,
default=1.0)
args = ap.parse_args()
analysis(args.input, args.proficiency)
if __name__ == '__main__':
main()