textAnalysis/postprocess.py at main · bowersd/textAnalysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import re
import needleman

def parser_out_string_dict(string):
    """reformats STRING in output FORM to dict keyed by first word"""
    proc = {}
    for line in string.split('\n'):
        if not line: continue
        split = line.split()
        if split[0] not in proc: proc[split[0]] = [split[1:]]
        elif split[1:] not in proc[split[0]]: proc[split[0]].append(split[1:])
    return proc

def minimal_filter(*msds):
    return [m for m in msds if m[1] == msds[0][1]]

def score_edits(typed, *generated):
    h = {}
    for g in generated:
        alnd = needleman.align(typed, g[0], -1, needleman.make_id_matrix(typed, g[0]))
        h[g[0]] = sum([alnd[0][i] != alnd[1][i] for i in range(len(alnd[0]))])
    return h

def min_morphs(*msds):
    """the length of the shortest morphosyntactic description"""
    return min([m[0].count("+") for m in msds])

def disambiguate2(scored, *msds):
    """get the first of the lowest scored"""
    lowest = min([scored[x] for x in scored])
    return min([i for i in range(len(msds)) if scored[msds[i][0]] == lowest])

def disambiguate(target, f, *msds):
    """the earliest of the morphosyntactic descriptions|f(m) = target"""
    #prioritizing order allows weighting schemes to be exploited
    for i in range(len(msds)):
        if f(msds[i]) == target: return i
    #first default
    return 0

def extract_regex(string, regex):
    """pull regex out of string"""
    #generalized to allow searches for +V+AI, or +VAI
    if re.search(regex, string): return re.search(regex, string).group(0)
    return None

def extract_lemma(string, pos_regex):
    """pull lemma out of string"""
    #lemma is always followed by Part Of Speech regex
    #lemma may be preceeded by prefixes, else word initial
    #if re.search(pos_regex, string): return re.search("(^|\+)(.*?)"+pos_regex, string).group(2)
    if "+Cmpd" in string:
        cmpd = []
        for x in re.split(r"\+Cmpd", string):
            cmpd.append(re.split(pos_regex, x)[0].split("+")[-1])
            #return "+".join([re.split(pos_regex, x)[0].split("+")[-1] for x in re.split("+Cmpd", string)])
        #print(cmpd)
        return "+".join(cmpd)
    if re.search(pos_regex, string): return re.split(pos_regex, string)[0].split("+")[-1] #last item before pos tag, after all other morphemes, is lemma
    return None

def extract_msd(string, pos_regex):
    """pull morphosyntactic description out of string"""
    #need everything but the lemma and pos tag
    #two lines for readability
    if re.search(pos_regex, string):
        pos = r"\+".join(re.search(pos_regex, string).group(0).split("+")) #need to escape + or interpreted as Kleene +
        #pos = re.search(pos_regex, string).group(0).encode('string-escape') #need to escape + or interpreted as Kleene +
        l = brackets(re.split(pos_regex, string)[0].split("+")[-1]) #lemma
        #l = re.split(pos_regex, string)[0].split("+")[-1].encode('string-escape') #lemma
        lpos = "".join([l, pos])
        r = re.search("(.*)"+lpos+"(.*)", string)
        #print string, pos, l, r
        if not (r.group(1) or r.group(2)): return None
        if not (r.group(1) and r.group(2)): return "".join([r.group(1), r.group(2)])
        #leaves a _ to demarcate where lemma was
        return "_".join([r.group(1), r.group(2)])
    return None

def plus_to_dot(string):
    if string: return ".".join(string.strip("+").split("+"))
    return None

def slash_to_e(string):
    if string: return "".join(string.split('/'))

def brackets(string):
    if string == "("  or string == ")": return "\\"+string
    return string