-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathxml_utils.py
More file actions
138 lines (123 loc) · 5.39 KB
/
xml_utils.py
File metadata and controls
138 lines (123 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
from lxml import etree
from collections import defaultdict
def get_text_title(root):
"""extract text title from NAF"""
target = root.find('nafHeader/fileDesc')
title = target.get('title')
return title
def srl_id_frames(root):
"""Load a NAF file, extract the frames and their corresponding identifiers and add them to a dictionary."""
framedict = {}
if root.find('srl') in root:
for predicate in root.find('srl'):
ext_ref_el = predicate.find('externalReferences/externalRef')
uri = ext_ref_el.get('reference')
frame = uri[35:]
label = frame[0].upper() + frame[1:]
target = predicate.find('span/target')
id = target.get('id')
framedict[id] = label
return framedict
def term_id_lemmas(root):
"""load a NAF file, extract the term id and corresponding lemma, POS and wf and add them to a dictionary."""
lemmadict = {}
if root.find('terms') in root:
for term in root.find('terms'):
term_id = term.get('id')
lemma = term.get('lemma')
pos = term.get('pos')
target = term.find('span/target')
target_id = target.get('id')
lemmapos = {"lemma": lemma, "POS": pos, "wf": target_id}
lemmadict[term_id] = lemmapos
return lemmadict
def sentence_info(root):
"""Load a NAF file, extract the sentence ids with corresponding wfs and add them to a dictionary."""
sentencedict = defaultdict(set)
if root.find('text') in root:
for term in root.find('text'):
sentence = term.get('sent')
wf = term.get('id')
sentencedict[sentence].add(wf)
return sentencedict
def compound_id_info(root):
"""load a NAF file, extract info about compounding."""
compounddict = {}
if root.find('deps') in root:
for dep in root.find('deps'):
if dep.get('rfunc') == "compound":
head_id = dep.get('from')
modifier_id = dep.get('to')
compounddict[head_id] = {"component": 'head', "modifier id": modifier_id}
compounddict[modifier_id] = {"component": 'modifier', "head id": head_id}
return compounddict
def determiner_id_info(root):
"""load a NAF file, extract info about determiners"""
detdict = {}
if root.find('deps') in root:
for dep in root.find('deps'):
if dep.get('rfunc') == "det":
det_id = dep.get("to")
predicate_id = dep.get("from")
detdict[predicate_id] = {"det id": det_id}
return detdict
def frame_info_dict(title,
framedict,
lemmadict,
sentencedict,
detdict,
compounddict):
"""integrate different dictionaries extracted from naf in order to create a frame_info_dict"""
frame_info_dict = {}
id_info_dict = {}
term_id_dict = {}
frame_freq_dict = {}
counter = 0
for term_id in framedict:
counter += 1
if term_id in lemmadict:
frame = framedict[term_id]
lemma = lemmadict[term_id]['lemma']
pos = lemmadict[term_id]['POS']
wf = lemmadict[term_id]['wf']
info_dict = {"frame": frame, "lemma": lemma, "POS": pos}
for sentence, words in sentencedict.items():
if wf in words:
info_dict['sentence'] = sentence
if term_id in detdict:
det_id = detdict[term_id]['det id']
if det_id in lemmadict and (lemmadict[det_id]['lemma'] == 'an' or lemmadict[det_id]['lemma'] == 'a' or lemmadict[det_id]['lemma'] == 'the'):
article = lemmadict[det_id]['lemma']
if article == 'a' or article == 'an':
definite_dict = {"definite": False, "lemma": article}
info_dict['article'] = definite_dict
else:
definite_dict = {"definite": True, "lemma": article}
info_dict['article'] = definite_dict
else:
info_dict['article'] = {"definite": None, "lemma": None}
else:
info_dict['article'] = {"definite": None, "lemma": None}
if term_id in compounddict:
if compounddict[term_id]['component'] == 'head':
modifier_id = compounddict[term_id]['modifier id']
if modifier_id in lemmadict:
modifier = lemmadict[modifier_id]['lemma']
if term_id in lemmadict:
head = lemmadict[term_id]['lemma']
compound = f"{modifier} {head}"
info_dict["compound"] = {"function": "head", "lemma": compound}
if compounddict[term_id]['component'] == 'modifier':
head_id = compounddict[term_id]['head id']
if head_id in lemmadict:
head = lemmadict[head_id]['lemma']
if term_id in lemmadict:
modifier = lemmadict[term_id]['lemma']
compound = f"{modifier} {head}"
info_dict["compound"] = {"function": "modifier", "lemma": compound}
else:
info_dict["compound"] = {"function": None, "lemma": None}
id_info_dict[term_id] = info_dict
frame_info_dict[title] = {'frame frequency': counter, 'frame info': id_info_dict}
return frame_info_dict