typical_frames/typical_frames_main.py at master · cltl/typical_frames · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from .xml_utils import srl_id_frames, term_id_lemmas, determiner_id_info, compound_id_info, get_text_title, frame_info_dict, sentence_info
from .path_utils import get_naf_paths
from .corpus_utils import delete_smallest_texts, corpus_to_json, select_event_types, sample_corpus
from .fficf_utils import frames_collections, frame_stats, ff_icf, scores_to_format, scores_to_json

from lxml import etree
import json
import os
import pandas as pd

def frame_info(naf_root,
                verbose=0):
    '''
    extracts dictionary from naf with relevant info about frames and their lexical units.
    :param naf_iterable: path to naf iterable
    :type naf_iterable: string
    '''
    doc_tree = etree.parse(naf_root)
    root = doc_tree.getroot()

    title = get_text_title(root)
    framedict = srl_id_frames(root)
    lemmadict = term_id_lemmas(root)
    sentencedict = sentence_info(root)
    detdict = determiner_id_info(root)
    compounddict = compound_id_info(root)

    frame_info = frame_info_dict(title=title,
                                    framedict=framedict,
                                    lemmadict=lemmadict,
                                    sentencedict=sentencedict,
                                    detdict=detdict,
                                    compounddict=compounddict)
    if verbose >= 2:
        print(frame_info)
    return frame_info

def event_type_info(collections,
                verbose=0):
    """
    Returns a dictionary with event type as key and list of dictionaries with linguistic information as value.
    :param collections: a collection of collections of NAF paths per event type
    :type collections: dictionary
    """
    event_type_frame_info_dict = {}

    for event_type, collection in collections.items():
        collection_of_dicts = []
        for file in collection:
            frame_info_dict = frame_info(file)
            collection_of_dicts.append(frame_info_dict)
        event_type_frame_info_dict[event_type] = collection_of_dicts
    return event_type_frame_info_dict

def load_corpus(project,
                language,
                output_folder=None,
                minimal_frames_per_doc=10,
                start_from_scratch=True,
                verbose=0):
    """
    load the corpus from DFNDataReleases and distribute the linguistic information from the naf files
    over event types in dictionary.
    :param project: the name of the project under which the corpus in DFNDataReleases is stored
    :param language: the language of the corpus
    :param output_folder: output folder
    :param minimal_frames_per_doc: the minimal number of annotated frames a document must contain
    :param start_from_scratch: start from scratch
    :type project: string
    :type language: string
    :type output_folder: string
    :type minimal_frames_per_doc: integer
    :type start_from_scratch: boolean
    """
    event_type_paths_dict = get_naf_paths(project=project,
                                        language=language,
                                        verbose=verbose)
    event_type_info_dict = event_type_info(collections=event_type_paths_dict)
    sliced_corpus = delete_smallest_texts(collections=event_type_info_dict,
                                            minimal_n_frames=minimal_frames_per_doc,
                                            verbose=verbose)

    if verbose >= 2:
        for event_type, collection in sliced_corpus.items():
            print(f'{event_type}: {len(collection)} reference texts')

    corpus_to_json(corpus_dict=sliced_corpus,
                    output_folder=output_folder,
                    start_from_scratch=start_from_scratch,
                    verbose=verbose)
    return

def contrastive_analysis(event_types=None,
                            output_folder=None,
                            start_from_scratch=False,
                            verbose=2):
    """
    Extract frames from corpus per event type, perform ff*icf and return a dataframe in excel and json.
    :param event_types: specified wikidata event type identifiers
    :param output_folder: output folder
    :param start_from_scratch: start from scratch
    :type event_types: list
    :type output_folder: string
    :type start_from_scratch: boolean
    """
    assert type(event_types) == list, "event type identifiers are not in list"
    assert len(event_types) >= 2, "provide at least two identifiers in the event types list"

    corpus_path = f"{output_folder}/corpus_info.json"
    assert os.path.isfile(corpus_path) == True, "corpus not found"

    with open(corpus_path, "r") as infile:
        corpus_dict = json.load(infile)

    if event_types!= None:
        event_type_info_dict = select_event_types(event_types=event_types,
                                                    corpus_dict=corpus_dict,
                                                    verbose=verbose)
    else:
        event_type_info_dict = corpus_dict

    sampled_corpus = sample_corpus(collections=event_type_info_dict,
                                    verbose=verbose)
    event_type_frames_dict = frames_collections(event_type_frame_collections=sampled_corpus,
                                                verbose=verbose)
    frame_freq_dict = frame_stats(event_type_frames_dict=event_type_frames_dict,
                                    verbose=verbose)
    fficf_dict = ff_icf(collections=sampled_corpus,
                        event_type_frames_dict=event_type_frames_dict,
                        frame_freq_dict=frame_freq_dict,
                        verbose=verbose)
    scores_to_format(fficf_dict=fficf_dict,
                        frame_freq_dict=frame_freq_dict,
                        output_folder=output_folder,
                        start_from_scratch=start_from_scratch,
                        event_types=event_types,
                        verbose=verbose)
    scores_to_json(fficf_dict=fficf_dict,
                    output_folder=output_folder,
                    start_from_scratch=start_from_scratch,
                    verbose=verbose)
    return