-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnltk_functions.py
More file actions
102 lines (88 loc) · 3.55 KB
/
nltk_functions.py
File metadata and controls
102 lines (88 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -------------------------------------------------------
# Assignment (2)
# Written by (Mohamed Hefny, 40033382)
# For COMP 472 Section (ABIX) – Summer 2020
# --------------------------------------------------------
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet
import string
import const
import re
#download nltk dependencies
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
removed_words = set()
remove_punc = set('\"#$%&\'()*+,./:;<=>?@[\]^-`{|}~”,')
def get_wordnet_pos(word):
"""
Classifies the words to correct vocabulary mapping, e.g('mohamed->noun, play->verb)
param word: word to be classified
"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
def remove_vocab():
"""
prints all the removed vocabulary to an external file
"""
fl = open(const.REMOVEDWORDS, 'w+', encoding='utf-8')
for word in removed_words:
fl.write('%s\n' %word)
fl.close()
def corpus_lemmatization(corpus_string, stopwords=None, word_length_filtering=False,baseline=False):
"""
sanatizes the data by removing punctuation and then tokenizing it, it then applies
lemmatization to each word in the vocabulary, this improves the performance of the classfier
param corpus_string: string to be tokenized
param stopwords: set of words to not include in the vocabulary
param word_length_filtering: if true applies word length filtering
param baseline: if true outputs the remove_wrds and vocabulary text files
"""
words = list()
lemmatizor = WordNetLemmatizer()
if stopwords is None:
stopwords = set()
#output the removed punctiuation to remove_words
if baseline:
punc_list = [removed_words.add(char) for char in remove_punc if char in corpus_string]
#remove punctuation except for '-'
no_punc = corpus_string.translate(str.maketrans('', '', "!\"#$%'()*+,./:;<=>?@[\]^-`{|}~”,…"))
#To keep the '-' punctioation
"""
## replace '-' with '_' because word_tokenize method doesnt tokenize '_'
## no_punc = no_punc.replace("-","_")
"""
#tokenize the sentence to words
for word in nltk.word_tokenize(no_punc):
#replace '_' back to its original form
#word = word.replace('_','-')
#check if the word is only alphabetical and not in stopwords
if (word.isalpha() or re.match("\w+(?=\S*[_])([a-zA-Z_]+)",word)):
#lemmatize each word
word = lemmatizor.lemmatize(word, get_wordnet_pos(word))
#check if valid length, if using the word length filtering experiment
if valid_length(word, word_length_filtering) and word not in stopwords:
words.append(word)
else:
#if word is to be removed
if baseline:
removed_words.add(word)
#lemma = [lemmatizor.lemmatize(w, get_wordnet_pos(w)) for w in words]
return words
#return [word for word in lemma if valid_length(word, word_length_filtering)]
def valid_length(word, word_length_filtering):
"""
checks if a word is within the length specified in the assignment
param word: word to be checked
word_length_filtering: if true applies word length filtering to the word
"""
if word_length_filtering:
if len(word)<=2 or len(word)>=9:
return False
return True