-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessor.py
More file actions
92 lines (72 loc) · 2.77 KB
/
Copy pathpreprocessor.py
File metadata and controls
92 lines (72 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# preprocessor.py
# Mark Van Moer
# mvanmoer@parkland.edu
# CSC 220, Fall 2016
import sys
import argparse
def preprocess(args):
'''Function to preprocess text. Takes an input file name, a stopword file
name, and writes a preprocessed text file.
* Remove (some?) punctuation and non-alphabetic characters.
* Convert all letters to lower case.
* Remove stopwords.
* Remove all words with less than three characters.
'''
stopwordsfile = args.stopwords
# MVM: This is rather compact, could be simply done in a loop.
stopwords = [x.strip() for x in open(stopwordsfile, 'r').readlines()]
inputtext = open(args.input, 'r')
if args.output:
outputtext = open(args.output, 'w')
else:
outputtext = sys.stdout
stops = 0
short = 0
for line in inputtext:
# Convert all to lower case.
line = line.lower()
# remove punctuation - strings are immutable
stripped_line = ''
for c in line:
if c.isalpha() or c == "'":
stripped_line += c
else:
stripped_line += ' '
words = stripped_line.split()
# ' needs special handling. When to remove:
# -- when indicating possesive case: "count's"
# -- when starting an old timey abbreviation: "'tis"
# -- when starting an inner quotation
# -- when contracting "had": "he'd"
for i in range(len(words)):
if words[i][-2:] == "'s" or words[i][-2:] == "s'":
words[i] = words[i][:-2]
elif words[i][0] == "'":
words[i] = words[i][1:]
elif words[i][-1] == "'":
words[i] = words[i][:-1]
elif words[i][-2:] == "'d":
words[i] = words[i][:-2]
for i in range(len(words)):
# Replace stopwords and short words with spaces.
if words[i] in stopwords:
stops += 1
words[i] = ' '*len(words[i])
elif len(words[i]) < 3:
short += 1
words[i] = ' '*len(words[i])
outputtext.write(' '.join(words) + '\n')
print('Preprocessing stats for {}'.format(args.input))
print('Stopwords removed: {:>12}'.format(stops))
print('Short words removed: {:>10}'.format(short))
inputtext.close()
if outputtext != sys.stdout:
outputtext.close()
if __name__=='__main__':
parser = argparse.ArgumentParser(description='A text preprocessor.')
parser.add_argument('input', help='Text to process.')
parser.add_argument('stopwords', help='Stopwords list.')
parser.add_argument('--output', dest='output',
help='File for processed output.')
args = parser.parse_args()
preprocess(args)