-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocess.py
More file actions
17 lines (15 loc) · 1.25 KB
/
preprocess.py
File metadata and controls
17 lines (15 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#!/usr/bin/python
# vim: setfileencoding=utf-8
import re
def sep_punct(string, drop):
#return "'".join(re.sub("(“|\()", "\g<1> ", re.sub("(”|…|\)|:|;|,|\*|\.|\?|!|/)", " \g<1>", string)).split("’")) #first separate trailing punc, then leading punc, then replace single quote ’ with '
if not drop: return "'".join(re.split("‘|’", re.sub(r"(\{|\}|\[|\]|\"|“|\(|\)|”|—|…|:|;|,|\*|\.|\?|!|/)", r" \g<1> ", string))) #separate all punc, then replace single quote ’ with '
return "'".join(re.split("‘|’", re.sub(r"(\{|\}|\[|\]|\"|“|\(|\)|”|—|…|:|;|,|\*|\.|\?|!|/)", " ", string))) #remove all punc, then replace single quote ’ with '
#if not drop: return "'".join(re.split("‘|’|'", re.sub(r"('+($| )|(^| )'+(?=[^aeioAEIO])|\{|\}|\[|\]|\"|“|\(|\)|”|—|…|:|;|,|\*|\.|\?|!|/)", r" \g<1> ", string))) #separate all punc, then replace single quote ’ with '
#return "'".join(re.split("‘|’|'", re.sub(r"('+($| )|(^| )'+(?=[^aeioAEIO])|\{|\}|\[|\]|\"|“|\(|\)|”|—|…|:|;|,|\*|\.|\?|!|/)", " ", string))) #remove all punc, then replace single quote ’ with '
def sent_break(*args):
holder = []
for a in args:
for s in re.split(r"((\.|!|\?) )", a):
holder.append(s)
return holder