-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfixUnk.py
More file actions
29 lines (22 loc) · 962 Bytes
/
fixUnk.py
File metadata and controls
29 lines (22 loc) · 962 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
import argparse
sys.path.append("./lexer/")
from folderManager import Folder
parser = argparse.ArgumentParser(description = "Kenlm doesn't handle existing <unk> so let's hide them as UNK")
parser.add_argument("input_dir", help = "Directory to look at. Since the explicity <unk> is used only" +
"with the lstm sets, we we replace unks in train, valid, and test files only.",
action="store", type = str)
lstm_files = ["train", "valid", "test"]
#lstm_files = ["train_valid"]
args = parser.parse_args()
baseDir = Folder(args.input_dir)
#fileList = baseDir.fullFileNames("train_valid",True)
#print(fileList)
#quit()
fileList = [os.path.join(args.input_dir,f) for f in lstm_files]
print(fileList)
for path in fileList:
fileContents = ''.join(open(path, 'r').readlines())
fileContents = fileContents.replace("<unk>", "UNK")
with open(path, 'w') as f:
f.write(fileContents)