-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
26 lines (19 loc) · 903 Bytes
/
utils.py
File metadata and controls
26 lines (19 loc) · 903 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from datasets import load_dataset
import os
def load_data(save_to_disk:bool=False):
dataset = load_dataset("iwslt2017", "iwslt2017-en-de", trust_remote_code=True)
if save_to_disk:
dataset.save_to_disk("iwslt2017_en_de")
return dataset
data = load_data()
def prepare_corpus(file_name:str, limit:float|int):
if isinstance(limit, float):
limit = int(limit*len(data['train']))
path = os.path.join(os.path.dirname(__file__), file_name)
with open(path,'w', encoding='utf-8') as f:
eng = ' '.join(list(map(lambda x: x['en'], data['train']['translation'][:limit])))
ger = ' '.join(list(map(lambda x: x['de'], data['train']['translation'][:limit])))
f.write(eng+ger)
print(f"Your text corpus is saved at {path}")
if __name__=='__main__':
prepare_corpus("corpus.txt", 500) # 25% of this dataset is around 50k examples per-language