-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
89 lines (64 loc) · 2.68 KB
/
data.py
File metadata and controls
89 lines (64 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
""" Sets up the data for the neural network """
import pandas as pd # Provides data structures and data analysis
import tensorflow as tf # Provides the basis for neural networks
import sys
import csv
# Dirty solution to the data set being too large for pandas
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10 as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt / 10)
decrement = True
TRAIN_PATH = "data/collated_train"
TEST_PATH = "data/collated_test"
CSV_COLUMN_NAMES = ['Char1', 'Char2', 'Char3', 'Char4', 'Char5', 'Char6', 'Char7',
'Char8', 'Char9', 'Char10', 'Char11', 'Char12', 'Char13', 'Char14',
'Char15', 'Char16', 'Language']
CSV_TYPES = [[''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], ['']]
LANGUAGE = ['German', 'English', 'Spanish', 'Italian']
# Note to self:
# y means label
# x means feature
def load_data(y_name='Language'):
""" Reads in the csv file for testing and training """
train = pd.read_csv(TRAIN_PATH, names=CSV_COLUMN_NAMES, header=0, engine='python')
train_x, train_y = train, train.pop(y_name)
test = pd.read_csv(TEST_PATH, names=CSV_COLUMN_NAMES, header=0, engine='python')
test_x, test_y = test, test.pop(y_name)
return (train_x, train_y), (test_x, test_y)
def train_input_fn(features, labels, batch_size=100):
""" Used for training from the inputs """
train_dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
train_dataset = train_dataset.shuffle(500000).repeat().batch(batch_size)
return train_dataset
def test_input_fn(features, labels, batch_size=100):
""" Used for evaluating from the inputs """
features = dict(features)
if labels is None:
inputs = features
else:
inputs = (features, labels)
test_dataset = tf.data.Dataset.from_tensor_slices(inputs)
assert batch_size is not None, "batch_size must not be none"
test_dataset = test_dataset.batch(batch_size)
return test_dataset
def parse_line(line):
fields = tf.decode_csv(line, record_defaults=CSV_TYPES)
features = dict(zip(CSV_COLUMN_NAMES, fields))
label = features.pop('Language')
return features, label
def csv_input(csv_path, batch_size):
dataset = tf.data.TextLineDataset(csv_path).skip(1)
dataset = dataset.map(parse_line)
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
return dataset
def main(argv):
print(load_data())
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
tf.app.run(main)