-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathTimePreprocessor.py
More file actions
60 lines (49 loc) · 3.19 KB
/
TimePreprocessor.py
File metadata and controls
60 lines (49 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
def findKMostRecentTimestamps(userSortedTimestamp, timestamp, sequence_length):
length = len(userSortedTimestamp)
userSortedTimestamp_ts = [0]*sequence_length + userSortedTimestamp.timestamp.tolist()
userSortedTimestamp_iid = [0]*sequence_length + userSortedTimestamp.item_id.tolist()
left = sequence_length
right = length + sequence_length - 1
mid = (left + right) / 2
while True:
mid = int(max(sequence_length, mid))
if userSortedTimestamp_ts[mid] == timestamp: # mid == timestamp
for i in range(mid, -1, -1):
if userSortedTimestamp_ts[i] != timestamp:
return userSortedTimestamp_ts[i+1-sequence_length : i+1], userSortedTimestamp_iid[i+1-sequence_length : i+1]
elif mid == length+sequence_length-1: # mid == end
return userSortedTimestamp_ts[mid-sequence_length : mid], userSortedTimestamp_iid[mid-sequence_length : mid]
elif userSortedTimestamp_ts[mid] > timestamp and userSortedTimestamp_ts[mid-1] < timestamp: # mid-1 < now < mid
return userSortedTimestamp_ts[mid-sequence_length : mid], userSortedTimestamp_iid[mid-sequence_length : mid]
elif userSortedTimestamp_ts[mid+1] > timestamp and userSortedTimestamp_ts[mid] < timestamp: # mid < now < mid+1
return userSortedTimestamp_ts[mid-sequence_length+1 : mid+1], userSortedTimestamp_iid[mid-sequence_length+1 : mid+1]
elif userSortedTimestamp_ts[mid] < timestamp:
left = mid + 1
mid = (left + right) / 2
elif userSortedTimestamp_ts[mid] > timestamp:
right = mid - 1
mid = (left + right) / 2
if right < left:
print "Error"
exit()
def timestamp_processor(dataset, userSortedTimestamp, sequence_length):
datetime = pd.to_datetime(dataset.timestamp, unit='s')
pydatetime = pd.DataFrame(list(map(lambda x: [x.year, x.month-1, x.day-1, x.hour], datetime.dt.to_pydatetime())), columns=['year', 'month', 'date', 'hour'])
day_of_week = datetime.dt.dayofweek
day_of_week.name = 'day_of_week'
dataset = dataset.join(pydatetime)
dataset = dataset.join(day_of_week)
KRecentTSItem = list(map(lambda x: findKMostRecentTimestamps(userSortedTimestamp[x[0]], x[1], sequence_length), zip(dataset.user_id, dataset.timestamp)))
KRecentTimestamps = pd.DataFrame(list(map(lambda x: x[0], KRecentTSItem)), columns=list(map(lambda x: 'timestamp' + str(x), range(sequence_length))))
KRecentItems = pd.DataFrame(list(map(lambda x: x[1], KRecentTSItem)), columns=list(map(lambda x: 'item_id' + str(x), range(sequence_length))))
dataset = dataset.join(KRecentTimestamps)
dataset = dataset.join(KRecentItems)
for i in range(sequence_length):
datetime = pd.to_datetime(KRecentTimestamps['timestamp' + str(i)], unit='s')
pydatetime = pd.DataFrame(list(map(lambda x: [x.month-1, x.day-1, x.hour], datetime.dt.to_pydatetime())), columns=['month' + str(i), 'date' + str(i), 'hour' + str(i)])
day_of_week = datetime.dt.dayofweek
day_of_week.name = 'day_of_week' + str(i)
dataset = dataset.join(pydatetime)
dataset = dataset.join(day_of_week)
return dataset