forked from adwait-thattey/stackoverflow_api_recommender
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpickler.py
More file actions
107 lines (83 loc) · 4.22 KB
/
pickler.py
File metadata and controls
107 lines (83 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pickle
import log
import shared
import os
import constants
import utils
def read_questions_segment(segment_id):
log.log(f"Reading questions segment {segment_id}", module="pickler")
questions_dict = None
with open(os.path.join(constants.pickled_questions_dir, str(segment_id) + constants.pickle_files_extension),
'rb') as f:
questions_dict = pickle.load(f)
if questions_dict is not None:
return questions_dict
else:
raise ValueError(f"[DEBUG] [PICKLER] Unable to read questions segment")
def write_questions_segment(segment_id, questions_dict):
log.log(f"Writing questions segment {segment_id}", module="pickler")
with open(os.path.join(constants.pickled_questions_dir, str(segment_id) + constants.pickle_files_extension),
'wb') as f:
pickle.dump(questions_dict, f)
def write_questions_index(questions_dict=None):
if questions_dict is None:
questions_dict = shared.INDEXED_QUESTIONS
new_segments = 0
modified_segments = 0
pickled = set()
log.log(f"Writing questions index to pickles ", module="pickler")
new_segment_questions_ids = set()
for ques_id in questions_dict:
if ques_id in pickled:
continue
if ques_id in shared.QUESTION_SEGMENT_MAP:
# question is already indexed. update index
segment = shared.QUESTION_SEGMENT_MAP[ques_id]
try:
segment_questions = read_questions_segment(segment)
# get list of questions in questions_dict which are not yet indexed and in this segment
q_seg_ids = [qid for qid in questions_dict if qid not in pickled and qid in segment_questions]
for i in q_seg_ids:
segment_questions[i] = questions_dict[i]
write_questions_segment(segment, segment_questions)
modified_segments += 1
pickled.update(q_seg_ids)
except FileNotFoundError:
log.warn(f"Segment {segment} in map but does not exist. Removing", module="pickler")
shared.QUESTION_SEGMENT_MAP = {k: v for (k, v) in shared.QUESTION_SEGMENT_MAP.items() if v != segment}
if ques_id not in shared.QUESTION_SEGMENT_MAP:
# question is not indexed yet. Create new segments
new_segment_questions_ids.add(ques_id)
pickled.add(ques_id)
if len(new_segment_questions_ids) >= constants.questions_per_segment:
# write this seg
new_seg_questions = {qid: questions_dict[qid] for qid in new_segment_questions_ids}
new_seg_id = utils.get_new_question_segment_id()
write_questions_segment(new_seg_id, new_seg_questions)
new_segments += 1
for qid in new_segment_questions_ids:
shared.QUESTION_SEGMENT_MAP[qid] = new_seg_id
new_segment_questions_ids.clear()
if new_segment_questions_ids:
# some questions are yet to be written
new_seg_questions = {qid: questions_dict[qid] for qid in new_segment_questions_ids}
new_seg_id = utils.get_new_question_segment_id()
write_questions_segment(new_seg_id, new_seg_questions)
new_segments += 1
for qid in new_segment_questions_ids:
shared.QUESTION_SEGMENT_MAP[qid] = new_seg_id
new_segment_questions_ids.clear()
log.log(f"Questions Written to disk. Modified Segments:{modified_segments}, New Segments:{new_segments}",
module="pickler")
def read_question_segment_map():
log.log(f" Reading question segment map", module="pickler")
try:
with open(os.path.join(constants.pickled_questions_dir, "qsmap" + constants.pickle_files_extension), 'rb') as f:
shared.QUESTION_SEGMENT_MAP = pickle.load(f)
except FileNotFoundError:
log.warn(f" Question-Segment Map pickle file not found", module="pickler")
shared.QUESTION_SEGMENT_MAP = dict()
def write_question_segment_map():
log.log(f" Writing question segment map", module="pickler")
with open(os.path.join(constants.pickled_questions_dir, "qsmap" + constants.pickle_files_extension), 'wb') as f:
pickle.dump(shared.QUESTION_SEGMENT_MAP, f)