-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmongodb.py
More file actions
127 lines (94 loc) · 4.84 KB
/
mongodb.py
File metadata and controls
127 lines (94 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from pymongo import MongoClient
from dotenv import load_dotenv
import os
class MongoDB:
def __init__(self, username, password, ip, database):
self.database = database
self.client = MongoClient(ip, username=username, password=password, authSource="admin")[database]
self.crawler_db = self.client.crawler_records
self.documents_db = self.client.documents
self.indexer_db = self.client.index
self.query_documents_db = self.client.query_documents
self.query_indexer_db = self.client.query_index
@staticmethod
def connect_to_db():
load_dotenv() # load enviromental variables from .env
username = os.getenv("MONGO_INITDB_ROOT_USERNAME")
password = os.getenv("MONGO_INITDB_ROOT_PASSWORD")
database = os.getenv("MONGO_INITDB_DATABASE")
ip = os.getenv("MONGO_IP")
# return MongoDB connection object
return MongoDB(username=username, password=password, database=database, ip=ip)
# ------------------------------------ Crawler-related methods ------------------------------------
def add_crawler_record(self, json):
self.crawler_db.insert_one(json)
def find_all_crawler_records(self):
return self.crawler_db.find({}, no_cursor_timeout=True)
def crawler_record_exists(self, title, url):
return self.crawler_db.find_one({"title": title, "url": url}) is not None
def reset_crawler(self):
self.crawler_db.drop()
self.crawler_db = self.client.crawler_records
# ------------------------------------ Indexer-related methods ------------------------------------
def build_documents_db(self):
self.documents_db.insert_many(self.crawler_db.find({}))
def get_documents_count(self):
return self.documents_db.count()
def find_document_record(self, doc_id):
return self.documents_db.find_one({"_id": doc_id})
def find_all_document_record_ids(self):
mongo_results = self.documents_db.find({}, {"_id": 1})
return [item["_id"] for item in mongo_results]
def add_length_to_document(self, doc_id, doc_length):
self.documents_db.update({"_id": doc_id}, {"$set": {"length": doc_length}})
def add_index_entry(self, json):
self.indexer_db.insert_one(json)
def update_index_entry(self, word, new_data):
entry = self.indexer_db.find_one({"word": word})
w_freq = entry["w_freq"] + 1
documents = entry["documents"]
documents.append(new_data)
self.indexer_db.update({"word": word},
{"$set": {"w_freq": w_freq, "documents": documents}})
def find_index_entry_by_keyword(self, word):
return self.indexer_db.find_one({"word": word})
def index_entry_exists(self, word):
return self.indexer_db.find_one({"word": word}) is not None
def reset_index(self):
self.indexer_db.drop()
self.documents_db.drop()
self.indexer_db = self.client.index
self.documents_db = self.client.documents
# ------------------------------------ Query Handler-related methods ------------------------------------
def reset_query_handler(self):
self.query_documents_db.drop()
self.query_indexer_db.drop()
self.query_documents_db = self.client.query_documents
self.query_indexer_db = self.client.query_index
def build_query_documents_db(self):
self.query_documents_db.insert_many(self.documents_db.find({}))
def build_query_indexer_db(self):
self.query_indexer_db.insert_many(self.indexer_db.find({}))
def update_query_handler_db(self):
# Reset query handler-related database collections
self.reset_query_handler()
# Copy all records from "documents" db collection to "query_documents" db collection.
self.build_query_documents_db()
# Copy all records from "indexer" db collection to "query_indexer" db collection.
self.build_query_indexer_db()
# Used to determine the status of the Query Database collections. If the Index has finished initializing
# for the first time, the status returned is always True, since the Query collections always contain the
# copy of the last completed Index. If the first index build is in progress, the status returned if False,
# since the Query collections have not yet been initialized.
def is_initialized(self):
collections = self.client.list_collection_names()
if "query_documents" in collections and "query_index" in collections:
return True
else:
return False
def get_query_documents_count(self):
return self.query_documents_db.count()
def find_query_index_entry(self, term):
return self.query_indexer_db.find_one({"word": term})
def find_query_document_record(self, doc_id):
return self.query_documents_db.find_one({"_id": doc_id})