-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvector_database.py
More file actions
68 lines (53 loc) · 2.42 KB
/
vector_database.py
File metadata and controls
68 lines (53 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# For faiss you have two options !pip install faiss-gpu or !pip install faiss-cpu
# The faiss-gpu package provides CUDA-enabled indices, either package should be installed, but not both.
#It is standalone file check strings_ranked_by_relatedness_vector args!
#Add this to your main file: from vector_database import save_index, load_index, strings_ranked_by_relatedness_vector
import faiss # for vector database
import scipy.spatial.distance as spatial
import pandas as pd
import numpy as np
import time
def save_index(embeddings: list, index_path: str = 'vector_database.index') -> faiss.IndexFlatL2:
""" Creates, saves and returns embeddings vectors"""
# Create a vector index
dimension = len(embeddings[0]) # Assuming all embeddings have the same dimension
index = faiss.IndexFlatL2(dimension) # L2 distance is used for similarity search
# Convert embeddings to numpy array
embeddings_np = np.array(embeddings).astype('float32')
# Add embeddings to the index
index.add(embeddings_np)
# Save the index
faiss.write_index(index, index_path)
return index
def load_index(index_path: str = 'vector_database.index') -> faiss.IndexFlatL2:
""" Loads index file from disc and returns it as faiss.IndexFlatL2"""
return faiss.read_index(index_path)
# Search function using the vector index and DataFrame
def strings_ranked_by_relatedness_vector(
query: str,
EMBEDDING_MODEL : str,
index: faiss.IndexFlatL2,
df: pd.DataFrame,
openai_api,
relatedness_fn=lambda x, y: 1 - spatial.cosine(x, y),
top_n: int = 5,
timeit: bool = False
) -> tuple[list[str], list[float]]:
"""Returns a list of top_n strings and relatednesses, sorted from most related to least."""
query_embedding_response = openai_api.Embedding.create(
model=EMBEDDING_MODEL,
input=query,
)
query_embedding = query_embedding_response["data"][0]["embedding"]
# Start the timer
start_time = time.time()
# Perform similarity search using the vector database
_, indices = index.search(np.array([query_embedding]), top_n)
strings = df.loc[indices[0], "text"].tolist()
embeddings = df.loc[indices[0], "embedding"].tolist()
relatednesses = [relatedness_fn(query_embedding, emb) for emb in embeddings]
# End the timer
end_time = time.time()
if timeit:
print(f'Elapsed time: {end_time - start_time} seconds')
return strings, relatednesses