-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDocProcessor.py
More file actions
32 lines (26 loc) · 1.01 KB
/
DocProcessor.py
File metadata and controls
32 lines (26 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
def get_pdf_text(pdf_docs):
"""Extracts text from a list of uploaded PDF files."""
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text() + " "
return text
def get_text_chunks(text):
"""Splits a long text into smaller, manageable chunks."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks):
"""Creates a FAISS vector store from text chunks."""
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore