-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocument_processor.py
More file actions
65 lines (53 loc) · 1.86 KB
/
Copy pathdocument_processor.py
File metadata and controls
65 lines (53 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import fitz # PyMuPDF
import streamlit as st
from typing import List, Dict, Any
def extract_text_from_pdf(pdf_file) -> str:
"""Extract text from a PDF file."""
text = ""
try:
# Open the PDF file
pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
# Iterate through pages
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text += page.get_text()
pdf_document.close()
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return ""
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""Split text into chunks with overlap."""
chunks = []
if len(text) <= chunk_size:
chunks.append(text)
else:
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if len(chunk) >= chunk_size // 2: # Only add if chunk is substantial
chunks.append(chunk)
return chunks
def process_document(file, metadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
"""Process a document and return chunks with metadata."""
if file.type == "application/pdf":
text = extract_text_from_pdf(file)
elif file.type.startswith("text/"):
text = file.getvalue().decode("utf-8")
else:
st.error(f"Unsupported file type: {file.type}")
return []
chunks = chunk_text(text)
# Create documents with metadata
documents = []
base_metadata = metadata or {"source": file.name}
for i, chunk in enumerate(chunks):
doc = {
"text": chunk,
"metadata": {
**base_metadata,
"chunk": i
}
}
documents.append(doc)
return documents