-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembed_lambda.py
More file actions
164 lines (139 loc) · 5 KB
/
Copy pathembed_lambda.py
File metadata and controls
164 lines (139 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import json
import boto3
import faiss
import numpy as np
from langchain_text_splitters import RecursiveCharacterTextSplitter
# ===== CONFIG =====
REGION = "us-east-1"
BUCKET_NAME = "agentic-docs-repo-joseph"
INDEX_PREFIX = "indexes/"
EMBED_MODEL_ID = "amazon.titan-embed-text-v2:0"
EMBEDDING_DIM = 1024
# ===== /tmp PATHS =====
# Lambda writes here during execution
# Persists across warm invocations
FAISS_LOCAL = "/tmp/faiss_index.bin"
METADATA_LOCAL = "/tmp/metadata.json"
# ===== AWS CLIENTS =====
# Outside handler for warm start performance
s3 = boto3.client("s3", region_name=REGION)
bedrock = boto3.client("bedrock-runtime", region_name=REGION)
# ===== TEXT SPLITTER =====
# Same settings as your original script
splitter = RecursiveCharacterTextSplitter(
chunk_size=700,
chunk_overlap=100
)
# ===== HELPER FUNCTIONS =====
def get_embedding(text):
"""Convert text chunk to 1024-dimensional vector via Titan."""
body = json.dumps({"inputText": text})
response = bedrock.invoke_model(
modelId=EMBED_MODEL_ID,
body=body,
contentType="application/json",
accept="application/json"
)
return json.loads(response["body"].read())["embedding"]
def load_existing_index():
"""
Download existing FAISS index and metadata from S3.
If they don't exist yet this is the first run —
create a fresh empty index instead.
"""
try:
s3.download_file(
BUCKET_NAME,
f"{INDEX_PREFIX}faiss_index.bin",
FAISS_LOCAL
)
s3.download_file(
BUCKET_NAME,
f"{INDEX_PREFIX}metadata.json",
METADATA_LOCAL
)
index = faiss.read_index(FAISS_LOCAL)
with open(METADATA_LOCAL, "r", encoding="utf-8") as f:
metadata = json.load(f)
print(f"Loaded existing index with {index.ntotal} vectors")
return index, metadata
except Exception:
# First time running — no index exists yet
print("No existing index found — creating fresh index")
return faiss.IndexFlatL2(EMBEDDING_DIM), []
def save_and_upload_index(index, metadata):
"""Save updated index locally then push both files back to S3."""
faiss.write_index(index, FAISS_LOCAL)
with open(METADATA_LOCAL, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
s3.upload_file(FAISS_LOCAL, BUCKET_NAME, f"{INDEX_PREFIX}faiss_index.bin")
s3.upload_file(METADATA_LOCAL, BUCKET_NAME, f"{INDEX_PREFIX}metadata.json")
print(f"Index uploaded to S3 — total vectors: {index.ntotal}")
# ===== LAMBDA HANDLER =====
def lambda_handler(event, context):
"""
Triggered by S3 when a new .txt file is uploaded.
S3 event payload looks like:
{
"Records": [{
"s3": {
"bucket": {"name": "agentic-docs-repo-joseph"},
"object": {"key": "what_is_rag.txt"}
}
}]
}
"""
try:
processed = []
skipped = []
# Step 1 — extract file info from S3 event
# S3 can batch multiple records in one event
for record in event["Records"]:
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
# Only process .txt files
if not key.endswith(".txt"):
print(f"Skipping non-txt file: {key}")
skipped.append(key)
continue
print(f"Processing: {key}")
# Step 2 — download the new txt file from S3
response = s3.get_object(Bucket=bucket, Key=key)
text = response["Body"].read().decode("utf-8")
# Step 3 — chunk it
# Same splitter settings as your original script
chunks = splitter.split_text(text)
print(f"Split into {len(chunks)} chunks")
# Step 4 — load existing index so we ADD to it
# not overwrite everything from scratch
index, metadata = load_existing_index()
# Step 5 — embed each chunk and add to index
added = 0
for chunk in chunks:
try:
vector = get_embedding(chunk)
index.add(np.array([vector], dtype="float32"))
metadata.append({
"text": chunk,
"source": key
})
added += 1
except Exception as e:
print(f"Skipped chunk due to error: {e}")
# Step 6 — save and push back to S3
save_and_upload_index(index, metadata)
processed.append({"file": key, "chunks_added": added})
return {
"statusCode": 200,
"body": json.dumps({
"processed": processed,
"skipped": skipped
})
}
except Exception as e:
print(f"Error: {e}")
return {
"statusCode": 500,
"body": json.dumps({"error": str(e)})
}