I am using VectorstoreIndexCreator as below , using SageMake JumpStart gpt-j-6b with FAISS . However I get error while creating the index.
- Code for VectorstoreIndex
from langchain.indexes import VectorstoreIndexCreator
index_creator = VectorstoreIndexCreator(
vectorstore_cls=FAISS,
embedding=embeddings,
text_splitter=text_splitter
)
index = index_creator.from_loaders([loader])
2. Code for Embedding model
My embedding model is SageMaker Jumpstart Embedding Model of gpt-j-6b . My enbedding model code is below.
`from typing import Dict
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import ContentHandlerBase
import json
class ContentHandler(ContentHandlerBase):
content_type = "application/json"
accepts = "application/json"
def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
test = {"text_inputs": prompt}
input_str = json.dumps({"text_inputs": prompt})
encoded_json = json.dumps(test).encode("utf-8")
print(input_str)
print(encoded_json)
return encoded_json
print(input_str)
#return input_str.encode('utf-8')
def transform_output(self, output: bytes) -> str:
#print(output)
response_json = json.loads(output.read().decode("utf-8"))
#print(response_json)
return response_json["embedding"]
#return response_json["embeddings"]
#response_json = json.loads(output.read().decode("utf-8")).get('generated_texts')
print("response" , response_json)
#return "".join(response_json)
content_handler = ContentHandler()
embeddings = SagemakerEndpointEmbeddings(
endpoint_name="endpoint-name",
credentials_profile_name="credentials-profile-name",
endpoint_name="jumpstart-dft-hf-textembedding-gpt-j-6b-fp16", #huggingface-pytorch-inference-2023-03-21-16-14-03-834",
region_name="us-east-1",
content_handler=content_handler
)
#print(embeddings)`
- Error I get on creating index
index = index_creator.from_loaders([loader])
I get below error on above index creation line. Below is the stack trace.
ValueError Traceback (most recent call last)
Cell In[10], line 7
1 from langchain.indexes import VectorstoreIndexCreator
2 index_creator = VectorstoreIndexCreator(
3 vectorstore_cls=FAISS,
4 embedding=embeddings,
5 text_splitter=text_splitter
6 )
----> 7 index = index_creator.from_loaders([loader])
File /opt/conda/lib/python3.10/site-packages/langchain/indexes/vectorstore.py:71, in VectorstoreIndexCreator.from_loaders(self, loaders)
69 docs.extend(loader.load())
70 sub_docs = self.text_splitter.split_documents(docs)
---> 71 vectorstore = self.vectorstore_cls.from_documents(
72 sub_docs, self.embedding, **self.vectorstore_kwargs
73 )
74 return VectorStoreIndexWrapper(vectorstore=vectorstore)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/base.py:164, in VectorStore.from_documents(cls, documents, embedding, **kwargs)
162 texts = [d.page_content for d in documents]
163 metadatas = [d.metadata for d in documents]
--> 164 return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/faiss.py:345, in FAISS.from_texts(cls, texts, embedding, metadatas, **kwargs)
327 """Construct FAISS wrapper from raw documents.
328
329 This is a user friendly interface that:
(...)
342 faiss = FAISS.from_texts(texts, embeddings)
343 """
344 embeddings = embedding.embed_documents(texts)
--> 345 return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/faiss.py:308, in FAISS.__from(cls, texts, embeddings, embedding, metadatas, **kwargs)
306 faiss = dependable_faiss_import()
307 index = faiss.IndexFlatL2(len(embeddings[0]))
--> 308 index.add(np.array(embeddings, dtype=np.float32))
309 documents = []
310 for i, text in enumerate(texts):
File /opt/conda/lib/python3.10/site-packages/faiss/class_wrappers.py:227, in handle_Index..replacement_add(self, x)
214 def replacement_add(self, x):
215 """Adds vectors to the index.
216 The index must be trained before vectors can be added to it.
217 The vectors are implicitly numbered in sequence. When n vectors are
(...)
224 dtype must be float32.
225 """
--> 227 n, d = x.shape
228 assert d == self.d
229 x = np.ascontiguousarray(x, dtype='float32')
ValueError: too many values to unpack (expected 2)
I am using VectorstoreIndexCreator as below , using SageMake JumpStart gpt-j-6b with FAISS . However I get error while creating the index.
from langchain.indexes import VectorstoreIndexCreator
index_creator = VectorstoreIndexCreator(
vectorstore_cls=FAISS,
embedding=embeddings,
text_splitter=text_splitter
)
index = index_creator.from_loaders([loader])
2. Code for Embedding model
My embedding model is SageMaker Jumpstart Embedding Model of gpt-j-6b . My enbedding model code is below.
`from typing import Dict
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import ContentHandlerBase
import json
class ContentHandler(ContentHandlerBase):
content_type = "application/json"
accepts = "application/json"
def transform_input(self, prompt: str, model_kwargs: Dict) -> bytes:
test = {"text_inputs": prompt}
input_str = json.dumps({"text_inputs": prompt})
encoded_json = json.dumps(test).encode("utf-8")
print(input_str)
print(encoded_json)
return encoded_json
print(input_str)
def transform_output(self, output: bytes) -> str:
#print(output)
response_json = json.loads(output.read().decode("utf-8"))
#print(response_json)
return response_json["embedding"]
#return response_json["embeddings"]
#response_json = json.loads(output.read().decode("utf-8")).get('generated_texts')
print("response" , response_json)
content_handler = ContentHandler()
embeddings = SagemakerEndpointEmbeddings(
endpoint_name="endpoint-name",
credentials_profile_name="credentials-profile-name",
endpoint_name="jumpstart-dft-hf-textembedding-gpt-j-6b-fp16", #huggingface-pytorch-inference-2023-03-21-16-14-03-834",
region_name="us-east-1",
content_handler=content_handler
)
#print(embeddings)`
index = index_creator.from_loaders([loader])
I get below error on above index creation line. Below is the stack trace.
ValueError Traceback (most recent call last)
Cell In[10], line 7
1 from langchain.indexes import VectorstoreIndexCreator
2 index_creator = VectorstoreIndexCreator(
3 vectorstore_cls=FAISS,
4 embedding=embeddings,
5 text_splitter=text_splitter
6 )
----> 7 index = index_creator.from_loaders([loader])
File /opt/conda/lib/python3.10/site-packages/langchain/indexes/vectorstore.py:71, in VectorstoreIndexCreator.from_loaders(self, loaders)
69 docs.extend(loader.load())
70 sub_docs = self.text_splitter.split_documents(docs)
---> 71 vectorstore = self.vectorstore_cls.from_documents(
72 sub_docs, self.embedding, **self.vectorstore_kwargs
73 )
74 return VectorStoreIndexWrapper(vectorstore=vectorstore)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/base.py:164, in VectorStore.from_documents(cls, documents, embedding, **kwargs)
162 texts = [d.page_content for d in documents]
163 metadatas = [d.metadata for d in documents]
--> 164 return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/faiss.py:345, in FAISS.from_texts(cls, texts, embedding, metadatas, **kwargs)
327 """Construct FAISS wrapper from raw documents.
328
329 This is a user friendly interface that:
(...)
342 faiss = FAISS.from_texts(texts, embeddings)
343 """
344 embeddings = embedding.embed_documents(texts)
--> 345 return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)
File /opt/conda/lib/python3.10/site-packages/langchain/vectorstores/faiss.py:308, in FAISS.__from(cls, texts, embeddings, embedding, metadatas, **kwargs)
306 faiss = dependable_faiss_import()
307 index = faiss.IndexFlatL2(len(embeddings[0]))
--> 308 index.add(np.array(embeddings, dtype=np.float32))
309 documents = []
310 for i, text in enumerate(texts):
File /opt/conda/lib/python3.10/site-packages/faiss/class_wrappers.py:227, in handle_Index..replacement_add(self, x)
214 def replacement_add(self, x):
215 """Adds vectors to the index.
216 The index must be trained before vectors can be added to it.
217 The vectors are implicitly numbered in sequence. When n vectors are
(...)
224 dtype must be float32.
225 """
--> 227 n, d = x.shape
228 assert d == self.d
229 x = np.ascontiguousarray(x, dtype='float32')
ValueError: too many values to unpack (expected 2)