Skip to content

Instantly share code, notes, and snippets.

@sukanyabag
Created September 10, 2023 15:38
Show Gist options
  • Save sukanyabag/1f5c0aa9a85ee7d634c6fd84af8b3e04 to your computer and use it in GitHub Desktop.
Save sukanyabag/1f5c0aa9a85ee7d634c6fd84af8b3e04 to your computer and use it in GitHub Desktop.
def create_and_add_embeddings(self, file):
os.makedirs("data", exist_ok=True)
self.embeddings = OpenAIEmbeddings(
openai_api_key=cfg.OPENAI_API_KEY,
chunk_size=cfg.OPENAI_EMBEDDINGS_CHUNK_SIZE,
)
loader = PyMuPDFLoader(file)
documents = loader.load()
text_splitter = CharacterTextSplitter(
chunk_size=cfg.CHARACTER_SPLITTER_CHUNK_SIZE,
chunk_overlap=0,
)
docs = text_splitter.split_documents(documents)
self.text_deeplake_schema = DeepLake(
dataset_path=cfg.TEXT_VECTORSTORE_PATH,
embedding_function=self.embeddings,
overwrite=True,
)
self.text_deeplake_schema.add_documents(docs)
self.text_retriever = self.text_deeplake_schema.as_retriever(
search_type="similarity"
)
self.text_retriever.search_kwargs["distance_metric"] = "cos"
self.text_retriever.search_kwargs["fetch_k"] = 15
self.text_retriever.search_kwargs["maximal_marginal_relevance"] = True
self.text_retriever.search_kwargs["k"] = 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment