Created
June 14, 2024 21:28
-
-
Save parkerfoshay/cdb4dea677550307b387419a11886715 to your computer and use it in GitHub Desktop.
Semantic caching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
from langchain_mongodb import MongoDBAtlasVectorSearch | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_transformers.openai_functions import ( | |
create_metadata_tagger, | |
) | |
import key_param | |
# Set the MongoDB URI, DB, Collection Names | |
client = MongoClient(key_param.MONGODB_URI) | |
dbName = "book_mongodb-chunks" | |
collectionName = "chunked_data" | |
collection = client[dbName][collectionName] | |
loader = PyPDFLoader( '.\sample_files\mongodb.pdf') | |
pages = loader.load() | |
cleaned_pages = [] | |
for page in pages: | |
if len(page.page_content.split(" ")) > 20: | |
cleaned_pages.append(page) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=500, | |
chunk_overlap=150 | |
) | |
schema = { | |
"properties": { | |
"title": {"type": "string"}, | |
"keywords": {"type": "array", "items": {"type": "string"}}, | |
"hasCode": {"type": "boolean"}, | |
}, | |
"required": ["title", "keywords", "hasCode"], | |
} | |
llm = ChatOpenAI(openai_api_key=key_param.openai_api_key, temperature=0, model="gpt-3.5-turbo-0613") | |
document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm) | |
docs = document_transformer.transform_documents(cleaned_pages) | |
split_docs = text_splitter.split_documents(docs) | |
print(split_docs[0]) | |
embeddings = OpenAIEmbeddings(openai_api_key=key_param.openai_api_key) | |
vectorStore = MongoDBAtlasVectorSearch.from_documents( split_docs, embeddings, collection=collection ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pip install langchain langchain_community langchain_core langchain_openai langchain_mongodb pymongo pypdf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pymongo import MongoClient | |
from langchain_mongodb import MongoDBAtlasVectorSearch | |
from langchain_mongodb.cache import MongoDBAtlasSemanticCache | |
from langchain_core.globals import set_llm_cache | |
from langchain_openai import OpenAIEmbeddings, ChatOpenAI | |
from langchain.prompts import PromptTemplate | |
from langchain_core.runnables import RunnablePassthrough, RunnableParallel | |
from langchain_core.output_parsers import StrOutputParser | |
from langsmith import traceable, Client | |
import key_param | |
import time | |
from dotenv import load_dotenv | |
import os | |
load_dotenv() | |
client = MongoClient(key_param.MONGODB_URI) | |
dbName = "book_mongodb-chunks" | |
collectionName = "chunked_data" | |
cacheCollectionName = "semantic_cache" | |
embeddingModel = OpenAIEmbeddings( | |
disallowed_special=(), openai_api_key=key_param.openai_api_key | |
) | |
index = "vector_index" | |
collection = client[dbName][collectionName] | |
vectorStore = MongoDBAtlasVectorSearch.from_connection_string( | |
key_param.MONGODB_URI, | |
dbName + "." + collectionName, | |
embeddingModel, | |
index_name=index, | |
) | |
set_llm_cache( | |
MongoDBAtlasSemanticCache( | |
embedding=embeddingModel, | |
connection_string=key_param.MONGODB_URI, | |
collection_name=cacheCollectionName, | |
database_name=dbName, | |
index_name=index | |
) | |
) | |
def query_data(query): | |
retriever = vectorStore.as_retriever( | |
search_type="similarity", | |
search_kwargs={"k": 5}, | |
) | |
template = """ | |
Use the following pieces of context to answer the question at the end. | |
If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
Do not answer the question if there is no given context. | |
Do not answer the question if it is not related to the context. | |
Context: | |
{context} | |
Question: {question} | |
""" | |
custom_rag_prompt = PromptTemplate.from_template(template) | |
retrieve = { | |
"context": retriever | |
| (lambda docs: "\n\n".join([d.page_content for d in docs])), | |
"question": RunnablePassthrough(), | |
} | |
llm = ChatOpenAI(openai_api_key=key_param.openai_api_key, temperature=0) | |
response_parser = StrOutputParser() | |
rag_chain = ( | |
RunnablePassthrough.assign(context=(lambda x: x["context"])) | |
| custom_rag_prompt | |
| llm | |
| response_parser | |
) | |
rag_chain_with_source = RunnableParallel(retrieve).assign(answer=rag_chain) | |
start_time = time.time() | |
answer = rag_chain_with_source.invoke(query) | |
end_time = time.time() | |
print(f"Time taken: {end_time - start_time}") | |
return answer | |
print(query_data("When did MongoDB begin supporting multi-document transactions?")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment