Skip to content

Instantly share code, notes, and snippets.

@jeffchuber
Created July 21, 2023 04:43
Show Gist options
  • Save jeffchuber/a9ebc0ad5c7b053b8d1c50449c07f893 to your computer and use it in GitHub Desktop.
Save jeffchuber/a9ebc0ad5c7b053b8d1c50449c07f893 to your computer and use it in GitHub Desktop.
# import
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
# load the document and split it into chunks
loader = TextLoader("./state_of_the_union.txt")
documents = loader.load()
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
output_dir = "./db_metadata_v5"
db = Chroma.from_documents(docs, embedding_function, persist_directory=output_dir)
# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
# print results
print(docs[0].page_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment