Skip to content

Instantly share code, notes, and snippets.

@amotl
Created October 27, 2023 20:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amotl/75f27244951f201b89db0d8394f97a0e to your computer and use it in GitHub Desktop.
Save amotl/75f27244951f201b89db0d8394f97a0e to your computer and use it in GitHub Desktop.
import logging
import sys
import requests
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import CrateDBVectorSearch
from langchain.embeddings import OpenAIEmbeddings
COLLECTION_NAME = "state_of_the_union_test"
CONNECTION_STRING = "crate://crate@localhost/?schema=testdrive"
def get_state_of_the_union():
"""
Load the `state_of_the_union.txt` document, and split it into chunks.
"""
state_of_the_union = requests.get("https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt").text
raw_documents = [Document(page_content=state_of_the_union, metadata={"source": "state_of_the_union.txt"})]
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
return documents
def main():
"""
Load document chunks, embed each chunk, and load it into the vector store.
"""
embeddings = OpenAIEmbeddings()
documents = get_state_of_the_union()
db = CrateDBVectorSearch.from_documents(
embedding=embeddings,
documents=documents,
collection_name=COLLECTION_NAME,
connection_string=CONNECTION_STRING,
pre_delete_collection=True,
)
def setup_logging(level=logging.DEBUG):
log_format = "%(asctime)-15s [%(name)-35s] %(levelname)-8s: %(message)s"
logging.basicConfig(format=log_format, stream=sys.stderr, level=level)
if __name__ == "__main__":
setup_logging()
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment