Skip to content

Instantly share code, notes, and snippets.

@amotl
Created October 30, 2023 00:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amotl/f8a62404e23a172f0671842e965dae48 to your computer and use it in GitHub Desktop.
Save amotl/f8a62404e23a172f0671842e965dae48 to your computer and use it in GitHub Desktop.
"""
pip install requests 'requests-cache<2'
"""
import os
import requests_cache
import typing as t
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.cratedb import CrateDBVectorSearch
from unstructured.partition.html import partition_html
# Connect to CrateDB instance defined by `CRATEDB_CONNECTION_STRING`.
CONNECTION_STRING = os.environ.get(
"CRATEDB_CONNECTION_STRING",
"crate://crate@localhost/?schema=notebook",
)
COLLECTION_NAME = "state_of_the_union_test"
embeddings = OpenAIEmbeddings()
http = requests_cache.CachedSession(".httpcache")
def document_from_url(url: str) -> Document:
"""
Converge URL resource into LangChain Document.
"""
response = http.get(url)
elements = partition_html(text=response.text)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url}
return Document(page_content=text, metadata=metadata)
def load_documents(url: str) -> t.List[Document]:
"""
Load URL resource, and split paragraphs in response into individual documents.
"""
documents = [document_from_url(url)]
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
return docs
def main():
print("Acquiring")
url = "https://github.com/langchain-ai/langchain/raw/v0.0.325/docs/docs/modules/state_of_the_union.txt"
docs = load_documents(url)
print("Loading")
db = CrateDBVectorSearch.from_documents(
documents=docs,
embedding=embeddings,
collection_name=COLLECTION_NAME,
connection_string=CONNECTION_STRING,
pre_delete_collection=True,
)
print("Querying")
docs_with_score = db.similarity_search_with_score("foo", k=10)
print(f"Result count: {len(docs_with_score)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment