Skip to content

Instantly share code, notes, and snippets.

@PrashantSaikia
Created October 29, 2023 23:30
Show Gist options
  • Save PrashantSaikia/d6a5be8a0e7e236207e759366e2becc5 to your computer and use it in GitHub Desktop.
Save PrashantSaikia/d6a5be8a0e7e236207e759366e2becc5 to your computer and use it in GitHub Desktop.
Weaviate Schema for Multiple Document Query Bot
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.weaviate import Weaviate
import weaviate, os
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
# Get environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = weaviate.Client('http://localhost:80')
# Load the documents
doc_loader = DirectoryLoader(
r'C:\Users\username\Docs',
glob='**/*.pdf',
show_progress=True
)
docs = doc_loader.load()
# Split the docs into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=50
)
splitted_docs_list = splitter.split_documents(docs)
# Create schema
classname = 'multiple-PDF-query-class'
if client.schema.exists(classname):
client.schema.delete_class(classname)
class_obj = {
"class": classname,
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"vectorizeClassName": True
}
}
}
try:
# Add the class to the schema
client.schema.create_class(class_obj)
except:
print("Class already exists")
embeddings = OpenAIEmbeddings()
# We use 'classname' for index_name and 'text' for text_key
vectorstore = Weaviate(client, classname, "text", embedding=embeddings)
# add text chunks' embeddings to the Weaviate vector database
texts = [d.page_content for d in splitted_docs_list]
metadatas = [d.metadata for d in splitted_docs_list]
vectorstore.add_texts(texts, metadatas=metadatas, embedding=embeddings)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment