Last active
February 27, 2024 16:26
-
-
Save mikesparr/7b8c6360d884770393c3e88376309ed1 to your computer and use it in GitHub Desktop.
Experiment with Langchain, OpenAI, and Datastax AstraDB (vector database) for custom LLM Q&A bot in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Reference: https://www.youtube.com/watch?v=yfHHvmaMkcA (Tutorial - some dated libs fixed in here) | |
# set constants | |
ASTRA_DB_APPLICATION_TOKEN="<< YOUR APP TOKEN (starts with AstraCS:) >>" | |
ASTRA_DB_SECURE_BUNDLE_PATH="<< FULL PATH TO YOUR ASTRA DB BUNDLE >>" | |
ASTRA_DB_CLIENT_ID="<< YOUR ASTRA DB CLIENT ID >>" | |
ASTRA_DB_CLIENT_SECRET="<< YOUR ASTRA DB CLIENT SECRET >>" | |
ASTRA_DB_KEYSPACE="search" | |
OPENAI_API_KEY="<< YOUR OPENAI KEY (starts with sk-) >>" | |
# imports | |
from langchain.vectorstores.cassandra import Cassandra | |
from langchain.indexes.vectorstore import VectorStoreIndexWrapper | |
from langchain_openai import OpenAI | |
from langchain_openai import OpenAIEmbeddings | |
from cassandra.cluster import Cluster | |
from cassandra.auth import PlainTextAuthProvider | |
from datasets import load_dataset | |
# initialize | |
cloud_config = { | |
'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH | |
} | |
auth_provider = PlainTextAuthProvider(ASTRA_DB_CLIENT_ID, ASTRA_DB_CLIENT_SECRET) | |
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider) | |
astraSession = cluster.connect() | |
llm = OpenAI(openai_api_key=OPENAI_API_KEY) | |
myEmbedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) | |
myCassandraVStore = Cassandra( | |
embedding=myEmbedding, | |
session=astraSession, | |
keyspace=ASTRA_DB_KEYSPACE, | |
table_name="qa_mini_demo", | |
) | |
# load data from sample source | |
print("Loading data from huggingface") | |
myDataset = load_dataset("Biddls/Onion_News", split="train") | |
headlines = myDataset["text"][:50] | |
# generate vector embeddings from text above and insert into Cassandra DB store (Astra DB serverless vector DB) | |
print("\nGenerating embeddings and storing in AstraDB") | |
myCassandraVStore.add_texts(headlines) | |
print("Inserted %i headlines.\n" % len(headlines)) | |
vectorIndex = VectorStoreIndexWrapper(vectorstore=myCassandraVStore) | |
# create simple CLI Q&A bot ('quit' to end) | |
first_question = True | |
while True: | |
if first_question: | |
query_text = input("\nEnter your question (or type 'quit' to exit): ") | |
first_question = False | |
else: | |
query_text = input("\nWhat's your next question (or type 'quit' to exit): ") | |
if query_text.lower() == 'quit': | |
break | |
print("QUESTION: \"%s\"" % query_text) | |
answer = vectorIndex.query(query_text, llm=llm).strip() | |
print("ANSWER: \"%s\"\n" % answer) | |
print("DOCUMENTS BY RELEVANCE:") | |
for doc, score in myCassandraVStore.similarity_search_with_score(query_text, k=4): | |
print(" %0.4f \"%s ...\"" % (score, doc.page_content[:60])) |
Additional learnings about Word2Vec and NLP
The YouTube tutorial on freeCodeCamp briefly covers the word embedding concepts in natural language processing (NLP) and mentions Word2Vec even though it leverages OpenAI's embeddings API. Out of curiosity I searched for word2vec and learned more about it and ironically, the example in the tutorial mentions king - man + woman = queen
and that is the exact example in this Medium article on Word2Vec.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Generative AI chat bot using large language model (LLM)
This is the working example code based upon this tutorial on YouTube which is updated to include correct libraries (some in video were deprecated references).
Result