Created April 9, 2023
Creating a private data QA bot entirely using the open-source LLM project
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import LlamaCppEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
loader = UnstructuredHTMLLoader("langchain/docs/_build/html/index.html")
embedding = LlamaCppEmbeddings(model_path="path/models/ggml-model-q4_0.bin")
llm = LlamaCpp(model_path="path/models/ggml-model-q4_0.bin")
def split_chunks(sources: list) -> list:
chunks = []
splitter = RecursiveCharacterTextSplitter(separator="", chunk_size=256, chunk_overlap=16)
for chunk in splitter.split_documents(sources):
return chunks
def generate_embedding(chunks: list):
texts = [doc.page_content for doc in chunks]
metadatas = [doc.metadata for doc in chunks]
search_index = FAISS.from_texts(texts, embedding, metadatas=metadatas)
return search_index
def similarity_search(
query: str, index: FAISS
) -> (list, list):
matched_docs = index.similarity_search(query, k=4)
sources = []
for doc in matched_docs:
"page_content": doc.page_content,
"metadata": doc.metadata,
return matched_docs, sources
docs = loader.load()
chunks = split_chunks(docs)
embeddings = generate_embedding(chunks)
question = "What are the use cases of LangChain?"
matched_docs, sources = similarity_search(question, embeddings)
template = """
Please use the following context to answer questions.
Context: {context}
Question: {question}
Answer: Let's think step by step."""
context = "\n".join([doc.page_content for doc in matched_docs])
prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)
llm_chain = LLMChain(prompt=prompt, llm=llm)
