Skip to content

Instantly share code, notes, and snippets.

@vamsigutta
Created August 18, 2023 14:25
Show Gist options
  • Save vamsigutta/8505493931a7474b455d91729d164c18 to your computer and use it in GitHub Desktop.
Save vamsigutta/8505493931a7474b455d91729d164c18 to your computer and use it in GitHub Desktop.
The code helps you query your own personal directory in your pc
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from transformers import T5Tokenizer, T5ForConditionalGeneration
import speech_recognition as sr
import sys
def get_context_faiss(query):
search_results = db.similarity_search(query)
context_doc = ' '.join([ doc.page_content for doc in search_results])
return context_doc
if __name__ == "__main__":
documents = []
# Mention here your own custom path
path = "./test/"
for document in ["pdf", "md", "txt", "csv"]:
loader = DirectoryLoader(path, glob=f'*.{document}')
documents.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(documents)
embeddings = HuggingFaceInstructEmbeddings(
model_name="hkunlp/instructor-large",
model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_documents(all_splits, embeddings)
vectorstore.save_local("vector_store/faiss_index_local_data")
db = FAISS.load_local("vector_store/faiss_index_local_data", embeddings)
parameter = sys.argv[-1]
while True:
if parameter == "speech":
r = sr.Recognizer()
with sr.Microphone() as source:
print("Say something!")
audio = r.listen(source)
query = r.recognize_whisper(audio, language="english")
else:
query = input("Enter your query: ")
print(f"Query is : {query}")
if query == 'quit':
break
context = get_context_faiss(query)
prompt_txt = f"""{context}
Q: {query}
A:
"""
## In the current example we are using Flan-T5-llm to get our answers
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl")
input_ids = tokenizer(prompt_txt, return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment