Skip to content

Instantly share code, notes, and snippets.

View sukanyabag's full-sized avatar
🌴
On vacation

Sukanya Bag sukanyabag

🌴
On vacation
View GitHub Profile
import os
from controller import Controller
import gradio as gr
os.environ["TOKENIZERS_PARALLELISM"] = "false"
controller = Controller()
def process_pdf(file):
import os
OPENAI_API_KEY = os.getenv(OPENAI_API_KEY)
TEXT_VECTORSTORE_PATH = "data\deeplake_text_vectorstore"
CHARACTER_SPLITTER_CHUNK_SIZE = 75
OPENAI_EMBEDDINGS_CHUNK_SIZE = 16
from langchain.callbacks import get_openai_callback
def save(query, qa):
with get_openai_callback() as cb:
response = qa({"query": query}, return_only_outputs=True)
return response["result"]
def retrieve_text(self, query):
self.text_deeplake_schema = DeepLake(
dataset_path=cfg.TEXT_VECTORSTORE_PATH,
read_only=True,
embedding_function=self.embeddings,
)
prompt_template = """You are an intelligent AI which analyses text from documents and
answers the user's questions. Please answer in as much detail as possible, so that the user does not have to
revisit the document. If you don't know the answer, say that you don't know, and avoid making up things.
def create_and_add_embeddings(self, file):
os.makedirs("data", exist_ok=True)
self.embeddings = OpenAIEmbeddings(
openai_api_key=cfg.OPENAI_API_KEY,
chunk_size=cfg.OPENAI_EMBEDDINGS_CHUNK_SIZE,
)
loader = PyMuPDFLoader(file)
documents = loader.load()
class Retriever:
def __init__(self):
self.text_retriever = None
self.text_deeplake_schema = None
self.embeddings = None
self.memory = ConversationBufferWindowMemory(k=2, return_messages=True)
import os
from langchain import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.deeplake import DeepLake
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyMuPDFLoader
from langchain.chat_models.openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferWindowMemory
from retriever.retrieval import Retriever
class Controller:
def __init__(self):
self.retriever = None
self.query = ""
def embed_document(self, file):
if file is not None:
self.retriever = Retriever()
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res1)))
from imblearn.combine import SMOTETomek
# Implementing Oversampling for Handling Imbalanced
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,Y)
os_us = SMOTETomek(ratio=0.5)
X_train_res1, y_train_res1 = os_us.fit_sample(X, Y)
X_train_res1.shape,y_train_res1.shape