This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flask import Flask, request, send_from_directory, send_file | |
import os | |
from flask_cors import CORS, cross_origin | |
# set the project root directory as the static folder | |
app = Flask(__name__, static_url_path='') | |
CORS(app) | |
app.config["CORS_HEADERS"]= 'Content-Type' | |
## flask endpoint to serve image using filename |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import json | |
from datetime import datetime, timedelta | |
import locale | |
import traceback | |
from matplotlib import dates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## helps to retrieve similar question based of input vectors/embeddings for test query | |
def retrieveSimilarFAQ(train_question_vectors, test_question_vectors, train_QA_df, train_column_name, test_QA_df, test_column_name): | |
similar_question_index = [] | |
for test_index, test_vector in enumerate(test_question_vectors): | |
sim, sim_Q_index = -1, -1 | |
for train_index, train_vector in enumerate(train_question_vectors): | |
sim_score = cosine_similarity(train_vector, test_vector)[0][0] | |
if sim < sim_score: | |
sim = sim_score |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Embeddings(): | |
def __init__(self, model_path): | |
self.model_path = model_path | |
self.model = None | |
self.__load_model__() | |
def __load_model__(self): | |
#word_vectors = api.load("glove-wiki-gigaword-100") | |
model_name = 'glove-twitter-25' #'word2vec-google-news-50' #'glove-twitter-25' | |
if not os.path.exists(self.model_path+ model_name): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TF_IDF(): | |
def __init__(self): | |
self.dictionary = None | |
self.model = None | |
self.bow_corpus = None | |
def create_tf_idf_model(self, data_df, column_name): | |
## create sentence token list | |
sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Data Preprocessing | |
class TextPreprocessor(): | |
def __init__(self, data_df, column_name=None): | |
self.data_df = data_df | |
if not column_name and type(colum_name) == str: | |
raise Exception("column name is mandatory. Make sure type is string format") | |
self.column = column_name | |
self.convert_lowercase() | |
self.applied_stopword = False | |
self.processed_column_name = f"processed_{self.column}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## QA will be stored as .csv file | |
def extract_QA_from_text_file(INPUT_DIR, text_file_name): | |
output_file_name = 'covid_19faq.csv' | |
with open(os.path.join(INPUT_DIR, text_file_name), 'r', encoding='latin') as obj: | |
text = obj.read() | |
text = text.strip() | |
## extract the question by following pattern | |
pattern = '\n+\s*\d+[.](.*?)\?' | |
question_pattern = re.compile(pattern,re.MULTILINE|re.IGNORECASE|re.DOTALL) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## The data is taken from https://www.un.org/sites/un2.un.org/files/new_dhmosh_covid-19_faq.pdf | |
## it has FAQ based question and answering for COVID-19 | |
def download_pdf_url(dataset_url, file_name): | |
response = requests.get(dataset_url) | |
pdf_content_output = None | |
with io.BytesIO(response.content) as open_pdf_file: | |
with open(file_name,'w') as obj: | |
obj.write(str(open_pdf_file)) |