Created
May 14, 2020 12:39
-
-
Save narendraprasath/1b9a80e5ce8fd540e660e8d7b54904f3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TF_IDF(): | |
def __init__(self): | |
self.dictionary = None | |
self.model = None | |
self.bow_corpus = None | |
def create_tf_idf_model(self, data_df, column_name): | |
## create sentence token list | |
sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]] | |
## dataset vocabulary | |
self.dictionary = Dictionary(sentence_token_list) | |
## bow representation of dataset | |
self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list] | |
## compute TF-IDF score for corpus | |
self.model = TfidfModel(self.bow_corpus) | |
## representation of question and respective TF-IDF value | |
print(f"First 10 question representation of TF-IDF vector") | |
for index, sentence in enumerate(data_df[column_name]): | |
if index <= 10: | |
print(f"{sentence} {self.model[self.bow_corpus[index]]}") | |
else: | |
break | |
def get_vector_for_test_set(self, test_df, column_name): | |
## store tf-idf vector | |
testset_tf_idf_vector = [] | |
sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]] | |
test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list] | |
for test_sentence in test_bow_corpus: | |
testset_tf_idf_vector.append(self.model[test_sentence]) | |
return testset_tf_idf_vector | |
def get_training_QA_vectors(self): | |
QA_vectors = [] | |
for sentence_vector in self.bow_corpus: | |
QA_vectors.append(self.model[sentence_vector]) | |
return QA_vectors | |
def get_train_vocabulary(self): | |
vocab = [] | |
for index in self.dictionary: | |
vocab.append(self.dictionary[index]) | |
return vocab |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment