Skip to content

Instantly share code, notes, and snippets.

@narendraprasath
Created May 14, 2020 12:39
Show Gist options
  • Save narendraprasath/1b9a80e5ce8fd540e660e8d7b54904f3 to your computer and use it in GitHub Desktop.
Save narendraprasath/1b9a80e5ce8fd540e660e8d7b54904f3 to your computer and use it in GitHub Desktop.
class TF_IDF():
def __init__(self):
self.dictionary = None
self.model = None
self.bow_corpus = None
def create_tf_idf_model(self, data_df, column_name):
## create sentence token list
sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]]
## dataset vocabulary
self.dictionary = Dictionary(sentence_token_list)
## bow representation of dataset
self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]
## compute TF-IDF score for corpus
self.model = TfidfModel(self.bow_corpus)
## representation of question and respective TF-IDF value
print(f"First 10 question representation of TF-IDF vector")
for index, sentence in enumerate(data_df[column_name]):
if index <= 10:
print(f"{sentence} {self.model[self.bow_corpus[index]]}")
else:
break
def get_vector_for_test_set(self, test_df, column_name):
## store tf-idf vector
testset_tf_idf_vector = []
sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]]
test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]
for test_sentence in test_bow_corpus:
testset_tf_idf_vector.append(self.model[test_sentence])
return testset_tf_idf_vector
def get_training_QA_vectors(self):
QA_vectors = []
for sentence_vector in self.bow_corpus:
QA_vectors.append(self.model[sentence_vector])
return QA_vectors
def get_train_vocabulary(self):
vocab = []
for index in self.dictionary:
vocab.append(self.dictionary[index])
return vocab
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment