narendraprasath/TF-IDF Representation

## TF-IDF Representation
class TF_IDF():
  def __init__(self):
    self.dictionary = None
    self.model = None
    self.bow_corpus = None

  def create_tf_idf_model(self, data_df, column_name):
    ## create sentence token list
    sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]]

    ## dataset vocabulary
    self.dictionary = Dictionary(sentence_token_list)

    ## bow representation of dataset
    self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]

    ## compute TF-IDF score for corpus
    self.model = TfidfModel(self.bow_corpus)

    ## representation of question and respective TF-IDF value
    print(f"First 10 question representation of TF-IDF vector")
    for index, sentence in enumerate(data_df[column_name]):
      if index <= 10:
        print(f"{sentence} {self.model[self.bow_corpus[index]]}")
      else:
        break

  def get_vector_for_test_set(self, test_df, column_name):
    ## store tf-idf vector
    testset_tf_idf_vector = []
    sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]]
    test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]
    for test_sentence in test_bow_corpus:
      testset_tf_idf_vector.append(self.model[test_sentence])

    return testset_tf_idf_vector

  def get_training_QA_vectors(self):
    QA_vectors = []
    for sentence_vector in self.bow_corpus:
      QA_vectors.append(self.model[sentence_vector])
    return QA_vectors

  def get_train_vocabulary(self):
    vocab = []
    for index in self.dictionary:
      vocab.append(self.dictionary[index])
    return vocab
	class TF_IDF():
	def __init__(self):
	self.dictionary = None
	self.model = None
	self.bow_corpus = None

	def create_tf_idf_model(self, data_df, column_name):
	## create sentence token list
	sentence_token_list = [sentence.split(" ") for sentence in data_df[column_name]]

	## dataset vocabulary
	self.dictionary = Dictionary(sentence_token_list)

	## bow representation of dataset
	self.bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]

	## compute TF-IDF score for corpus
	self.model = TfidfModel(self.bow_corpus)

	## representation of question and respective TF-IDF value
	print(f"First 10 question representation of TF-IDF vector")
	for index, sentence in enumerate(data_df[column_name]):
	if index <= 10:
	print(f"{sentence} {self.model[self.bow_corpus[index]]}")
	else:
	break

	def get_vector_for_test_set(self, test_df, column_name):
	## store tf-idf vector
	testset_tf_idf_vector = []
	sentence_token_list = [sentence.split(" ") for sentence in test_df[column_name]]
	test_bow_corpus = [self.dictionary.doc2bow(sentence_tokens) for sentence_tokens in sentence_token_list]
	for test_sentence in test_bow_corpus:
	testset_tf_idf_vector.append(self.model[test_sentence])

	return testset_tf_idf_vector

	def get_training_QA_vectors(self):
	QA_vectors = []
	for sentence_vector in self.bow_corpus:
	QA_vectors.append(self.model[sentence_vector])
	return QA_vectors

	def get_train_vocabulary(self):
	vocab = []
	for index in self.dictionary:
	vocab.append(self.dictionary[index])
	return vocab