ravi07bec/tf_idf.py

## tf_idf.py
#Term Frequency
def termfreq(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance/N

#Inverse Document Frequency

def inverse_doc_freq(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_documents/word_occurance)

#Combining the functions
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word)

        value = tf*idf
        tf_idf_vec[index_dict[word]] = value
    return tf_idf_vec
	#Term Frequency
	def termfreq(document, word):
	N = len(document)
	occurance = len([token for token in document if token == word])
	return occurance/N

	#Inverse Document Frequency

	def inverse_doc_freq(word):
	try:
	word_occurance = word_count[word] + 1
	except:
	word_occurance = 1
	return np.log(total_documents/word_occurance)

	#Combining the functions
	def tf_idf(sentence):
	tf_idf_vec = np.zeros((len(word_set),))
	for word in sentence:
	tf = termfreq(sentence,word)
	idf = inverse_doc_freq(word)

	value = tf*idf
	tf_idf_vec[index_dict[word]] = value
	return tf_idf_vec