Skip to content

Instantly share code, notes, and snippets.

@Renien
Last active May 14, 2016 13:11
Show Gist options
  • Save Renien/af591c38baf99f07e8f2e959b91a2666 to your computer and use it in GitHub Desktop.
Save Renien/af591c38baf99f07e8f2e959b91a2666 to your computer and use it in GitHub Desktop.
The measure called TF.IDF lets us identify words in a collection of documents that are useful for determining the topic of each document. A word has high TF.IDF score in a document if it appears in relatively few documents, but appears in this one, and when it appears in a document it tends to appear many times.
__author__ = 'renienj'
import numpy as np
import pandas as pd
import math as math
def compute_tfidf(tf_list, idf_list):
"""
tfidf = tf(w) * idf(w)
"""
tfidi = {}
for word, val in tf_list.iteritems():
tfidi[word] = val * idf_list[word]
return tfidi
def compute_inverse_document_frequency(doc_unique_list):
"""
idf(w) = log(# of documents /
# of documents that contain word w)
:rtype : idf_dict
"""
doc_count = len(doc_unique_list)
idf_dict = {}
# Count the W containing documents
for doc in doc_unique_list:
for word in doc:
if word in idf_dict:
idf_dict[word] += 1
else:
idf_dict[word] = 1
for word, count in idf_dict.iteritems():
idf_dict[word] = math.log(doc_count / float(count))
return idf_dict
def compute_term_frequency(document, dict):
"""
tf(w) = (# of times the words appears in a document) /
(Total # of words in the document)
"""
tf_dict = {}
words_count = len(document)
for word, count in dict.iteritems():
tf_dict[word] = count / float(words_count)
return tf_dict
def count_the_words(document, dict):
"""
Count the words in the bad.
This will iterate the documents and
check for words and increment the count
"""
for word in document:
dict[word] += 1
return dict
if __name__ == "__main__":
# Sample documents
document_a = "the cat sat on my face"
document_b = "the dog sat on my bed"
#Tokenizing
bag_of_words_a = document_a.split(" ")
bag_of_words_b = document_b.split(" ")
# All words in the dictionaries
total_words = set(bag_of_words_a).union(set(bag_of_words_b))
# Set the default values for the dictionaries
words_dic_a = dict.fromkeys(total_words, 0)
words_dic_b = dict.fromkeys(total_words, 0)
words_dic_a = count_the_words(bag_of_words_a, words_dic_a)
words_dic_b = count_the_words(bag_of_words_b, words_dic_b)
print "\n*** numpy view ***\n"
print np.array([words_dic_a, words_dic_b])
print "\n*** panda view ***\n"
print pd.DataFrame([words_dic_a, words_dic_b])
# Calculate the Term Frequency
tf_document_a = compute_term_frequency(bag_of_words_a, words_dic_a)
tf_document_b = compute_term_frequency(bag_of_words_b, words_dic_b)
idf_documents = compute_inverse_document_frequency([{k: v for k, v in words_dic_a.items() if v > 0},
{k: v for k, v in words_dic_b.items() if v > 0}])
tfidf_document_a = compute_tfidf(tf_document_a, idf_documents)
tfidf_document_b = compute_tfidf(tf_document_b, idf_documents)
print "\n---------- TF.IDF Results ----------\n"
print "\n*** numpy view ***\n"
print np.array([tfidf_document_a, tfidf_document_b])
print "\n*** panda view ***\n"
print pd.DataFrame([tfidf_document_a, tfidf_document_b])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment