Last active
May 14, 2016 13:11
-
-
Save Renien/af591c38baf99f07e8f2e959b91a2666 to your computer and use it in GitHub Desktop.
The measure called TF.IDF lets us identify words in a collection of documents that are useful for determining the topic of each document. A word has high TF.IDF score in a document if it appears in relatively few documents, but appears in this one, and when it appears in a document it tends to appear many times.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'renienj' | |
import numpy as np | |
import pandas as pd | |
import math as math | |
def compute_tfidf(tf_list, idf_list): | |
""" | |
tfidf = tf(w) * idf(w) | |
""" | |
tfidi = {} | |
for word, val in tf_list.iteritems(): | |
tfidi[word] = val * idf_list[word] | |
return tfidi | |
def compute_inverse_document_frequency(doc_unique_list): | |
""" | |
idf(w) = log(# of documents / | |
# of documents that contain word w) | |
:rtype : idf_dict | |
""" | |
doc_count = len(doc_unique_list) | |
idf_dict = {} | |
# Count the W containing documents | |
for doc in doc_unique_list: | |
for word in doc: | |
if word in idf_dict: | |
idf_dict[word] += 1 | |
else: | |
idf_dict[word] = 1 | |
for word, count in idf_dict.iteritems(): | |
idf_dict[word] = math.log(doc_count / float(count)) | |
return idf_dict | |
def compute_term_frequency(document, dict): | |
""" | |
tf(w) = (# of times the words appears in a document) / | |
(Total # of words in the document) | |
""" | |
tf_dict = {} | |
words_count = len(document) | |
for word, count in dict.iteritems(): | |
tf_dict[word] = count / float(words_count) | |
return tf_dict | |
def count_the_words(document, dict): | |
""" | |
Count the words in the bad. | |
This will iterate the documents and | |
check for words and increment the count | |
""" | |
for word in document: | |
dict[word] += 1 | |
return dict | |
if __name__ == "__main__": | |
# Sample documents | |
document_a = "the cat sat on my face" | |
document_b = "the dog sat on my bed" | |
#Tokenizing | |
bag_of_words_a = document_a.split(" ") | |
bag_of_words_b = document_b.split(" ") | |
# All words in the dictionaries | |
total_words = set(bag_of_words_a).union(set(bag_of_words_b)) | |
# Set the default values for the dictionaries | |
words_dic_a = dict.fromkeys(total_words, 0) | |
words_dic_b = dict.fromkeys(total_words, 0) | |
words_dic_a = count_the_words(bag_of_words_a, words_dic_a) | |
words_dic_b = count_the_words(bag_of_words_b, words_dic_b) | |
print "\n*** numpy view ***\n" | |
print np.array([words_dic_a, words_dic_b]) | |
print "\n*** panda view ***\n" | |
print pd.DataFrame([words_dic_a, words_dic_b]) | |
# Calculate the Term Frequency | |
tf_document_a = compute_term_frequency(bag_of_words_a, words_dic_a) | |
tf_document_b = compute_term_frequency(bag_of_words_b, words_dic_b) | |
idf_documents = compute_inverse_document_frequency([{k: v for k, v in words_dic_a.items() if v > 0}, | |
{k: v for k, v in words_dic_b.items() if v > 0}]) | |
tfidf_document_a = compute_tfidf(tf_document_a, idf_documents) | |
tfidf_document_b = compute_tfidf(tf_document_b, idf_documents) | |
print "\n---------- TF.IDF Results ----------\n" | |
print "\n*** numpy view ***\n" | |
print np.array([tfidf_document_a, tfidf_document_b]) | |
print "\n*** panda view ***\n" | |
print pd.DataFrame([tfidf_document_a, tfidf_document_b]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment