Renien/tf-ids-python.py

## tf-ids-python.py
__author__ = 'renienj'

import numpy as np
import pandas as pd
import math as math


def compute_tfidf(tf_list, idf_list):
    """
    tfidf = tf(w) * idf(w)
    """
    tfidi = {}
    for word, val in tf_list.iteritems():
        tfidi[word] = val * idf_list[word]
    return tfidi


def compute_inverse_document_frequency(doc_unique_list):
    """
    idf(w) = log(# of documents /
                # of documents that contain word w)
    :rtype : idf_dict
    """
    doc_count = len(doc_unique_list)
    idf_dict = {}

    # Count the W containing documents
    for doc in doc_unique_list:
        for word in doc:
            if word in idf_dict:
                idf_dict[word] += 1
            else:
                idf_dict[word] = 1

    for word, count in idf_dict.iteritems():
        idf_dict[word] = math.log(doc_count / float(count))

    return idf_dict


def compute_term_frequency(document, dict):
    """
    tf(w) = (# of times the words appears in a document) /
                    (Total # of words in the document)
    """
    tf_dict = {}
    words_count = len(document)

    for word, count in dict.iteritems():
        tf_dict[word] = count / float(words_count)
    return tf_dict


def count_the_words(document, dict):
    """
    Count the words in the bad.
    This will iterate the documents and
    check for words and increment the count
    """
    for word in document:
        dict[word] += 1
    return dict


if __name__ == "__main__":
    # Sample documents
    document_a = "the cat sat on my face"
    document_b = "the dog sat on my bed"

    #Tokenizing
    bag_of_words_a = document_a.split(" ")
    bag_of_words_b = document_b.split(" ")

    # All words in the dictionaries
    total_words = set(bag_of_words_a).union(set(bag_of_words_b))

    # Set the default values for the dictionaries
    words_dic_a = dict.fromkeys(total_words, 0)
    words_dic_b = dict.fromkeys(total_words, 0)

    words_dic_a = count_the_words(bag_of_words_a, words_dic_a)
    words_dic_b = count_the_words(bag_of_words_b, words_dic_b)

    print "\n*** numpy view ***\n"
    print np.array([words_dic_a, words_dic_b])
    print "\n*** panda view ***\n"
    print pd.DataFrame([words_dic_a, words_dic_b])

    # Calculate the Term Frequency
    tf_document_a = compute_term_frequency(bag_of_words_a, words_dic_a)
    tf_document_b = compute_term_frequency(bag_of_words_b, words_dic_b)

    idf_documents = compute_inverse_document_frequency([{k: v for k, v in words_dic_a.items() if v > 0},
                                                        {k: v for k, v in words_dic_b.items() if v > 0}])

    tfidf_document_a = compute_tfidf(tf_document_a, idf_documents)
    tfidf_document_b = compute_tfidf(tf_document_b, idf_documents)

    print "\n---------- TF.IDF Results ----------\n"
    print "\n*** numpy view ***\n"
    print np.array([tfidf_document_a, tfidf_document_b])
    print "\n*** panda view ***\n"
    print pd.DataFrame([tfidf_document_a, tfidf_document_b])
	__author__ = 'renienj'

	import numpy as np
	import pandas as pd
	import math as math


	def compute_tfidf(tf_list, idf_list):
	"""
	tfidf = tf(w) * idf(w)
	"""
	tfidi = {}
	for word, val in tf_list.iteritems():
	tfidi[word] = val * idf_list[word]
	return tfidi


	def compute_inverse_document_frequency(doc_unique_list):
	"""
	idf(w) = log(# of documents /
	# of documents that contain word w)
	:rtype : idf_dict
	"""
	doc_count = len(doc_unique_list)
	idf_dict = {}

	# Count the W containing documents
	for doc in doc_unique_list:
	for word in doc:
	if word in idf_dict:
	idf_dict[word] += 1
	else:
	idf_dict[word] = 1

	for word, count in idf_dict.iteritems():
	idf_dict[word] = math.log(doc_count / float(count))

	return idf_dict


	def compute_term_frequency(document, dict):
	"""
	tf(w) = (# of times the words appears in a document) /
	(Total # of words in the document)
	"""
	tf_dict = {}
	words_count = len(document)

	for word, count in dict.iteritems():
	tf_dict[word] = count / float(words_count)
	return tf_dict


	def count_the_words(document, dict):
	"""
	Count the words in the bad.
	This will iterate the documents and
	check for words and increment the count
	"""
	for word in document:
	dict[word] += 1
	return dict


	if __name__ == "__main__":
	# Sample documents
	document_a = "the cat sat on my face"
	document_b = "the dog sat on my bed"

	#Tokenizing
	bag_of_words_a = document_a.split(" ")
	bag_of_words_b = document_b.split(" ")

	# All words in the dictionaries
	total_words = set(bag_of_words_a).union(set(bag_of_words_b))

	# Set the default values for the dictionaries
	words_dic_a = dict.fromkeys(total_words, 0)
	words_dic_b = dict.fromkeys(total_words, 0)

	words_dic_a = count_the_words(bag_of_words_a, words_dic_a)
	words_dic_b = count_the_words(bag_of_words_b, words_dic_b)

	print "\n* numpy view *\n"
	print np.array([words_dic_a, words_dic_b])
	print "\n* panda view *\n"
	print pd.DataFrame([words_dic_a, words_dic_b])

	# Calculate the Term Frequency
	tf_document_a = compute_term_frequency(bag_of_words_a, words_dic_a)
	tf_document_b = compute_term_frequency(bag_of_words_b, words_dic_b)

	idf_documents = compute_inverse_document_frequency([{k: v for k, v in words_dic_a.items() if v > 0},
	{k: v for k, v in words_dic_b.items() if v > 0}])

	tfidf_document_a = compute_tfidf(tf_document_a, idf_documents)
	tfidf_document_b = compute_tfidf(tf_document_b, idf_documents)

	print "\n---------- TF.IDF Results ----------\n"
	print "\n* numpy view *\n"
	print np.array([tfidf_document_a, tfidf_document_b])
	print "\n* panda view *\n"
	print pd.DataFrame([tfidf_document_a, tfidf_document_b])