thinrhino/tf-idf.py

## tf-idf.py
# ref: http://www.tfidf.com/
# Example:
# Consider a document containing 100 words wherein the word cat appears 3 times.
# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we
# have 10 million documents and the word cat appears in one thousand of these.
# Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4.
# Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
#
# Hence:
# 1. Calculate term frequency
# - Find total number of words in each document (w_count_per_doc)
# - { word_id : (count/w_count_per_doc) }
#
# 2. Inverse Document Fequency
# - Find total number of documents = D (given constant)
# - Find count of doc_id for every word_id
# - IDF = log(D/count) (this is per word_id)
#
# 3. TF-IDF
# - tf-idf = tf * idf

from collections import defaultdict
data_file = open('<data_file>', 'r')

data = defaultdict(list)
w_count_per_doc = defaultdict(int)
w_count = defaultdict(int)

while True:
    l = data_file.readline()
    if l == '':
        break
    doc_id, word_id, word_count = l.split(' ')
    data[doc_id].append({word_id : int(word_count)})
    w_count_per_doc[doc_id] += int(word_count)
    w_count[word_id] += int(word_count)

# Calculate TF
tf_dict = defaultdict(list)

for key in data.keys():
    for i in data[key]:
        #pass
        tf = float(i.values()[0])/float(w_count_per_doc[key])
        tf_dict[key].append({i.keys()[0]: tf})

# Calculate IDF
import math

# Document Count
D = 300000

idf_dict = defaultdict(float)
for key in w_count.keys():
    idf_dict[key] = math.log(float(D) / float(w_count[key]))

# Calculate tf-idf
tfidf_dict = defaultdict(list)
for key in data.keys():
    for i in data[key]:
        tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]])
        tfidf_dict[key].append({i.keys()[0] : tfidf})
	# ref: http://www.tfidf.com/
	# Example:
	# Consider a document containing 100 words wherein the word cat appears 3 times.
	# The term frequency (i.e., tf) for cat is then (3 / 100) = 0.03. Now, assume we
	# have 10 million documents and the word cat appears in one thousand of these.
	# Then, the inverse document frequency (i.e., idf) is calculated as log(10,000,000 / 1,000) = 4.
	# Thus, the Tf-idf weight is the product of these quantities: 0.03 * 4 = 0.12.
	#
	# Hence:
	# 1. Calculate term frequency
	# - Find total number of words in each document (w_count_per_doc)
	# - { word_id : (count/w_count_per_doc) }
	#
	# 2. Inverse Document Fequency
	# - Find total number of documents = D (given constant)
	# - Find count of doc_id for every word_id
	# - IDF = log(D/count) (this is per word_id)
	#
	# 3. TF-IDF
	# - tf-idf = tf * idf

	from collections import defaultdict
	data_file = open('<data_file>', 'r')

	data = defaultdict(list)
	w_count_per_doc = defaultdict(int)
	w_count = defaultdict(int)

	while True:
	l = data_file.readline()
	if l == '':
	break
	doc_id, word_id, word_count = l.split(' ')
	data[doc_id].append({word_id : int(word_count)})
	w_count_per_doc[doc_id] += int(word_count)
	w_count[word_id] += int(word_count)

	# Calculate TF
	tf_dict = defaultdict(list)

	for key in data.keys():
	for i in data[key]:
	#pass
	tf = float(i.values()[0])/float(w_count_per_doc[key])
	tf_dict[key].append({i.keys()[0]: tf})

	# Calculate IDF
	import math

	# Document Count
	D = 300000

	idf_dict = defaultdict(float)
	for key in w_count.keys():
	idf_dict[key] = math.log(float(D) / float(w_count[key]))

	# Calculate tf-idf
	tfidf_dict = defaultdict(list)
	for key in data.keys():
	for i in data[key]:
	tfidf = float(i.values()[0]) * float(idf_dict[i.keys()[0]])
	tfidf_dict[key].append({i.keys()[0] : tfidf})