Created
December 23, 2014 02:02
-
-
Save bllchmbrs/02217b94ef02ea13c7ee to your computer and use it in GitHub Desktop.
TF IDF & Jaccard Similarity Explained for my Blog Post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import string | |
import math | |
tokenize = lambda doc: doc.lower().split(" ") | |
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy." | |
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption." | |
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people." | |
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled." | |
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views" | |
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily." | |
document_6 = "Vladimir Putin is riding a horse while hunting deer. Vladimir Putin always seems so serious about things - even riding horses. Is he crazy?" | |
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6] | |
def jaccard_similarity(query, document): | |
intersection = set(query).intersection(set(document)) | |
union = set(query).union(set(document)) | |
return len(intersection)/len(union) | |
def term_frequency(term, tokenized_document): | |
return tokenized_document.count(term) | |
def sublinear_term_frequency(term, tokenized_document): | |
count = tokenized_document.count(term) | |
if count == 0: | |
return 0 | |
return 1 + math.log(count) | |
def augmented_term_frequency(term, tokenized_document): | |
max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document]) | |
return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count)) | |
def inverse_document_frequencies(tokenized_documents): | |
idf_values = {} | |
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist]) | |
for tkn in all_tokens_set: | |
contains_token = map(lambda doc: tkn in doc, tokenized_documents) | |
idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token))) | |
return idf_values | |
def tfidf(documents): | |
tokenized_documents = [tokenize(d) for d in documents] | |
idf = inverse_document_frequencies(tokenized_documents) | |
tfidf_documents = [] | |
for document in tokenized_documents: | |
doc_tfidf = [] | |
for term in idf.keys(): | |
tf = sublinear_term_frequency(term, document) | |
doc_tfidf.append(tf * idf[term]) | |
tfidf_documents.append(doc_tfidf) | |
return tfidf_documents | |
#in Scikit-Learn | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize) | |
sklearn_representation = sklearn_tfidf.fit_transform(all_documents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment