Skip to content

Instantly share code, notes, and snippets.

@BurakaKrishna
Created February 23, 2018 17:33
Show Gist options
  • Save BurakaKrishna/ed712135c74c874fdc2d6474c5aedf68 to your computer and use it in GitHub Desktop.
Save BurakaKrishna/ed712135c74c874fdc2d6474c5aedf68 to your computer and use it in GitHub Desktop.
tfidf_explaine
import string
import math
tokenize = lambda doc: doc.lower().split(" ")
document_0 = "China has a strong economy that is growing at a rapid pace. However politically it differs greatly from the US Economy."
document_1 = "At last, China seems serious about confronting an endemic problem: domestic violence and corruption."
document_2 = "Japan's prime minister, Shinzo Abe, is working towards healing the economic turmoil in his own country for his view on the future of his people."
document_3 = "Vladimir Putin is working hard to fix the economy in Russia as the Ruble has tumbled."
document_4 = "What's the future of Abenomics? We asked Shinzo Abe for his views"
document_5 = "Obama has eased sanctions on Cuba while accelerating those against the Russian Economy, even as the Ruble's value falls almost daily."
document_6 = "Vladimir Putin was found to be riding a horse, again, without a shirt on while hunting deer. Vladimir Putin always seems so serious about things - even riding horses."
all_documents = [document_0, document_1, document_2, document_3, document_4, document_5, document_6]
tokenized_documents = [tokenize(d) for d in all_documents] # tokenized docs
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
def jaccard_similarity(query,document):
intersection = set(document).intersection(set(query))
union = set(document).union(set(query))
return len(intersection)/len(union)
# print (jaccard_similarity(tokenized_documents[2],tokenized_documents[4]))
# # we get a score of 0.64
# # problems with this approach
# # 1. document length is influencing our score
# # print (set(tokenized_documents[2]))
# # print (set(tokenized_documents[4]))
# # print (set(tokenized_documents[2]).intersection(set(tokenized_documents[4])))
# print (set(tokenized_documents[4]).intersection(set(tokenized_documents[2])))
# # 2. common words are affecting our score
# print (jaccard_similarity(tokenized_documents[1],tokenized_documents[6]))
# # 0.08571428571428572
# print (set(tokenized_documents[1]).intersection(set(tokenized_documents[6])))
# {'about', 'seems', 'serious'}
def term_frequency(term, tokenized_document):
return tokenized_document.count(term)
#test functions
# print (term_frequency('china',tokenized_documents[0]))
# works on only lowercase letters
def sublinaear_term_frequency(term,tokenized_document):
return 1 + math.log(max(1,tokenized_document.count(term)))
def augmented_term_frequency(term,tokenized_document):
max_count = max(term_frequency(t,tokenized_document) for t in tokenized_document)
return (0.5 + ((0.5*term_frequency(term,tokenized_document))/max_count))
def inverse_document_frequencies(tokenized_documents):
idf_values = {}
all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
for token in all_tokens_set:
contains_token = map(lambda doc: token in doc,tokenized_documents)
idf_values[token] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
return idf_values
idf_values = inverse_document_frequencies(tokenized_documents)
print (idf_values['abenomics?'])
# token = 'china'
# print (list(map(lambda doc: token in doc,tokenized_documents)))
def tfidf(documents):
tokenized_documents = [tokenize(d) for d in documents]
idf = inverse_document_frequencies(tokenized_documents)
tfidf_documents = []
for document in tokenized_documents:
doc_tfidf = []
for term in idf.keys():
tf = sublinaear_term_frequency(term,document)
doc_tfidf.append(tf*idf[term])
tfidf_documents.append(doc_tfidf)
return tfidf_documents
#
tfidf_representation = tfidf(all_documents)
print (tfidf(all_documents))
# def get_count_term(documents,idf):
# for document in documents:
# for term in idf.keys():
# print ((term,term_frequency(term,document)))
#
# get_count_term(all_documents,idf_values)
from sklearn.feature_extraction.text import TfidfVectorizer
sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
#
sklearn_representation = sklearn_tfidf.fit_transform(all_documents)
# print (tfidf_representation[0])
# print (sklearn_representation.toarray()[0].tolist())
# print (document_0)
def cosine_similarity(vector1,vector2):
dot_product = sum(p*q for p,q in zip(vector1,vector2))
magnitude = math.sqrt(sum(p*p for p in vector1)*math.sqrt(sum(q*q for q in vector2)))
if magnitude:
return 0
return dot_product/magnitude
our_tfidf_comparisons = []
for count_0,doc_0 in enumerate(tfidf_representation):
for count_1,doc_1 in enumerate(tfidf_representation):
our_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))
skl_tfidf_comparisons = []
for count_0,doc_0 in enumerate(sklearn_representation.toarray()):
for count_1,doc_1 in enumerate(sklearn_representation.toarray()):
skl_tfidf_comparisons.append((cosine_similarity(doc_0,doc_1),count_0,count_1))
for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
print (x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment