Skip to content

Instantly share code, notes, and snippets.

@veekaybee
Created October 4, 2022 18:29
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veekaybee/3f05047a179384888c235b409ea528a5 to your computer and use it in GitHub Desktop.
Save veekaybee/3f05047a179384888c235b409ea528a5 to your computer and use it in GitHub Desktop.
TF-IDF
import math
documentA = ['the', 'man', 'went', 'out', 'for' ,'a' ,'walk']
documentB = ['the', 'children' ,'sat', 'around' ,'the', 'fire']
def tf(term, document):
'''
Term frequency of a word in a document
over total words in document
'''
term_count = 0
total_count = 0
for word in document:
total_count +=1
if word == term:
term_count += 1
return (term_count / total_count)
def idf(term, doc_list):
'''
Inverse frequency of term across a set of documents
(The more it appears the less important it is)
'''
total_docs = 0
total_docs_with_term = 0
for doc in doc_list:
total_docs +=1
if term in doc:
total_docs_with_term +=1
idf = math.log(total_docs / total_docs_with_term)
return idf
def tf_idf(tf, idf):
tfidf = tf*idf
print("tf-idf:{:0.3f}".format(tfidf))
tf_fire = tf('fire', documentA)
idf_docs = idf('fire',[documentA, documentB])
tf_idf(tf_fire, idf_docs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment