Skip to content

Instantly share code, notes, and snippets.

@JackHowa
Created May 1, 2021 15:46
Show Gist options
  • Save JackHowa/500bf7683aee658487abcac247c0491a to your computer and use it in GitHub Desktop.
Save JackHowa/500bf7683aee658487abcac247c0491a to your computer and use it in GitHub Desktop.
Exercise 18: TF-IDF
# DATA BLOCK
text = '''he really really loves coffee
my sister dislikes coffee
my sister loves tea'''
import math
def main(text):
# split the text first into lines and then into lists of words
docs = [line.split() for line in text.splitlines()]
N = len(docs)
# create the vocabulary: the list of words that appear at least once
vocabulary = list(set(text.split()))
df = {}
tf = {}
for word in vocabulary:
# tf: number of occurrences of word w in document divided by document length
# note: tf[word] will be a list containing the tf of each word for each document
# for example tf['he'][0] contains the term frequence of the word 'he' in the first
# document
tf[word] = [doc.count(word)/len(doc) for doc in docs]
# df: number of documents containing word w
df[word] = sum([word in doc for doc in docs])/N
# loop through documents to calculate the tf-idf values
for doc_index, doc in enumerate(docs):
tfidf = []
for word in vocabulary:
# ADD THE CORRECT FORMULA HERE. Remember to use the base 10 logarithm: math.log(x, 10)
word_tf = tf[word][doc_index]
word_df = df[word]
word_tfidf = word_tf * math.log(1/word_df, 10)
tfidf.append(word_tfidf)
print(tfidf)
main(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment