Skip to content

Instantly share code, notes, and snippets.

@chrisfalter
Created June 13, 2022 22:19
Show Gist options
  • Save chrisfalter/28647555e618a6f4ed39796d95636d4e to your computer and use it in GitHub Desktop.
Save chrisfalter/28647555e618a6f4ed39796d95636d4e to your computer and use it in GitHub Desktop.
How to Implement TF-IDF in Python
from collections import Counter
import math
from string import punctuation
from typing import List
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")
test_corpus = ["The quick, brown fox jumps, over the lazy dog.", "Never jump over the lazy dog quickly."]
def test_term_frequency() -> bool:
""" (document, token, expected_result)"""
test_data = [(test_corpus[0], "the", 1), (test_corpus[1], "never", 1)]
for (doc, token, expected_result) in test_data:
actual = term_frequency(token, doc)
if actual != expected_result or type(actual) != float:
return False
return True
def term_frequency(token: str, doc: str) -> float:
tokens = word_tokenize(doc)
tokens = [t.lower() for t in tokens if t not in punctuation]
term_dict = Counter(tokens)
max_term_count = max(term_dict.values())
term_count = term_dict[token]
return 0.5 + 0.5 * term_count / max_term_count
test_term_frequency() # "True"
def test_inverse_doc_frequency() -> bool:
""" (corpus, token, expected_result)"""
test_data = [(test_corpus, "the", 0.0), (test_corpus, "never", math.log(2))]
for (corpus, token, expected) in test_data:
actual = inverse_doc_frequency(token, corpus)
if actual != expected or type(actual) != float:
return False
return True
def num_docs_with_token(token:str, corpus: List[str]) -> int:
num_docs = 0
token = token.lower()
for doc in corpus:
tokens = [t.lower() for t in word_tokenize(doc)]
if token in tokens:
num_docs += 1
return num_docs
def inverse_doc_frequency(token:str, corpus: List[str]) -> float:
n = len(corpus)
token = token.lower()
num_docs = num_docs_with_token(token, corpus)
if num_docs == 0:
return 0.0
else:
return math.log(n / num_docs)
test_inverse_doc_frequency() # "True"
def test_tfidf() -> bool:
""" (corpus, doc, token, expected_result)"""
test_data = [(test_corpus, test_corpus[0], "the", 0.0), (test_corpus, test_corpus[1], "never", math.log(2))]
for (corpus, doc, token, expected) in test_data:
actual = tfidf(token, doc, corpus)
if actual != expected or type(actual) != float:
return False
return True
def tfidf(token:str, doc:str, corpus:List[str]) -> float:
return term_frequency(token, doc) * inverse_doc_frequency(token, corpus)
test_tfidf()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment