Created
June 13, 2022 22:19
-
-
Save chrisfalter/28647555e618a6f4ed39796d95636d4e to your computer and use it in GitHub Desktop.
How to Implement TF-IDF in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import math | |
from string import punctuation | |
from typing import List | |
from nltk.tokenize import word_tokenize | |
import nltk | |
nltk.download("punkt") | |
test_corpus = ["The quick, brown fox jumps, over the lazy dog.", "Never jump over the lazy dog quickly."] | |
def test_term_frequency() -> bool: | |
""" (document, token, expected_result)""" | |
test_data = [(test_corpus[0], "the", 1), (test_corpus[1], "never", 1)] | |
for (doc, token, expected_result) in test_data: | |
actual = term_frequency(token, doc) | |
if actual != expected_result or type(actual) != float: | |
return False | |
return True | |
def term_frequency(token: str, doc: str) -> float: | |
tokens = word_tokenize(doc) | |
tokens = [t.lower() for t in tokens if t not in punctuation] | |
term_dict = Counter(tokens) | |
max_term_count = max(term_dict.values()) | |
term_count = term_dict[token] | |
return 0.5 + 0.5 * term_count / max_term_count | |
test_term_frequency() # "True" | |
def test_inverse_doc_frequency() -> bool: | |
""" (corpus, token, expected_result)""" | |
test_data = [(test_corpus, "the", 0.0), (test_corpus, "never", math.log(2))] | |
for (corpus, token, expected) in test_data: | |
actual = inverse_doc_frequency(token, corpus) | |
if actual != expected or type(actual) != float: | |
return False | |
return True | |
def num_docs_with_token(token:str, corpus: List[str]) -> int: | |
num_docs = 0 | |
token = token.lower() | |
for doc in corpus: | |
tokens = [t.lower() for t in word_tokenize(doc)] | |
if token in tokens: | |
num_docs += 1 | |
return num_docs | |
def inverse_doc_frequency(token:str, corpus: List[str]) -> float: | |
n = len(corpus) | |
token = token.lower() | |
num_docs = num_docs_with_token(token, corpus) | |
if num_docs == 0: | |
return 0.0 | |
else: | |
return math.log(n / num_docs) | |
test_inverse_doc_frequency() # "True" | |
def test_tfidf() -> bool: | |
""" (corpus, doc, token, expected_result)""" | |
test_data = [(test_corpus, test_corpus[0], "the", 0.0), (test_corpus, test_corpus[1], "never", math.log(2))] | |
for (corpus, doc, token, expected) in test_data: | |
actual = tfidf(token, doc, corpus) | |
if actual != expected or type(actual) != float: | |
return False | |
return True | |
def tfidf(token:str, doc:str, corpus:List[str]) -> float: | |
return term_frequency(token, doc) * inverse_doc_frequency(token, corpus) | |
test_tfidf() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment