Skip to content

Instantly share code, notes, and snippets.

@sevperez
Created October 14, 2020 08:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sevperez/3f3d3398733e125a8695adc5c21734df to your computer and use it in GitHub Desktop.
Save sevperez/3f3d3398733e125a8695adc5c21734df to your computer and use it in GitHub Desktop.
def document_frequency(td_df, term: str):
"""
- Parameters: td_df (Pandas DataFrame) representing a term-document
matrix, and term (string).
- Returns: The document frequency value showing the number of
documents in td_df where term occurs at least once.
"""
return td_df[td_df[term] > 0].shape[0]
def inverse_document_frequency(td_df, term: str):
"""
- Parameters: td_df (Pandas DataFrame) representing a term-document
matrix, and term (string).
- Returns: The inverse document frequency value for term, calculated
as N / log(dft) where N is the number of documents in td_df and
dft is the document frequency value for term.
"""
N = td_df.shape[0]
dft = document_frequency(td_df, term)
return (N / np.log10(dft))
def build_tfidf_df(td_df):
"""
- Parameters: td_df (Pandas DataFrame) representing a term-document
matrix.
- Returns: Returns a term frequency-inverse document frequency
(TF-IDF) matrix in the form of a Pandas DataFrame, where each row
is a document and each column is a token. Values in the dataframe
are TF-IDF values for the given document / token.
"""
def calculate_tfidf(col, td_df):
idf = inverse_document_frequency(td_df, col.name)
return col * idf
return td_df.apply(calculate_tfidf, td_df=td_df)
sotu_tfidf_df = build_tfidf_df(sotu_td_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment