Skip to content

Instantly share code, notes, and snippets.

@organisciak
Created December 14, 2016 05:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save organisciak/9bffe454030344510e71e56ae409f6af to your computer and use it in GitHub Desktop.
Save organisciak/9bffe454030344510e71e56ae409f6af to your computer and use it in GitHub Desktop.
def calculate_tfidf(tokencounts, idf_df, df='PF', case=True, log_tf=True):
'''Takes a 'token, count' DF and returns TF*IDF weights '''
if not case:
tc['token'] = tc['token'].str.lower()
tc = tc.groupby('token', as_index=False).sum()
tfidf = pd.merge(tc.set_index('token'), idf_df, left_index=True, right_index=True)
if log_tf:
tfidf['TF'] = tfidf['count'].add(1).apply(np.log10)
else:
tfidf['TF'] = tfidf['count']
tfidf['TF*I'+df] = tfidf['TF'] * tfidf['I'+df]
return tfidf.sort_values('TF*I'+df, ascending=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment