Created
April 28, 2022 10:54
-
-
Save alexcpn/cc27002743c0bcd8c65ba0420588b306 to your computer and use it in GitHub Desktop.
TFID Vectorizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Using TFidfVectorizer | |
# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html | |
tfidf_vectorizer = TfidfVectorizer(token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',stop_words='english') #token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b' | |
df = read_syslog(sys.argv[1]) | |
tfidf_vector = tfidf_vectorizer.fit_transform(df['y_org']) | |
print(tfidf_vectorizer.get_feature_names_out()) | |
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=df['ds_org'], columns=tfidf_vectorizer.get_feature_names()) | |
# Create a new row with sum of all the terms of the existing rows | |
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum() | |
tfidf_df['max']=tfidf_df.apply(np.max,axis=1) | |
tfidf_df.to_csv('tfidf_df.csv') | |
print("tfidf_df.head()") | |
print(tfidf_df.head()) | |
df = tfidf_df.loc['00_Document Frequency'] | |
#tfidf_df.sort_values(by=['00_Document Frequency','tfidf'], ascending=[True,False]).groupby(['document']).head(10) | |
print("Pandas Series Sort Values",df.sort_values()) | |
tfidf_df.to_csv('tfidf_df.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment