alexcpn/tfidf_vectorizer .py

## tfidf_vectorizer .py
    # Using TFidfVectorizer
    # https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html
    tfidf_vectorizer = TfidfVectorizer(token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',stop_words='english')  #token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b'

    df = read_syslog(sys.argv[1])
    tfidf_vector = tfidf_vectorizer.fit_transform(df['y_org'])
    print(tfidf_vectorizer.get_feature_names_out())
    tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=df['ds_org'], columns=tfidf_vectorizer.get_feature_names())
    # Create a new row with sum of all the terms of the existing rows
    tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()
    tfidf_df['max']=tfidf_df.apply(np.max,axis=1)
    tfidf_df.to_csv('tfidf_df.csv')
    print("tfidf_df.head()")
    print(tfidf_df.head())

    df = tfidf_df.loc['00_Document Frequency']
    #tfidf_df.sort_values(by=['00_Document Frequency','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
    print("Pandas Series Sort Values",df.sort_values())
    tfidf_df.to_csv('tfidf_df.csv')
	# Using TFidfVectorizer
	# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html
	tfidf_vectorizer = TfidfVectorizer(token_pattern=u'(?ui)\\b\\w[a-z]+\\w\\b',stop_words='english') #token_pattern=u'(?ui)\\b\\w[a-z]+\\w\\b'

	df = read_syslog(sys.argv[1])
	tfidf_vector = tfidf_vectorizer.fit_transform(df['y_org'])
	print(tfidf_vectorizer.get_feature_names_out())
	tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=df['ds_org'], columns=tfidf_vectorizer.get_feature_names())
	# Create a new row with sum of all the terms of the existing rows
	tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()
	tfidf_df['max']=tfidf_df.apply(np.max,axis=1)
	tfidf_df.to_csv('tfidf_df.csv')
	print("tfidf_df.head()")
	print(tfidf_df.head())

	df = tfidf_df.loc['00_Document Frequency']
	#tfidf_df.sort_values(by=['00_Document Frequency','tfidf'], ascending=[True,False]).groupby(['document']).head(10)
	print("Pandas Series Sort Values",df.sort_values())
	tfidf_df.to_csv('tfidf_df.csv')