Skip to content

Instantly share code, notes, and snippets.

@tharna
Created January 15, 2019 06:09
Show Gist options
  • Save tharna/c41cdfa8cb5b7dcea521307faa309517 to your computer and use it in GitHub Desktop.
Save tharna/c41cdfa8cb5b7dcea521307faa309517 to your computer and use it in GitHub Desktop.
Script to categorize notes
#!/usr/bin/python3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
import csv
documents = open("offering.csv")
tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df=2, stop_words='english', max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names()
NUMBER_OF_TOPICS = 10
TOPICS_PER_CATEGORY = 3
NOTES_PER_TOPIC = 10
nmf = NMF(n_components=NUMBER_OF_TOPICS, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
results = []
# Vectorizer reads file just fine, but for printing we need to read it into array
with open("offering.csv") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
results.append(row)
def get_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for index, topic in enumerate(H):
main_topics = []
for i in topic.argsort()[:-no_top_words - 1:-1]:
main_topics.append(feature_names[i])
# Print topic categories
print(', '.join(main_topics))
# Print out notes per topic
top_sn_indices = np.argsort( W[:,index] )[::-1][0:no_top_documents]
for sn_index in top_sn_indices:
print(documents[sn_index])
get_topics(nmf_H, nmf_W, feature_names, results, TOPICS_PER_CATEGORY, NOTES_PER_TOPIC)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment