tharna/notecat.py

## notecat.py
#!/usr/bin/python3

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
import csv

documents = open("offering.csv")

tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df=2, stop_words='english', max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names()

NUMBER_OF_TOPICS = 10
TOPICS_PER_CATEGORY = 3
NOTES_PER_TOPIC = 10

nmf = NMF(n_components=NUMBER_OF_TOPICS, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
results = []

# Vectorizer reads file just fine, but for printing we need to read it into array
with open("offering.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        results.append(row)

def get_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for index, topic in enumerate(H):
        main_topics = []
        for i in topic.argsort()[:-no_top_words - 1:-1]:
            main_topics.append(feature_names[i])

        # Print topic categories
        print(', '.join(main_topics))

        # Print out notes per topic
        top_sn_indices = np.argsort( W[:,index] )[::-1][0:no_top_documents]
        for sn_index in top_sn_indices:
            print(documents[sn_index])

get_topics(nmf_H, nmf_W, feature_names, results, TOPICS_PER_CATEGORY, NOTES_PER_TOPIC)
	#!/usr/bin/python3

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.decomposition import NMF
	import numpy as np
	import csv

	documents = open("offering.csv")

	tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df=2, stop_words='english', max_features=1000)
	tfidf = tfidf_vectorizer.fit_transform(documents)
	feature_names = tfidf_vectorizer.get_feature_names()

	NUMBER_OF_TOPICS = 10
	TOPICS_PER_CATEGORY = 3
	NOTES_PER_TOPIC = 10

	nmf = NMF(n_components=NUMBER_OF_TOPICS, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
	nmf_W = nmf.transform(tfidf)
	nmf_H = nmf.components_
	results = []

	# Vectorizer reads file just fine, but for printing we need to read it into array
	with open("offering.csv") as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	results.append(row)

	def get_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
	for index, topic in enumerate(H):
	main_topics = []
	for i in topic.argsort()[:-no_top_words - 1:-1]:
	main_topics.append(feature_names[i])

	# Print topic categories
	print(', '.join(main_topics))

	# Print out notes per topic
	top_sn_indices = np.argsort( W[:,index] )[::-1][0:no_top_documents]
	for sn_index in top_sn_indices:
	print(documents[sn_index])

	get_topics(nmf_H, nmf_W, feature_names, results, TOPICS_PER_CATEGORY, NOTES_PER_TOPIC)