Created
September 26, 2017 18:30
-
-
Save edsu/6158b975fb3754f0fae3080e7bb66e29 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# topic-model.py - given a directory of plain text files, compute t topics with d dimensions | |
# see -> https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730 | |
# Eric Lease Morgan <emorgan@nd.edu> | |
# September 25, 2017 - first cut; needs to list documents | |
# require | |
from os import listdir | |
from os.path import isfile, join | |
from sklearn.decomposition import NMF | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import sys | |
# sanity check | |
if len( sys.argv ) != 4 : | |
sys.stderr.write( 'Usage: ' + sys.argv[ 0 ] + " <directory> <number of topics> <number of dimensions>\n" ) | |
quit() | |
# get input | |
directory = sys.argv[ 1 ] | |
topics = int( sys.argv[ 2 ] ) | |
dimensions = int( sys.argv[ 3 ] ) | |
# initialize | |
files = [ file for file in listdir( directory ) if isfile( join( directory, file ) ) ] | |
documents = [] | |
vectorizer = TfidfVectorizer( max_df = 0.95, min_df = 2, stop_words = 'english' ) | |
# read each file; update the list of documents | |
for file in files : | |
with open( directory + '/' + file ) as input : documents.append( input.read() ) | |
# vectorize and create a model against the corpus; extract the features | |
tfidf = vectorizer.fit_transform( documents ) | |
model = NMF( n_components = topics, random_state = 1, alpha = .1, l1_ratio = .5 ).fit( tfidf ) | |
features = vectorizer.get_feature_names() | |
# output | |
for item, topic in enumerate( model.components_ ) : | |
print( ' * ' + "; ".join( [ features[ i ] for i in topic.argsort()[ :-dimensions - 1:-1 ] ] ) ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment