Skip to content

Instantly share code, notes, and snippets.

@edsu
Created September 26, 2017 18:30
Show Gist options
  • Save edsu/6158b975fb3754f0fae3080e7bb66e29 to your computer and use it in GitHub Desktop.
Save edsu/6158b975fb3754f0fae3080e7bb66e29 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# topic-model.py - given a directory of plain text files, compute t topics with d dimensions
# see -> https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
# Eric Lease Morgan <emorgan@nd.edu>
# September 25, 2017 - first cut; needs to list documents
# require
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
# sanity check
if len( sys.argv ) != 4 :
sys.stderr.write( 'Usage: ' + sys.argv[ 0 ] + " <directory> <number of topics> <number of dimensions>\n" )
quit()
# get input
directory = sys.argv[ 1 ]
topics = int( sys.argv[ 2 ] )
dimensions = int( sys.argv[ 3 ] )
# initialize
files = [ file for file in listdir( directory ) if isfile( join( directory, file ) ) ]
documents = []
vectorizer = TfidfVectorizer( max_df = 0.95, min_df = 2, stop_words = 'english' )
# read each file; update the list of documents
for file in files :
with open( directory + '/' + file ) as input : documents.append( input.read() )
# vectorize and create a model against the corpus; extract the features
tfidf = vectorizer.fit_transform( documents )
model = NMF( n_components = topics, random_state = 1, alpha = .1, l1_ratio = .5 ).fit( tfidf )
features = vectorizer.get_feature_names()
# output
for item, topic in enumerate( model.components_ ) :
print( ' * ' + "; ".join( [ features[ i ] for i in topic.argsort()[ :-dimensions - 1:-1 ] ] ) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment