Skip to content

Instantly share code, notes, and snippets.

@garyForeman
Created November 30, 2016 21:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save garyForeman/321a10ebe29215a0c1acbcb4b320fb8e to your computer and use it in GitHub Desktop.
Save garyForeman/321a10ebe29215a0c1acbcb4b320fb8e to your computer and use it in GitHub Desktop.
from __future__ import print_function
from nltk import wordpunct_tokenize
from nltk.corpus import movie_reviews
from nltk.stem import SnowballStemmer
import re, string
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, _check_stop_list
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
sklearn_version = sklearn.__version__
print("Scikit-learn version: {0}".format(sklearn_version))
if sklearn_version == '0.18.1':
from sklearn.model_selection import train_test_split #sklearn v0.18
else:
from sklearn.cross_validation import train_test_split #sklearn v0.17
#supress stderr logging which annoyingly outputs parallel job info after
#each lda iteration
import sys
sys.stderr = open('lda_test.err','w')
STEMMER = SnowballStemmer('english')
STOP_WORDS = _check_stop_list('english')
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
if item not in STOP_WORDS:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenizer(text):
text = text.encode('utf-8', errors='ignore') #convert unicode
text = text.lower() #put everything in lower case
#get rid of punctuation and digits
text = text.translate(None, ''.join(set(string.punctuation) | \
set(string.digits)))
tokens = wordpunct_tokenize(text) #tokenize
return stem_tokens(tokens, STEMMER)
def main():
#load documents
documents = [movie_reviews.raw(fileid) for fileid \
in movie_reviews.fileids()]
#test/train split
documents_train, documents_test = train_test_split(documents,
test_size=0.25,
random_state=42)
#initialize lda model pipeline
count_vec = CountVectorizer(max_df=0.95, min_df=2, tokenizer=tokenizer,
strip_accents='ascii')
lda = LatentDirichletAllocation(learning_method='batch', evaluate_every=1,
verbose=1, max_iter=100, random_state=42,
doc_topic_prior=0.1, topic_word_prior=0.01)
lda_pipe = Pipeline([('count_vec', count_vec), ('lda', lda)])
#fit model
lda_pipe.fit(documents_train)
#evaluate train set
X_vectorized = lda_pipe.named_steps['count_vec'].transform(documents_train)
perplexity_train = lda_pipe.named_steps['lda'].perplexity(X_vectorized)
print("Train set perplexity: {0}".format(perplexity_train))
#evaluate test set
X_vectorized = lda_pipe.named_steps['count_vec'].transform(documents_test)
perplexity_test = lda_pipe.named_steps['lda'].perplexity(X_vectorized)
print("Test set perplexity: {0}".format(perplexity_test))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment