Created
November 30, 2016 21:43
-
-
Save garyForeman/321a10ebe29215a0c1acbcb4b320fb8e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from nltk import wordpunct_tokenize | |
from nltk.corpus import movie_reviews | |
from nltk.stem import SnowballStemmer | |
import re, string | |
import sklearn | |
from sklearn.feature_extraction.text import CountVectorizer, _check_stop_list | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.pipeline import Pipeline | |
sklearn_version = sklearn.__version__ | |
print("Scikit-learn version: {0}".format(sklearn_version)) | |
if sklearn_version == '0.18.1': | |
from sklearn.model_selection import train_test_split #sklearn v0.18 | |
else: | |
from sklearn.cross_validation import train_test_split #sklearn v0.17 | |
#supress stderr logging which annoyingly outputs parallel job info after | |
#each lda iteration | |
import sys | |
sys.stderr = open('lda_test.err','w') | |
STEMMER = SnowballStemmer('english') | |
STOP_WORDS = _check_stop_list('english') | |
def stem_tokens(tokens, stemmer): | |
stemmed = [] | |
for item in tokens: | |
if item not in STOP_WORDS: | |
stemmed.append(stemmer.stem(item)) | |
return stemmed | |
def tokenizer(text): | |
text = text.encode('utf-8', errors='ignore') #convert unicode | |
text = text.lower() #put everything in lower case | |
#get rid of punctuation and digits | |
text = text.translate(None, ''.join(set(string.punctuation) | \ | |
set(string.digits))) | |
tokens = wordpunct_tokenize(text) #tokenize | |
return stem_tokens(tokens, STEMMER) | |
def main(): | |
#load documents | |
documents = [movie_reviews.raw(fileid) for fileid \ | |
in movie_reviews.fileids()] | |
#test/train split | |
documents_train, documents_test = train_test_split(documents, | |
test_size=0.25, | |
random_state=42) | |
#initialize lda model pipeline | |
count_vec = CountVectorizer(max_df=0.95, min_df=2, tokenizer=tokenizer, | |
strip_accents='ascii') | |
lda = LatentDirichletAllocation(learning_method='batch', evaluate_every=1, | |
verbose=1, max_iter=100, random_state=42, | |
doc_topic_prior=0.1, topic_word_prior=0.01) | |
lda_pipe = Pipeline([('count_vec', count_vec), ('lda', lda)]) | |
#fit model | |
lda_pipe.fit(documents_train) | |
#evaluate train set | |
X_vectorized = lda_pipe.named_steps['count_vec'].transform(documents_train) | |
perplexity_train = lda_pipe.named_steps['lda'].perplexity(X_vectorized) | |
print("Train set perplexity: {0}".format(perplexity_train)) | |
#evaluate test set | |
X_vectorized = lda_pipe.named_steps['count_vec'].transform(documents_test) | |
perplexity_test = lda_pipe.named_steps['lda'].perplexity(X_vectorized) | |
print("Test set perplexity: {0}".format(perplexity_test)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment