Skip to content

Instantly share code, notes, and snippets.

@thm1118
Forked from chyikwei/gist:34b97d4d443a0cc38a2f
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thm1118/9a3fb354fab671ee59b0 to your computer and use it in GitHub Desktop.
Save thm1118/9a3fb354fab671ee59b0 to your computer and use it in GitHub Desktop.
from time import time
import logging
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.matutils import Sparse2Corpus
#from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def load_dataset():
train = fetch_20newsgroups(subset='train', random_state=1,
remove=('headers', 'footers', 'quotes')).data
test = fetch_20newsgroups(subset='test', random_state=1,
remove=('headers', 'footers', 'quotes')).data
return train, test
def main():
# test mode can be 'batch' or 'online'
test_mode = 'batch'
#test_mode = 'online'
# params
n_features = 2000
n_topics = 10
alpha = 1. / n_topics
eta = 1. / n_topics
n_jobs = 3
# for batch update setting
max_iterations = 5
# for online udpate setting
kappa = 0.5 # decay in gensim
tau0 = 1. # offest in gensim
batch_size = 2000 # chunk size in gensim
train_data, test_data = load_dataset()
#sklearn format
vectorizer = CountVectorizer(max_df=0.8, max_features=n_features,
min_df=3, stop_words='english')
train_X = vectorizer.fit_transform(train_data)
test_X = vectorizer.transform(test_data)
# convert sparse matrix to gensim corpus
id2words = dict()
for k, v in vectorizer.vocabulary_.iteritems():
id2words[v] = k
train_corpus = Sparse2Corpus(train_X, documents_columns=False)
test_corpus = Sparse2Corpus(test_X, documents_columns=False)
# sklearn
lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta,
batch_size=batch_size, kappa=kappa, tau=tau0,
n_jobs=n_jobs, n_docs=1e4,
normalize_doc=False, random_state=0, verbose=1)
print('run test in %s mode' % test_mode)
t0 = time()
if test_mode == 'batch':
#for batch mode
lda_sklearn.fit(train_X, max_iters=max_iterations)
else:
# for online mode
lda_sklearn.partial_fit(train_X)
print("sklearn fit in %0.3fs." % (time() - t0))
# transform
train_gamma = lda_sklearn.transform(train_X)
#bound = lda_sklearn._approx_bound(train_X, train_gamma, False)
train_preplexity = lda_sklearn.preplexity(train_X, train_gamma)
test_gamma = lda_sklearn.transform(test_X)
test_preplexity = lda_sklearn.preplexity(test_X, test_gamma)
print('sklearn preplexity: train=%.3f, test=%.3f' % (train_preplexity, test_preplexity))
# gensim
id2words = dict()
for k, v in vectorizer.vocabulary_.iteritems():
id2words[v] = k
train_corpus = Sparse2Corpus(train_X, documents_columns=False)
test_corpus = Sparse2Corpus(test_X, documents_columns=False)
t0 = time()
if test_mode == 'batch':
# for batch mode
lda_gensim = LdaMulticore(train_corpus, id2word=id2words,
batch=True, eval_every=1,
workers=n_jobs, num_topics=n_topics, passes=max_iterations)
else:
# for online mode
lda_gensim = LdaMulticore(train_corpus, id2word=id2words,
batch=False, eval_every=20,
decay=0.5, offset=1.0,
workers=n_jobs, num_topics=n_topics,
passes=1)
print("gensim done in %0.3fs." % (time() - t0))
#lda_gensim.print_topics()
train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus)
test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus)
train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim)
test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim)
print('gensim preplexity: train=%.3f, test=%.3f' % (train_preplexity_gensim, test_preplexity_gensim))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment