Last active
September 25, 2022 15:12
-
-
Save tmylk/b71bf7d3ec2f203bfce2 to your computer and use it in GitHub Desktop.
LDA in gensim and sklearn test scripts to compare
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import time | |
import logging | |
import numpy as np | |
import pandas as pd | |
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
from gensim.matutils import Sparse2Corpus | |
from gensim.models.ldamodel import LdaModel | |
from gensim.models.ldamulticore import LdaMulticore | |
logging.basicConfig( | |
format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.DEBUG) | |
logger = logging.getLogger("comp_clean") | |
gs_single_core = True | |
test_mode = 'online' | |
max_e_steps = 100 | |
n_docs = 10000 | |
n_test = 1000 | |
n_topics = 10 | |
n_features = 2000 | |
# n_docs_per_job = 10000 | |
n_jobs = 1 | |
sk_batch_size = 2000 | |
gs_batch_size = 2000 | |
n_passes = 5 | |
max_iterations = n_passes | |
gensim_update_after = n_docs | |
kappa = 0.5 | |
tau0 = 1. | |
eval_every = 1 | |
def load_dataset(): | |
train = fetch_20newsgroups( | |
subset='train', | |
random_state=1, | |
remove=( | |
'headers', | |
'footers', | |
'quotes')).data[ | |
0:n_docs] | |
test = fetch_20newsgroups( | |
subset='test', | |
random_state=1, | |
remove=( | |
'headers', | |
'footers', | |
'quotes')).data[ | |
0:n_test] | |
return train, test | |
def sklearn_run(test_mode=None, | |
train_X=None, | |
test_X=None, | |
n_topics=None, | |
n_jobs=None, | |
max_iterations=None, | |
vectorizer=None, | |
decay=None, | |
offset=None, | |
total_samples=None, | |
batch_size=None | |
): | |
logger.info("=================starting sklearn==================") | |
alpha = None | |
eta = None | |
# difference with gensim. in gensim and hoffmann first iteration is 0, | |
# while in sklearn it is 1 | |
offset = offset - 1 | |
# sklearn | |
lda_sklearn = LatentDirichletAllocation( | |
n_topics=n_topics, | |
doc_topic_prior=alpha, | |
topic_word_prior=eta, | |
batch_size=batch_size, | |
learning_decay=decay, | |
learning_offset=offset, | |
n_jobs=n_jobs, | |
total_samples=total_samples, | |
random_state=0, | |
verbose=1, | |
max_iter=max_iterations, | |
learning_method=test_mode, | |
max_doc_update_iter=max_e_steps, | |
evaluate_every=eval_every) | |
logger_sk = logging.getLogger('sklearn.online_lda') | |
logger_sk.setLevel(logging.DEBUG) | |
print('run test in %s mode' % test_mode) | |
t0 = time() | |
if test_mode == 'batch': | |
# for batch mode | |
lda_sklearn.fit(train_X) | |
else: | |
if n_passes == 1: | |
# for online mode | |
lda_sklearn.partial_fit(train_X) | |
else: | |
lda_sklearn.fit(train_X) | |
sk_time = (time() - t0) | |
print("sklearn fit in %0.3fs." % sk_time) | |
# transform | |
train_gamma = lda_sklearn.transform(train_X) | |
train_perplexity = lda_sklearn.perplexity(train_X, train_gamma) | |
test_gamma = lda_sklearn.transform(test_X) | |
test_perplexity = lda_sklearn.perplexity(test_X, test_gamma) | |
print('sklearn preplexity: train=%.3f, test=%.3f' % | |
(train_perplexity, test_perplexity)) | |
return (train_perplexity, test_perplexity, sk_time) | |
def pre_processing( | |
train_data=None, | |
test_data=None, | |
max_features=None, | |
max_df=0.8, | |
min_df=3): | |
# sklearn format | |
vectorizer = CountVectorizer(max_df=max_df, max_features=max_features, | |
min_df=min_df, stop_words='english') | |
train_X = vectorizer.fit_transform(train_data) | |
test_X = vectorizer.transform(test_data) | |
return (train_X, test_X, vectorizer) | |
def gensim_prep(train_X=None, test_X=None, vectorizer=None): | |
# gensim | |
id2words = dict() | |
for k, v in vectorizer.vocabulary_.iteritems(): | |
id2words[v] = k | |
train_corpus = Sparse2Corpus(train_X, documents_columns=False) | |
test_corpus = Sparse2Corpus(test_X, documents_columns=False) | |
return (train_corpus, test_corpus, id2words) | |
def gensim_run(test_mode=None, | |
train_corpus=None, | |
test_corpus=None, | |
n_topics=None, | |
n_jobs=None, | |
max_iterations=None, | |
id2words=None, | |
decay=None, | |
offset=None, | |
total_samples=None, | |
batch_size=None, update_after=None | |
): | |
np.random.seed(0) | |
logger.info("=================starting gensim==================") | |
logger_gs = logging.getLogger('gensim.models.ldamodel') | |
logger_gs.setLevel(logging.DEBUG) | |
t0 = time() | |
if test_mode == 'batch': | |
gs_batch = True | |
else: | |
logger.info("gensim online mode") | |
gs_batch = False | |
lda_gensim = LdaMulticore( | |
train_corpus, | |
id2word=id2words, | |
batch=gs_batch, | |
decay=decay, | |
offset=offset, | |
workers=n_jobs, | |
num_topics=n_topics, | |
passes=max_iterations, | |
chunksize=batch_size, | |
updateafter=update_after, | |
iterations=max_e_steps, | |
eval_every=eval_every) # how to pass total_samples? | |
gs_time = (time() - t0) | |
print("gensim done in %0.3fs." % (time() - t0)) | |
# lda_gensim.print_topics() | |
train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus) | |
test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus) | |
train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim) | |
test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim) | |
print('gensim preplexity: train=%.3f, test=%.3f' % | |
(train_preplexity_gensim, test_preplexity_gensim)) | |
return (train_preplexity_gensim, test_preplexity_gensim, gs_time) | |
def gensim_single_core_run(test_mode=None, | |
train_corpus=None, | |
test_corpus=None, | |
n_topics=None, | |
n_jobs=None, | |
max_iterations=None, | |
# special for gensim | |
id2words=None, | |
decay=None, | |
offset=None, | |
total_samples=None, | |
batch_size=None, update_after=None | |
): | |
np.random.seed(0) | |
logger.info("=================starting gensim sc==================") | |
logger_gs = logging.getLogger('gensim.models.ldamodel') | |
logger_gs.setLevel(logging.DEBUG) | |
t0 = time() | |
lda_gensim = LdaModel( | |
train_corpus, | |
id2word=id2words, | |
decay=decay, | |
offset=offset, | |
num_topics=n_topics, | |
passes=max_iterations, | |
chunksize=batch_size, | |
iterations=max_e_steps, | |
eval_every=eval_every) # how to pass total_samples? | |
gs_time = (time() - t0) | |
print("gensim sc done in %0.3fs." % (time() - t0)) | |
# lda_gensim.print_topics() | |
train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus) | |
test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus) | |
train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim) | |
test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim) | |
print('gensim sc preplexity: train=%.3f, test=%.3f' % | |
(train_preplexity_gensim, test_preplexity_gensim)) | |
return (train_preplexity_gensim, test_preplexity_gensim, gs_time) | |
def run_all(load_data=None): | |
train_data, test_data = load_data() | |
train_X, test_X, vectorizer = pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features) | |
sklearn_results = sklearn_run( | |
test_mode=test_mode, | |
train_X=train_X, | |
test_X=test_X, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
vectorizer=vectorizer, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=sk_batch_size) | |
train_corpus, test_corpus, id2words = gensim_prep( | |
train_X=train_X, test_X=test_X, vectorizer=vectorizer) | |
if gs_single_core: | |
gensim_results = gensim_single_core_run( | |
test_mode=test_mode, | |
train_corpus=train_corpus, | |
test_corpus=test_corpus, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
id2words=id2words, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=gs_batch_size, | |
update_after=gensim_update_after) | |
else: | |
gensim_results = gensim_run(test_mode=test_mode, | |
train_corpus=train_corpus, | |
test_corpus=test_corpus, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
id2words=id2words, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=gs_batch_size, | |
update_after=gensim_update_after | |
) | |
print('n_jobs = %s', n_jobs) | |
print('sklearn preplexity: train=%.3f, test=%.3f in %3f seconds', | |
sklearn_results[0], sklearn_results[1], sklearn_results[2]) | |
print('gensim preplexity: train=%.3f, test=%.3f in %3f seconds', | |
gensim_results[0], gensim_results[1], gensim_results[2]) | |
def print_top_words(model, feature_names, n_top_words): | |
for topic_idx, topic in enumerate(model.components_): | |
print("Topic #%d:" % topic_idx) | |
print topic | |
print "normalised:", topic / topic.sum() | |
print(" ".join([feature_names[i] + '#' + str(i) | |
for i in topic.argsort()[:-n_top_words - 1:-1]])) | |
print() | |
print 'all topics' | |
print model.components_ | |
def main(): | |
run_all(load_data=load_dataset) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'lev' | |
import comparison | |
n_docs = 3 | |
n_features = n_docs | |
n_topics = 3 | |
n_jobs = n_docs | |
max_iterations = 5 # 50 | |
batch_size = 3 # 1 for gs, 3 for skl | |
update_after = n_docs | |
kappa = 0.5 # batch 0.5 # decay in gensim | |
tau0 = 1. # offest in gensim | |
def load_small_dataset(): | |
train = [u"apples", u"bananas", u"raspberries"] | |
test = [u"bananas"] | |
return train, test | |
def test_preprocessing(): | |
train, test = load_small_dataset() | |
(train_X, test_X, vectorizer) = comparison.pre_processing( | |
train_data=train, test_data=test, max_features=3, max_df=0.8, min_df=1) | |
assert train_X.shape[0] == len(train) | |
assert test_X.shape[0] == len(test) | |
def test_sklearn_batch_run(): | |
test_mode = 'batch' | |
train_data, test_data = load_small_dataset() | |
train_X, test_X, vectorizer = comparison.pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features, max_df=0.8, min_df=1) | |
sklearn_perplexity = comparison.sklearn_run(test_mode=test_mode, | |
train_X=train_X, | |
test_X=test_X, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
vectorizer=vectorizer | |
) | |
assert sklearn_perplexity[0] == 11.182026161374607 | |
assert sklearn_perplexity[1] == 19.259947095857502 | |
assert (sklearn_perplexity[2] == [[0.33492697, 1.3241937, 0.34087933]]).all | |
# in gensim it is [[ 0.33800402 1.3144455 0.34755048]] | |
def test_gensim_preprocessing(): | |
train, test = load_small_dataset() | |
(train_X, test_X, vectorizer) = comparison.pre_processing( | |
train_data=train, test_data=test, max_features=3, max_df=0.8, min_df=1) | |
train_corpus, test_corpus, id2words = comparison.gensim_prep( | |
train_X=train_X, test_X=test_X, vectorizer=vectorizer) | |
assert len(train_corpus) == len(train) | |
assert list(train_corpus) == [[(0, 1)], [(1, 1)], [(2, 1)]] | |
assert len(test_corpus) == len(test) | |
assert list(test_corpus) == [[(1, 1)]] | |
def test_gensim_batch_run(): | |
test_mode = 'batch' | |
train_data, test_data = load_small_dataset() | |
train_X, test_X, vectorizer = comparison.pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features, max_df=0.8, min_df=1) | |
train_corpus, test_corpus, id2words = comparison.gensim_prep( | |
train_X=train_X, test_X=test_X, vectorizer=vectorizer) | |
gensim_perplexity = comparison.gensim_run(test_mode=test_mode, | |
train_corpus=train_corpus, | |
test_corpus=test_corpus, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
id2words=id2words | |
) | |
print gensim_perplexity | |
assert gensim_perplexity[0] == 11.18202581586212 | |
assert gensim_perplexity[1] == 19.259947213026244 | |
# [[ 0.33800402 1.3144455 0.34755048]] | |
def test_sklearn_online_run(): | |
test_mode = 'online' | |
train_data, test_data = load_small_dataset() | |
train_X, test_X, vectorizer = comparison.pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features, max_df=0.8, min_df=1) | |
sklearn_perplexity = comparison.sklearn_run( | |
test_mode=test_mode, | |
train_X=train_X, | |
test_X=test_X, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
vectorizer=vectorizer, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=batch_size) | |
assert sklearn_perplexity[0] == 10.476297933158504 | |
assert sklearn_perplexity[1] == 21.579590404225367 | |
assert (sklearn_perplexity[2] == [[0.33492697, 1.3241937, 0.34087933]]).all | |
def test_gensim_online_run(): | |
test_mode = 'online' | |
train_data, test_data = load_small_dataset() | |
train_X, test_X, vectorizer = comparison.pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features, max_df=0.8, min_df=1) | |
train_corpus, test_corpus, id2words = comparison.gensim_prep( | |
train_X=train_X, test_X=test_X, vectorizer=vectorizer) | |
gensim_perplexity = comparison.gensim_run(test_mode=test_mode, | |
train_corpus=train_corpus, | |
test_corpus=test_corpus, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
id2words=id2words, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=1, update_after=update_after # one m-step per chunk | |
) | |
print gensim_perplexity | |
assert gensim_perplexity[0] == 12.074197413443118 | |
assert gensim_perplexity[1] == 19.414047426272397 | |
def test_gensim_online_singlecore_run(): | |
test_mode = 'online' | |
train_data, test_data = load_small_dataset() | |
train_X, test_X, vectorizer = comparison.pre_processing( | |
train_data=train_data, test_data=test_data, max_features=n_features, max_df=0.8, min_df=1) | |
train_corpus, test_corpus, id2words = comparison.gensim_prep( | |
train_X=train_X, test_X=test_X, vectorizer=vectorizer) | |
gensim_perplexity = comparison.gensim_single_core_run( | |
test_mode=test_mode, | |
train_corpus=train_corpus, | |
test_corpus=test_corpus, | |
n_topics=n_topics, | |
n_jobs=n_jobs, | |
max_iterations=max_iterations, | |
id2words=id2words, | |
decay=kappa, | |
offset=tau0, | |
total_samples=n_docs, | |
batch_size=1) | |
print gensim_perplexity | |
assert gensim_perplexity[0] == 10.474984476417523 | |
assert gensim_perplexity[1] == 21.580473227971478 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment