Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Last active August 29, 2015 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chyikwei/2b2c68b9f9c46e4866c6 to your computer and use it in GitHub Desktop.
Save chyikwei/2b2c68b9f9c46e4866c6 to your computer and use it in GitHub Desktop.
LDA profiling for mean_change code
import line_profiler
from sklearn.decomposition.online_lda import _update_doc_distribution, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
n_features = 5000
n_topics = 100
n_top_words = 20
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
%lprun -f _update_doc_distribution lda.fit(tf)
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
Function: _update_doc_distribution at line 55
Total time: 62.5723 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
56 mean_change_tol, cal_sstats, random_state):
57 """E-step: update document-topic distribution.
95 """
96 445 2072 4.7 0.0 is_sparse_x = sp.issparse(X)
97 445 1562 3.5 0.0 n_samples, n_features = X.shape
98 445 968 2.2 0.0 n_topics = exp_topic_word_distr.shape[0]
99
100 445 631 1.4 0.0 if random_state:
101 445 375408 843.6 0.6 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
102 else:
103 doc_topic_distr = np.ones((n_samples, n_topics))
104
105 # In the literature, this is `exp(E[log(theta)])`
106 445 406215 912.8 0.6 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
107
108 # diff on `component_` (only calculate it when `cal_diff` is True)
109 445 272284 611.9 0.4 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
110
111 445 967 2.2 0.0 if is_sparse_x:
112 445 818 1.8 0.0 X_data = X.data
113 445 588 1.3 0.0 X_indices = X.indices
114 445 569 1.3 0.0 X_indptr = X.indptr
115
116 57015 104059 1.8 0.2 for idx_d in xrange(n_samples):
117 56570 66859 1.2 0.1 if is_sparse_x:
118 56570 197957 3.5 0.3 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
119 56570 123378 2.2 0.2 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
120 else:
121 ids = np.nonzero(X[idx_d, :])[0]
122 cnts = X[idx_d, ids]
123
124 56570 181113 3.2 0.3 doc_topic_d = doc_topic_distr[idx_d, :]
125 56570 112248 2.0 0.2 exp_doc_topic_d = exp_doc_topic[idx_d, :]
126 56570 2871002 50.8 4.6 exp_topic_word_d = exp_topic_word_distr[:, ids]
127
128 # The optimal phi_{dwk} is proportional to
129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
130 56570 678083 12.0 1.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
131
132 # Iterate between `doc_topic_d` and `norm_phi` until convergence
133 970904 1252268 1.3 2.0 for _ in xrange(0, max_iters):
134 970806 1877461 1.9 3.0 last_d = doc_topic_d
135
136 970806 1141431 1.2 1.8 doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
137 970806 13609752 14.0 21.8 np.dot(cnts / norm_phi, exp_topic_word_d.T))
138 970806 17626102 18.2 28.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
139 970806 8847251 9.1 14.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
140
141 970806 3845608 4.0 6.1 meanchange = mean_change(last_d, doc_topic_d)
142 970806 1356006 1.4 2.2 if meanchange < mean_change_tol:
143 56472 69706 1.2 0.1 break
144 56570 229828 4.1 0.4 doc_topic_distr[idx_d, :] = doc_topic_d
145
146 # Contribution of document d to the expected sufficient
147 # statistics for the M step.
148 56570 71634 1.3 0.1 if cal_sstats:
149 56570 7247878 128.1 11.6 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
150
151 445 580 1.3 0.0 return (doc_topic_distr, suff_stats)
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
Function: _update_doc_distribution at line 55
Total time: 91.5451 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
56 mean_change_tol, cal_sstats, random_state):
57 """E-step: update document-topic distribution.
95 """
96 445 1970 4.4 0.0 is_sparse_x = sp.issparse(X)
97 445 1482 3.3 0.0 n_samples, n_features = X.shape
98 445 987 2.2 0.0 n_topics = exp_topic_word_distr.shape[0]
99
100 445 603 1.4 0.0 if random_state:
101 445 372312 836.7 0.4 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
102 else:
103 doc_topic_distr = np.ones((n_samples, n_topics))
104
105 # In the literature, this is `exp(E[log(theta)])`
106 445 403094 905.8 0.4 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
107
108 # diff on `component_` (only calculate it when `cal_diff` is True)
109 445 246014 552.8 0.3 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
110
111 445 978 2.2 0.0 if is_sparse_x:
112 445 869 2.0 0.0 X_data = X.data
113 445 612 1.4 0.0 X_indices = X.indices
114 445 596 1.3 0.0 X_indptr = X.indptr
115
116 57015 103765 1.8 0.1 for idx_d in xrange(n_samples):
117 56570 68083 1.2 0.1 if is_sparse_x:
118 56570 195278 3.5 0.2 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
119 56570 126524 2.2 0.1 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
120 else:
121 ids = np.nonzero(X[idx_d, :])[0]
122 cnts = X[idx_d, ids]
123
124 56570 180917 3.2 0.2 doc_topic_d = doc_topic_distr[idx_d, :]
125 56570 116897 2.1 0.1 exp_doc_topic_d = exp_doc_topic[idx_d, :]
126 56570 2803589 49.6 3.1 exp_topic_word_d = exp_topic_word_distr[:, ids]
127
128 # The optimal phi_{dwk} is proportional to
129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
130 56570 676765 12.0 0.7 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
131
132 # Iterate between `doc_topic_d` and `norm_phi` until convergence
133 970904 1300270 1.3 1.4 for _ in xrange(0, max_iters):
134 970806 2065190 2.1 2.3 last_d = doc_topic_d
135
136 970806 1181293 1.2 1.3 doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
137 970806 14890168 15.3 16.3 np.dot(cnts / norm_phi, exp_topic_word_d.T))
138 970806 18474403 19.0 20.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
139 970806 9081853 9.4 9.9 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
140
141 #meanchange = mean_change(last_d, doc_topic_d)
142 970806 30065266 31.0 32.8 meanchange = np.mean(abs(last_d - doc_topic_d))
143 970806 1654795 1.7 1.8 if meanchange < mean_change_tol:
144 56472 72215 1.3 0.1 break
145 56570 241869 4.3 0.3 doc_topic_distr[idx_d, :] = doc_topic_d
146
147 # Contribution of document d to the expected sufficient
148 # statistics for the M step.
149 56570 74521 1.3 0.1 if cal_sstats:
150 56570 7141256 126.2 7.8 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
151
152 445 625 1.4 0.0 return (doc_topic_distr, suff_stats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment