-
-
Save chyikwei/2b2c68b9f9c46e4866c6 to your computer and use it in GitHub Desktop.
LDA profiling for mean_change code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import line_profiler | |
from sklearn.decomposition.online_lda import _update_doc_distribution, LatentDirichletAllocation | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.datasets import fetch_20newsgroups | |
n_features = 5000 | |
n_topics = 100 | |
n_top_words = 20 | |
dataset = fetch_20newsgroups(shuffle=True, random_state=1, | |
remove=('headers', 'footers', 'quotes')) | |
data_samples = dataset.data | |
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, | |
stop_words='english') | |
tf = tf_vectorizer.fit_transform(data_samples) | |
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, | |
learning_method='online', learning_offset=50., | |
random_state=0) | |
%lprun -f _update_doc_distribution lda.fit(tf) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py | |
Function: _update_doc_distribution at line 55 | |
Total time: 62.5723 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters, | |
56 mean_change_tol, cal_sstats, random_state): | |
57 """E-step: update document-topic distribution. | |
95 """ | |
96 445 2072 4.7 0.0 is_sparse_x = sp.issparse(X) | |
97 445 1562 3.5 0.0 n_samples, n_features = X.shape | |
98 445 968 2.2 0.0 n_topics = exp_topic_word_distr.shape[0] | |
99 | |
100 445 631 1.4 0.0 if random_state: | |
101 445 375408 843.6 0.6 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics)) | |
102 else: | |
103 doc_topic_distr = np.ones((n_samples, n_topics)) | |
104 | |
105 # In the literature, this is `exp(E[log(theta)])` | |
106 445 406215 912.8 0.6 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr)) | |
107 | |
108 # diff on `component_` (only calculate it when `cal_diff` is True) | |
109 445 272284 611.9 0.4 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None | |
110 | |
111 445 967 2.2 0.0 if is_sparse_x: | |
112 445 818 1.8 0.0 X_data = X.data | |
113 445 588 1.3 0.0 X_indices = X.indices | |
114 445 569 1.3 0.0 X_indptr = X.indptr | |
115 | |
116 57015 104059 1.8 0.2 for idx_d in xrange(n_samples): | |
117 56570 66859 1.2 0.1 if is_sparse_x: | |
118 56570 197957 3.5 0.3 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]] | |
119 56570 123378 2.2 0.2 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]] | |
120 else: | |
121 ids = np.nonzero(X[idx_d, :])[0] | |
122 cnts = X[idx_d, ids] | |
123 | |
124 56570 181113 3.2 0.3 doc_topic_d = doc_topic_distr[idx_d, :] | |
125 56570 112248 2.0 0.2 exp_doc_topic_d = exp_doc_topic[idx_d, :] | |
126 56570 2871002 50.8 4.6 exp_topic_word_d = exp_topic_word_distr[:, ids] | |
127 | |
128 # The optimal phi_{dwk} is proportional to | |
129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]). | |
130 56570 678083 12.0 1.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS | |
131 | |
132 # Iterate between `doc_topic_d` and `norm_phi` until convergence | |
133 970904 1252268 1.3 2.0 for _ in xrange(0, max_iters): | |
134 970806 1877461 1.9 3.0 last_d = doc_topic_d | |
135 | |
136 970806 1141431 1.2 1.8 doc_topic_d = (doc_topic_prior + exp_doc_topic_d * | |
137 970806 13609752 14.0 21.8 np.dot(cnts / norm_phi, exp_topic_word_d.T)) | |
138 970806 17626102 18.2 28.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d)) | |
139 970806 8847251 9.1 14.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS | |
140 | |
141 970806 3845608 4.0 6.1 meanchange = mean_change(last_d, doc_topic_d) | |
142 970806 1356006 1.4 2.2 if meanchange < mean_change_tol: | |
143 56472 69706 1.2 0.1 break | |
144 56570 229828 4.1 0.4 doc_topic_distr[idx_d, :] = doc_topic_d | |
145 | |
146 # Contribution of document d to the expected sufficient | |
147 # statistics for the M step. | |
148 56570 71634 1.3 0.1 if cal_sstats: | |
149 56570 7247878 128.1 11.6 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi) | |
150 | |
151 445 580 1.3 0.0 return (doc_topic_distr, suff_stats) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py | |
Function: _update_doc_distribution at line 55 | |
Total time: 91.5451 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters, | |
56 mean_change_tol, cal_sstats, random_state): | |
57 """E-step: update document-topic distribution. | |
95 """ | |
96 445 1970 4.4 0.0 is_sparse_x = sp.issparse(X) | |
97 445 1482 3.3 0.0 n_samples, n_features = X.shape | |
98 445 987 2.2 0.0 n_topics = exp_topic_word_distr.shape[0] | |
99 | |
100 445 603 1.4 0.0 if random_state: | |
101 445 372312 836.7 0.4 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics)) | |
102 else: | |
103 doc_topic_distr = np.ones((n_samples, n_topics)) | |
104 | |
105 # In the literature, this is `exp(E[log(theta)])` | |
106 445 403094 905.8 0.4 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr)) | |
107 | |
108 # diff on `component_` (only calculate it when `cal_diff` is True) | |
109 445 246014 552.8 0.3 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None | |
110 | |
111 445 978 2.2 0.0 if is_sparse_x: | |
112 445 869 2.0 0.0 X_data = X.data | |
113 445 612 1.4 0.0 X_indices = X.indices | |
114 445 596 1.3 0.0 X_indptr = X.indptr | |
115 | |
116 57015 103765 1.8 0.1 for idx_d in xrange(n_samples): | |
117 56570 68083 1.2 0.1 if is_sparse_x: | |
118 56570 195278 3.5 0.2 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]] | |
119 56570 126524 2.2 0.1 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]] | |
120 else: | |
121 ids = np.nonzero(X[idx_d, :])[0] | |
122 cnts = X[idx_d, ids] | |
123 | |
124 56570 180917 3.2 0.2 doc_topic_d = doc_topic_distr[idx_d, :] | |
125 56570 116897 2.1 0.1 exp_doc_topic_d = exp_doc_topic[idx_d, :] | |
126 56570 2803589 49.6 3.1 exp_topic_word_d = exp_topic_word_distr[:, ids] | |
127 | |
128 # The optimal phi_{dwk} is proportional to | |
129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]). | |
130 56570 676765 12.0 0.7 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS | |
131 | |
132 # Iterate between `doc_topic_d` and `norm_phi` until convergence | |
133 970904 1300270 1.3 1.4 for _ in xrange(0, max_iters): | |
134 970806 2065190 2.1 2.3 last_d = doc_topic_d | |
135 | |
136 970806 1181293 1.2 1.3 doc_topic_d = (doc_topic_prior + exp_doc_topic_d * | |
137 970806 14890168 15.3 16.3 np.dot(cnts / norm_phi, exp_topic_word_d.T)) | |
138 970806 18474403 19.0 20.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d)) | |
139 970806 9081853 9.4 9.9 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS | |
140 | |
141 #meanchange = mean_change(last_d, doc_topic_d) | |
142 970806 30065266 31.0 32.8 meanchange = np.mean(abs(last_d - doc_topic_d)) | |
143 970806 1654795 1.7 1.8 if meanchange < mean_change_tol: | |
144 56472 72215 1.3 0.1 break | |
145 56570 241869 4.3 0.3 doc_topic_distr[idx_d, :] = doc_topic_d | |
146 | |
147 # Contribution of document d to the expected sufficient | |
148 # statistics for the M step. | |
149 56570 74521 1.3 0.1 if cal_sstats: | |
150 56570 7141256 126.2 7.8 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi) | |
151 | |
152 445 625 1.4 0.0 return (doc_topic_distr, suff_stats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment