Created
October 14, 2014 00:07
-
-
Save chyikwei/59c3f024ff3148efe1df to your computer and use it in GitHub Desktop.
online LDA profiling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: _em_step at line 233 | |
Total time: 144.169 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
233 @profile | |
234 def _em_step(self, X, batch_update): | |
235 """ | |
236 EM update for 1 iteration | |
237 | |
238 parameters | |
239 ---------- | |
240 X: sparse matrix | |
241 | |
242 batch_update: boolean | |
243 update `_component` by bath VB or online VB | |
244 """ | |
245 # e-step | |
246 10 144161358 14416135.8 100.0 gamma, delta_component = self._e_step(X, cal_delta=True) | |
247 | |
248 # m-step | |
249 10 9 0.9 0.0 if batch_update: | |
250 10 306 30.6 0.0 self.components_ = self.eta + delta_component | |
251 else: | |
252 # online update | |
253 rhot = np.power(self.tau + self.n_iter_, -self.kappa) | |
254 doc_ratio = float(self.n_docs) / X.shape[0] | |
255 self.components_ *= (1 - rhot) | |
256 self.components_ += (rhot * | |
257 (self.eta + doc_ratio * delta_component)) | |
258 | |
259 10 6337 633.7 0.0 self.Elogbeta = _dirichlet_expectation(self.components_) | |
260 10 1068 106.8 0.0 self.expElogbeta = np.exp(self.Elogbeta) | |
261 10 17 1.7 0.0 self.n_iter_ += 1 | |
262 | |
263 10 12 1.2 0.0 return gamma |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: _update_gamma at line 32 | |
Total time: 122.966 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
32 @profile | |
33 def _update_gamma(X, expElogbeta, alpha, rng, max_iters, | |
34 meanchangethresh, cal_delta): | |
35 """ | |
36 E-step: update latent variable gamma | |
37 """ | |
38 | |
39 10 28 2.8 0.0 n_docs, n_vocabs = X.shape | |
40 10 21 2.1 0.0 n_topics = expElogbeta.shape[0] | |
41 | |
42 # gamma is non-normailzed topic distribution | |
43 10 25453 2545.3 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics)) | |
44 10 28540 2854.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma)) | |
45 # diff on component (only calculate it when keep_comp_change is True) | |
46 10 95 9.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None | |
47 | |
48 10 17 1.7 0.0 X_data = X.data | |
49 10 11 1.1 0.0 X_indices = X.indices | |
50 10 15 1.5 0.0 X_indptr = X.indptr | |
51 | |
52 40010 64327 1.6 0.1 for d in xrange(n_docs): | |
53 40000 131201 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]] | |
54 40000 101176 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]] | |
55 40000 157978 3.9 0.1 gammad = gamma[d, :] | |
56 40000 140439 3.5 0.1 expElogthetad = expElogtheta[d, :] | |
57 40000 558147 14.0 0.5 expElogbetad = expElogbeta[:, ids] | |
58 # The optimal phi_{dwk} is proportional to | |
59 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer. | |
60 40000 412485 10.3 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100 | |
61 | |
62 # Iterate between gamma and phi until convergence | |
63 1270009 1590740 1.3 1.3 for it in xrange(0, max_iters): | |
64 1267843 2018700 1.6 1.6 lastgamma = gammad | |
65 # We represent phi implicitly to save memory and time. | |
66 # Substituting the value of the optimal phi back into | |
67 # the update for gamma gives this update. Cf. Lee&Seung 2001. | |
68 1267843 1415554 1.1 1.2 gammad = alpha + expElogthetad * \ | |
69 1267843 19351733 15.3 15.7 np.dot(cnts / phinorm, expElogbetad.T) | |
70 1267843 32689880 25.8 26.6 tmp = _dirichlet_expectation(gammad) | |
71 1267843 5131640 4.0 4.2 expElogthetad = np.exp(tmp) | |
72 1267843 5214700 4.1 4.2 phinorm = np.dot(expElogthetad, expElogbetad) | |
73 1267843 5520713 4.4 4.5 phinorm += 1e-100 | |
74 1267843 6854410 5.4 5.6 tmp2 = np.absolute(gammad - lastgamma) | |
75 1267843 36903876 29.1 30.0 meanchange = np.mean(tmp2) | |
76 1267843 1988535 1.6 1.6 if (meanchange < meanchangethresh): | |
77 37834 45863 1.2 0.0 break | |
78 40000 262022 6.6 0.2 gamma[d, :] = gammad | |
79 # Contribution of document d to the expected sufficient | |
80 # statistics for the M step. | |
81 40000 51594 1.3 0.0 if cal_delta: | |
82 40000 2305939 57.6 1.9 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm) | |
83 | |
84 10 14 1.4 0.0 return (gamma, delta_component) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File: lda.py | |
Function: fit at line 350 | |
Total time: 199.511 s | |
Line # Hits Time Per Hit % Time Line Contents | |
============================================================== | |
350 @profile | |
351 def fit(self, X, y=None, max_iters=10): | |
352 """ | |
353 Learn model from X. This function is for batch learning. | |
354 So it will override the old _component variables | |
355 | |
356 Parameters | |
357 ---------- | |
358 X: sparse matrix, shape = (n_docs, n_vocabs) | |
359 Data matrix to be transformed by the model | |
360 | |
361 max_iters: int, (default: 10) | |
362 Max number of iterations | |
363 | |
364 Returns | |
365 ------- | |
366 self | |
367 """ | |
368 | |
369 1 55 55.0 0.0 X = self._to_csr(X) | |
370 1 3 3.0 0.0 n_docs, n_vocabs = X.shape | |
371 | |
372 # initialize parameters | |
373 1 1466 1466.0 0.0 self._init_latent_vars(n_vocabs) | |
374 | |
375 # change to preplexity later | |
376 1 1 1.0 0.0 last_bound = None | |
377 11 34 3.1 0.0 for i in xrange(max_iters): | |
378 10 144169860 14416986.0 72.3 gamma = self._em_step(X, batch_update=True) | |
379 | |
380 # check preplexity | |
381 10 55338838 5533883.8 27.7 bound = self.preplexity(X, gamma, sub_sampling=False) | |
382 10 15 1.5 0.0 if self.verbose: | |
383 10 304 30.4 0.0 print('iteration: %d, preplexity: %.4f' % (i, bound)) | |
384 | |
385 10 43 4.3 0.0 if i > 0 and abs(last_bound - bound) < self.prex_tol: | |
386 break | |
387 10 13 1.3 0.0 last_bound = bound | |
388 | |
389 1 1 1.0 0.0 return self |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment