Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Created October 14, 2014 00:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chyikwei/59c3f024ff3148efe1df to your computer and use it in GitHub Desktop.
Save chyikwei/59c3f024ff3148efe1df to your computer and use it in GitHub Desktop.
online LDA profiling
File: lda.py
Function: _em_step at line 233
Total time: 144.169 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
233 @profile
234 def _em_step(self, X, batch_update):
235 """
236 EM update for 1 iteration
237
238 parameters
239 ----------
240 X: sparse matrix
241
242 batch_update: boolean
243 update `_component` by bath VB or online VB
244 """
245 # e-step
246 10 144161358 14416135.8 100.0 gamma, delta_component = self._e_step(X, cal_delta=True)
247
248 # m-step
249 10 9 0.9 0.0 if batch_update:
250 10 306 30.6 0.0 self.components_ = self.eta + delta_component
251 else:
252 # online update
253 rhot = np.power(self.tau + self.n_iter_, -self.kappa)
254 doc_ratio = float(self.n_docs) / X.shape[0]
255 self.components_ *= (1 - rhot)
256 self.components_ += (rhot *
257 (self.eta + doc_ratio * delta_component))
258
259 10 6337 633.7 0.0 self.Elogbeta = _dirichlet_expectation(self.components_)
260 10 1068 106.8 0.0 self.expElogbeta = np.exp(self.Elogbeta)
261 10 17 1.7 0.0 self.n_iter_ += 1
262
263 10 12 1.2 0.0 return gamma
File: lda.py
Function: _update_gamma at line 32
Total time: 122.966 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
32 @profile
33 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
34 meanchangethresh, cal_delta):
35 """
36 E-step: update latent variable gamma
37 """
38
39 10 28 2.8 0.0 n_docs, n_vocabs = X.shape
40 10 21 2.1 0.0 n_topics = expElogbeta.shape[0]
41
42 # gamma is non-normailzed topic distribution
43 10 25453 2545.3 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
44 10 28540 2854.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
45 # diff on component (only calculate it when keep_comp_change is True)
46 10 95 9.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
47
48 10 17 1.7 0.0 X_data = X.data
49 10 11 1.1 0.0 X_indices = X.indices
50 10 15 1.5 0.0 X_indptr = X.indptr
51
52 40010 64327 1.6 0.1 for d in xrange(n_docs):
53 40000 131201 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
54 40000 101176 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
55 40000 157978 3.9 0.1 gammad = gamma[d, :]
56 40000 140439 3.5 0.1 expElogthetad = expElogtheta[d, :]
57 40000 558147 14.0 0.5 expElogbetad = expElogbeta[:, ids]
58 # The optimal phi_{dwk} is proportional to
59 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
60 40000 412485 10.3 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
61
62 # Iterate between gamma and phi until convergence
63 1270009 1590740 1.3 1.3 for it in xrange(0, max_iters):
64 1267843 2018700 1.6 1.6 lastgamma = gammad
65 # We represent phi implicitly to save memory and time.
66 # Substituting the value of the optimal phi back into
67 # the update for gamma gives this update. Cf. Lee&Seung 2001.
68 1267843 1415554 1.1 1.2 gammad = alpha + expElogthetad * \
69 1267843 19351733 15.3 15.7 np.dot(cnts / phinorm, expElogbetad.T)
70 1267843 32689880 25.8 26.6 tmp = _dirichlet_expectation(gammad)
71 1267843 5131640 4.0 4.2 expElogthetad = np.exp(tmp)
72 1267843 5214700 4.1 4.2 phinorm = np.dot(expElogthetad, expElogbetad)
73 1267843 5520713 4.4 4.5 phinorm += 1e-100
74 1267843 6854410 5.4 5.6 tmp2 = np.absolute(gammad - lastgamma)
75 1267843 36903876 29.1 30.0 meanchange = np.mean(tmp2)
76 1267843 1988535 1.6 1.6 if (meanchange < meanchangethresh):
77 37834 45863 1.2 0.0 break
78 40000 262022 6.6 0.2 gamma[d, :] = gammad
79 # Contribution of document d to the expected sufficient
80 # statistics for the M step.
81 40000 51594 1.3 0.0 if cal_delta:
82 40000 2305939 57.6 1.9 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
83
84 10 14 1.4 0.0 return (gamma, delta_component)
File: lda.py
Function: fit at line 350
Total time: 199.511 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
350 @profile
351 def fit(self, X, y=None, max_iters=10):
352 """
353 Learn model from X. This function is for batch learning.
354 So it will override the old _component variables
355
356 Parameters
357 ----------
358 X: sparse matrix, shape = (n_docs, n_vocabs)
359 Data matrix to be transformed by the model
360
361 max_iters: int, (default: 10)
362 Max number of iterations
363
364 Returns
365 -------
366 self
367 """
368
369 1 55 55.0 0.0 X = self._to_csr(X)
370 1 3 3.0 0.0 n_docs, n_vocabs = X.shape
371
372 # initialize parameters
373 1 1466 1466.0 0.0 self._init_latent_vars(n_vocabs)
374
375 # change to preplexity later
376 1 1 1.0 0.0 last_bound = None
377 11 34 3.1 0.0 for i in xrange(max_iters):
378 10 144169860 14416986.0 72.3 gamma = self._em_step(X, batch_update=True)
379
380 # check preplexity
381 10 55338838 5533883.8 27.7 bound = self.preplexity(X, gamma, sub_sampling=False)
382 10 15 1.5 0.0 if self.verbose:
383 10 304 30.4 0.0 print('iteration: %d, preplexity: %.4f' % (i, bound))
384
385 10 43 4.3 0.0 if i > 0 and abs(last_bound - bound) < self.prex_tol:
386 break
387 10 13 1.3 0.0 last_bound = bound
388
389 1 1 1.0 0.0 return self
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment