chyikwei/_em_step function

## _em_step function
File: lda.py
Function: _em_step at line 233
Total time: 144.169 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   233                                               @profile
   234                                               def _em_step(self, X, batch_update):
   235                                                   """
   236                                                   EM update for 1 iteration
   237
   238                                                   parameters
   239                                                   ----------
   240                                                   X:  sparse matrix
   241
   242                                                   batch_update: boolean
   243                                                   update `_component` by bath VB or online VB
   244                                                   """
   245                                                   # e-step
   246        10    144161358 14416135.8    100.0          gamma, delta_component = self._e_step(X, cal_delta=True)
   247
   248                                                   # m-step
   249        10            9      0.9      0.0          if batch_update:
   250        10          306     30.6      0.0              self.components_ = self.eta + delta_component
   251                                                   else:
   252                                                       # online update
   253                                                       rhot = np.power(self.tau + self.n_iter_, -self.kappa)
   254                                                       doc_ratio = float(self.n_docs) / X.shape[0]
   255                                                       self.components_ *= (1 - rhot)
   256                                                       self.components_ += (rhot *
   257                                                                            (self.eta + doc_ratio * delta_component))
   258
   259        10         6337    633.7      0.0          self.Elogbeta = _dirichlet_expectation(self.components_)
   260        10         1068    106.8      0.0          self.expElogbeta = np.exp(self.Elogbeta)
   261        10           17      1.7      0.0          self.n_iter_ += 1
   262
   263        10           12      1.2      0.0          return gamma

## e_step (update_gamma)
File: lda.py
Function: _update_gamma at line 32
Total time: 122.966 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    32                                           @profile
    33                                           def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
    34                                                             meanchangethresh, cal_delta):
    35                                               """
    36                                               E-step: update latent variable gamma
    37                                               """
    38
    39        10           28      2.8      0.0      n_docs, n_vocabs = X.shape
    40        10           21      2.1      0.0      n_topics = expElogbeta.shape[0]
    41
    42                                               # gamma is non-normailzed topic distribution
    43        10        25453   2545.3      0.0      gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
    44        10        28540   2854.0      0.0      expElogtheta = np.exp(_dirichlet_expectation(gamma))
    45                                               # diff on component (only calculate it when keep_comp_change is True)
    46        10           95      9.5      0.0      delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
    47
    48        10           17      1.7      0.0      X_data = X.data
    49        10           11      1.1      0.0      X_indices = X.indices
    50        10           15      1.5      0.0      X_indptr = X.indptr
    51
    52     40010        64327      1.6      0.1      for d in xrange(n_docs):
    53     40000       131201      3.3      0.1          ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
    54     40000       101176      2.5      0.1          cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
    55     40000       157978      3.9      0.1          gammad = gamma[d, :]
    56     40000       140439      3.5      0.1          expElogthetad = expElogtheta[d, :]
    57     40000       558147     14.0      0.5          expElogbetad = expElogbeta[:, ids]
    58                                                   # The optimal phi_{dwk} is proportional to
    59                                                   # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
    60     40000       412485     10.3      0.3          phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
    61
    62                                                   # Iterate between gamma and phi until convergence
    63   1270009      1590740      1.3      1.3          for it in xrange(0, max_iters):
    64   1267843      2018700      1.6      1.6              lastgamma = gammad
    65                                                       # We represent phi implicitly to save memory and time.
    66                                                       # Substituting the value of the optimal phi back into
    67                                                       # the update for gamma gives this update. Cf. Lee&Seung 2001.
    68   1267843      1415554      1.1      1.2              gammad = alpha + expElogthetad * \
    69   1267843     19351733     15.3     15.7                  np.dot(cnts / phinorm, expElogbetad.T)
    70   1267843     32689880     25.8     26.6              tmp = _dirichlet_expectation(gammad)
    71   1267843      5131640      4.0      4.2              expElogthetad = np.exp(tmp)
    72   1267843      5214700      4.1      4.2              phinorm = np.dot(expElogthetad, expElogbetad)
    73   1267843      5520713      4.4      4.5              phinorm += 1e-100
    74   1267843      6854410      5.4      5.6              tmp2 = np.absolute(gammad - lastgamma)
    75   1267843     36903876     29.1     30.0              meanchange = np.mean(tmp2)
    76   1267843      1988535      1.6      1.6              if (meanchange < meanchangethresh):
    77     37834        45863      1.2      0.0                  break
    78     40000       262022      6.6      0.2          gamma[d, :] = gammad
    79                                                   # Contribution of document d to the expected sufficient
    80                                                   # statistics for the M step.
    81     40000        51594      1.3      0.0          if cal_delta:
    82     40000      2305939     57.6      1.9              delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
    83
    84        10           14      1.4      0.0      return (gamma, delta_component)

## fit function
File: lda.py
Function: fit at line 350
Total time: 199.511 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
   350                                               @profile
   351                                               def fit(self, X, y=None, max_iters=10):
   352                                                   """
   353                                                   Learn model from X. This function is for batch learning.
   354                                                   So it will override the old _component variables
   355
   356                                                   Parameters
   357                                                   ----------
   358                                                   X: sparse matrix, shape = (n_docs, n_vocabs)
   359                                                       Data matrix to be transformed by the model
   360
   361                                                   max_iters: int, (default: 10)
   362                                                       Max number of iterations
   363
   364                                                   Returns
   365                                                   -------
   366                                                   self
   367                                                   """
   368
   369         1           55     55.0      0.0          X = self._to_csr(X)
   370         1            3      3.0      0.0          n_docs, n_vocabs = X.shape
   371
   372                                                   # initialize parameters
   373         1         1466   1466.0      0.0          self._init_latent_vars(n_vocabs)
   374
   375                                                   # change to preplexity later
   376         1            1      1.0      0.0          last_bound = None
   377        11           34      3.1      0.0          for i in xrange(max_iters):
   378        10    144169860 14416986.0     72.3              gamma = self._em_step(X, batch_update=True)
   379
   380                                                       # check preplexity
   381        10     55338838 5533883.8     27.7              bound = self.preplexity(X, gamma, sub_sampling=False)
   382        10           15      1.5      0.0              if self.verbose:
   383        10          304     30.4      0.0                  print('iteration: %d, preplexity: %.4f' % (i, bound))
   384
   385        10           43      4.3      0.0              if i > 0 and abs(last_bound - bound) < self.prex_tol:
   386                                                           break
   387        10           13      1.3      0.0              last_bound = bound
   388
   389         1            1      1.0      0.0          return self
	File: lda.py
	Function: _em_step at line 233
	Total time: 144.169 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	233 @profile
	234 def _em_step(self, X, batch_update):
	235 """
	236 EM update for 1 iteration
	237
	238 parameters
	239 ----------
	240 X: sparse matrix
	241
	242 batch_update: boolean
	243 update `_component` by bath VB or online VB
	244 """
	245 # e-step
	246 10 144161358 14416135.8 100.0 gamma, delta_component = self._e_step(X, cal_delta=True)
	247
	248 # m-step
	249 10 9 0.9 0.0 if batch_update:
	250 10 306 30.6 0.0 self.components_ = self.eta + delta_component
	251 else:
	252 # online update
	253 rhot = np.power(self.tau + self.n_iter_, -self.kappa)
	254 doc_ratio = float(self.n_docs) / X.shape[0]
	255 self.components_ *= (1 - rhot)
	256 self.components_ += (rhot *
	257 (self.eta + doc_ratio * delta_component))
	258
	259 10 6337 633.7 0.0 self.Elogbeta = _dirichlet_expectation(self.components_)
	260 10 1068 106.8 0.0 self.expElogbeta = np.exp(self.Elogbeta)
	261 10 17 1.7 0.0 self.n_iter_ += 1
	262
	263 10 12 1.2 0.0 return gamma
	File: lda.py
	Function: _update_gamma at line 32
	Total time: 122.966 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	32 @profile
	33 def _update_gamma(X, expElogbeta, alpha, rng, max_iters,
	34 meanchangethresh, cal_delta):
	35 """
	36 E-step: update latent variable gamma
	37 """
	38
	39 10 28 2.8 0.0 n_docs, n_vocabs = X.shape
	40 10 21 2.1 0.0 n_topics = expElogbeta.shape[0]
	41
	42 # gamma is non-normailzed topic distribution
	43 10 25453 2545.3 0.0 gamma = rng.gamma(100., 1. / 100., (n_docs, n_topics))
	44 10 28540 2854.0 0.0 expElogtheta = np.exp(_dirichlet_expectation(gamma))
	45 # diff on component (only calculate it when keep_comp_change is True)
	46 10 95 9.5 0.0 delta_component = np.zeros(expElogbeta.shape) if cal_delta else None
	47
	48 10 17 1.7 0.0 X_data = X.data
	49 10 11 1.1 0.0 X_indices = X.indices
	50 10 15 1.5 0.0 X_indptr = X.indptr
	51
	52 40010 64327 1.6 0.1 for d in xrange(n_docs):
	53 40000 131201 3.3 0.1 ids = X_indices[X_indptr[d]:X_indptr[d + 1]]
	54 40000 101176 2.5 0.1 cnts = X_data[X_indptr[d]:X_indptr[d + 1]]
	55 40000 157978 3.9 0.1 gammad = gamma[d, :]
	56 40000 140439 3.5 0.1 expElogthetad = expElogtheta[d, :]
	57 40000 558147 14.0 0.5 expElogbetad = expElogbeta[:, ids]
	58 # The optimal phi_{dwk} is proportional to
	59 # expElogthetad_k * expElogbetad_w. phinorm is the normalizer.
	60 40000 412485 10.3 0.3 phinorm = np.dot(expElogthetad, expElogbetad) + 1e-100
	61
	62 # Iterate between gamma and phi until convergence
	63 1270009 1590740 1.3 1.3 for it in xrange(0, max_iters):
	64 1267843 2018700 1.6 1.6 lastgamma = gammad
	65 # We represent phi implicitly to save memory and time.
	66 # Substituting the value of the optimal phi back into
	67 # the update for gamma gives this update. Cf. Lee&Seung 2001.
	68 1267843 1415554 1.1 1.2 gammad = alpha + expElogthetad * \
	69 1267843 19351733 15.3 15.7 np.dot(cnts / phinorm, expElogbetad.T)
	70 1267843 32689880 25.8 26.6 tmp = _dirichlet_expectation(gammad)
	71 1267843 5131640 4.0 4.2 expElogthetad = np.exp(tmp)
	72 1267843 5214700 4.1 4.2 phinorm = np.dot(expElogthetad, expElogbetad)
	73 1267843 5520713 4.4 4.5 phinorm += 1e-100
	74 1267843 6854410 5.4 5.6 tmp2 = np.absolute(gammad - lastgamma)
	75 1267843 36903876 29.1 30.0 meanchange = np.mean(tmp2)
	76 1267843 1988535 1.6 1.6 if (meanchange < meanchangethresh):
	77 37834 45863 1.2 0.0 break
	78 40000 262022 6.6 0.2 gamma[d, :] = gammad
	79 # Contribution of document d to the expected sufficient
	80 # statistics for the M step.
	81 40000 51594 1.3 0.0 if cal_delta:
	82 40000 2305939 57.6 1.9 delta_component[:, ids] += np.outer(expElogthetad, cnts / phinorm)
	83
	84 10 14 1.4 0.0 return (gamma, delta_component)
	File: lda.py
	Function: fit at line 350
	Total time: 199.511 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	350 @profile
	351 def fit(self, X, y=None, max_iters=10):
	352 """
	353 Learn model from X. This function is for batch learning.
	354 So it will override the old _component variables
	355
	356 Parameters
	357 ----------
	358 X: sparse matrix, shape = (n_docs, n_vocabs)
	359 Data matrix to be transformed by the model
	360
	361 max_iters: int, (default: 10)
	362 Max number of iterations
	363
	364 Returns
	365 -------
	366 self
	367 """
	368
	369 1 55 55.0 0.0 X = self._to_csr(X)
	370 1 3 3.0 0.0 n_docs, n_vocabs = X.shape
	371
	372 # initialize parameters
	373 1 1466 1466.0 0.0 self._init_latent_vars(n_vocabs)
	374
	375 # change to preplexity later
	376 1 1 1.0 0.0 last_bound = None
	377 11 34 3.1 0.0 for i in xrange(max_iters):
	378 10 144169860 14416986.0 72.3 gamma = self._em_step(X, batch_update=True)
	379
	380 # check preplexity
	381 10 55338838 5533883.8 27.7 bound = self.preplexity(X, gamma, sub_sampling=False)
	382 10 15 1.5 0.0 if self.verbose:
	383 10 304 30.4 0.0 print('iteration: %d, preplexity: %.4f' % (i, bound))
	384
	385 10 43 4.3 0.0 if i > 0 and abs(last_bound - bound) < self.prex_tol:
	386 break
	387 10 13 1.3 0.0 last_bound = bound
	388
	389 1 1 1.0 0.0 return self