chyikwei/ipython_profiling_script.py Secret

## ipython_profiling_script.py
import line_profiler
from sklearn.decomposition.online_lda import _update_doc_distribution, LatentDirichletAllocation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_features = 5000
n_topics = 100
n_top_words = 20

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

%lprun -f _update_doc_distribution lda.fit(tf)

## with cython code
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
Function: _update_doc_distribution at line 55
Total time: 62.5723 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    55                                           def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
    56                                                                        mean_change_tol, cal_sstats, random_state):
    57                                               """E-step: update document-topic distribution.
    95                                               """
    96       445         2072      4.7      0.0      is_sparse_x = sp.issparse(X)
    97       445         1562      3.5      0.0      n_samples, n_features = X.shape
    98       445          968      2.2      0.0      n_topics = exp_topic_word_distr.shape[0]
    99
   100       445          631      1.4      0.0      if random_state:
   101       445       375408    843.6      0.6          doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
   102                                               else:
   103                                                   doc_topic_distr = np.ones((n_samples, n_topics))
   104
   105                                               # In the literature, this is `exp(E[log(theta)])`
   106       445       406215    912.8      0.6      exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
   107
   108                                               # diff on `component_` (only calculate it when `cal_diff` is True)
   109       445       272284    611.9      0.4      suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
   110
   111       445          967      2.2      0.0      if is_sparse_x:
   112       445          818      1.8      0.0          X_data = X.data
   113       445          588      1.3      0.0          X_indices = X.indices
   114       445          569      1.3      0.0          X_indptr = X.indptr
   115
   116     57015       104059      1.8      0.2      for idx_d in xrange(n_samples):
   117     56570        66859      1.2      0.1          if is_sparse_x:
   118     56570       197957      3.5      0.3              ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
   119     56570       123378      2.2      0.2              cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
   120                                                   else:
   121                                                       ids = np.nonzero(X[idx_d, :])[0]
   122                                                       cnts = X[idx_d, ids]
   123
   124     56570       181113      3.2      0.3          doc_topic_d = doc_topic_distr[idx_d, :]
   125     56570       112248      2.0      0.2          exp_doc_topic_d = exp_doc_topic[idx_d, :]
   126     56570      2871002     50.8      4.6          exp_topic_word_d = exp_topic_word_distr[:, ids]
   127
   128                                                   # The optimal phi_{dwk} is proportional to
   129                                                   # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
   130     56570       678083     12.0      1.1          norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
   131
   132                                                   # Iterate between `doc_topic_d` and `norm_phi` until convergence
   133    970904      1252268      1.3      2.0          for _ in xrange(0, max_iters):
   134    970806      1877461      1.9      3.0              last_d = doc_topic_d
   135
   136    970806      1141431      1.2      1.8              doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
   137    970806     13609752     14.0     21.8                             np.dot(cnts / norm_phi, exp_topic_word_d.T))
   138    970806     17626102     18.2     28.2              exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
   139    970806      8847251      9.1     14.1              norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
   140
   141    970806      3845608      4.0      6.1              meanchange = mean_change(last_d, doc_topic_d)
   142    970806      1356006      1.4      2.2              if meanchange < mean_change_tol:
   143     56472        69706      1.2      0.1                  break
   144     56570       229828      4.1      0.4          doc_topic_distr[idx_d, :] = doc_topic_d
   145
   146                                                   # Contribution of document d to the expected sufficient
   147                                                   # statistics for the M step.
   148     56570        71634      1.3      0.1          if cal_sstats:
   149     56570      7247878    128.1     11.6              suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
   150
   151       445          580      1.3      0.0      return (doc_topic_distr, suff_stats)

## without cython code
File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
Function: _update_doc_distribution at line 55
Total time: 91.5451 s

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    55                                           def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
    56                                                                        mean_change_tol, cal_sstats, random_state):
    57                                               """E-step: update document-topic distribution.
    95                                               """
    96       445         1970      4.4      0.0      is_sparse_x = sp.issparse(X)
    97       445         1482      3.3      0.0      n_samples, n_features = X.shape
    98       445          987      2.2      0.0      n_topics = exp_topic_word_distr.shape[0]
    99
   100       445          603      1.4      0.0      if random_state:
   101       445       372312    836.7      0.4          doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
   102                                               else:
   103                                                   doc_topic_distr = np.ones((n_samples, n_topics))
   104
   105                                               # In the literature, this is `exp(E[log(theta)])`
   106       445       403094    905.8      0.4      exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
   107
   108                                               # diff on `component_` (only calculate it when `cal_diff` is True)
   109       445       246014    552.8      0.3      suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
   110
   111       445          978      2.2      0.0      if is_sparse_x:
   112       445          869      2.0      0.0          X_data = X.data
   113       445          612      1.4      0.0          X_indices = X.indices
   114       445          596      1.3      0.0          X_indptr = X.indptr
   115
   116     57015       103765      1.8      0.1      for idx_d in xrange(n_samples):
   117     56570        68083      1.2      0.1          if is_sparse_x:
   118     56570       195278      3.5      0.2              ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
   119     56570       126524      2.2      0.1              cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
   120                                                   else:
   121                                                       ids = np.nonzero(X[idx_d, :])[0]
   122                                                       cnts = X[idx_d, ids]
   123
   124     56570       180917      3.2      0.2          doc_topic_d = doc_topic_distr[idx_d, :]
   125     56570       116897      2.1      0.1          exp_doc_topic_d = exp_doc_topic[idx_d, :]
   126     56570      2803589     49.6      3.1          exp_topic_word_d = exp_topic_word_distr[:, ids]
   127
   128                                                   # The optimal phi_{dwk} is proportional to
   129                                                   # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
   130     56570       676765     12.0      0.7          norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
   131
   132                                                   # Iterate between `doc_topic_d` and `norm_phi` until convergence
   133    970904      1300270      1.3      1.4          for _ in xrange(0, max_iters):
   134    970806      2065190      2.1      2.3              last_d = doc_topic_d
   135
   136    970806      1181293      1.2      1.3              doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
   137    970806     14890168     15.3     16.3                             np.dot(cnts / norm_phi, exp_topic_word_d.T))
   138    970806     18474403     19.0     20.2              exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
   139    970806      9081853      9.4      9.9              norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
   140
   141                                                       #meanchange = mean_change(last_d, doc_topic_d)
   142    970806     30065266     31.0     32.8              meanchange = np.mean(abs(last_d - doc_topic_d))
   143    970806      1654795      1.7      1.8              if meanchange < mean_change_tol:
   144     56472        72215      1.3      0.1                  break
   145     56570       241869      4.3      0.3          doc_topic_distr[idx_d, :] = doc_topic_d
   146
   147                                                   # Contribution of document d to the expected sufficient
   148                                                   # statistics for the M step.
   149     56570        74521      1.3      0.1          if cal_sstats:
   150     56570      7141256    126.2      7.8              suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
   151
   152       445          625      1.4      0.0      return (doc_topic_distr, suff_stats)
	import line_profiler
	from sklearn.decomposition.online_lda import _update_doc_distribution, LatentDirichletAllocation

	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.datasets import fetch_20newsgroups

	n_features = 5000
	n_topics = 100
	n_top_words = 20

	dataset = fetch_20newsgroups(shuffle=True, random_state=1,
	remove=('headers', 'footers', 'quotes'))
	data_samples = dataset.data

	tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
	stop_words='english')
	tf = tf_vectorizer.fit_transform(data_samples)

	lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
	learning_method='online', learning_offset=50.,
	random_state=0)

	%lprun -f _update_doc_distribution lda.fit(tf)
	File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
	Function: _update_doc_distribution at line 55
	Total time: 62.5723 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
	56 mean_change_tol, cal_sstats, random_state):
	57 """E-step: update document-topic distribution.
	95 """
	96 445 2072 4.7 0.0 is_sparse_x = sp.issparse(X)
	97 445 1562 3.5 0.0 n_samples, n_features = X.shape
	98 445 968 2.2 0.0 n_topics = exp_topic_word_distr.shape[0]
	99
	100 445 631 1.4 0.0 if random_state:
	101 445 375408 843.6 0.6 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
	102 else:
	103 doc_topic_distr = np.ones((n_samples, n_topics))
	104
	105 # In the literature, this is `exp(E[log(theta)])`
	106 445 406215 912.8 0.6 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
	107
	108 # diff on `component_` (only calculate it when `cal_diff` is True)
	109 445 272284 611.9 0.4 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
	110
	111 445 967 2.2 0.0 if is_sparse_x:
	112 445 818 1.8 0.0 X_data = X.data
	113 445 588 1.3 0.0 X_indices = X.indices
	114 445 569 1.3 0.0 X_indptr = X.indptr
	115
	116 57015 104059 1.8 0.2 for idx_d in xrange(n_samples):
	117 56570 66859 1.2 0.1 if is_sparse_x:
	118 56570 197957 3.5 0.3 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
	119 56570 123378 2.2 0.2 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
	120 else:
	121 ids = np.nonzero(X[idx_d, :])[0]
	122 cnts = X[idx_d, ids]
	123
	124 56570 181113 3.2 0.3 doc_topic_d = doc_topic_distr[idx_d, :]
	125 56570 112248 2.0 0.2 exp_doc_topic_d = exp_doc_topic[idx_d, :]
	126 56570 2871002 50.8 4.6 exp_topic_word_d = exp_topic_word_distr[:, ids]
	127
	128 # The optimal phi_{dwk} is proportional to
	129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
	130 56570 678083 12.0 1.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
	131
	132 # Iterate between `doc_topic_d` and `norm_phi` until convergence
	133 970904 1252268 1.3 2.0 for _ in xrange(0, max_iters):
	134 970806 1877461 1.9 3.0 last_d = doc_topic_d
	135
	136 970806 1141431 1.2 1.8 doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
	137 970806 13609752 14.0 21.8 np.dot(cnts / norm_phi, exp_topic_word_d.T))
	138 970806 17626102 18.2 28.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
	139 970806 8847251 9.1 14.1 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
	140
	141 970806 3845608 4.0 6.1 meanchange = mean_change(last_d, doc_topic_d)
	142 970806 1356006 1.4 2.2 if meanchange < mean_change_tol:
	143 56472 69706 1.2 0.1 break
	144 56570 229828 4.1 0.4 doc_topic_distr[idx_d, :] = doc_topic_d
	145
	146 # Contribution of document d to the expected sufficient
	147 # statistics for the M step.
	148 56570 71634 1.3 0.1 if cal_sstats:
	149 56570 7247878 128.1 11.6 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
	150
	151 445 580 1.3 0.0 return (doc_topic_distr, suff_stats)
	File: /Users/chyikwei/github/scikit-learn/sklearn/decomposition/online_lda.py
	Function: _update_doc_distribution at line 55
	Total time: 91.5451 s

	Line # Hits Time Per Hit % Time Line Contents
	==============================================================
	55 def _update_doc_distribution(X, exp_topic_word_distr, doc_topic_prior, max_iters,
	56 mean_change_tol, cal_sstats, random_state):
	57 """E-step: update document-topic distribution.
	95 """
	96 445 1970 4.4 0.0 is_sparse_x = sp.issparse(X)
	97 445 1482 3.3 0.0 n_samples, n_features = X.shape
	98 445 987 2.2 0.0 n_topics = exp_topic_word_distr.shape[0]
	99
	100 445 603 1.4 0.0 if random_state:
	101 445 372312 836.7 0.4 doc_topic_distr = random_state.gamma(100., 0.01, (n_samples, n_topics))
	102 else:
	103 doc_topic_distr = np.ones((n_samples, n_topics))
	104
	105 # In the literature, this is `exp(E[log(theta)])`
	106 445 403094 905.8 0.4 exp_doc_topic = np.exp(_log_dirichlet_expectation(doc_topic_distr))
	107
	108 # diff on `component_` (only calculate it when `cal_diff` is True)
	109 445 246014 552.8 0.3 suff_stats = np.zeros(exp_topic_word_distr.shape) if cal_sstats else None
	110
	111 445 978 2.2 0.0 if is_sparse_x:
	112 445 869 2.0 0.0 X_data = X.data
	113 445 612 1.4 0.0 X_indices = X.indices
	114 445 596 1.3 0.0 X_indptr = X.indptr
	115
	116 57015 103765 1.8 0.1 for idx_d in xrange(n_samples):
	117 56570 68083 1.2 0.1 if is_sparse_x:
	118 56570 195278 3.5 0.2 ids = X_indices[X_indptr[idx_d]:X_indptr[idx_d + 1]]
	119 56570 126524 2.2 0.1 cnts = X_data[X_indptr[idx_d]:X_indptr[idx_d + 1]]
	120 else:
	121 ids = np.nonzero(X[idx_d, :])[0]
	122 cnts = X[idx_d, ids]
	123
	124 56570 180917 3.2 0.2 doc_topic_d = doc_topic_distr[idx_d, :]
	125 56570 116897 2.1 0.1 exp_doc_topic_d = exp_doc_topic[idx_d, :]
	126 56570 2803589 49.6 3.1 exp_topic_word_d = exp_topic_word_distr[:, ids]
	127
	128 # The optimal phi_{dwk} is proportional to
	129 # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
	130 56570 676765 12.0 0.7 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
	131
	132 # Iterate between `doc_topic_d` and `norm_phi` until convergence
	133 970904 1300270 1.3 1.4 for _ in xrange(0, max_iters):
	134 970806 2065190 2.1 2.3 last_d = doc_topic_d
	135
	136 970806 1181293 1.2 1.3 doc_topic_d = (doc_topic_prior + exp_doc_topic_d *
	137 970806 14890168 15.3 16.3 np.dot(cnts / norm_phi, exp_topic_word_d.T))
	138 970806 18474403 19.0 20.2 exp_doc_topic_d = np.exp(_log_dirichlet_expectation(doc_topic_d))
	139 970806 9081853 9.4 9.9 norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + EPS
	140
	141 #meanchange = mean_change(last_d, doc_topic_d)
	142 970806 30065266 31.0 32.8 meanchange = np.mean(abs(last_d - doc_topic_d))
	143 970806 1654795 1.7 1.8 if meanchange < mean_change_tol:
	144 56472 72215 1.3 0.1 break
	145 56570 241869 4.3 0.3 doc_topic_distr[idx_d, :] = doc_topic_d
	146
	147 # Contribution of document d to the expected sufficient
	148 # statistics for the M step.
	149 56570 74521 1.3 0.1 if cal_sstats:
	150 56570 7141256 126.2 7.8 suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
	151
	152 445 625 1.4 0.0 return (doc_topic_distr, suff_stats)