arjoly/bench_ovr.py

## bench_ovr.py
import gc
import argparse
import sys
import numpy as np
import joblib
import time
import scipy.sparse as sp
import warnings
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import fit_ovr
from sklearn.multiclass import predict_ovr
from sklearn.multiclass import _fit_binary
from sklearn.naive_bayes import MultinomialNB

# OneVsRestClassifier.fit = profile(OneVsRestClassifier.fit)
# OneVsRestClassifier.predict = profile(OneVsRestClassifier.predict)
fit_ovr = profile(fit_ovr)
# _fit_binary = profile(_fit_binary)
# predict_ovr = profile(predict_ovr)

@profile
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset', type=str, required=True,
        help='Path to the dataset')
    args = vars(parser.parse_args(argv))

    data = joblib.load(args["dataset"])
    X = data["X"]
    y = data["y"]

    print("X.shape =", X.shape)
    print("y.shape =", y.shape)
    if sp.issparse(y):
        print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        gc.collect()
        start_time = time.time()
        est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
        chrono_fit = time.time() - start_time

    gc.collect()
    start_time = time.time()
    est.predict(X)
    chrono_predict = time.time() - start_time

    print(args)
    print("Time to fit     = %s" % chrono_fit)
    print("Time to predict = %s" % chrono_predict)


if __name__ == "__main__":
    main()

## bench_ovr_mem.txt
Branch / dataset | Fit | predict
--------------------------------
master | 68.4454979897 | 42.3201339245
3276 / dense    |  67.2968831062 | 35.926846981
3276 / sparse    |   60.5505950451 | 34.7806410789
sprs-ovr-gc / dense  | 116.219926119 | 36.4771659374
sprs-ovr-gc / sparse  | 105.955276012 | 36.4771659374


###############################################################################
Branch master

ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on master!
(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit     = 68.4454979897
Time to predict = 42.3201339245
Filename: sklearn/multiclass.py

Line #    Mem usage    Increment   Line Contents
================================================
    83                             def fit_ovr(estimator, X, y, n_jobs=1):
    84   649.430 MB     0.000 MB       """Fit a one-vs-the-rest strategy."""
    85   649.430 MB     0.000 MB       _check_estimator(estimator)
    86
    87   649.430 MB     0.000 MB       lb = LabelBinarizer()
    88  1415.551 MB   766.121 MB       Y = lb.fit_transform(y)
    89
    90  1415.551 MB     0.000 MB       estimators = Parallel(n_jobs=n_jobs)(
    91  1415.551 MB     0.000 MB           delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i])
    92  1439.605 MB    24.055 MB           for i in range(Y.shape[1]))
    93  1439.605 MB     0.000 MB       return estimators, lb


Filename: bench_ovr.py

Line #    Mem usage    Increment   Line Contents
================================================
    19                             @profile
    20    28.832 MB     0.000 MB   def main(argv=None):
    21    28.836 MB     0.004 MB       if argv is None:
    22    28.836 MB     0.000 MB           argv = sys.argv[1:]
    23
    24    28.875 MB     0.039 MB       parser = argparse.ArgumentParser()
    25    28.875 MB     0.000 MB       parser.add_argument('-d', '--dataset', type=str, required=True,
    26    28.875 MB     0.000 MB           help='Path to the dataset')
    27    28.883 MB     0.008 MB       args = vars(parser.parse_args(argv))
    28
    29   649.430 MB   620.547 MB       data = joblib.load(args["dataset"])
    30   649.430 MB     0.000 MB       X = data["X"]
    31   649.430 MB     0.000 MB       y = data["y"]
    32
    33   649.430 MB     0.000 MB       print("X.shape =", X.shape)
    34   649.430 MB     0.000 MB       print("y.shape =", y.shape)
    35   649.430 MB     0.000 MB       if sp.issparse(y):
    36                                     print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
    37
    38   649.430 MB     0.000 MB       with warnings.catch_warnings():
    39   649.430 MB     0.000 MB           warnings.simplefilter("ignore")
    40   649.430 MB     0.000 MB           gc.collect()
    41   649.430 MB     0.000 MB           start_time = time.time()
    42   829.254 MB   179.824 MB           est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
    43   829.254 MB     0.000 MB           chrono_fit = time.time() - start_time
    44
    45   829.254 MB     0.000 MB       gc.collect()
    46   829.254 MB     0.000 MB       start_time = time.time()
    47   755.285 MB   -73.969 MB       est.predict(X)
    48   755.285 MB     0.000 MB       chrono_predict = time.time() - start_time
    49
    50   755.285 MB     0.000 MB       print(args)
    51   755.285 MB     0.000 MB       print("Time to fit     = %s" % chrono_fit)
    52   755.285 MB     0.000 MB       print("Time to predict = %s" % chrono_predict)


###############################################################################
Branch https://github.com/scikit-learn/scikit-learn/pull/3276


ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on 9f29711!
(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit     = 67.2968831062
Time to predict = 35.926846981
Filename: bench_ovr.py

Line #    Mem usage    Increment   Line Contents
================================================
    21                             @profile
    22    30.418 MB     0.000 MB   def main(argv=None):
    23    30.418 MB     0.000 MB       if argv is None:
    24    30.418 MB     0.000 MB           argv = sys.argv[1:]
    25
    26    30.441 MB     0.023 MB       parser = argparse.ArgumentParser()
    27    30.441 MB     0.000 MB       parser.add_argument('-d', '--dataset', type=str, required=True,
    28    30.441 MB     0.000 MB           help='Path to the dataset')
    29    30.453 MB     0.012 MB       args = vars(parser.parse_args(argv))
    30
    31   650.160 MB   619.707 MB       data = joblib.load(args["dataset"])
    32   650.160 MB     0.000 MB       X = data["X"]
    33   650.160 MB     0.000 MB       y = data["y"]
    34
    35   650.160 MB     0.000 MB       print("X.shape =", X.shape)
    36   650.160 MB     0.000 MB       print("y.shape =", y.shape)
    37   650.160 MB     0.000 MB       if sp.issparse(y):
    38                                     print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
    39
    40   650.160 MB     0.000 MB       with warnings.catch_warnings():
    41   650.160 MB     0.000 MB           warnings.simplefilter("ignore")
    42   650.160 MB     0.000 MB           gc.collect()
    43   650.160 MB     0.000 MB           start_time = time.time()
    44   830.082 MB   179.922 MB           est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
    45   830.082 MB     0.000 MB           chrono_fit = time.time() - start_time
    46
    47   830.082 MB     0.000 MB       gc.collect()
    48   830.082 MB     0.000 MB       start_time = time.time()
    49   830.164 MB     0.082 MB       est.predict(X)
    50   830.164 MB     0.000 MB       chrono_predict = time.time() - start_time
    51
    52   830.164 MB     0.000 MB       print(args)
    53   830.164 MB     0.000 MB       print("Time to fit     = %s" % chrono_fit)
    54   830.164 MB     0.000 MB       print("Time to predict = %s" % chrono_predict)


Filename: sklearn/multiclass.py

Line #    Mem usage    Increment   Line Contents
================================================
    87                             def fit_ovr(estimator, X, y, n_jobs=1):
    88   650.160 MB     0.000 MB       """Fit a one-vs-the-rest strategy."""
    89   650.160 MB     0.000 MB       _check_estimator(estimator)
    90   650.160 MB     0.000 MB       lb = LabelBinarizer(sparse_output=True)
    91   805.719 MB   155.559 MB       Y = lb.fit_transform(y)
    92
    93   805.719 MB     0.000 MB       if sp.issparse(Y):
    94   805.758 MB     0.039 MB           Y = Y.tocsc()
    95   805.777 MB     0.020 MB           columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
    96                                 else:
    97                                     columns = Y.T
    98   805.777 MB     0.000 MB       estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
    99                                                                      (estimator,
   100                                                                       X,
   101                                                                       column,
   102                                                                       classes=["not %s" % i, i])
   103   830.082 MB    24.305 MB                                            for i, column in enumerate(columns))
   104   830.082 MB     0.000 MB       return estimators, lb

###############################################################################
Branch https://github.com/scikit-learn/scikit-learn/pull/3276

(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
('Target Data Sparsity', 0.0010003124999999999)
{'dataset': 'sparse_output.joblib'}
Time to fit     = 60.5505950451
Time to predict = 34.7806410789
Filename: bench_ovr.py

Line #    Mem usage    Increment   Line Contents
================================================
    21                             @profile
    22    28.980 MB     0.000 MB   def main(argv=None):
    23    28.984 MB     0.004 MB       if argv is None:
    24    28.984 MB     0.000 MB           argv = sys.argv[1:]
    25
    26    29.012 MB     0.027 MB       parser = argparse.ArgumentParser()
    27    29.012 MB     0.000 MB       parser.add_argument('-d', '--dataset', type=str, required=True,
    28    29.016 MB     0.004 MB           help='Path to the dataset')
    29    29.027 MB     0.012 MB       args = vars(parser.parse_args(argv))
    30
    31    40.316 MB    11.289 MB       data = joblib.load(args["dataset"])
    32    40.316 MB     0.000 MB       X = data["X"]
    33    40.316 MB     0.000 MB       y = data["y"]
    34
    35    40.316 MB     0.000 MB       print("X.shape =", X.shape)
    36    40.316 MB     0.000 MB       print("y.shape =", y.shape)
    37    40.316 MB     0.000 MB       if sp.issparse(y):
    38    40.324 MB     0.008 MB           print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
    39
    40    40.324 MB     0.000 MB       with warnings.catch_warnings():
    41    40.324 MB     0.000 MB           warnings.simplefilter("ignore")
    42    40.324 MB     0.000 MB           gc.collect()
    43    40.324 MB     0.000 MB           start_time = time.time()
    44    67.750 MB    27.426 MB           est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
    45    67.750 MB     0.000 MB           chrono_fit = time.time() - start_time
    46
    47    67.750 MB     0.000 MB       gc.collect()
    48    67.750 MB     0.000 MB       start_time = time.time()
    49    67.930 MB     0.180 MB       est.predict(X)
    50    67.930 MB     0.000 MB       chrono_predict = time.time() - start_time
    51
    52    67.930 MB     0.000 MB       print(args)
    53    67.930 MB     0.000 MB       print("Time to fit     = %s" % chrono_fit)
    54    67.930 MB     0.000 MB       print("Time to predict = %s" % chrono_predict)


Filename: sklearn/multiclass.py

Line #    Mem usage    Increment   Line Contents
================================================
    87                             def fit_ovr(estimator, X, y, n_jobs=1):
    88    40.324 MB     0.000 MB       """Fit a one-vs-the-rest strategy."""
    89    40.324 MB     0.000 MB       _check_estimator(estimator)
    90    40.324 MB     0.000 MB       lb = LabelBinarizer(sparse_output=True)
    91    40.625 MB     0.301 MB       Y = lb.fit_transform(y)
    92
    93    40.625 MB     0.000 MB       if sp.issparse(Y):
    94    41.586 MB     0.961 MB           Y = Y.tocsc()
    95    41.586 MB     0.000 MB           columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
    96                                 else:
    97                                     columns = Y.T
    98    41.586 MB     0.000 MB       estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
    99                                                                      (estimator,
   100                                                                       X,
   101                                                                       column,
   102                                                                       classes=["not %s" % i, i])
   103    67.750 MB    26.164 MB                                            for i, column in enumerate(columns))
   104    67.750 MB     0.000 MB       return estimators, lb


###############################################################################
Branch  sprs-ovr-gc branch

(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
('Target Data Sparsity', 0.0010003124999999999)
{'dataset': 'sparse_output.joblib'}
Time to fit     = 105.955276012
Time to predict = 36.4771659374
Filename: sklearn/multiclass.py

Line #    Mem usage    Increment   Line Contents
================================================
    94                             def fit_ovr(estimator, X, y, n_jobs=-1):
    95    40.246 MB     0.000 MB       """Fit a one-vs-the-rest strategy."""
    96    40.246 MB     0.000 MB       _check_estimator(estimator)
    97
    98                                 # Choose first line to do dense vs sparse target data bench
    99    40.246 MB     0.000 MB       lb = LabelBinarizer(sparse_output=sp.issparse(y))
   100                                 # lb = LabelBinarizer(sparse_output=True)
   101
   102    40.582 MB     0.336 MB       Y = lb.fit_transform(y)
   103
   104    40.582 MB     0.000 MB       if sp.issparse(Y):
   105    41.543 MB     0.961 MB           Y = Y.tocsc()
   106    41.543 MB     0.000 MB           estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
   107                                                                          (estimator,
   108                                                                           X,
   109                                                                           y=None,
   110                                                                           classes=["not %s" % i, i],
   111                                                                           Y=Y,
   112                                                                           i=i)
   113    67.734 MB    26.191 MB                                                for i in range(Y.shape[1]))
   114                                 else:
   115                                     columns = Y.T
   116                                     estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
   117                                                                          (estimator,
   118                                                                           X,
   119                                                                           column,
   120                                                                           classes=["not %s" % i, i],
   121                                                                           Y=None,
   122                                                                           i=None)
   123                                                                          for i, column in enumerate(columns))
   124
   125    67.734 MB     0.000 MB       return estimators, lb


Filename: bench_ovr.py

Line #    Mem usage    Increment   Line Contents
================================================
    21                             @profile
    22    28.875 MB     0.000 MB   def main(argv=None):
    23    28.875 MB     0.000 MB       if argv is None:
    24    28.875 MB     0.000 MB           argv = sys.argv[1:]
    25
    26    28.906 MB     0.031 MB       parser = argparse.ArgumentParser()
    27    28.906 MB     0.000 MB       parser.add_argument('-d', '--dataset', type=str, required=True,
    28    28.906 MB     0.000 MB           help='Path to the dataset')
    29    28.922 MB     0.016 MB       args = vars(parser.parse_args(argv))
    30
    31    40.238 MB    11.316 MB       data = joblib.load(args["dataset"])
    32    40.238 MB     0.000 MB       X = data["X"]
    33    40.238 MB     0.000 MB       y = data["y"]
    34
    35    40.238 MB     0.000 MB       print("X.shape =", X.shape)
    36    40.238 MB     0.000 MB       print("y.shape =", y.shape)
    37    40.238 MB     0.000 MB       if sp.issparse(y):
    38    40.246 MB     0.008 MB           print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
    39
    40    40.246 MB     0.000 MB       with warnings.catch_warnings():
    41    40.246 MB     0.000 MB           warnings.simplefilter("ignore")
    42    40.246 MB     0.000 MB           gc.collect()
    43    40.246 MB     0.000 MB           start_time = time.time()
    44    67.734 MB    27.488 MB           est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
    45    67.734 MB     0.000 MB           chrono_fit = time.time() - start_time
    46
    47    67.734 MB     0.000 MB       gc.collect()
    48    67.734 MB     0.000 MB       start_time = time.time()
    49    67.906 MB     0.172 MB       est.predict(X)
    50    67.906 MB     0.000 MB       chrono_predict = time.time() - start_time
    51
    52    67.906 MB     0.000 MB       print(args)
    53    67.906 MB     0.000 MB       print("Time to fit     = %s" % chrono_fit)
    54    67.906 MB     0.000 MB       print("Time to predict = %s" % chrono_predict)


###############################################################################
Branch  sprs-ovr-gc branch

(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit     = 116.219926119
Time to predict = 37.6857318878
Filename: sklearn/multiclass.py

Line #    Mem usage    Increment   Line Contents
================================================
    94                             def fit_ovr(estimator, X, y, n_jobs=-1):
    95   649.539 MB     0.000 MB       """Fit a one-vs-the-rest strategy."""
    96   649.539 MB     0.000 MB       _check_estimator(estimator)
    97
    98                                 # Choose first line to do dense vs sparse target data bench
    99   649.539 MB     0.000 MB       lb = LabelBinarizer(sparse_output=sp.issparse(y))
   100                                 # lb = LabelBinarizer(sparse_output=True)
   101
   102  1415.641 MB   766.102 MB       Y = lb.fit_transform(y)
   103
   104  1415.641 MB     0.000 MB       if sp.issparse(Y):
   105                                     Y = Y.tocsc()
   106                                     estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
   107                                                                          (estimator,
   108                                                                           X,
   109                                                                           y=None,
   110                                                                           classes=["not %s" % i, i],
   111                                                                           Y=Y,
   112                                                                           i=i)
   113                                                                          for i in range(Y.shape[1]))
   114                                 else:
   115  1415.645 MB     0.004 MB           columns = Y.T
   116  1415.645 MB     0.000 MB           estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
   117                                                                          (estimator,
   118                                                                           X,
   119                                                                           column,
   120                                                                           classes=["not %s" % i, i],
   121                                                                           Y=None,
   122                                                                           i=None)
   123  1439.578 MB    23.934 MB                                                for i, column in enumerate(columns))
   124
   125  1439.578 MB     0.000 MB       return estimators, lb


Filename: bench_ovr.py

Line #    Mem usage    Increment   Line Contents
================================================
    21                             @profile
    22    28.898 MB     0.000 MB   def main(argv=None):
    23    28.898 MB     0.000 MB       if argv is None:
    24    28.898 MB     0.000 MB           argv = sys.argv[1:]
    25
    26    28.930 MB     0.031 MB       parser = argparse.ArgumentParser()
    27    28.930 MB     0.000 MB       parser.add_argument('-d', '--dataset', type=str, required=True,
    28    28.934 MB     0.004 MB           help='Path to the dataset')
    29    28.945 MB     0.012 MB       args = vars(parser.parse_args(argv))
    30
    31   649.539 MB   620.594 MB       data = joblib.load(args["dataset"])
    32   649.539 MB     0.000 MB       X = data["X"]
    33   649.539 MB     0.000 MB       y = data["y"]
    34
    35   649.539 MB     0.000 MB       print("X.shape =", X.shape)
    36   649.539 MB     0.000 MB       print("y.shape =", y.shape)
    37   649.539 MB     0.000 MB       if sp.issparse(y):
    38                                     print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
    39
    40   649.539 MB     0.000 MB       with warnings.catch_warnings():
    41   649.539 MB     0.000 MB           warnings.simplefilter("ignore")
    42   649.539 MB     0.000 MB           gc.collect()
    43   649.539 MB     0.000 MB           start_time = time.time()
    44   829.227 MB   179.688 MB           est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
    45   829.227 MB     0.000 MB           chrono_fit = time.time() - start_time
    46
    47   829.227 MB     0.000 MB       gc.collect()
    48   829.227 MB     0.000 MB       start_time = time.time()
    49   829.309 MB     0.082 MB       est.predict(X)
    50   829.309 MB     0.000 MB       chrono_predict = time.time() - start_time
    51
    52   829.309 MB     0.000 MB       print(args)
    53   829.309 MB     0.000 MB       print("Time to fit     = %s" % chrono_fit)
    54   829.309 MB     0.000 MB       print("Time to predict = %s" % chrono_predict)


## make_mlb.py
from scipy.sparse import csr_matrix
from sklearn.datasets import make_multilabel_classification
import joblib

X, y = make_multilabel_classification(sparse=True, return_indicator=True,
                                      n_samples=20000, n_features=100,
                                      n_classes=4000, n_labels=4,
                                      random_state=0)


joblib.dump({"X": X, "y": y}, "dense_output.joblib")
joblib.dump({"X": X, "y": csr_matrix(y)}, "sparse_output.joblib")
	import gc
	import argparse
	import sys
	import numpy as np
	import joblib
	import time
	import scipy.sparse as sp
	import warnings
	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.multiclass import fit_ovr
	from sklearn.multiclass import predict_ovr
	from sklearn.multiclass import _fit_binary
	from sklearn.naive_bayes import MultinomialNB

	# OneVsRestClassifier.fit = profile(OneVsRestClassifier.fit)
	# OneVsRestClassifier.predict = profile(OneVsRestClassifier.predict)
	fit_ovr = profile(fit_ovr)
	# _fit_binary = profile(_fit_binary)
	# predict_ovr = profile(predict_ovr)

	@profile
	def main(argv=None):
	if argv is None:
	argv = sys.argv[1:]

	parser = argparse.ArgumentParser()
	parser.add_argument('-d', '--dataset', type=str, required=True,
	help='Path to the dataset')
	args = vars(parser.parse_args(argv))

	data = joblib.load(args["dataset"])
	X = data["X"]
	y = data["y"]

	print("X.shape =", X.shape)
	print("y.shape =", y.shape)
	if sp.issparse(y):
	print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))

	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	gc.collect()
	start_time = time.time()
	est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	chrono_fit = time.time() - start_time

	gc.collect()
	start_time = time.time()
	est.predict(X)
	chrono_predict = time.time() - start_time

	print(args)
	print("Time to fit = %s" % chrono_fit)
	print("Time to predict = %s" % chrono_predict)


	if __name__ == "__main__":
	main()
	Branch / dataset \| Fit \| predict
	--------------------------------
	master \| 68.4454979897 \| 42.3201339245
	3276 / dense \| 67.2968831062 \| 35.926846981
	3276 / sparse \| 60.5505950451 \| 34.7806410789
	sprs-ovr-gc / dense \| 116.219926119 \| 36.4771659374
	sprs-ovr-gc / sparse \| 105.955276012 \| 36.4771659374


	###############################################################################
	Branch master

	ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on master!
	(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
	('X.shape =', (20000, 100))
	('y.shape =', (20000, 4000))
	{'dataset': 'dense_output.joblib'}
	Time to fit = 68.4454979897
	Time to predict = 42.3201339245
	Filename: sklearn/multiclass.py

	Line # Mem usage Increment Line Contents
	================================================
	83 def fit_ovr(estimator, X, y, n_jobs=1):
	84 649.430 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
	85 649.430 MB 0.000 MB _check_estimator(estimator)
	86
	87 649.430 MB 0.000 MB lb = LabelBinarizer()
	88 1415.551 MB 766.121 MB Y = lb.fit_transform(y)
	89
	90 1415.551 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(
	91 1415.551 MB 0.000 MB delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i])
	92 1439.605 MB 24.055 MB for i in range(Y.shape[1]))
	93 1439.605 MB 0.000 MB return estimators, lb


	Filename: bench_ovr.py

	Line # Mem usage Increment Line Contents
	================================================
	19 @profile
	20 28.832 MB 0.000 MB def main(argv=None):
	21 28.836 MB 0.004 MB if argv is None:
	22 28.836 MB 0.000 MB argv = sys.argv[1:]
	23
	24 28.875 MB 0.039 MB parser = argparse.ArgumentParser()
	25 28.875 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
	26 28.875 MB 0.000 MB help='Path to the dataset')
	27 28.883 MB 0.008 MB args = vars(parser.parse_args(argv))
	28
	29 649.430 MB 620.547 MB data = joblib.load(args["dataset"])
	30 649.430 MB 0.000 MB X = data["X"]
	31 649.430 MB 0.000 MB y = data["y"]
	32
	33 649.430 MB 0.000 MB print("X.shape =", X.shape)
	34 649.430 MB 0.000 MB print("y.shape =", y.shape)
	35 649.430 MB 0.000 MB if sp.issparse(y):
	36 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
	37
	38 649.430 MB 0.000 MB with warnings.catch_warnings():
	39 649.430 MB 0.000 MB warnings.simplefilter("ignore")
	40 649.430 MB 0.000 MB gc.collect()
	41 649.430 MB 0.000 MB start_time = time.time()
	42 829.254 MB 179.824 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	43 829.254 MB 0.000 MB chrono_fit = time.time() - start_time
	44
	45 829.254 MB 0.000 MB gc.collect()
	46 829.254 MB 0.000 MB start_time = time.time()
	47 755.285 MB -73.969 MB est.predict(X)
	48 755.285 MB 0.000 MB chrono_predict = time.time() - start_time
	49
	50 755.285 MB 0.000 MB print(args)
	51 755.285 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
	52 755.285 MB 0.000 MB print("Time to predict = %s" % chrono_predict)


	###############################################################################
	Branch https://github.com/scikit-learn/scikit-learn/pull/3276


	ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on 9f29711!
	(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
	('X.shape =', (20000, 100))
	('y.shape =', (20000, 4000))
	{'dataset': 'dense_output.joblib'}
	Time to fit = 67.2968831062
	Time to predict = 35.926846981
	Filename: bench_ovr.py

	Line # Mem usage Increment Line Contents
	================================================
	21 @profile
	22 30.418 MB 0.000 MB def main(argv=None):
	23 30.418 MB 0.000 MB if argv is None:
	24 30.418 MB 0.000 MB argv = sys.argv[1:]
	25
	26 30.441 MB 0.023 MB parser = argparse.ArgumentParser()
	27 30.441 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
	28 30.441 MB 0.000 MB help='Path to the dataset')
	29 30.453 MB 0.012 MB args = vars(parser.parse_args(argv))
	30
	31 650.160 MB 619.707 MB data = joblib.load(args["dataset"])
	32 650.160 MB 0.000 MB X = data["X"]
	33 650.160 MB 0.000 MB y = data["y"]
	34
	35 650.160 MB 0.000 MB print("X.shape =", X.shape)
	36 650.160 MB 0.000 MB print("y.shape =", y.shape)
	37 650.160 MB 0.000 MB if sp.issparse(y):
	38 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
	39
	40 650.160 MB 0.000 MB with warnings.catch_warnings():
	41 650.160 MB 0.000 MB warnings.simplefilter("ignore")
	42 650.160 MB 0.000 MB gc.collect()
	43 650.160 MB 0.000 MB start_time = time.time()
	44 830.082 MB 179.922 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	45 830.082 MB 0.000 MB chrono_fit = time.time() - start_time
	46
	47 830.082 MB 0.000 MB gc.collect()
	48 830.082 MB 0.000 MB start_time = time.time()
	49 830.164 MB 0.082 MB est.predict(X)
	50 830.164 MB 0.000 MB chrono_predict = time.time() - start_time
	51
	52 830.164 MB 0.000 MB print(args)
	53 830.164 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
	54 830.164 MB 0.000 MB print("Time to predict = %s" % chrono_predict)


	Filename: sklearn/multiclass.py

	Line # Mem usage Increment Line Contents
	================================================
	87 def fit_ovr(estimator, X, y, n_jobs=1):
	88 650.160 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
	89 650.160 MB 0.000 MB _check_estimator(estimator)
	90 650.160 MB 0.000 MB lb = LabelBinarizer(sparse_output=True)
	91 805.719 MB 155.559 MB Y = lb.fit_transform(y)
	92
	93 805.719 MB 0.000 MB if sp.issparse(Y):
	94 805.758 MB 0.039 MB Y = Y.tocsc()
	95 805.777 MB 0.020 MB columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
	96 else:
	97 columns = Y.T
	98 805.777 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	99 (estimator,
	100 X,
	101 column,
	102 classes=["not %s" % i, i])
	103 830.082 MB 24.305 MB for i, column in enumerate(columns))
	104 830.082 MB 0.000 MB return estimators, lb

	###############################################################################
	Branch https://github.com/scikit-learn/scikit-learn/pull/3276

	(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
	('X.shape =', (20000, 100))
	('y.shape =', (20000, 4000))
	('Target Data Sparsity', 0.0010003124999999999)
	{'dataset': 'sparse_output.joblib'}
	Time to fit = 60.5505950451
	Time to predict = 34.7806410789
	Filename: bench_ovr.py

	Line # Mem usage Increment Line Contents
	================================================
	21 @profile
	22 28.980 MB 0.000 MB def main(argv=None):
	23 28.984 MB 0.004 MB if argv is None:
	24 28.984 MB 0.000 MB argv = sys.argv[1:]
	25
	26 29.012 MB 0.027 MB parser = argparse.ArgumentParser()
	27 29.012 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
	28 29.016 MB 0.004 MB help='Path to the dataset')
	29 29.027 MB 0.012 MB args = vars(parser.parse_args(argv))
	30
	31 40.316 MB 11.289 MB data = joblib.load(args["dataset"])
	32 40.316 MB 0.000 MB X = data["X"]
	33 40.316 MB 0.000 MB y = data["y"]
	34
	35 40.316 MB 0.000 MB print("X.shape =", X.shape)
	36 40.316 MB 0.000 MB print("y.shape =", y.shape)
	37 40.316 MB 0.000 MB if sp.issparse(y):
	38 40.324 MB 0.008 MB print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
	39
	40 40.324 MB 0.000 MB with warnings.catch_warnings():
	41 40.324 MB 0.000 MB warnings.simplefilter("ignore")
	42 40.324 MB 0.000 MB gc.collect()
	43 40.324 MB 0.000 MB start_time = time.time()
	44 67.750 MB 27.426 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	45 67.750 MB 0.000 MB chrono_fit = time.time() - start_time
	46
	47 67.750 MB 0.000 MB gc.collect()
	48 67.750 MB 0.000 MB start_time = time.time()
	49 67.930 MB 0.180 MB est.predict(X)
	50 67.930 MB 0.000 MB chrono_predict = time.time() - start_time
	51
	52 67.930 MB 0.000 MB print(args)
	53 67.930 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
	54 67.930 MB 0.000 MB print("Time to predict = %s" % chrono_predict)


	Filename: sklearn/multiclass.py

	Line # Mem usage Increment Line Contents
	================================================
	87 def fit_ovr(estimator, X, y, n_jobs=1):
	88 40.324 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
	89 40.324 MB 0.000 MB _check_estimator(estimator)
	90 40.324 MB 0.000 MB lb = LabelBinarizer(sparse_output=True)
	91 40.625 MB 0.301 MB Y = lb.fit_transform(y)
	92
	93 40.625 MB 0.000 MB if sp.issparse(Y):
	94 41.586 MB 0.961 MB Y = Y.tocsc()
	95 41.586 MB 0.000 MB columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
	96 else:
	97 columns = Y.T
	98 41.586 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	99 (estimator,
	100 X,
	101 column,
	102 classes=["not %s" % i, i])
	103 67.750 MB 26.164 MB for i, column in enumerate(columns))
	104 67.750 MB 0.000 MB return estimators, lb


	###############################################################################
	Branch sprs-ovr-gc branch

	(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
	('X.shape =', (20000, 100))
	('y.shape =', (20000, 4000))
	('Target Data Sparsity', 0.0010003124999999999)
	{'dataset': 'sparse_output.joblib'}
	Time to fit = 105.955276012
	Time to predict = 36.4771659374
	Filename: sklearn/multiclass.py

	Line # Mem usage Increment Line Contents
	================================================
	94 def fit_ovr(estimator, X, y, n_jobs=-1):
	95 40.246 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
	96 40.246 MB 0.000 MB _check_estimator(estimator)
	97
	98 # Choose first line to do dense vs sparse target data bench
	99 40.246 MB 0.000 MB lb = LabelBinarizer(sparse_output=sp.issparse(y))
	100 # lb = LabelBinarizer(sparse_output=True)
	101
	102 40.582 MB 0.336 MB Y = lb.fit_transform(y)
	103
	104 40.582 MB 0.000 MB if sp.issparse(Y):
	105 41.543 MB 0.961 MB Y = Y.tocsc()
	106 41.543 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	107 (estimator,
	108 X,
	109 y=None,
	110 classes=["not %s" % i, i],
	111 Y=Y,
	112 i=i)
	113 67.734 MB 26.191 MB for i in range(Y.shape[1]))
	114 else:
	115 columns = Y.T
	116 estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	117 (estimator,
	118 X,
	119 column,
	120 classes=["not %s" % i, i],
	121 Y=None,
	122 i=None)
	123 for i, column in enumerate(columns))
	124
	125 67.734 MB 0.000 MB return estimators, lb


	Filename: bench_ovr.py

	Line # Mem usage Increment Line Contents
	================================================
	21 @profile
	22 28.875 MB 0.000 MB def main(argv=None):
	23 28.875 MB 0.000 MB if argv is None:
	24 28.875 MB 0.000 MB argv = sys.argv[1:]
	25
	26 28.906 MB 0.031 MB parser = argparse.ArgumentParser()
	27 28.906 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
	28 28.906 MB 0.000 MB help='Path to the dataset')
	29 28.922 MB 0.016 MB args = vars(parser.parse_args(argv))
	30
	31 40.238 MB 11.316 MB data = joblib.load(args["dataset"])
	32 40.238 MB 0.000 MB X = data["X"]
	33 40.238 MB 0.000 MB y = data["y"]
	34
	35 40.238 MB 0.000 MB print("X.shape =", X.shape)
	36 40.238 MB 0.000 MB print("y.shape =", y.shape)
	37 40.238 MB 0.000 MB if sp.issparse(y):
	38 40.246 MB 0.008 MB print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
	39
	40 40.246 MB 0.000 MB with warnings.catch_warnings():
	41 40.246 MB 0.000 MB warnings.simplefilter("ignore")
	42 40.246 MB 0.000 MB gc.collect()
	43 40.246 MB 0.000 MB start_time = time.time()
	44 67.734 MB 27.488 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	45 67.734 MB 0.000 MB chrono_fit = time.time() - start_time
	46
	47 67.734 MB 0.000 MB gc.collect()
	48 67.734 MB 0.000 MB start_time = time.time()
	49 67.906 MB 0.172 MB est.predict(X)
	50 67.906 MB 0.000 MB chrono_predict = time.time() - start_time
	51
	52 67.906 MB 0.000 MB print(args)
	53 67.906 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
	54 67.906 MB 0.000 MB print("Time to predict = %s" % chrono_predict)


	###############################################################################
	Branch sprs-ovr-gc branch

	(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
	('X.shape =', (20000, 100))
	('y.shape =', (20000, 4000))
	{'dataset': 'dense_output.joblib'}
	Time to fit = 116.219926119
	Time to predict = 37.6857318878
	Filename: sklearn/multiclass.py

	Line # Mem usage Increment Line Contents
	================================================
	94 def fit_ovr(estimator, X, y, n_jobs=-1):
	95 649.539 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
	96 649.539 MB 0.000 MB _check_estimator(estimator)
	97
	98 # Choose first line to do dense vs sparse target data bench
	99 649.539 MB 0.000 MB lb = LabelBinarizer(sparse_output=sp.issparse(y))
	100 # lb = LabelBinarizer(sparse_output=True)
	101
	102 1415.641 MB 766.102 MB Y = lb.fit_transform(y)
	103
	104 1415.641 MB 0.000 MB if sp.issparse(Y):
	105 Y = Y.tocsc()
	106 estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	107 (estimator,
	108 X,
	109 y=None,
	110 classes=["not %s" % i, i],
	111 Y=Y,
	112 i=i)
	113 for i in range(Y.shape[1]))
	114 else:
	115 1415.645 MB 0.004 MB columns = Y.T
	116 1415.645 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
	117 (estimator,
	118 X,
	119 column,
	120 classes=["not %s" % i, i],
	121 Y=None,
	122 i=None)
	123 1439.578 MB 23.934 MB for i, column in enumerate(columns))
	124
	125 1439.578 MB 0.000 MB return estimators, lb


	Filename: bench_ovr.py

	Line # Mem usage Increment Line Contents
	================================================
	21 @profile
	22 28.898 MB 0.000 MB def main(argv=None):
	23 28.898 MB 0.000 MB if argv is None:
	24 28.898 MB 0.000 MB argv = sys.argv[1:]
	25
	26 28.930 MB 0.031 MB parser = argparse.ArgumentParser()
	27 28.930 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
	28 28.934 MB 0.004 MB help='Path to the dataset')
	29 28.945 MB 0.012 MB args = vars(parser.parse_args(argv))
	30
	31 649.539 MB 620.594 MB data = joblib.load(args["dataset"])
	32 649.539 MB 0.000 MB X = data["X"]
	33 649.539 MB 0.000 MB y = data["y"]
	34
	35 649.539 MB 0.000 MB print("X.shape =", X.shape)
	36 649.539 MB 0.000 MB print("y.shape =", y.shape)
	37 649.539 MB 0.000 MB if sp.issparse(y):
	38 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
	39
	40 649.539 MB 0.000 MB with warnings.catch_warnings():
	41 649.539 MB 0.000 MB warnings.simplefilter("ignore")
	42 649.539 MB 0.000 MB gc.collect()
	43 649.539 MB 0.000 MB start_time = time.time()
	44 829.227 MB 179.688 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
	45 829.227 MB 0.000 MB chrono_fit = time.time() - start_time
	46
	47 829.227 MB 0.000 MB gc.collect()
	48 829.227 MB 0.000 MB start_time = time.time()
	49 829.309 MB 0.082 MB est.predict(X)
	50 829.309 MB 0.000 MB chrono_predict = time.time() - start_time
	51
	52 829.309 MB 0.000 MB print(args)
	53 829.309 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
	54 829.309 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
	from scipy.sparse import csr_matrix
	from sklearn.datasets import make_multilabel_classification
	import joblib

	X, y = make_multilabel_classification(sparse=True, return_indicator=True,
	n_samples=20000, n_features=100,
	n_classes=4000, n_labels=4,
	random_state=0)


	joblib.dump({"X": X, "y": y}, "dense_output.joblib")
	joblib.dump({"X": X, "y": csr_matrix(y)}, "sparse_output.joblib")