Skip to content

Instantly share code, notes, and snippets.

@arjoly
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arjoly/6962f6346c58e60e709a to your computer and use it in GitHub Desktop.
Save arjoly/6962f6346c58e60e709a to your computer and use it in GitHub Desktop.
Bench sparse multilabel ovr
import gc
import argparse
import sys
import numpy as np
import joblib
import time
import scipy.sparse as sp
import warnings
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import fit_ovr
from sklearn.multiclass import predict_ovr
from sklearn.multiclass import _fit_binary
from sklearn.naive_bayes import MultinomialNB
# OneVsRestClassifier.fit = profile(OneVsRestClassifier.fit)
# OneVsRestClassifier.predict = profile(OneVsRestClassifier.predict)
fit_ovr = profile(fit_ovr)
# _fit_binary = profile(_fit_binary)
# predict_ovr = profile(predict_ovr)
@profile
def main(argv=None):
if argv is None:
argv = sys.argv[1:]
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--dataset', type=str, required=True,
help='Path to the dataset')
args = vars(parser.parse_args(argv))
data = joblib.load(args["dataset"])
X = data["X"]
y = data["y"]
print("X.shape =", X.shape)
print("y.shape =", y.shape)
if sp.issparse(y):
print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
gc.collect()
start_time = time.time()
est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
chrono_fit = time.time() - start_time
gc.collect()
start_time = time.time()
est.predict(X)
chrono_predict = time.time() - start_time
print(args)
print("Time to fit = %s" % chrono_fit)
print("Time to predict = %s" % chrono_predict)
if __name__ == "__main__":
main()
Branch / dataset | Fit | predict
--------------------------------
master | 68.4454979897 | 42.3201339245
3276 / dense | 67.2968831062 | 35.926846981
3276 / sparse | 60.5505950451 | 34.7806410789
sprs-ovr-gc / dense  | 116.219926119 | 36.4771659374
sprs-ovr-gc / sparse  | 105.955276012 | 36.4771659374
###############################################################################
Branch master
ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on master!
(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit = 68.4454979897
Time to predict = 42.3201339245
Filename: sklearn/multiclass.py
Line # Mem usage Increment Line Contents
================================================
83 def fit_ovr(estimator, X, y, n_jobs=1):
84 649.430 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
85 649.430 MB 0.000 MB _check_estimator(estimator)
86
87 649.430 MB 0.000 MB lb = LabelBinarizer()
88 1415.551 MB 766.121 MB Y = lb.fit_transform(y)
89
90 1415.551 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(
91 1415.551 MB 0.000 MB delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i])
92 1439.605 MB 24.055 MB for i in range(Y.shape[1]))
93 1439.605 MB 0.000 MB return estimators, lb
Filename: bench_ovr.py
Line # Mem usage Increment Line Contents
================================================
19 @profile
20 28.832 MB 0.000 MB def main(argv=None):
21 28.836 MB 0.004 MB if argv is None:
22 28.836 MB 0.000 MB argv = sys.argv[1:]
23
24 28.875 MB 0.039 MB parser = argparse.ArgumentParser()
25 28.875 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
26 28.875 MB 0.000 MB help='Path to the dataset')
27 28.883 MB 0.008 MB args = vars(parser.parse_args(argv))
28
29 649.430 MB 620.547 MB data = joblib.load(args["dataset"])
30 649.430 MB 0.000 MB X = data["X"]
31 649.430 MB 0.000 MB y = data["y"]
32
33 649.430 MB 0.000 MB print("X.shape =", X.shape)
34 649.430 MB 0.000 MB print("y.shape =", y.shape)
35 649.430 MB 0.000 MB if sp.issparse(y):
36 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
37
38 649.430 MB 0.000 MB with warnings.catch_warnings():
39 649.430 MB 0.000 MB warnings.simplefilter("ignore")
40 649.430 MB 0.000 MB gc.collect()
41 649.430 MB 0.000 MB start_time = time.time()
42 829.254 MB 179.824 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
43 829.254 MB 0.000 MB chrono_fit = time.time() - start_time
44
45 829.254 MB 0.000 MB gc.collect()
46 829.254 MB 0.000 MB start_time = time.time()
47 755.285 MB -73.969 MB est.predict(X)
48 755.285 MB 0.000 MB chrono_predict = time.time() - start_time
49
50 755.285 MB 0.000 MB print(args)
51 755.285 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
52 755.285 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
###############################################################################
Branch https://github.com/scikit-learn/scikit-learn/pull/3276
ajoly at Arnauds-MacBook-Pro-2 in ~/git/scikit-learn on 9f29711!
(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit = 67.2968831062
Time to predict = 35.926846981
Filename: bench_ovr.py
Line # Mem usage Increment Line Contents
================================================
21 @profile
22 30.418 MB 0.000 MB def main(argv=None):
23 30.418 MB 0.000 MB if argv is None:
24 30.418 MB 0.000 MB argv = sys.argv[1:]
25
26 30.441 MB 0.023 MB parser = argparse.ArgumentParser()
27 30.441 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
28 30.441 MB 0.000 MB help='Path to the dataset')
29 30.453 MB 0.012 MB args = vars(parser.parse_args(argv))
30
31 650.160 MB 619.707 MB data = joblib.load(args["dataset"])
32 650.160 MB 0.000 MB X = data["X"]
33 650.160 MB 0.000 MB y = data["y"]
34
35 650.160 MB 0.000 MB print("X.shape =", X.shape)
36 650.160 MB 0.000 MB print("y.shape =", y.shape)
37 650.160 MB 0.000 MB if sp.issparse(y):
38 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
39
40 650.160 MB 0.000 MB with warnings.catch_warnings():
41 650.160 MB 0.000 MB warnings.simplefilter("ignore")
42 650.160 MB 0.000 MB gc.collect()
43 650.160 MB 0.000 MB start_time = time.time()
44 830.082 MB 179.922 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
45 830.082 MB 0.000 MB chrono_fit = time.time() - start_time
46
47 830.082 MB 0.000 MB gc.collect()
48 830.082 MB 0.000 MB start_time = time.time()
49 830.164 MB 0.082 MB est.predict(X)
50 830.164 MB 0.000 MB chrono_predict = time.time() - start_time
51
52 830.164 MB 0.000 MB print(args)
53 830.164 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
54 830.164 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
Filename: sklearn/multiclass.py
Line # Mem usage Increment Line Contents
================================================
87 def fit_ovr(estimator, X, y, n_jobs=1):
88 650.160 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
89 650.160 MB 0.000 MB _check_estimator(estimator)
90 650.160 MB 0.000 MB lb = LabelBinarizer(sparse_output=True)
91 805.719 MB 155.559 MB Y = lb.fit_transform(y)
92
93 805.719 MB 0.000 MB if sp.issparse(Y):
94 805.758 MB 0.039 MB Y = Y.tocsc()
95 805.777 MB 0.020 MB columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
96 else:
97 columns = Y.T
98 805.777 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
99 (estimator,
100 X,
101 column,
102 classes=["not %s" % i, i])
103 830.082 MB 24.305 MB for i, column in enumerate(columns))
104 830.082 MB 0.000 MB return estimators, lb
###############################################################################
Branch https://github.com/scikit-learn/scikit-learn/pull/3276
(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
('Target Data Sparsity', 0.0010003124999999999)
{'dataset': 'sparse_output.joblib'}
Time to fit = 60.5505950451
Time to predict = 34.7806410789
Filename: bench_ovr.py
Line # Mem usage Increment Line Contents
================================================
21 @profile
22 28.980 MB 0.000 MB def main(argv=None):
23 28.984 MB 0.004 MB if argv is None:
24 28.984 MB 0.000 MB argv = sys.argv[1:]
25
26 29.012 MB 0.027 MB parser = argparse.ArgumentParser()
27 29.012 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
28 29.016 MB 0.004 MB help='Path to the dataset')
29 29.027 MB 0.012 MB args = vars(parser.parse_args(argv))
30
31 40.316 MB 11.289 MB data = joblib.load(args["dataset"])
32 40.316 MB 0.000 MB X = data["X"]
33 40.316 MB 0.000 MB y = data["y"]
34
35 40.316 MB 0.000 MB print("X.shape =", X.shape)
36 40.316 MB 0.000 MB print("y.shape =", y.shape)
37 40.316 MB 0.000 MB if sp.issparse(y):
38 40.324 MB 0.008 MB print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
39
40 40.324 MB 0.000 MB with warnings.catch_warnings():
41 40.324 MB 0.000 MB warnings.simplefilter("ignore")
42 40.324 MB 0.000 MB gc.collect()
43 40.324 MB 0.000 MB start_time = time.time()
44 67.750 MB 27.426 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
45 67.750 MB 0.000 MB chrono_fit = time.time() - start_time
46
47 67.750 MB 0.000 MB gc.collect()
48 67.750 MB 0.000 MB start_time = time.time()
49 67.930 MB 0.180 MB est.predict(X)
50 67.930 MB 0.000 MB chrono_predict = time.time() - start_time
51
52 67.930 MB 0.000 MB print(args)
53 67.930 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
54 67.930 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
Filename: sklearn/multiclass.py
Line # Mem usage Increment Line Contents
================================================
87 def fit_ovr(estimator, X, y, n_jobs=1):
88 40.324 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
89 40.324 MB 0.000 MB _check_estimator(estimator)
90 40.324 MB 0.000 MB lb = LabelBinarizer(sparse_output=True)
91 40.625 MB 0.301 MB Y = lb.fit_transform(y)
92
93 40.625 MB 0.000 MB if sp.issparse(Y):
94 41.586 MB 0.961 MB Y = Y.tocsc()
95 41.586 MB 0.000 MB columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1]))
96 else:
97 columns = Y.T
98 41.586 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
99 (estimator,
100 X,
101 column,
102 classes=["not %s" % i, i])
103 67.750 MB 26.164 MB for i, column in enumerate(columns))
104 67.750 MB 0.000 MB return estimators, lb
###############################################################################
Branch sprs-ovr-gc branch
(sklearn) ± python -m memory_profiler bench_ovr.py -d sparse_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
('Target Data Sparsity', 0.0010003124999999999)
{'dataset': 'sparse_output.joblib'}
Time to fit = 105.955276012
Time to predict = 36.4771659374
Filename: sklearn/multiclass.py
Line # Mem usage Increment Line Contents
================================================
94 def fit_ovr(estimator, X, y, n_jobs=-1):
95 40.246 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
96 40.246 MB 0.000 MB _check_estimator(estimator)
97
98 # Choose first line to do dense vs sparse target data bench
99 40.246 MB 0.000 MB lb = LabelBinarizer(sparse_output=sp.issparse(y))
100 # lb = LabelBinarizer(sparse_output=True)
101
102 40.582 MB 0.336 MB Y = lb.fit_transform(y)
103
104 40.582 MB 0.000 MB if sp.issparse(Y):
105 41.543 MB 0.961 MB Y = Y.tocsc()
106 41.543 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
107 (estimator,
108 X,
109 y=None,
110 classes=["not %s" % i, i],
111 Y=Y,
112 i=i)
113 67.734 MB 26.191 MB for i in range(Y.shape[1]))
114 else:
115 columns = Y.T
116 estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
117 (estimator,
118 X,
119 column,
120 classes=["not %s" % i, i],
121 Y=None,
122 i=None)
123 for i, column in enumerate(columns))
124
125 67.734 MB 0.000 MB return estimators, lb
Filename: bench_ovr.py
Line # Mem usage Increment Line Contents
================================================
21 @profile
22 28.875 MB 0.000 MB def main(argv=None):
23 28.875 MB 0.000 MB if argv is None:
24 28.875 MB 0.000 MB argv = sys.argv[1:]
25
26 28.906 MB 0.031 MB parser = argparse.ArgumentParser()
27 28.906 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
28 28.906 MB 0.000 MB help='Path to the dataset')
29 28.922 MB 0.016 MB args = vars(parser.parse_args(argv))
30
31 40.238 MB 11.316 MB data = joblib.load(args["dataset"])
32 40.238 MB 0.000 MB X = data["X"]
33 40.238 MB 0.000 MB y = data["y"]
34
35 40.238 MB 0.000 MB print("X.shape =", X.shape)
36 40.238 MB 0.000 MB print("y.shape =", y.shape)
37 40.238 MB 0.000 MB if sp.issparse(y):
38 40.246 MB 0.008 MB print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
39
40 40.246 MB 0.000 MB with warnings.catch_warnings():
41 40.246 MB 0.000 MB warnings.simplefilter("ignore")
42 40.246 MB 0.000 MB gc.collect()
43 40.246 MB 0.000 MB start_time = time.time()
44 67.734 MB 27.488 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
45 67.734 MB 0.000 MB chrono_fit = time.time() - start_time
46
47 67.734 MB 0.000 MB gc.collect()
48 67.734 MB 0.000 MB start_time = time.time()
49 67.906 MB 0.172 MB est.predict(X)
50 67.906 MB 0.000 MB chrono_predict = time.time() - start_time
51
52 67.906 MB 0.000 MB print(args)
53 67.906 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
54 67.906 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
###############################################################################
Branch sprs-ovr-gc branch
(sklearn) ± python -m memory_profiler bench_ovr.py -d dense_output.joblib
('X.shape =', (20000, 100))
('y.shape =', (20000, 4000))
{'dataset': 'dense_output.joblib'}
Time to fit = 116.219926119
Time to predict = 37.6857318878
Filename: sklearn/multiclass.py
Line # Mem usage Increment Line Contents
================================================
94 def fit_ovr(estimator, X, y, n_jobs=-1):
95 649.539 MB 0.000 MB """Fit a one-vs-the-rest strategy."""
96 649.539 MB 0.000 MB _check_estimator(estimator)
97
98 # Choose first line to do dense vs sparse target data bench
99 649.539 MB 0.000 MB lb = LabelBinarizer(sparse_output=sp.issparse(y))
100 # lb = LabelBinarizer(sparse_output=True)
101
102 1415.641 MB 766.102 MB Y = lb.fit_transform(y)
103
104 1415.641 MB 0.000 MB if sp.issparse(Y):
105 Y = Y.tocsc()
106 estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
107 (estimator,
108 X,
109 y=None,
110 classes=["not %s" % i, i],
111 Y=Y,
112 i=i)
113 for i in range(Y.shape[1]))
114 else:
115 1415.645 MB 0.004 MB columns = Y.T
116 1415.645 MB 0.000 MB estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
117 (estimator,
118 X,
119 column,
120 classes=["not %s" % i, i],
121 Y=None,
122 i=None)
123 1439.578 MB 23.934 MB for i, column in enumerate(columns))
124
125 1439.578 MB 0.000 MB return estimators, lb
Filename: bench_ovr.py
Line # Mem usage Increment Line Contents
================================================
21 @profile
22 28.898 MB 0.000 MB def main(argv=None):
23 28.898 MB 0.000 MB if argv is None:
24 28.898 MB 0.000 MB argv = sys.argv[1:]
25
26 28.930 MB 0.031 MB parser = argparse.ArgumentParser()
27 28.930 MB 0.000 MB parser.add_argument('-d', '--dataset', type=str, required=True,
28 28.934 MB 0.004 MB help='Path to the dataset')
29 28.945 MB 0.012 MB args = vars(parser.parse_args(argv))
30
31 649.539 MB 620.594 MB data = joblib.load(args["dataset"])
32 649.539 MB 0.000 MB X = data["X"]
33 649.539 MB 0.000 MB y = data["y"]
34
35 649.539 MB 0.000 MB print("X.shape =", X.shape)
36 649.539 MB 0.000 MB print("y.shape =", y.shape)
37 649.539 MB 0.000 MB if sp.issparse(y):
38 print("Target Data Sparsity", float(y.nnz) / np.prod(y.shape))
39
40 649.539 MB 0.000 MB with warnings.catch_warnings():
41 649.539 MB 0.000 MB warnings.simplefilter("ignore")
42 649.539 MB 0.000 MB gc.collect()
43 649.539 MB 0.000 MB start_time = time.time()
44 829.227 MB 179.688 MB est = OneVsRestClassifier(MultinomialNB(alpha=1)).fit(X, y)
45 829.227 MB 0.000 MB chrono_fit = time.time() - start_time
46
47 829.227 MB 0.000 MB gc.collect()
48 829.227 MB 0.000 MB start_time = time.time()
49 829.309 MB 0.082 MB est.predict(X)
50 829.309 MB 0.000 MB chrono_predict = time.time() - start_time
51
52 829.309 MB 0.000 MB print(args)
53 829.309 MB 0.000 MB print("Time to fit = %s" % chrono_fit)
54 829.309 MB 0.000 MB print("Time to predict = %s" % chrono_predict)
from scipy.sparse import csr_matrix
from sklearn.datasets import make_multilabel_classification
import joblib
X, y = make_multilabel_classification(sparse=True, return_indicator=True,
n_samples=20000, n_features=100,
n_classes=4000, n_labels=4,
random_state=0)
joblib.dump({"X": X, "y": y}, "dense_output.joblib")
joblib.dump({"X": X, "y": csr_matrix(y)}, "sparse_output.joblib")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment