Skip to content

Instantly share code, notes, and snippets.

@tokoroten
Last active August 29, 2015 14:21
Show Gist options
  • Save tokoroten/629bf0a70d1a53d585cc to your computer and use it in GitHub Desktop.
Save tokoroten/629bf0a70d1a53d585cc to your computer and use it in GitHub Desktop.
scikit-learn randomforest serialize problem
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os.path
import shutil
import random
import math
import cPickle
import pickle
import sklearn.ensemble
import sklearn.externals.joblib
import sklearn.tree
def dump_pickle(model, base_path):
path = base_path + "/model.pkl"
fp = open(path, "wb")
pickle.dump(model, fp)
size = os.path.getsize(path)
return size
def dump_cPickle(model, base_path):
path = base_path + "/model.pkl"
fp = open(path, "wb")
cPickle.dump(model, fp)
size = os.path.getsize(path)
return size
def dump_joblib(model, base_path):
joblib_path = base_path + "/model.pkl"
sklearn.externals.joblib.dump(model, joblib_path)
size = 0
for filename in os.listdir(base_path):
size += os.path.getsize(base_path + '/' + filename)
return size
def dump_dot(model, base_path):
estimators = model.estimators_
file_list = []
for i in xrange(len(estimators)):
filename = base_path + "/tree_%d.dot" % i
file_list.append(filename)
sklearn.tree.export_graphviz(
estimators[i],
filename,
['x'])
size = 0
for p in file_list:
size += os.path.getsize(p)
return size
def generate_data(sample_num = 10000, random_seed = None):
rnd = random.Random(random_seed)
label = []
value = []
for i in xrange(sample_num):
x = rnd.random() * math.pi * 2
y = math.sin(x)
value.append([x])
label.append(y)
return label, value
def get_trained_model():
model = sklearn.ensemble.RandomForestRegressor(
n_estimators = 1000,
n_jobs = -1,
max_depth=3,
bootstrap = True,
)
label, value = generate_data(100000, 1)
model.fit(value, label)
return model
def model_test(model):
label, value = generate_data(100, 2)
predicted_labels = model.predict(value)
score = sum(abs(label - predicted_labels))
print "score =", score
def reset_folder():
folder_list = [
'pickle_1',
'pickle_2',
'cpickle_1',
'cpickle_2',
'joblib_1',
'joblib_2',
'dot_1',
'dot_2',
]
for folder in folder_list:
try:
shutil.rmtree(folder)
except:
pass
os.mkdir(folder)
def main():
reset_folder()
model = get_trained_model()
model_test(model)
print "normal dump size"
cpickle_size_1 = dump_cPickle(model, 'cpickle_1')
print "dump_cPickle_1=", cpickle_size_1, "%.2fMB" % (cpickle_size_1 / 2.0**20)
pickle_size_1 = dump_pickle(model, 'pickle_1')
print "dump_pickle_1=", pickle_size_1, "%.2fMB" % (pickle_size_1 / 2.0**20)
dump_joblib_1 = dump_joblib(model, 'joblib_1')
print "dump_joblib_1=", dump_joblib_1, "%.2fMB" % (dump_joblib_1 / 2.0**20)
dump_dot_1 = dump_dot(model, 'dot_1')
print "dump_dot_1=", dump_dot_1, "%.2fMB" % (dump_dot_1 / 2.0**20)
# delete indices_
for e in model.estimators_:
del e.indices_
model_test(model) # it works as not delete indices_ ver
print "delete indices_ dump size"
cpickle_size_2 = dump_cPickle(model, 'cpickle_2')
print "dump_cPickle_2=", cpickle_size_2, "%.2fMB" % (cpickle_size_2 / 2.0**20)
pickle_size_2 = dump_pickle(model, 'pickle_2')
print "dump_pickle_2=", pickle_size_2, "%.2fMB" % (pickle_size_2 / 2.0**20)
dump_joblib_2 = dump_joblib(model, 'joblib_2')
print "dump_joblib_2=", dump_joblib_2, "%.2fMB" % (dump_joblib_2 / 2.0**20)
dump_dot_2 = dump_dot(model, 'dot_2')
print "dump_dot_2=", dump_dot_2, "%.2fMB" % (dump_dot_2 / 2.0**20)
print "compress_rate"
print "cpickle_compress", "%.2f%%" % (float(cpickle_size_2) / float(cpickle_size_1) * 100)
print "pickle_compress", "%.2f%%" % (float(pickle_size_2) / float(pickle_size_1) * 100)
print "joblib_compress", "%.2f%%" % (float(dump_joblib_2) / float(dump_joblib_1) * 100)
print "dot_compress", "%.2f%%" % (float(dump_dot_2) / float(dump_dot_1) * 100)
return model
if __name__ == "__main__":
model = main()
score = 9.8706322981
normal dump size
dump_cPickle_1= 403963904 385.25MB
dump_pickle_1= 404131840 385.41MB
dump_joblib_1= 101802336 97.09MB
dump_dot_1= 1336226 1.27MB
score = 9.8706322981
delete indices_ dump size
dump_cPickle_2= 3903488 3.72MB
dump_pickle_2= 4034560 3.85MB
dump_joblib_2= 1671323 1.59MB
dump_dot_2= 1336226 1.27MB
compress_rate
cpickle_compress 0.97%
pickle_compress 1.00%
joblib_compress 1.64%
dot_compress 100.00%
@tokoroten
Copy link
Author

problem is here.
https://github.com/scikit-learn/scikit-learn/blob/64e553398b3873de3d9d1e67f0cccb20e01824bc/sklearn/ensemble/forest.py#L96

if bootstrap option is True, it takes member-variable to calculate oob_score.
the memory size is (randomforest_tree_num * len(train_dataset) * sizeof(bool)).
so, train large dataset, it takes large memory,

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment