Skip to content

Instantly share code, notes, and snippets.

@ogrisel
Created July 10, 2012 15:36
Show Gist options
  • Save ogrisel/3084146 to your computer and use it in GitHub Desktop.
Save ogrisel/3084146 to your computer and use it in GitHub Desktop.
memmaping for random forests
/Users/oliviergrisel/coding/scikit-learn/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=2), iterable=<generator object <genexpr> at 0x10467b3c0>)
470 self.n_dispatched = 0
471 try:
472 for function, args, kwargs in iterable:
473 self.dispatch(function, args, kwargs)
474
--> 475 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=2)>
476 # Make sure that we get a last message telling us we are done
477 elapsed_time = time.time() - self._start_time
478 self._print('Done %3i out of %3i | elapsed: %s finished',
479 (len(self._output),
---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
AttributeError Tue Jul 10 17:50:08 2012
PID: 46155 Python 2.6.1: /usr/bin/python
...........................................................................
/Users/oliviergrisel/coding/scikit-learn/sklearn/ensemble/forest.pyc in _parallel_build_trees(n_trees=50, forest=RandomForestClassifier(bootstrap=True, compute_i...te object at 0x1004b30f0>,
verbose=0), X=<class 'numpy.core.memmap.memmap'> instance, y=array([[ 1.],
[ 0.],
[ 1.],
...,
[ 0.],
[ 1.],
[ 1.]]), sample_mask=None, X_argsorted=None, seed=445760040, verbose=0)
73 tree.set_params(random_state=check_random_state(seed))
74
75 if forest.bootstrap:
76 n_samples = X.shape[0]
77 indices = random_state.randint(0, n_samples, n_samples)
---> 78 tree.fit(X[indices], y[indices],
i = 0
79 sample_mask=sample_mask, X_argsorted=X_argsorted)
80 tree.indices_ = indices
81
82 else:
...........................................................................
/Library/Python/2.6/site-packages/numpy/core/memmap.pyc in __array_finalize__(self=<class 'numpy.core.memmap.memmap'> instance, obj=<class 'numpy.core.memmap.memmap'> instance)
252 return self
253
254 def __array_finalize__(self, obj):
255 if hasattr(obj, '_mmap'):
256 self._mmap = obj._mmap
--> 257 self.filename = obj.filename
258 self.offset = obj.offset
259 self.mode = obj.mode
260 else:
261 self._mmap = None
AttributeError: 'memmap' object has no attribute 'filename'
___________________________________________________________________________
#!/usr/bin/env python
import numpy as np
from sklearn.datasets.samples_generator import make_classification
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
print "generating dataset"
X, y = make_classification(n_samples=100000, n_features=500)
filename = '/tmp/dataset.joblib'
print "memory mapping to " + filename
joblib.dump(np.asarray(X, dtype=np.float32, order='F'), filename)
X = joblib.load(filename, mmap_mode='c')
print "fitting random forest:"
clf = RandomForestClassifier(n_estimators=100, n_jobs=2)
print clf.fit(X, y).score(X, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment