jonathan-taylor/Segfault with joblib?.ipynb

## Segfault with joblib?.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from functools import partial\n",
    "\n",
    "import numpy as np, pandas as pd\n",
    "\n",
    "import rpy2.robjects as rpy\n",
    "from rpy2.robjects.packages import importr\n",
    "from rpy2.robjects import pandas2ri\n",
    "from rpy2.robjects.conversion import localconverter\n",
    "\n",
    "from sklearn.base import BaseEstimator, RegressorMixin\n",
    "from sklearn.model_selection import cross_validate, check_cv\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from joblib.parallel import Parallel, delayed\n",
    "\n",
    "rpy.r('library(leaps)')\n",
    "\n",
    "class Subset(BaseEstimator, RegressorMixin):\n",
    "\n",
    "    def __init__(self,\n",
    "                 formula_str,\n",
    "                 nvar,\n",
    "                 method='exhaustive'):\n",
    "\n",
    "        self.formula_str = formula_str\n",
    "        self.formula = rpy.r(formula_str)\n",
    "        self.method = method\n",
    "        self.nvar = nvar\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        \"\"\"Fit best subsets regression for a given `nvar`                       \n",
    "                                                                                \n",
    "        Parameters                                                              \n",
    "        ----------                                                              \n",
    "        X : {array-like}, shape (n_samples, n_features)                         \n",
    "            Training data. Pass directly as Fortran-contiguous data             \n",
    "            to avoid unnecessary memory duplication. If y is mono-output,       \n",
    "            X can be sparse.                                                    \n",
    "                                                                                \n",
    "        y : array-like, shape (n_samples,) or (n_samples, n_targets)            \n",
    "            Target values                                                       \n",
    "        \"\"\"\n",
    "        leaps = importr('leaps')\n",
    "        base = importr('base')\n",
    "\n",
    "        D = _convert_df(pd.concat([X, y], axis=1).copy()) # reconstitute data                   \n",
    "        _regfit = rpy.r['regsubsets'](self.formula,\n",
    "                                      data=D,\n",
    "                                      method=self.method,\n",
    "                                      nvmax=self.nvar)\n",
    "        self._which = rpy.r['summary'](_regfit).rx2('which').astype(np.bool)\n",
    "        _names = _regfit.rx2('xnames')\n",
    "        self._nz_coef = rpy.r['coef'](_regfit, self.nvar)\n",
    "        self.coef_ = pd.Series(self._nz_coef, index=_names[self._which[self.nvar-1]])\n",
    "\n",
    "    def predict(self, X):\n",
    "        stats = importr('stats')\n",
    "        with localconverter(rpy.default_converter + pandas2ri.converter):\n",
    "            _X = stats.model_matrix(self.formula, data=X)\n",
    "            _X = _X[:, self._which[self.nvar-1]] # 0-based indexing             \n",
    "        return _X.dot(self._nz_coef)\n",
    "    \n",
    "class SubsetCV(BaseEstimator, RegressorMixin): \n",
    "\n",
    "    def __init__(self, \n",
    "                 formula_str, \n",
    "                 nvmax, \n",
    "                 method='exhaustive', \n",
    "                 cv=3, \n",
    "                 n_jobs=-1,\n",
    "                 verbose=False):\n",
    "        self.formula_str = formula_str\n",
    "        self.formula = rpy.r(formula_str)\n",
    "        self.method = method\n",
    "        self.cv = cv\n",
    "        self.nvmax = nvmax\n",
    "        self.n_jobs = n_jobs\n",
    "        self.verbose = verbose\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        \"\"\"Fit cross-validated best subsets regression\n",
    "\n",
    "        Fit is over a grid of sizes up to `nvmax`.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        X : {array-like}, shape (n_samples, n_features)\n",
    "            Training data. Pass directly as Fortran-contiguous data\n",
    "            to avoid unnecessary memory duplication. If y is mono-output,\n",
    "            X can be sparse.\n",
    "\n",
    "        y : array-like, shape (n_samples,) or (n_samples, n_targets)\n",
    "            Target values\n",
    "        \"\"\"\n",
    "\n",
    "        if X.shape[0] != y.shape[0]:\n",
    "            raise ValueError(\"X and y have inconsistent dimensions (%d != %d)\"\n",
    "                             % (X.shape[0], y.shape[0]))\n",
    "\n",
    "        cv = check_cv(self.cv)\n",
    "\n",
    "        # Compute path for all folds and compute MSE to get the best alpha\n",
    "        folds = list(cv.split(X, y))\n",
    "        best_mse = np.inf\n",
    "\n",
    "        # Loop over folds, computing mse path\n",
    "        # for each (train, test)\n",
    "        D = pd.concat([X, y], axis=1) # reconstitute data\n",
    "\n",
    "        jobs = (delayed(_regsubsets_MSE)(X,\n",
    "                                         y,\n",
    "                                         self.formula, \n",
    "                                         self.method, \n",
    "                                         self.nvmax, \n",
    "                                         train, \n",
    "                                         test)\n",
    "                for train, test in folds)\n",
    "\n",
    "        # Execute the jobs in parallel using joblib\n",
    "        MSEs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,\n",
    "                        backend=\"threading\")(jobs)\n",
    "        MSEs = np.asarray(MSEs)\n",
    "        self.mse_path = MSEs.mean(0)\n",
    "\n",
    "        # Find best index by minimizing MSEs\n",
    "\n",
    "        self.best_index = int(np.argmin(self.mse_path) + 1)\n",
    "\n",
    "        # Refit with best index hyperparameter                                  \n",
    "        self.best_estimator = Subset(self.formula_str,\n",
    "                                     self.best_index,\n",
    "                                     method=self.method)\n",
    "        print(self.best_index)\n",
    "        self.best_estimator.fit(X, y)\n",
    "        self.coef_ = self.best_estimator.coef_\n",
    "\n",
    "        \n",
    "        return self\n",
    "\n",
    "    def predict(self, D):\n",
    "        return self.best_estimator.predict(D)\n",
    "    \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _regsubsets_MSE(X,\n",
    "                    y,\n",
    "                    formula, \n",
    "                    method, \n",
    "                    nvar, \n",
    "                    train, \n",
    "                    test):\n",
    "\n",
    "    D = pd.concat([X, y], axis=1) # reconstitute data\n",
    "\n",
    "    Dtrain = _convert_df(D.loc[D.index[train]].copy())\n",
    "    Dtest = _convert_df(D.loc[D.index[test]].copy())\n",
    "\n",
    "    _regfit = rpy.r['regsubsets'](formula, \n",
    "                                  data=Dtrain, \n",
    "                                  method=method,\n",
    "                                  nvmax=nvar)\n",
    "    _which = np.asarray(rpy.r['summary'](_regfit).rx2('which')).astype(np.bool)\n",
    "    _Xtest = np.asarray(rpy.r['model.matrix'](formula, data=Dtest))\n",
    "    _coef = np.zeros(_Xtest.shape[1])\n",
    "\n",
    "    _MSEs = []\n",
    "\n",
    "    _y_test = y.loc[y.index[test]]\n",
    "    for ivar in range(1, nvar+1):\n",
    "        _nz_coef = rpy.r['coef'](_regfit, ivar)\n",
    "        _mask = _which[ivar-1]\n",
    "        _coef *= 0\n",
    "        _coef[_mask] = _nz_coef\n",
    "        _y_hat = _Xtest.dot(_coef)\n",
    "        _MSEs.append(((_y_test - _y_hat)**2).mean())\n",
    "\n",
    "    return _MSEs\n",
    "\n",
    "def _convert_df(pd_df):\n",
    "    with localconverter(rpy.default_converter + pandas2ri.converter):\n",
    "        r_from_pd_df = rpy.conversion.py2rpy(pd_df)\n",
    "    return r_from_pd_df\n",
    "\n",
    "def _predict(formula, D, _which, nvar, _nz_coef):\n",
    "    D_ = _convert_df(D)\n",
    "    _X = np.asarray(rpy.r['model_matrix'](formula, data=D_))\n",
    "    _X = _X[:, _which[nvar-1]] # 0-based indexing                           \n",
    "    return _X.dot(_nz_coef)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'Y'], dtype='object')\n",
      "[1.1044207361816287, 1.1127754845670763, 1.1162538883158355, 1.1969607794847699, 1.1971463482157412, 1.2241621673072327, 1.2152457283584275, 1.231690457342606]\n"
     ]
    }
   ],
   "source": [
    "D = pd.DataFrame(dict([('X%d' % i, np.random.standard_normal(200)) for i in range(10)] + \n",
    "                      [('Y', np.random.standard_normal(200))]))\n",
    "print(D.columns)\n",
    "test = np.ones(200, np.bool)\n",
    "test[:50] = 0\n",
    "train = ~test\n",
    "subset_CV = SubsetCV('Y ~ .', 12, method='exhaustive', cv=10)\n",
    "print(_regsubsets_MSE(D, D['Y'], subset_CV.formula, 'exhaustive', 8, train, test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "SubsetCV(cv=10, formula_str='Y ~ .', method='exhaustive', n_jobs=-1, nvmax=10,\n",
       "         verbose=False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset_CV = SubsetCV('Y ~ .', 10, method='exhaustive', cv=10)\n",
    "subset_CV.fit(D, D['Y'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from functools import partial\n",
	"\n",
	"import numpy as np, pandas as pd\n",
	"\n",
	"import rpy2.robjects as rpy\n",
	"from rpy2.robjects.packages import importr\n",
	"from rpy2.robjects import pandas2ri\n",
	"from rpy2.robjects.conversion import localconverter\n",
	"\n",
	"from sklearn.base import BaseEstimator, RegressorMixin\n",
	"from sklearn.model_selection import cross_validate, check_cv\n",
	"from sklearn.metrics import mean_squared_error\n",
	"from joblib.parallel import Parallel, delayed\n",
	"\n",
	"rpy.r('library(leaps)')\n",
	"\n",
	"class Subset(BaseEstimator, RegressorMixin):\n",
	"\n",
	" def __init__(self,\n",
	" formula_str,\n",
	" nvar,\n",
	" method='exhaustive'):\n",
	"\n",
	" self.formula_str = formula_str\n",
	" self.formula = rpy.r(formula_str)\n",
	" self.method = method\n",
	" self.nvar = nvar\n",
	"\n",
	" def fit(self, X, y):\n",
	" \"\"\"Fit best subsets regression for a given `nvar` \n",
	" \n",
	" Parameters \n",
	" ---------- \n",
	" X : {array-like}, shape (n_samples, n_features) \n",
	" Training data. Pass directly as Fortran-contiguous data \n",
	" to avoid unnecessary memory duplication. If y is mono-output, \n",
	" X can be sparse. \n",
	" \n",
	" y : array-like, shape (n_samples,) or (n_samples, n_targets) \n",
	" Target values \n",
	" \"\"\"\n",
	" leaps = importr('leaps')\n",
	" base = importr('base')\n",
	"\n",
	" D = _convert_df(pd.concat([X, y], axis=1).copy()) # reconstitute data \n",
	" _regfit = rpy.r['regsubsets'](self.formula,\n",
	" data=D,\n",
	" method=self.method,\n",
	" nvmax=self.nvar)\n",
	" self._which = rpy.r['summary'](_regfit).rx2('which').astype(np.bool)\n",
	" _names = _regfit.rx2('xnames')\n",
	" self._nz_coef = rpy.r['coef'](_regfit, self.nvar)\n",
	" self.coef_ = pd.Series(self._nz_coef, index=_names[self._which[self.nvar-1]])\n",
	"\n",
	" def predict(self, X):\n",
	" stats = importr('stats')\n",
	" with localconverter(rpy.default_converter + pandas2ri.converter):\n",
	" _X = stats.model_matrix(self.formula, data=X)\n",
	" _X = _X[:, self._which[self.nvar-1]] # 0-based indexing \n",
	" return _X.dot(self._nz_coef)\n",
	" \n",
	"class SubsetCV(BaseEstimator, RegressorMixin): \n",
	"\n",
	" def __init__(self, \n",
	" formula_str, \n",
	" nvmax, \n",
	" method='exhaustive', \n",
	" cv=3, \n",
	" n_jobs=-1,\n",
	" verbose=False):\n",
	" self.formula_str = formula_str\n",
	" self.formula = rpy.r(formula_str)\n",
	" self.method = method\n",
	" self.cv = cv\n",
	" self.nvmax = nvmax\n",
	" self.n_jobs = n_jobs\n",
	" self.verbose = verbose\n",
	"\n",
	" def fit(self, X, y):\n",
	" \"\"\"Fit cross-validated best subsets regression\n",
	"\n",
	" Fit is over a grid of sizes up to `nvmax`.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" X : {array-like}, shape (n_samples, n_features)\n",
	" Training data. Pass directly as Fortran-contiguous data\n",
	" to avoid unnecessary memory duplication. If y is mono-output,\n",
	" X can be sparse.\n",
	"\n",
	" y : array-like, shape (n_samples,) or (n_samples, n_targets)\n",
	" Target values\n",
	" \"\"\"\n",
	"\n",
	" if X.shape[0] != y.shape[0]:\n",
	" raise ValueError(\"X and y have inconsistent dimensions (%d != %d)\"\n",
	" % (X.shape[0], y.shape[0]))\n",
	"\n",
	" cv = check_cv(self.cv)\n",
	"\n",
	" # Compute path for all folds and compute MSE to get the best alpha\n",
	" folds = list(cv.split(X, y))\n",
	" best_mse = np.inf\n",
	"\n",
	" # Loop over folds, computing mse path\n",
	" # for each (train, test)\n",
	" D = pd.concat([X, y], axis=1) # reconstitute data\n",
	"\n",
	" jobs = (delayed(_regsubsets_MSE)(X,\n",
	" y,\n",
	" self.formula, \n",
	" self.method, \n",
	" self.nvmax, \n",
	" train, \n",
	" test)\n",
	" for train, test in folds)\n",
	"\n",
	" # Execute the jobs in parallel using joblib\n",
	" MSEs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,\n",
	" backend=\"threading\")(jobs)\n",
	" MSEs = np.asarray(MSEs)\n",
	" self.mse_path = MSEs.mean(0)\n",
	"\n",
	" # Find best index by minimizing MSEs\n",
	"\n",
	" self.best_index = int(np.argmin(self.mse_path) + 1)\n",
	"\n",
	" # Refit with best index hyperparameter \n",
	" self.best_estimator = Subset(self.formula_str,\n",
	" self.best_index,\n",
	" method=self.method)\n",
	" print(self.best_index)\n",
	" self.best_estimator.fit(X, y)\n",
	" self.coef_ = self.best_estimator.coef_\n",
	"\n",
	" \n",
	" return self\n",
	"\n",
	" def predict(self, D):\n",
	" return self.best_estimator.predict(D)\n",
	" \n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def _regsubsets_MSE(X,\n",
	" y,\n",
	" formula, \n",
	" method, \n",
	" nvar, \n",
	" train, \n",
	" test):\n",
	"\n",
	" D = pd.concat([X, y], axis=1) # reconstitute data\n",
	"\n",
	" Dtrain = _convert_df(D.loc[D.index[train]].copy())\n",
	" Dtest = _convert_df(D.loc[D.index[test]].copy())\n",
	"\n",
	" _regfit = rpy.r['regsubsets'](formula, \n",
	" data=Dtrain, \n",
	" method=method,\n",
	" nvmax=nvar)\n",
	" _which = np.asarray(rpy.r['summary'](_regfit).rx2('which')).astype(np.bool)\n",
	" _Xtest = np.asarray(rpy.r['model.matrix'](formula, data=Dtest))\n",
	" _coef = np.zeros(_Xtest.shape[1])\n",
	"\n",
	" _MSEs = []\n",
	"\n",
	" _y_test = y.loc[y.index[test]]\n",
	" for ivar in range(1, nvar+1):\n",
	" _nz_coef = rpy.r['coef'](_regfit, ivar)\n",
	" _mask = _which[ivar-1]\n",
	" _coef *= 0\n",
	" _coef[_mask] = _nz_coef\n",
	" _y_hat = _Xtest.dot(_coef)\n",
	" _MSEs.append(((_y_test - _y_hat)**2).mean())\n",
	"\n",
	" return _MSEs\n",
	"\n",
	"def _convert_df(pd_df):\n",
	" with localconverter(rpy.default_converter + pandas2ri.converter):\n",
	" r_from_pd_df = rpy.conversion.py2rpy(pd_df)\n",
	" return r_from_pd_df\n",
	"\n",
	"def _predict(formula, D, _which, nvar, _nz_coef):\n",
	" D_ = _convert_df(D)\n",
	" _X = np.asarray(rpy.r['model_matrix'](formula, data=D_))\n",
	" _X = _X[:, _which[nvar-1]] # 0-based indexing \n",
	" return _X.dot(_nz_coef)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'Y'], dtype='object')\n",
	"[1.1044207361816287, 1.1127754845670763, 1.1162538883158355, 1.1969607794847699, 1.1971463482157412, 1.2241621673072327, 1.2152457283584275, 1.231690457342606]\n"
	]
	}
	],
	"source": [
	"D = pd.DataFrame(dict([('X%d' % i, np.random.standard_normal(200)) for i in range(10)] + \n",
	" [('Y', np.random.standard_normal(200))]))\n",
	"print(D.columns)\n",
	"test = np.ones(200, np.bool)\n",
	"test[:50] = 0\n",
	"train = ~test\n",
	"subset_CV = SubsetCV('Y ~ .', 12, method='exhaustive', cv=10)\n",
	"print(_regsubsets_MSE(D, D['Y'], subset_CV.formula, 'exhaustive', 8, train, test))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"10\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"SubsetCV(cv=10, formula_str='Y ~ .', method='exhaustive', n_jobs=-1, nvmax=10,\n",
	" verbose=False)"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"subset_CV = SubsetCV('Y ~ .', 10, method='exhaustive', cv=10)\n",
	"subset_CV.fit(D, D['Y'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}