Skip to content

Instantly share code, notes, and snippets.

@Polegar22
Last active April 16, 2018 22:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Polegar22/f9019bf80803758a6b1323217d31a99a to your computer and use it in GitHub Desktop.
Save Polegar22/f9019bf80803758a6b1323217d31a99a to your computer and use it in GitHub Desktop.
fastai/courses/ml1/FeatureImportanceScratch.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "%load_ext autoreload\n%autoreload 2",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "%matplotlib inline\n\nfrom fastai.imports import *\nfrom fastai.structured import *\nfrom sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\nfrom IPython.display import display\nfrom sklearn import metrics\nfrom numpy import array\nfrom numpy import reshape",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "\nclass TreeEnsemble():\n def __init__(self, x, y, n_trees, sample_sz, min_leaf=5):\n np.random.seed(42)\n self.x,self.y,self.sample_sz,self.min_leaf = x,y,sample_sz,min_leaf\n self.trees = [self.create_tree() for i in range(n_trees)]\n\n def create_tree(self):\n idxs = np.random.permutation(len(self.y))[:self.sample_sz]\n return DecisionTree(self.x.iloc[idxs], self.y[idxs], \n idxs=np.array(range(self.sample_sz)), min_leaf=self.min_leaf)\n \n def predict(self, x):\n return np.mean([t.predict(x) for t in self.trees], axis=0)\n\ndef std_agg(cnt, s1, s2): return math.sqrt((s2/cnt) - (s1/cnt)**2)",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "class DecisionTree():\n def __init__(self, x, y, idxs, min_leaf=5):\n self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf\n self.n,self.c = len(idxs), x.shape[1]\n self.val = np.mean(y[idxs])\n self.score = float('inf')\n self.find_varsplit()\n \n def find_varsplit(self):\n for i in range(self.c): self.find_better_split(i)\n if self.score == float('inf'): return\n x = self.split_col\n lhs = np.nonzero(x<=self.split)[0]\n rhs = np.nonzero(x>self.split)[0]\n self.lhs = DecisionTree(self.x, self.y, self.idxs[lhs])\n self.rhs = DecisionTree(self.x, self.y, self.idxs[rhs])\n\n def find_better_split(self, var_idx):\n x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs]\n sort_idx = np.argsort(x)\n sort_y,sort_x = y[sort_idx], x[sort_idx]\n rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()\n lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.\n\n for i in range(0,self.n-self.min_leaf-1):\n xi,yi = sort_x[i],sort_y[i]\n lhs_cnt += 1; rhs_cnt -= 1\n lhs_sum += yi; rhs_sum -= yi\n lhs_sum2 += yi**2; rhs_sum2 -= yi**2\n if i<self.min_leaf or xi==sort_x[i+1]:\n continue\n\n lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)\n rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)\n curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt\n if curr_score<self.score: \n self.var_idx,self.score,self.split = var_idx,curr_score,xi\n\n @property\n def split_name(self): return self.x.columns[self.var_idx]\n \n @property\n def split_col(self): return self.x.values[self.idxs,self.var_idx]\n\n @property\n def is_leaf(self): return self.score == float('inf')\n \n def __repr__(self):\n s = f'n: {self.n}; val:{self.val}'\n if not self.is_leaf:\n s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'\n return s\n\n def predict(self, x):\n return np.array([self.predict_row(xi) for xi in x])\n\n def predict_row(self, xi):\n if self.is_leaf: return self.val\n t = self.lhs if xi[self.var_idx]<=self.split else self.rhs\n return t.predict_row(xi)",
"execution_count": 11,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Load in our data from last lesson"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "PATH = \"data/bulldozers/\"\n\ndf_raw = pd.read_feather('tmp/bulldozers-raw')\ndf_raw['random'] = np.random.randint(0,10000, size=len(df_raw))\ndf_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def split_vals(a,n): return a[:n], a[n:]\nn_valid = 12000\nn_trn = len(df_trn)-n_valid\nX_train, X_valid = split_vals(df_trn, n_trn)\ny_train, y_valid = split_vals(y_trn, n_trn)\nraw_train, raw_valid = split_vals(df_raw, n_trn)",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cols = ['MachineID', 'YearMade', 'MachineHoursCurrentMeter', 'ProductSize', 'Enclosure',\n 'Coupler_System', 'saleYear', 'random']",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "tree = TreeEnsemble(X_train[cols], y_train, 5, 1000)\npreds = tree.predict(X_valid[cols].values)",
"execution_count": 22,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Feature importance"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def rmse(x,y): return math.sqrt(((x-y)**2).mean())",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def shuffle_df_column(df, column):\n shuffled = df.iloc[:,[column]].sample(frac=1)\n shuffled.reset_index(inplace=True, drop=True)\n df.iloc[:,[column]] = shuffled\n return df\n",
"execution_count": 24,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def featureImportance(self, x,y):\n result = pd.DataFrame({'cols':[],'imp':[]})\n benchRmse = rmse(self.predict(x.values), y)\n for column in range(x.shape[1]):\n shuffledRmse = rmse(self.predict(shuffle_df_column(x.copy(), column).values), y)\n fi = np.absolute(benchRmse - shuffledRmse)\n result.loc[result.shape[0]] = [x.columns[column], fi]\n return result.sort_values('imp', ascending=False)\n\n \n",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"scrolled": false,
"trusted": true
},
"cell_type": "code",
"source": "my_fi = featureImportance(tree, X_valid[cols], y_valid);",
"execution_count": 35,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "def plot_fi(fi): return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)",
"execution_count": 30,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "plot_fi(my_fi[:10]);",
"execution_count": 33,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 864x504 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#sklearn results"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "m = RandomForestRegressor(n_estimators=5, bootstrap=False)\nm.fit(X_train[cols], y_train)",
"execution_count": 27,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 27,
"data": {
"text/plain": "RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,\n oob_score=False, random_state=None, verbose=0, warm_start=False)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "fi = rf_feat_importance(m, X_valid[cols]); plot_fi(fi[:10])",
"execution_count": 34,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 34,
"data": {
"text/plain": "<matplotlib.axes._subplots.AxesSubplot at 0x2c50fd33b70>"
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 864x504 with 1 Axes>",
"image/png": "\n"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.6.5",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "f9019bf80803758a6b1323217d31a99a",
"data": {
"description": "fastai/courses/ml1/FeatureImportanceScratch.ipynb",
"public": true
}
},
"_draft": {
"nbviewer_url": "https://gist.github.com/f9019bf80803758a6b1323217d31a99a"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment