fonnesbeck/Black non-responders.ipynb

## Black non-responders.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "from redcap import Project\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pylab as plt\n",
    "sns.set_context('notebook')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "api_url = 'https://redcap.vanderbilt.edu/api/'\n",
    "\n",
    "hospitalized_key = open(\"token.txt\").read()\n",
    "hospitalized_proj = Project(api_url, hospitalized_key)\n",
    "hospitalized_raw = hospitalized_proj.export_records(format='df', \n",
    "                            df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "black_subset['response'] = None\n",
    "black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1\n",
    "black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0\n",
    "black_subset.response = black_subset.response.astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    63\n",
       "0    24\n",
       "Name: response, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "black_subset.response.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dob = pd.to_datetime(black_subset.dob)\n",
    "adm = pd.to_datetime(black_subset.date_admission)\n",
    "black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]\n",
    "echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]\n",
    "meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']\n",
    "drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])\n",
    "\n",
    "black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]\n",
    "black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(87, 70)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "black_subset_complete.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "X = black_subset_complete.copy()\n",
    "y = X.pop('response')\n",
    "\n",
    "X_scaled = preprocessing.scale(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing._weights import _balance_weights\n",
    "\n",
    "w = _balance_weights(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    63\n",
       "0    24\n",
       "Name: response, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "black_subset.response.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import cross_validation\n",
    "\n",
    "X_train, X_test, y_train, y_test = cross_validation.train_test_split(\n",
    "        X_scaled, y, test_size=0.4, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "best parameter choice: {'max_depth': 25}\n"
     ]
    }
   ],
   "source": [
    "from sklearn.grid_search import GridSearchCV\n",
    "\n",
    "rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})\n",
    "\n",
    "grid = GridSearchCV(rfc, \n",
    "                    param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},\n",
    "                    scoring='precision', cv=5)\n",
    "grid.fit(X, y)\n",
    "\n",
    "print \"best parameter choice:\", grid.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>prediction</th>\n",
       "      <th>0.0</th>\n",
       "      <th>1.0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>actual</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "prediction  0   1\n",
       "actual           \n",
       "0           8   1\n",
       "1           1  25"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rf = RandomForestClassifier(n_jobs=4, max_depth=25,\n",
    "                            class_weight={0:w.max(), 1:w.min()})\n",
    "rf.fit(X_train, y_train)\n",
    "\n",
    "preds = rf.predict(X_test)\n",
    "pd.crosstab(y_test, preds, rownames=['actual'], \n",
    "            colnames=['prediction'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.91304348,  0.83333333,  0.91304348])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cross_validation import cross_val_score\n",
    "\n",
    "cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, \n",
    "                                      n_estimators=50, max_depth=5,), \n",
    "                X, y, cv=3, \n",
    "                scoring='precision')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "kd_therapy___1                    0.319578\n",
       "hd_num_days                       0.132306\n",
       "age                               0.085432\n",
       "kd_therapy___9                    0.050139\n",
       "abnormality___5                   0.044890\n",
       "abnormality___3                   0.039355\n",
       "lab_criteria___8                  0.038673\n",
       "num_echo_post_eval                0.038341\n",
       "illness_day_at_rx                 0.036826\n",
       "lab_criteria___7                  0.023252\n",
       "subsequent_diagnosis              0.018851\n",
       "abnormality___1                   0.018535\n",
       "abnormality___8                   0.016046\n",
       "lab_criteria___5                  0.012319\n",
       "lab_criteria___2                  0.012073\n",
       "clinical_criteria___3             0.011801\n",
       "clinical_criteria___5             0.009152\n",
       "clinical_criteria___1             0.008939\n",
       "picu_admission                    0.008713\n",
       "lab_criteria___6                  0.008495\n",
       "clinical_criteria___4             0.008262\n",
       "subsequent_diagnosis_of_cv___2    0.007222\n",
       "lab_criteria___1                  0.007044\n",
       "type_of_subsequent_diagnos___4    0.005527\n",
       "sex                               0.005014\n",
       "dtype: float64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance = pd.Series(rf.feature_importances_, index=X.columns)\n",
    "importance.sort_values(ascending=False)[:25]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"from redcap import Project\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import seaborn as sns\n",
	"import matplotlib.pylab as plt\n",
	"sns.set_context('notebook')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"api_url = 'https://redcap.vanderbilt.edu/api/'\n",
	"\n",
	"hospitalized_key = open(\"token.txt\").read()\n",
	"hospitalized_proj = Project(api_url, hospitalized_key)\n",
	"hospitalized_raw = hospitalized_proj.export_records(format='df', \n",
	" df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"black_subset['response'] = None\n",
	"black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1\n",
	"black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0\n",
	"black_subset.response = black_subset.response.astype(float)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1 63\n",
	"0 24\n",
	"Name: response, dtype: int64"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"black_subset.response.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"dob = pd.to_datetime(black_subset.dob)\n",
	"adm = pd.to_datetime(black_subset.date_admission)\n",
	"black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]\n",
	"echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]\n",
	"meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']\n",
	"drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])\n",
	"\n",
	"black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]\n",
	"black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(87, 70)"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"black_subset_complete.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn import preprocessing\n",
	"\n",
	"X = black_subset_complete.copy()\n",
	"y = X.pop('response')\n",
	"\n",
	"X_scaled = preprocessing.scale(X)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.preprocessing._weights import _balance_weights\n",
	"\n",
	"w = _balance_weights(y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1 63\n",
	"0 24\n",
	"Name: response, dtype: int64"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"black_subset.response.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn import cross_validation\n",
	"\n",
	"X_train, X_test, y_train, y_test = cross_validation.train_test_split(\n",
	" X_scaled, y, test_size=0.4, random_state=0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import RandomForestClassifier\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"best parameter choice: {'max_depth': 25}\n"
	]
	}
	],
	"source": [
	"from sklearn.grid_search import GridSearchCV\n",
	"\n",
	"rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})\n",
	"\n",
	"grid = GridSearchCV(rfc, \n",
	" param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},\n",
	" scoring='precision', cv=5)\n",
	"grid.fit(X, y)\n",
	"\n",
	"print \"best parameter choice:\", grid.best_params_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th>prediction</th>\n",
	" <th>0.0</th>\n",
	" <th>1.0</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>actual</th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>8</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>25</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	"prediction 0 1\n",
	"actual \n",
	"0 8 1\n",
	"1 1 25"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"rf = RandomForestClassifier(n_jobs=4, max_depth=25,\n",
	" class_weight={0:w.max(), 1:w.min()})\n",
	"rf.fit(X_train, y_train)\n",
	"\n",
	"preds = rf.predict(X_test)\n",
	"pd.crosstab(y_test, preds, rownames=['actual'], \n",
	" colnames=['prediction'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 0.91304348, 0.83333333, 0.91304348])"
	]
	},
	"execution_count": 27,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.cross_validation import cross_val_score\n",
	"\n",
	"cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, \n",
	" n_estimators=50, max_depth=5,), \n",
	" X, y, cv=3, \n",
	" scoring='precision')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"kd_therapy___1 0.319578\n",
	"hd_num_days 0.132306\n",
	"age 0.085432\n",
	"kd_therapy___9 0.050139\n",
	"abnormality___5 0.044890\n",
	"abnormality___3 0.039355\n",
	"lab_criteria___8 0.038673\n",
	"num_echo_post_eval 0.038341\n",
	"illness_day_at_rx 0.036826\n",
	"lab_criteria___7 0.023252\n",
	"subsequent_diagnosis 0.018851\n",
	"abnormality___1 0.018535\n",
	"abnormality___8 0.016046\n",
	"lab_criteria___5 0.012319\n",
	"lab_criteria___2 0.012073\n",
	"clinical_criteria___3 0.011801\n",
	"clinical_criteria___5 0.009152\n",
	"clinical_criteria___1 0.008939\n",
	"picu_admission 0.008713\n",
	"lab_criteria___6 0.008495\n",
	"clinical_criteria___4 0.008262\n",
	"subsequent_diagnosis_of_cv___2 0.007222\n",
	"lab_criteria___1 0.007044\n",
	"type_of_subsequent_diagnos___4 0.005527\n",
	"sex 0.005014\n",
	"dtype: float64"
	]
	},
	"execution_count": 32,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"importance = pd.Series(rf.feature_importances_, index=X.columns)\n",
	"importance.sort_values(ascending=False)[:25]"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}