Skip to content

Instantly share code, notes, and snippets.

@fonnesbeck
Last active October 13, 2015 03:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fonnesbeck/7bb6a08c4fae71a21a2c to your computer and use it in GitHub Desktop.
Save fonnesbeck/7bb6a08c4fae71a21a2c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from redcap import Project\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pylab as plt\n",
"sns.set_context('notebook')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"api_url = 'https://redcap.vanderbilt.edu/api/'\n",
"\n",
"hospitalized_key = open(\"token.txt\").read()\n",
"hospitalized_proj = Project(api_url, hospitalized_key)\n",
"hospitalized_raw = hospitalized_proj.export_records(format='df', \n",
" df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"black_subset['response'] = None\n",
"black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1\n",
"black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0\n",
"black_subset.response = black_subset.response.astype(float)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1 63\n",
"0 24\n",
"Name: response, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"black_subset.response.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dob = pd.to_datetime(black_subset.dob)\n",
"adm = pd.to_datetime(black_subset.date_admission)\n",
"black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]\n",
"echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]\n",
"meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']\n",
"drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])\n",
"\n",
"black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]\n",
"black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(87, 70)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"black_subset_complete.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn import preprocessing\n",
"\n",
"X = black_subset_complete.copy()\n",
"y = X.pop('response')\n",
"\n",
"X_scaled = preprocessing.scale(X)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.preprocessing._weights import _balance_weights\n",
"\n",
"w = _balance_weights(y)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1 63\n",
"0 24\n",
"Name: response, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"black_subset.response.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn import cross_validation\n",
"\n",
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(\n",
" X_scaled, y, test_size=0.4, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"best parameter choice: {'max_depth': 25}\n"
]
}
],
"source": [
"from sklearn.grid_search import GridSearchCV\n",
"\n",
"rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})\n",
"\n",
"grid = GridSearchCV(rfc, \n",
" param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},\n",
" scoring='precision', cv=5)\n",
"grid.fit(X, y)\n",
"\n",
"print \"best parameter choice:\", grid.best_params_"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>prediction</th>\n",
" <th>0.0</th>\n",
" <th>1.0</th>\n",
" </tr>\n",
" <tr>\n",
" <th>actual</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"prediction 0 1\n",
"actual \n",
"0 8 1\n",
"1 1 25"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rf = RandomForestClassifier(n_jobs=4, max_depth=25,\n",
" class_weight={0:w.max(), 1:w.min()})\n",
"rf.fit(X_train, y_train)\n",
"\n",
"preds = rf.predict(X_test)\n",
"pd.crosstab(y_test, preds, rownames=['actual'], \n",
" colnames=['prediction'])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.91304348, 0.83333333, 0.91304348])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.cross_validation import cross_val_score\n",
"\n",
"cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, \n",
" n_estimators=50, max_depth=5,), \n",
" X, y, cv=3, \n",
" scoring='precision')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"kd_therapy___1 0.319578\n",
"hd_num_days 0.132306\n",
"age 0.085432\n",
"kd_therapy___9 0.050139\n",
"abnormality___5 0.044890\n",
"abnormality___3 0.039355\n",
"lab_criteria___8 0.038673\n",
"num_echo_post_eval 0.038341\n",
"illness_day_at_rx 0.036826\n",
"lab_criteria___7 0.023252\n",
"subsequent_diagnosis 0.018851\n",
"abnormality___1 0.018535\n",
"abnormality___8 0.016046\n",
"lab_criteria___5 0.012319\n",
"lab_criteria___2 0.012073\n",
"clinical_criteria___3 0.011801\n",
"clinical_criteria___5 0.009152\n",
"clinical_criteria___1 0.008939\n",
"picu_admission 0.008713\n",
"lab_criteria___6 0.008495\n",
"clinical_criteria___4 0.008262\n",
"subsequent_diagnosis_of_cv___2 0.007222\n",
"lab_criteria___1 0.007044\n",
"type_of_subsequent_diagnos___4 0.005527\n",
"sex 0.005014\n",
"dtype: float64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"importance = pd.Series(rf.feature_importances_, index=X.columns)\n",
"importance.sort_values(ascending=False)[:25]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment