Last active
October 13, 2015 03:00
-
-
Save fonnesbeck/7bb6a08c4fae71a21a2c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"from redcap import Project\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import seaborn as sns\n", | |
"import matplotlib.pylab as plt\n", | |
"sns.set_context('notebook')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"api_url = 'https://redcap.vanderbilt.edu/api/'\n", | |
"\n", | |
"hospitalized_key = open(\"token.txt\").read()\n", | |
"hospitalized_proj = Project(api_url, hospitalized_key)\n", | |
"hospitalized_raw = hospitalized_proj.export_records(format='df', \n", | |
" df_kwargs={'index_col': hospitalized_proj.field_names[0]})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"black_subset = hospitalized_raw[hospitalized_raw.race==3].copy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"black_subset['response'] = None\n", | |
"black_subset.loc[black_subset.therapy_response___0==1, 'response'] = 1\n", | |
"black_subset.loc[black_subset.therapy_response___1==1, 'response'] = 0\n", | |
"black_subset.response = black_subset.response.astype(float)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1 63\n", | |
"0 24\n", | |
"Name: response, dtype: int64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"black_subset.response.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dob = pd.to_datetime(black_subset.dob)\n", | |
"adm = pd.to_datetime(black_subset.date_admission)\n", | |
"black_subset['age'] = (adm-dob).astype(int)/(365.25 * 8.64e13)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"response_cols = black_subset.columns[black_subset.columns.str.startswith('therapy_response')]\n", | |
"echo_cols = black_subset.columns[black_subset.columns.str.startswith('echo_')]\n", | |
"meaningless_cols = ['mrn', 'mrn_and_treatment_date', 'deceased']\n", | |
"drop_cols = np.concatenate([response_cols.values, meaningless_cols, echo_cols])\n", | |
"\n", | |
"black_subset = black_subset.dropna(subset=['response']).drop(drop_cols, axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"black_subset_low_missing = black_subset.loc[:, (black_subset.isnull().sum(0) < 5) & (black_subset.dtypes != object)]\n", | |
"black_subset_complete = black_subset_low_missing.apply(lambda x: x.fillna(x.mean()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(87, 70)" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"black_subset_complete.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import preprocessing\n", | |
"\n", | |
"X = black_subset_complete.copy()\n", | |
"y = X.pop('response')\n", | |
"\n", | |
"X_scaled = preprocessing.scale(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.preprocessing._weights import _balance_weights\n", | |
"\n", | |
"w = _balance_weights(y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1 63\n", | |
"0 24\n", | |
"Name: response, dtype: int64" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"black_subset.response.value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import cross_validation\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = cross_validation.train_test_split(\n", | |
" X_scaled, y, test_size=0.4, random_state=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"best parameter choice: {'max_depth': 25}\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn.grid_search import GridSearchCV\n", | |
"\n", | |
"rfc = RandomForestClassifier(n_jobs=4, class_weight={0:w.max(), 1:w.min()})\n", | |
"\n", | |
"grid = GridSearchCV(rfc, \n", | |
" param_grid={'max_depth': [2, 5, 10, 25, 50, 100]},\n", | |
" scoring='precision', cv=5)\n", | |
"grid.fit(X, y)\n", | |
"\n", | |
"print \"best parameter choice:\", grid.best_params_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>prediction</th>\n", | |
" <th>0.0</th>\n", | |
" <th>1.0</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>actual</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>8</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>25</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"prediction 0 1\n", | |
"actual \n", | |
"0 8 1\n", | |
"1 1 25" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rf = RandomForestClassifier(n_jobs=4, max_depth=25,\n", | |
" class_weight={0:w.max(), 1:w.min()})\n", | |
"rf.fit(X_train, y_train)\n", | |
"\n", | |
"preds = rf.predict(X_test)\n", | |
"pd.crosstab(y_test, preds, rownames=['actual'], \n", | |
" colnames=['prediction'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.91304348, 0.83333333, 0.91304348])" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.cross_validation import cross_val_score\n", | |
"\n", | |
"cross_val_score(RandomForestClassifier(class_weight={0:w.max(), 1:w.min()}, \n", | |
" n_estimators=50, max_depth=5,), \n", | |
" X, y, cv=3, \n", | |
" scoring='precision')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"kd_therapy___1 0.319578\n", | |
"hd_num_days 0.132306\n", | |
"age 0.085432\n", | |
"kd_therapy___9 0.050139\n", | |
"abnormality___5 0.044890\n", | |
"abnormality___3 0.039355\n", | |
"lab_criteria___8 0.038673\n", | |
"num_echo_post_eval 0.038341\n", | |
"illness_day_at_rx 0.036826\n", | |
"lab_criteria___7 0.023252\n", | |
"subsequent_diagnosis 0.018851\n", | |
"abnormality___1 0.018535\n", | |
"abnormality___8 0.016046\n", | |
"lab_criteria___5 0.012319\n", | |
"lab_criteria___2 0.012073\n", | |
"clinical_criteria___3 0.011801\n", | |
"clinical_criteria___5 0.009152\n", | |
"clinical_criteria___1 0.008939\n", | |
"picu_admission 0.008713\n", | |
"lab_criteria___6 0.008495\n", | |
"clinical_criteria___4 0.008262\n", | |
"subsequent_diagnosis_of_cv___2 0.007222\n", | |
"lab_criteria___1 0.007044\n", | |
"type_of_subsequent_diagnos___4 0.005527\n", | |
"sex 0.005014\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"importance = pd.Series(rf.feature_importances_, index=X.columns)\n", | |
"importance.sort_values(ascending=False)[:25]" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment