Skip to content

Instantly share code, notes, and snippets.

@mdbecker
Last active November 14, 2017 16:10
Show Gist options
  • Save mdbecker/5159dba66cc5690d81a3b1ac2ff2d796 to your computer and use it in GitHub Desktop.
Save mdbecker/5159dba66cc5690d81a3b1ac2ff2d796 to your computer and use it in GitHub Desktop.
from sklearn import metrics
def binary_cv_metrics(y, preds, m):
ACC = metrics.accuracy_score(y,preds)
cm = metrics.confusion_matrix(y,preds)
m['confusion_matrix'] = cm
m['Accuracy'] = ACC
m['F1 score'] = metrics.f1_score(y,preds)
m['FPR'] = cm[0,1]/(cm[0,:].sum()*1.0)
m['FNR'] = cm[1,0]/(cm[1,:].sum()*1.0)
m['Specificity (TNR)'] = cm[0,0]/(cm[0,:].sum()*1.0)
m['Sensitivity (TPR, Recall)'] = cm[1,1]/(cm[1,:].sum()*1.0)
m['PPV (Precision)'] = cm[1,1]/(cm[:,1].sum()*1.0)
m['NPV'] = cm[0,0]/(cm[:,0].sum()*1.0)
def plt_auc(pred, actual, ax):
fpr, tpr, thresholds = metrics.roc_curve(actual, pred[:, 1])
auc = metrics.auc(fpr, tpr)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], '--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.text(0.7, 0.2, 'AUC = %0.2f' % (auc))
test_norm['Pred'] = result.predict(test_norm[in_vars])
preds = test_norm['Pred'].values
preds = np.array([preds,preds]).T
ms = []
threshes = np.linspace(0,test_norm['Pred'].max(),100)
for thresh in threshes:
m = {}
criteria = test_norm['Pred'] > thresh
binary_cv_metrics(test_norm[label_name],criteria ,m)
m[dx_class] = n
ms.append(m)
ms_df = pd.DataFrame(ms)
fig, ax = plt.subplots(2,1,figsize=(8,8))
for metric in ['F1 score','NPV','PPV (Precision)','Sensitivity (TPR, Recall)','Specificity (TNR)']:
ax[0].plot(threshes,ms_df[metric],'-',label=metric)
ax[0].legend(loc=0)
ps_analysis_utils.plt_auc(preds,test_norm[label_name],ax=ax[1])
plt.show()
import statsmodels.api as sm
from sklearn import preprocessing
# You can use sklearn train_test_split to create you train and test sets: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
train = X[train_idx]
test = X[test_idx]
# Scale your features so that the coefficients will be easily comparible: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization
scaler = preprocessing.StandardScaler().fit(train)
# Note: You might need to turn this back into a dataframe for line 18 to work
train_norm = scaler.transform(train)
test_norm = scaler.transform(test)
train_norm['intercept'] = 1
test_norm['intercept'] = 1
# list of columns in your DataFrame to use in training
in_vars = ['intercept'] + in_vars_no_intercept
logit = sm.Logit(train_norm[label_name],train_norm[in_vars])
result = logit.fit()
print result.summary()
# remove variables with low p-values (this is called a parsimonious model https://stats.stackexchange.com/a/17570)
in_vars = result.pvalues.index[result.pvalues < 0.1]
logit = sm.Logit(train_norm[label_name],train_norm[in_vars])
result = logit.fit()
print result.summary()
# remove variables with low p-values
in_vars = result.pvalues.index[result.pvalues < 0.05]
logit = sm.Logit(train_norm[label_name],train_norm[in_vars])
result = logit.fit()
print result.summary()
test_norm['Pred'] = result.predict(test_norm[in_vars])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment