Skip to content

Instantly share code, notes, and snippets.

@w495
Last active March 13, 2018 21:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.
Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
# data
XX = pd.read_csv('who_X_1.csv')
y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel())
y=np.array([0 if i > -0.50 else 1 for i in y])
#Use get-dummies to convert categorical features into dummy ones
features=list(XX)
dis_features=['X121']
index=[12,120,124,125,126,127,128,129,130,131]
con_features=[i for i in features if features.index(i) not in index]
XX=XX.iloc[:,0:124]
X=pd.get_dummies(XX,columns=dis_features)
# # Divide Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
#
kappa_scorer = make_scorer(cohen_kappa_score)
auc_scorer=make_scorer(roc_auc_score)
F_measure_scorer = make_scorer(f1_score)
st=StandardScaler()
rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005)
param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004],
#'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}]
}
cat_indices=list(range(123,160))
num=list(range(0,160))
num_indices=[i for i in num if num.index(i) not in cat_indices]
pipeline=Pipeline(steps= [
('feature_processing', FeatureUnion(transformer_list = [
('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', Pipeline(steps = [
('select', FunctionTransformer(lambda data: data[:, num_indices])),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring = 'f1')
rg_cv.fit(X_train, y_train)
print("Tuned rg best params: {}".format(rg_cv.best_params_))
ypred = rg_cv.predict(X_train)
print('Cohen Kappa:',cohen_kappa_score(y_train, ypred))
print(matthews_corrcoef(y_train,ypred))
print(confusion_matrix(y_train, ypred))
print(classification_report(y_train, ypred))
print('######################')
ypred2 = rg_cv.predict(X_test)
print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2))
print(matthews_corrcoef(y_test,ypred2))
print(confusion_matrix(y_test, ypred2))
print(classification_report(y_test, ypred2))
def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(train_sizes, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.xlabel('Number of training points')
plt.ylabel('F-measure')
plt.grid(ls='--')
plt.legend(loc='best')
plt.show()
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Parameter value')
plt.ylabel('F-measure')
plt.legend(loc='best')
plt.show()
plt.figure(figsize=(9,6))
if __name__ == '__main__':
train_sizes, train_scores, test_scores = learning_curve(
estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs=1)
plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment