Skip to content

Instantly share code, notes, and snippets.

@prerakmody
Last active May 9, 2019 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prerakmody/f1dad0062c6002b7932fcb4719d12c1a to your computer and use it in GitHub Desktop.
Save prerakmody/f1dad0062c6002b7932fcb4719d12c1a to your computer and use it in GitHub Desktop.
Scikit Learn
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
def roc_plot(Y_test, Y_pred, title=''):
fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % metrics.auc(fpr, tpr))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(title)
plt.legend(loc="lower right")
plt.show()
def classifier_train(clf, X_train, Y_train, X_test, Y_test):
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
conf_matrix = confusion_matrix(Y_test, Y_pred)
Y_pred_probab = clf.predict_proba(X_test)
print (' - Conf Matrix : ')
print (conf_matrix)
print (' - F1 score : ', round(metrics.f1_score(Y_test, Y_pred, pos_label=1),3))
print (' - Precision : ', round(metrics.precision_score(Y_test, Y_pred, pos_label=1),3))
print (' - Recall : ', round(metrics.recall_score(Y_test, Y_pred, pos_label=1),3))
return clf, conf_matrix, Y_pred_probab, Y_pred
def experiments(X, Y, cv, smote, args):
classifiers = []
conf_matrixes = []
Y_tests = []
Y_tests_preds = []
Y_tests_preds_probabs = []
roc_title = ''
if (smote == 0):
roc_title = 'ROC - unSMOTEd - '
elif smote == 1:
roc_title = 'ROC - SMOTEd (%.1f)' % (args['sampling_ratio'])
n_splits = cv
if cv == 1:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=args['random_state'])
n_splits = 2
for i, (train, test) in enumerate(StratifiedKFold(n_splits=n_splits, random_state=args['random_state']).split(X, Y)):
print ('')
print (' -------------------- Fold : ', i, ' --------------------')
if (cv != 1):
X_train = X[train]
Y_train = Y[train]
X_test = X[test]
Y_test = Y[test]
print (' - Y_train : ', Counter(Y_train))
print (' - Y_test : ', Counter(Y_test))
print ('')
if (smote == 1):
sm = SMOTE(sampling_strategy=args['sampling_ratio'], k_neighbors=5
, random_state=args['random_state'])
X_train, Y_train = sm.fit_resample(X_train, Y_train)
print (' - SMOTEd Y : ', Counter(Y_train))
if (args['LogisticRegression']['bool']):
print (' - LogisticRegression')
clf_args = args['LogisticRegression']
print (' - Clf Args : ', clf_args)
clf = LogisticRegression(C=clf_args['C'], max_iter=clf_args['max_iter'], solver='lbfgs', random_state=42)
if i == 0:
roc_title += ' Logistic Regression'
if (args['RandomForestClassifier']['bool']):
print (' - RandomForestClassifier')
clf_args = args['RandomForestClassifier']
print (' - Clf Args : ', clf_args)
clf = RandomForestClassifier(n_estimators=clf_args['n_estimators'], n_jobs=3, random_state=42)
if i == 0:
roc_title += ' Random Forest'
if (args['SVC']['bool']):
clf = classifier_SVC(X_orig_smote, Y_orig_smote, X_orig_test, Y_orig_test, 'SVC')
classifiers.append(('svc', clf))
if args['AdaBoostClassifier']['bool']:
print (' - AdaBoostClassifier')
clf_args = args['AdaBoostClassifier']
print (' - Clf Args : ', clf_args)
clf = AdaBoostClassifier(n_estimators=clf_args['n_estimators'], learning_rate=clf_args['learning_rate'], random_state=42)
if i == 0:
roc_title += ' AdaBoost'
if args['XGBClassifier']['bool']:
print (' - XGBClassifier')
clf_args = args['XGBClassifier']
print (' - Clf Args : ', clf_args)
clf = xgb.XGBClassifier(subsample=clf_args['subsample'], objective=clf_args['objective'], random_state=42)
if i == 0:
roc_title += ' XGB'
if (args['ensemble']['bool']):
print ('')
print (' - Ensembling')
if (1):
clf_xg = xgb.XGBClassifier(objective="binary:logistic", subsample = 0.5, random_state=42)
clf_log = LogisticRegression(C=100, max_iter=500, random_state=42, solver='lbfgs')
clf_randfor = RandomForestClassifier(n_estimators=250, random_state=42)
clf_ada = AdaBoostClassifier(n_estimators=250, learning_rate=1, random_state=42)
classifiers = [('clg_xg', clf_xg), ('clf_log', clf_log)
, ('clf_randfor', clf_randfor), ('clf_ada', clf_ada),
]
clf = VotingClassifier(classifiers, voting='soft', n_jobs=3) #voting='hard'
if i == 0:
roc_title += ' Ensemble'
# TRAIN
clf, conf_matrix, Y_pred_probab, Y_pred = classifier_train(clf, X_train, Y_train, X_test, Y_test)
classifiers.append(clf)
conf_matrixes.append(conf_matrix)
Y_tests.append(Y_test)
Y_tests_preds.append(Y_pred)
Y_tests_preds_probabs.append(Y_pred_probab)
if (cv == 1):
break
print ('')
print (' -------------------------------------------------------------- ')
# Confusion Matrices
conf_matrix_final = []
for i, each in enumerate(conf_matrixes):
if i == 0 : conf_matrix_final = each.copy()
else : conf_matrix_final += each.copy()
print (' - Final Conf Matrix : ')
print (conf_matrix_final)
# ROC-CURVEs
Y_tests_final = []
for i,each in enumerate(Y_tests):
Y_tests_final.extend(each.tolist())
Y_tests_preds_final = []
for i,each in enumerate(Y_tests_preds):
Y_tests_preds_final.extend(each.tolist())
Y_tests_preds_probabs_final = []
for i,each in enumerate(Y_tests_preds_probabs):
Y_tests_preds_probabs_final.extend(each[:,1].tolist())
print (' - F1 score : ', round(metrics.f1_score(Y_tests_final, Y_tests_preds_final, pos_label=1),3))
print (' - Precision : ', round(metrics.precision_score(Y_tests_final, Y_tests_preds_final, pos_label=1),3))
print (' - Recall : ', round(metrics.recall_score(Y_tests_final, Y_tests_preds_final, pos_label=1),3))
roc_plot(Y_tests_final, Y_tests_preds_probabs_final, roc_title)
return classifiers, conf_matrixes, Y_tests, Y_tests_preds
if __name__ == "__main__":
df = pd.read_csv('file.csv')
data = df.as_matrix()
X_orig = data[:,:-1]
Y_orig = data[:,-1].astype(int)
print (' - Original Y : ', Counter(Y_orig), ' || Type : ', Y_orig.dtype)
rand_idx = np.random.choice(len(X_orig), len(X_orig), replace=False)
X_orig = X_orig[rand_idx]
Y_orig = Y_orig[rand_idx]
print (' - Original Y : ', Counter(Y_orig), ' || Type : ', Y_orig.dtype)
# --------------------------- #
args = { 'random_state' : 25,
'sampling_ratio' : 0.3,
'LogisticRegression' : {'bool':0, 'C':1, 'max_iter':500} # C = [1,10,100,500]
, 'RandomForestClassifier': {'bool':0, 'n_estimators' : 200} #n_estimators = [100, 200, 500, 750, 1000]
, 'AdaBoostClassifier' : {'bool':1, 'n_estimators': 250, 'learning_rate':0.9} #n_estimators = [100, 200, 500, 750, 1000]
, 'SVC' : {'bool':0}
, 'XGBClassifier' : {'bool':0, 'objective' :'binary:logistic', 'subsample' : 0.5}
, 'ensemble' : {'bool':0}
}
smote = 1
cv = 5
print (' -------- PARAMS ----------- ')
print (' -- Total Features : ', len(df.columns) - 1)
print (' -- Sampling Ratio : ', args['sampling_ratio'])
print (' -- CV : ', cv)
classifiers, conf_matrixes, Y_tests, Y_tests_preds = \
experiments(X_orig, Y_orig, cv, smote, args)
if (1):
if (args['LogisticRegression']['bool']):
clf = classifiers[0] # if logisistic regression
coeffs = clf.coef_[0]
plt.title('LogisticRegression - Coefficients')
plt.bar(range(len(coeffs)), coeffs)
plt.xticks(range(len(df.columns[:-1])), df.columns[:-1])
plt.xticks(rotation=90)
if (args['RandomForestClassifier']['bool'] or args['AdaBoostClassifier']['bool'] or or args['XGBClassifier']['bool'])):
clf = classifiers[0] # if RandomForests
coeffs = clf.feature_importances_
if args['RandomForestClassifier']['bool'] == 1:
plt.title('RandomForest - Feature Importances')
elif args['AdaBoostClassifier']['bool']:
plt.title('AdaBoost - Feature Importances')
plt.bar(range(len(coeffs)), coeffs)
plt.xticks(range(len(df.columns[:-1])), df.columns[:-1])
plt.xticks(rotation=90)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment