Created September 4, 2019 12:00
Hand on session 2 for SOSC
#!/usr/bin/env python3
import warnings
# import sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
# import numpy
import numpy as np
# fix random seed for reproducibility
SEED = 7
def first_model():
iris = load_iris()
x =
y =
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)
model = DecisionTreeClassifier()
model =, y_train)
preds = model.predict(x_test)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds)*100)+"%")
# Task: load more classifiers and compare their accuracies
# RandomForestClassifier, KNeighborsClassifier, GradientBoostingClassifier
# QA: what will happen if you change test_size and/or add random_state value
# Task: write simple emseble model which will combine multiple classifiers
# and check its accuracy with y_test
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
def ensemble_model():
iris = load_iris()
x =
y =
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)
model_1 = DecisionTreeClassifier()
model_1 =, y_train)
preds_1 = model_1.predict(x_test)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")
model_2 = GaussianNB()
model_2 =, y_train)
preds_2 = model_2.predict(x_test)
# Prediction accuracy
print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")
# make an average amont two predictions
preds_a = (preds_1+preds_2)/2.
print("Ensemble preds", preds_a)
print("Accuracy for ensemble model: " + str(accuracy_score(y_test, preds_a)*100)+"%")
except Exception as exp:
print("ERROR: %s" % exp)
print("We need to handle continuous valies")
preds_a = (preds_1+preds_2)/2.
preds_a[preds_a==1.5] = 2
print("Accuracy for ensemble model (1.5->2): " + str(accuracy_score(y_test, preds_a)*100)+"%")
preds_a = (preds_1+preds_2)/2.
preds_a[preds_a==1.5] = 1
print("Accuracy for ensemble model (1.5->1): " + str(accuracy_score(y_test, preds_a)*100)+"%")
# Task: take 3 different classifiers and create ensemble with votes
# votes can be assigned as most common predictions among two classifiers
import numpy as np
def vote_preds(preds1, preds2, preds3):
votes = []
for idx in range(len(preds1)):
p1 = preds1[idx]
p2 = preds2[idx]
p3 = preds3[idx]
if p1 == p2:
elif p1 == p3:
elif p2 == p3:
else: # no consistency we'll average
ap = (p1+p2+p3)/3.
return np.array(votes)
def ensemble_votes():
iris = load_iris()
x =
y =
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)
model_1 = DecisionTreeClassifier()
model_1 =, y_train)
preds_1 = model_1.predict(x_test)
print("DecisionTree preds", preds_1)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")
model_2 = GaussianNB()
model_2 =, y_train)
preds_2 = model_2.predict(x_test)
# Prediction accuracy
print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")
model_3 = KNeighborsClassifier()
model_3 =, y_train)
preds_3 = model_3.predict(x_test)
# Prediction accuracy
print("Accuracy for KNeighborsClassifier: " + str(accuracy_score(y_test, preds_3)*100)+"%")
preds_a = vote_preds(preds_1,preds_2,preds_3)
print("Accuracy for ensemble model with votes: " + str(accuracy_score(y_test, preds_a)*100)+"%")
# Introduce concept of scaling and cross validation
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.svm.classes import SVC
from sklearn.metrics import confusion_matrix
def cross_val_predict():
X, y = load_iris(return_X_y=True)
X = StandardScaler().fit_transform(X)
clf = SVC()
cv = KFold(n_splits=4, random_state=SEED, shuffle=True)
idx = 1
for train_index, test_index in cv.split(X):[train_index], y[train_index])
ypred = clf.predict(X[test_index])
auc = accuracy_score(y[test_index], ypred)
print("Fold: %s, AUC: %s" % (idx, auc) )
conf_matrix = confusion_matrix(y[test_index], ypred)
idx += 1
# Final task: write ensemble model (voting or not) which will perform
# best using cross validation techniques
# Bonus:
# If time permit introduce keras NN
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
def nn_model():
X, y = load_iris(return_X_y=True)
X = StandardScaler().fit_transform(X)
# encode class values as integers
encoder = LabelEncoder()
encoded_y = encoder.transform(y)
# convert integers to categorical variables (i.e. one hot encoded)
cat_y = np_utils.to_categorical(encoded_y)
print("input dataset labels : %s ... %s" % (y[0], y[-1]))
print("categorical variables: %s ... %s" % (cat_y[0], cat_y[-1]))
# create Keras NN model
def base_model():
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
clf = KerasClassifier(build_fn=base_model, epochs=100, batch_size=5, verbose=0)
# evaluate the model using kFold cross validation with 20% of the data for testing and 80% for training
cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
results = cross_val_score(clf, X, cat_y, cv=cv)
print("NN validation accuracy: %.2f%% +- (%.2f%%)" % (results.mean()*100, results.std()*100))
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
# based on
def meta_model():
X, y = load_iris(return_X_y=True)
X = StandardScaler().fit_transform(X)
clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=SEED)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
print('3-fold cross validation:\n')
names = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier (LogisticRegression)']
for clf, label in zip([clf1, clf2, clf3, sclf], names):
scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
def main():
print("\n+++ first model")
print("\n+++ ensemble model")
print("\n+++ ensemble votes")
print("\n+++ cross validation technique")
print("\n+++ neural networks")
print("\n+++ meta-classifier model")
if __name__ == '__main__':
