-
-
Save vkuznet/d49d7f84008ef0b436f1ba7e511c7305 to your computer and use it in GitHub Desktop.
Hand on session 2 for SOSC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import warnings | |
warnings.filterwarnings("ignore") | |
# import sklearn modules | |
from sklearn.model_selection import train_test_split | |
from sklearn.datasets import load_iris | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import accuracy_score, roc_auc_score | |
# import numpy | |
import numpy as np | |
# fix random seed for reproducibility | |
SEED = 7 | |
np.random.seed(SEED) | |
def first_model(): | |
iris = load_iris() | |
x = iris.data | |
y = iris.target | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED) | |
model = DecisionTreeClassifier() | |
model = model.fit(x_train, y_train) | |
preds = model.predict(x_test) | |
# Prediction accuracy | |
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds)*100)+"%") | |
# Task: load more classifiers and compare their accuracies | |
# RandomForestClassifier, KNeighborsClassifier, GradientBoostingClassifier | |
# QA: what will happen if you change test_size and/or add random_state value | |
# Task: write simple emseble model which will combine multiple classifiers | |
# and check its accuracy with y_test | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.naive_bayes import GaussianNB | |
def ensemble_model(): | |
iris = load_iris() | |
x = iris.data | |
y = iris.target | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED) | |
model_1 = DecisionTreeClassifier() | |
model_1 = model_1.fit(x_train, y_train) | |
preds_1 = model_1.predict(x_test) | |
# Prediction accuracy | |
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%") | |
model_2 = GaussianNB() | |
model_2 = model_2.fit(x_train, y_train) | |
preds_2 = model_2.predict(x_test) | |
# Prediction accuracy | |
print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%") | |
# make an average amont two predictions | |
try: | |
preds_a = (preds_1+preds_2)/2. | |
print("Ensemble preds", preds_a) | |
print("Accuracy for ensemble model: " + str(accuracy_score(y_test, preds_a)*100)+"%") | |
except Exception as exp: | |
print("ERROR: %s" % exp) | |
print("We need to handle continuous valies") | |
preds_a = (preds_1+preds_2)/2. | |
preds_a[preds_a==1.5] = 2 | |
print("Accuracy for ensemble model (1.5->2): " + str(accuracy_score(y_test, preds_a)*100)+"%") | |
preds_a = (preds_1+preds_2)/2. | |
preds_a[preds_a==1.5] = 1 | |
print("Accuracy for ensemble model (1.5->1): " + str(accuracy_score(y_test, preds_a)*100)+"%") | |
# Task: take 3 different classifiers and create ensemble with votes | |
# votes can be assigned as most common predictions among two classifiers | |
import numpy as np | |
def vote_preds(preds1, preds2, preds3): | |
votes = [] | |
for idx in range(len(preds1)): | |
p1 = preds1[idx] | |
p2 = preds2[idx] | |
p3 = preds3[idx] | |
if p1 == p2: | |
votes.append(p1) | |
elif p1 == p3: | |
votes.append(p1) | |
elif p2 == p3: | |
votes.append(p2) | |
else: # no consistency we'll average | |
ap = (p1+p2+p3)/3. | |
votes.append(round(ap)) | |
return np.array(votes) | |
def ensemble_votes(): | |
iris = load_iris() | |
x = iris.data | |
y = iris.target | |
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED) | |
model_1 = DecisionTreeClassifier() | |
model_1 = model_1.fit(x_train, y_train) | |
preds_1 = model_1.predict(x_test) | |
print("DecisionTree preds", preds_1) | |
# Prediction accuracy | |
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%") | |
model_2 = GaussianNB() | |
model_2 = model_2.fit(x_train, y_train) | |
preds_2 = model_2.predict(x_test) | |
# Prediction accuracy | |
print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%") | |
model_3 = KNeighborsClassifier() | |
model_3 = model_3.fit(x_train, y_train) | |
preds_3 = model_3.predict(x_test) | |
# Prediction accuracy | |
print("Accuracy for KNeighborsClassifier: " + str(accuracy_score(y_test, preds_3)*100)+"%") | |
preds_a = vote_preds(preds_1,preds_2,preds_3) | |
print("Accuracy for ensemble model with votes: " + str(accuracy_score(y_test, preds_a)*100)+"%") | |
# Introduce concept of scaling and cross validation | |
from sklearn.preprocessing import LabelBinarizer, StandardScaler | |
from sklearn.model_selection import cross_val_predict, KFold | |
from sklearn.svm.classes import SVC | |
from sklearn.metrics import confusion_matrix | |
def cross_val_predict(): | |
X, y = load_iris(return_X_y=True) | |
X = StandardScaler().fit_transform(X) | |
clf = SVC() | |
cv = KFold(n_splits=4, random_state=SEED, shuffle=True) | |
idx = 1 | |
for train_index, test_index in cv.split(X): | |
clf.fit(X[train_index], y[train_index]) | |
ypred = clf.predict(X[test_index]) | |
auc = accuracy_score(y[test_index], ypred) | |
print("Fold: %s, AUC: %s" % (idx, auc) ) | |
conf_matrix = confusion_matrix(y[test_index], ypred) | |
print(conf_matrix) | |
idx += 1 | |
# Final task: write ensemble model (voting or not) which will perform | |
# best using cross validation techniques | |
# Bonus: https://www.programcreek.com/python/example/81062/sklearn.datasets.load_iris | |
# If time permit introduce keras NN | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.model_selection import cross_val_score | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.wrappers.scikit_learn import KerasClassifier | |
from keras.utils import np_utils | |
def nn_model(): | |
X, y = load_iris(return_X_y=True) | |
X = StandardScaler().fit_transform(X) | |
# encode class values as integers | |
encoder = LabelEncoder() | |
encoder.fit(y) | |
encoded_y = encoder.transform(y) | |
# convert integers to categorical variables (i.e. one hot encoded) | |
cat_y = np_utils.to_categorical(encoded_y) | |
print("input dataset labels : %s ... %s" % (y[0], y[-1])) | |
print("categorical variables: %s ... %s" % (cat_y[0], cat_y[-1])) | |
# create Keras NN model | |
def base_model(): | |
model = Sequential() | |
model.add(Dense(8, input_dim=4, activation='relu')) | |
model.add(Dense(3, activation='softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | |
return model | |
clf = KerasClassifier(build_fn=base_model, epochs=100, batch_size=5, verbose=0) | |
# evaluate the model using kFold cross validation with 20% of the data for testing and 80% for training | |
cv = KFold(n_splits=5, shuffle=True, random_state=SEED) | |
results = cross_val_score(clf, X, cat_y, cv=cv) | |
print("NN validation accuracy: %.2f%% +- (%.2f%%)" % (results.mean()*100, results.std()*100)) | |
from sklearn import model_selection | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.ensemble import RandomForestClassifier | |
from mlxtend.classifier import StackingCVClassifier | |
# based on https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/ | |
def meta_model(): | |
X, y = load_iris(return_X_y=True) | |
X = StandardScaler().fit_transform(X) | |
clf1 = KNeighborsClassifier(n_neighbors=1) | |
clf2 = RandomForestClassifier(random_state=SEED) | |
clf3 = GaussianNB() | |
lr = LogisticRegression() | |
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) | |
print('3-fold cross validation:\n') | |
names = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier (LogisticRegression)'] | |
for clf, label in zip([clf1, clf2, clf3, sclf], names): | |
scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') | |
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) | |
def main(): | |
print("\n+++ first model") | |
first_model() | |
print("\n+++ ensemble model") | |
ensemble_model() | |
print("\n+++ ensemble votes") | |
ensemble_votes() | |
print("\n+++ cross validation technique") | |
cross_val_predict() | |
print("\n+++ neural networks") | |
nn_model() | |
print("\n+++ meta-classifier model") | |
meta_model() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment