vkuznet/HandOnSession2.py Secret

## HandOnSession2.py
#!/usr/bin/env python3

import warnings
warnings.filterwarnings("ignore")

# import sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# import numpy
import numpy as np

# fix random seed for reproducibility
SEED = 7
np.random.seed(SEED)

def first_model():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model = DecisionTreeClassifier()
    model = model.fit(x_train, y_train)
    preds = model.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds)*100)+"%")

# Task: load more classifiers and compare their accuracies
# RandomForestClassifier, KNeighborsClassifier, GradientBoostingClassifier

# QA: what will happen if you change test_size and/or add random_state value

# Task: write simple emseble model which will combine multiple classifiers
# and check its accuracy with y_test

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

def ensemble_model():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model_1 = DecisionTreeClassifier()
    model_1 = model_1.fit(x_train, y_train)
    preds_1 = model_1.predict(x_test)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

    model_2 = GaussianNB()
    model_2 = model_2.fit(x_train, y_train)
    preds_2 = model_2.predict(x_test)

    # Prediction accuracy
    print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

    # make an average amont two predictions
    try:
        preds_a = (preds_1+preds_2)/2.
        print("Ensemble preds", preds_a)
        print("Accuracy for ensemble model: " + str(accuracy_score(y_test, preds_a)*100)+"%")
    except Exception as exp:
        print("ERROR: %s" % exp)
        print("We need to handle continuous valies")

    preds_a = (preds_1+preds_2)/2.
    preds_a[preds_a==1.5] = 2
    print("Accuracy for ensemble model (1.5->2): " + str(accuracy_score(y_test, preds_a)*100)+"%")
    preds_a = (preds_1+preds_2)/2.
    preds_a[preds_a==1.5] = 1
    print("Accuracy for ensemble model (1.5->1): " + str(accuracy_score(y_test, preds_a)*100)+"%")

# Task: take 3 different classifiers and create ensemble with votes
# votes can be assigned as most common predictions among two classifiers

import numpy as np

def vote_preds(preds1, preds2, preds3):
    votes = []
    for idx in range(len(preds1)):
        p1 = preds1[idx]
        p2 = preds2[idx]
        p3 = preds3[idx]
        if p1 == p2:
            votes.append(p1)
        elif p1 == p3:
            votes.append(p1)
        elif p2 == p3:
            votes.append(p2)
        else: # no consistency we'll average
            ap = (p1+p2+p3)/3.
            votes.append(round(ap))
    return np.array(votes)

def ensemble_votes():

    iris = load_iris()
    x = iris.data
    y = iris.target

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

    model_1 = DecisionTreeClassifier()
    model_1 = model_1.fit(x_train, y_train)
    preds_1 = model_1.predict(x_test)
    print("DecisionTree preds", preds_1)

    # Prediction accuracy
    print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

    model_2 = GaussianNB()
    model_2 = model_2.fit(x_train, y_train)
    preds_2 = model_2.predict(x_test)

    # Prediction accuracy
    print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

    model_3 = KNeighborsClassifier()
    model_3 = model_3.fit(x_train, y_train)
    preds_3 = model_3.predict(x_test)

    # Prediction accuracy
    print("Accuracy for KNeighborsClassifier: " + str(accuracy_score(y_test, preds_3)*100)+"%")

    preds_a = vote_preds(preds_1,preds_2,preds_3)
    print("Accuracy for ensemble model with votes: " + str(accuracy_score(y_test, preds_a)*100)+"%")

# Introduce concept of scaling and cross validation

from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.svm.classes import SVC
from sklearn.metrics import confusion_matrix

def cross_val_predict():
    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    clf = SVC()
    cv = KFold(n_splits=4, random_state=SEED, shuffle=True)

    idx = 1
    for train_index, test_index in cv.split(X):
        clf.fit(X[train_index], y[train_index])
        ypred = clf.predict(X[test_index])
        auc = accuracy_score(y[test_index], ypred)
        print("Fold: %s, AUC: %s" % (idx, auc) )
        conf_matrix = confusion_matrix(y[test_index], ypred)
        print(conf_matrix)
        idx += 1

# Final task: write ensemble model (voting or not) which will perform
# best using cross validation techniques

# Bonus: https://www.programcreek.com/python/example/81062/sklearn.datasets.load_iris

# If time permit introduce keras NN


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

def nn_model():
    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    # convert integers to categorical variables (i.e. one hot encoded)
    cat_y = np_utils.to_categorical(encoded_y)
    print("input dataset labels : %s ... %s" % (y[0], y[-1]))
    print("categorical variables: %s ... %s" % (cat_y[0], cat_y[-1]))

    # create Keras NN model
    def base_model():
        model = Sequential()
        model.add(Dense(8, input_dim=4, activation='relu'))
        model.add(Dense(3, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    clf = KerasClassifier(build_fn=base_model, epochs=100, batch_size=5, verbose=0)

    # evaluate the model using kFold cross validation with 20% of the data for testing and 80% for training
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    results = cross_val_score(clf, X, cat_y, cv=cv)
    print("NN validation accuracy: %.2f%% +- (%.2f%%)" % (results.mean()*100, results.std()*100))

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier

# based on https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/
def meta_model():
    X, y = load_iris(return_X_y=True)
    X = StandardScaler().fit_transform(X)

    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=SEED)
    clf3 = GaussianNB()
    lr = LogisticRegression()

    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

    print('3-fold cross validation:\n')

    names = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier (LogisticRegression)']
    for clf, label in zip([clf1, clf2, clf3, sclf], names):
        scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

def main():
    print("\n+++ first model")
    first_model()
    print("\n+++ ensemble model")
    ensemble_model()
    print("\n+++ ensemble votes")
    ensemble_votes()
    print("\n+++ cross validation technique")
    cross_val_predict()
    print("\n+++ neural networks")
    nn_model()
    print("\n+++ meta-classifier model")
    meta_model()

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3

	import warnings
	warnings.filterwarnings("ignore")

	# import sklearn modules
	from sklearn.model_selection import train_test_split
	from sklearn.datasets import load_iris
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.metrics import accuracy_score, roc_auc_score

	# import numpy
	import numpy as np

	# fix random seed for reproducibility
	SEED = 7
	np.random.seed(SEED)

	def first_model():

	iris = load_iris()
	x = iris.data
	y = iris.target

	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

	model = DecisionTreeClassifier()
	model = model.fit(x_train, y_train)
	preds = model.predict(x_test)

	# Prediction accuracy
	print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds)*100)+"%")

	# Task: load more classifiers and compare their accuracies
	# RandomForestClassifier, KNeighborsClassifier, GradientBoostingClassifier

	# QA: what will happen if you change test_size and/or add random_state value

	# Task: write simple emseble model which will combine multiple classifiers
	# and check its accuracy with y_test

	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB

	def ensemble_model():

	iris = load_iris()
	x = iris.data
	y = iris.target

	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

	model_1 = DecisionTreeClassifier()
	model_1 = model_1.fit(x_train, y_train)
	preds_1 = model_1.predict(x_test)

	# Prediction accuracy
	print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

	model_2 = GaussianNB()
	model_2 = model_2.fit(x_train, y_train)
	preds_2 = model_2.predict(x_test)

	# Prediction accuracy
	print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

	# make an average amont two predictions
	try:
	preds_a = (preds_1+preds_2)/2.
	print("Ensemble preds", preds_a)
	print("Accuracy for ensemble model: " + str(accuracy_score(y_test, preds_a)*100)+"%")
	except Exception as exp:
	print("ERROR: %s" % exp)
	print("We need to handle continuous valies")

	preds_a = (preds_1+preds_2)/2.
	preds_a[preds_a==1.5] = 2
	print("Accuracy for ensemble model (1.5->2): " + str(accuracy_score(y_test, preds_a)*100)+"%")
	preds_a = (preds_1+preds_2)/2.
	preds_a[preds_a==1.5] = 1
	print("Accuracy for ensemble model (1.5->1): " + str(accuracy_score(y_test, preds_a)*100)+"%")

	# Task: take 3 different classifiers and create ensemble with votes
	# votes can be assigned as most common predictions among two classifiers

	import numpy as np

	def vote_preds(preds1, preds2, preds3):
	votes = []
	for idx in range(len(preds1)):
	p1 = preds1[idx]
	p2 = preds2[idx]
	p3 = preds3[idx]
	if p1 == p2:
	votes.append(p1)
	elif p1 == p3:
	votes.append(p1)
	elif p2 == p3:
	votes.append(p2)
	else: # no consistency we'll average
	ap = (p1+p2+p3)/3.
	votes.append(round(ap))
	return np.array(votes)

	def ensemble_votes():

	iris = load_iris()
	x = iris.data
	y = iris.target

	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5, random_state=SEED)

	model_1 = DecisionTreeClassifier()
	model_1 = model_1.fit(x_train, y_train)
	preds_1 = model_1.predict(x_test)
	print("DecisionTree preds", preds_1)

	# Prediction accuracy
	print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, preds_1)*100)+"%")

	model_2 = GaussianNB()
	model_2 = model_2.fit(x_train, y_train)
	preds_2 = model_2.predict(x_test)

	# Prediction accuracy
	print("Accuracy for GaussianNB: " + str(accuracy_score(y_test, preds_2)*100)+"%")

	model_3 = KNeighborsClassifier()
	model_3 = model_3.fit(x_train, y_train)
	preds_3 = model_3.predict(x_test)

	# Prediction accuracy
	print("Accuracy for KNeighborsClassifier: " + str(accuracy_score(y_test, preds_3)*100)+"%")

	preds_a = vote_preds(preds_1,preds_2,preds_3)
	print("Accuracy for ensemble model with votes: " + str(accuracy_score(y_test, preds_a)*100)+"%")

	# Introduce concept of scaling and cross validation

	from sklearn.preprocessing import LabelBinarizer, StandardScaler
	from sklearn.model_selection import cross_val_predict, KFold
	from sklearn.svm.classes import SVC
	from sklearn.metrics import confusion_matrix

	def cross_val_predict():
	X, y = load_iris(return_X_y=True)
	X = StandardScaler().fit_transform(X)

	clf = SVC()
	cv = KFold(n_splits=4, random_state=SEED, shuffle=True)

	idx = 1
	for train_index, test_index in cv.split(X):
	clf.fit(X[train_index], y[train_index])
	ypred = clf.predict(X[test_index])
	auc = accuracy_score(y[test_index], ypred)
	print("Fold: %s, AUC: %s" % (idx, auc) )
	conf_matrix = confusion_matrix(y[test_index], ypred)
	print(conf_matrix)
	idx += 1

	# Final task: write ensemble model (voting or not) which will perform
	# best using cross validation techniques

	# Bonus: https://www.programcreek.com/python/example/81062/sklearn.datasets.load_iris

	# If time permit introduce keras NN


	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import cross_val_score

	from keras.models import Sequential
	from keras.layers import Dense
	from keras.wrappers.scikit_learn import KerasClassifier
	from keras.utils import np_utils

	def nn_model():
	X, y = load_iris(return_X_y=True)
	X = StandardScaler().fit_transform(X)

	# encode class values as integers
	encoder = LabelEncoder()
	encoder.fit(y)
	encoded_y = encoder.transform(y)
	# convert integers to categorical variables (i.e. one hot encoded)
	cat_y = np_utils.to_categorical(encoded_y)
	print("input dataset labels : %s ... %s" % (y[0], y[-1]))
	print("categorical variables: %s ... %s" % (cat_y[0], cat_y[-1]))

	# create Keras NN model
	def base_model():
	model = Sequential()
	model.add(Dense(8, input_dim=4, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

	clf = KerasClassifier(build_fn=base_model, epochs=100, batch_size=5, verbose=0)

	# evaluate the model using kFold cross validation with 20% of the data for testing and 80% for training
	cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
	results = cross_val_score(clf, X, cat_y, cv=cv)
	print("NN validation accuracy: %.2f%% +- (%.2f%%)" % (results.mean()100, results.std()100))

	from sklearn import model_selection
	from sklearn.linear_model import LogisticRegression
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	from sklearn.ensemble import RandomForestClassifier
	from mlxtend.classifier import StackingCVClassifier

	# based on https://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/
	def meta_model():
	X, y = load_iris(return_X_y=True)
	X = StandardScaler().fit_transform(X)

	clf1 = KNeighborsClassifier(n_neighbors=1)
	clf2 = RandomForestClassifier(random_state=SEED)
	clf3 = GaussianNB()
	lr = LogisticRegression()

	sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

	print('3-fold cross validation:\n')

	names = ['KNN', 'Random Forest', 'Naive Bayes', 'Stacking Classifier (LogisticRegression)']
	for clf, label in zip([clf1, clf2, clf3, sclf], names):
	scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
	print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

	def main():
	print("\n+++ first model")
	first_model()
	print("\n+++ ensemble model")
	ensemble_model()
	print("\n+++ ensemble votes")
	ensemble_votes()
	print("\n+++ cross validation technique")
	cross_val_predict()
	print("\n+++ neural networks")
	nn_model()
	print("\n+++ meta-classifier model")
	meta_model()

	if __name__ == '__main__':
	main()