sh16ma/evaluation_classfier_practical.py

## evaluation_classfier_practical.py
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
# algorithm
from sklearn.svm import SVC # サポートベクトルマシン
from sklearn.linear_model import LogisticRegression # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier # K近傍方
from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト
from sklearn.ensemble import GradientBoostingClassifier # 勾配ブースティング
from sklearn.neural_network import MLPClassifier
# evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


'''
# Datasets
データセットは省略
'''

# （準備）訓練データをテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


# （準備）パイプラインを用いてデータのスケール変換と機械学習アルゴリズムによるモデル構築
pipe_svc = Pipeline([("scl", StandardScaler()), ("clf", SVC(random_state=1))])
pipe_knn = Pipeline([("scl", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=10))])
pipe_logistic = Pipeline([("scl", StandardScaler()), ("clf", LogisticRegression())])
pipe_rf  = Pipeline([("scl", StandardScaler()), ("clf", RandomForestClassifier(random_state=1))])
pipe_gb  = Pipeline([("scl", StandardScaler()), ("clf", GradientBoostingClassifier(random_state=1))])
pipe_mlp = Pipeline([("scl", StandardScaler()), ("clf", MLPClassifier(hidden_layer_sizes=(5,2), max_iter=500, random_state=1))])

pipe_names = ["SVC", "KNeighbors", "LogisticRegression", "RandomForest", "GradientBoosting", "MLP"]
pipe_lines = [pipe_svc, pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp]


# 機械学習アルゴリズム別に混同行列、正解率、適合率、再現率、F値を出力
for (i, pipe) in enumerate(pipe_lines):
    # モデル学習
    pipe.fit(X_train, y_train)
    # モデル予測
    y_pred = pipe.predict(X_test)

    # 混同行列（実際のクラス、予測で得たクラスを比較）
    confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

    # 混同行列からヒートマップ描画
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for j in range(confmat.shape[0]):
        for k in range(confmat.shape[1]):
            ax.text(x=k, y=j, s=confmat[j, k], va="center", ha="center", fontsize=25)
    plt.title(pipe_names[ i ], fontsize=20)
    plt.xlabel("predicted label", fontsize=17)
    plt.ylabel("true label", fontsize=17)
    plt.show()

    # 機械学習アルゴリズム毎に正解率を出力
    print(f"{pipe_names[ i ]} accuracy : {accuracy_score(y_test, y_pred):.3f}")
    print()
    # 適合率、再現率、F値、サンプルの合計数を出力
    print(classification_report(y_test, y_pred, target_names=["良性", "悪性"]))
    print("_"*40)
    print()
	%matplotlib inline
	import pandas as pd
	import matplotlib.pyplot as plt
	# preprocessing
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	# algorithm
	from sklearn.svm import SVC # サポートベクトルマシン
	from sklearn.linear_model import LogisticRegression # ロジスティック回帰
	from sklearn.neighbors import KNeighborsClassifier # K近傍方
	from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト
	from sklearn.ensemble import GradientBoostingClassifier # 勾配ブースティング
	from sklearn.neural_network import MLPClassifier
	# evaluation
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import classification_report
	from sklearn.metrics import accuracy_score


	'''
	# Datasets
	データセットは省略
	'''

	# （準備）訓練データをテストデータに分割
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


	# （準備）パイプラインを用いてデータのスケール変換と機械学習アルゴリズムによるモデル構築
	pipe_svc = Pipeline([("scl", StandardScaler()), ("clf", SVC(random_state=1))])
	pipe_knn = Pipeline([("scl", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=10))])
	pipe_logistic = Pipeline([("scl", StandardScaler()), ("clf", LogisticRegression())])
	pipe_rf = Pipeline([("scl", StandardScaler()), ("clf", RandomForestClassifier(random_state=1))])
	pipe_gb = Pipeline([("scl", StandardScaler()), ("clf", GradientBoostingClassifier(random_state=1))])
	pipe_mlp = Pipeline([("scl", StandardScaler()), ("clf", MLPClassifier(hidden_layer_sizes=(5,2), max_iter=500, random_state=1))])

	pipe_names = ["SVC", "KNeighbors", "LogisticRegression", "RandomForest", "GradientBoosting", "MLP"]
	pipe_lines = [pipe_svc, pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp]


	# 機械学習アルゴリズム別に混同行列、正解率、適合率、再現率、F値を出力
	for (i, pipe) in enumerate(pipe_lines):
	# モデル学習
	pipe.fit(X_train, y_train)
	# モデル予測
	y_pred = pipe.predict(X_test)

	# 混同行列（実際のクラス、予測で得たクラスを比較）
	confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)

	# 混同行列からヒートマップ描画
	fig, ax = plt.subplots(figsize=(5, 5))
	ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
	for j in range(confmat.shape[0]):
	for k in range(confmat.shape[1]):
	ax.text(x=k, y=j, s=confmat[j, k], va="center", ha="center", fontsize=25)
	plt.title(pipe_names[ i ], fontsize=20)
	plt.xlabel("predicted label", fontsize=17)
	plt.ylabel("true label", fontsize=17)
	plt.show()

	# 機械学習アルゴリズム毎に正解率を出力
	print(f"{pipe_names[ i ]} accuracy : {accuracy_score(y_test, y_pred):.3f}")
	print()
	# 適合率、再現率、F値、サンプルの合計数を出力
	print(classification_report(y_test, y_pred, target_names=["良性", "悪性"]))
	print("_"*40)
	print()