Last active
January 19, 2022 10:13
-
-
Save sh16ma/c8ca409a18aa05a49471700eaae28151 to your computer and use it in GitHub Desktop.
#🐍 #Python #評価指標 #二値分類 #混同行列 #confusion_matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%matplotlib inline | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# preprocessing | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import Pipeline | |
# algorithm | |
from sklearn.svm import SVC # サポートベクトルマシン | |
from sklearn.linear_model import LogisticRegression # ロジスティック回帰 | |
from sklearn.neighbors import KNeighborsClassifier # K近傍方 | |
from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト | |
from sklearn.ensemble import GradientBoostingClassifier # 勾配ブースティング | |
from sklearn.neural_network import MLPClassifier | |
# evaluation | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import classification_report | |
from sklearn.metrics import accuracy_score | |
''' | |
# Datasets | |
データセットは省略 | |
''' | |
# (準備)訓練データをテストデータに分割 | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) | |
# (準備)パイプラインを用いてデータのスケール変換と機械学習アルゴリズムによるモデル構築 | |
pipe_svc = Pipeline([("scl", StandardScaler()), ("clf", SVC(random_state=1))]) | |
pipe_knn = Pipeline([("scl", StandardScaler()), ("clf", KNeighborsClassifier(n_neighbors=10))]) | |
pipe_logistic = Pipeline([("scl", StandardScaler()), ("clf", LogisticRegression())]) | |
pipe_rf = Pipeline([("scl", StandardScaler()), ("clf", RandomForestClassifier(random_state=1))]) | |
pipe_gb = Pipeline([("scl", StandardScaler()), ("clf", GradientBoostingClassifier(random_state=1))]) | |
pipe_mlp = Pipeline([("scl", StandardScaler()), ("clf", MLPClassifier(hidden_layer_sizes=(5,2), max_iter=500, random_state=1))]) | |
pipe_names = ["SVC", "KNeighbors", "LogisticRegression", "RandomForest", "GradientBoosting", "MLP"] | |
pipe_lines = [pipe_svc, pipe_knn, pipe_logistic, pipe_rf, pipe_gb, pipe_mlp] | |
# 機械学習アルゴリズム別に混同行列、正解率、適合率、再現率、F値を出力 | |
for (i, pipe) in enumerate(pipe_lines): | |
# モデル学習 | |
pipe.fit(X_train, y_train) | |
# モデル予測 | |
y_pred = pipe.predict(X_test) | |
# 混同行列(実際のクラス、予測で得たクラスを比較) | |
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) | |
# 混同行列からヒートマップ描画 | |
fig, ax = plt.subplots(figsize=(5, 5)) | |
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) | |
for j in range(confmat.shape[0]): | |
for k in range(confmat.shape[1]): | |
ax.text(x=k, y=j, s=confmat[j, k], va="center", ha="center", fontsize=25) | |
plt.title(pipe_names[ i ], fontsize=20) | |
plt.xlabel("predicted label", fontsize=17) | |
plt.ylabel("true label", fontsize=17) | |
plt.show() | |
# 機械学習アルゴリズム毎に正解率を出力 | |
print(f"{pipe_names[ i ]} accuracy : {accuracy_score(y_test, y_pred):.3f}") | |
print() | |
# 適合率、再現率、F値、サンプルの合計数を出力 | |
print(classification_report(y_test, y_pred, target_names=["良性", "悪性"])) | |
print("_"*40) | |
print() |
Author
sh16ma
commented
Mar 1, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment