Created
July 17, 2024 15:31
-
-
Save ZOI-dayo/d68b32568afad2a073bd97e8cbf201bd to your computer and use it in GitHub Desktop.
ml講習会コンペ解法
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
xgb_best_params = { | |
'objective': 'binary:logistic', | |
'max_depth': 8, | |
'n_estimators': 300, | |
'learning_rate': 0.1, | |
'eta': 0.01, | |
'gamma': 0.12, | |
'subsample': 0.5, | |
'colsample_bytree': 0.5, | |
} | |
lgbm_best_params = { | |
'objective': 'binary', | |
'metric': 'binary_logloss', | |
'max_depth': 20, | |
'n_estimators': 300, | |
'learning_rate': 0.1, | |
'subsample': 1.0, | |
'colsample_bytree': 1.0, | |
'verbosity': -1, | |
'max_bin': 255, | |
'feature_fraction': 1.0, | |
} | |
logistic_best_params = { | |
'C': 0.1, | |
'penalty': 'l2', | |
'max_iter': 1000, | |
} | |
nn_best_params = { | |
'activation': 'logistic', | |
'hidden_layer_sizes': (64, 128), | |
'max_iter': 10000, | |
'early_stopping': True, | |
} | |
rf_best_params = { | |
'n_estimators': 400, | |
'max_depth': 10, | |
'min_samples_split': 2, | |
'min_samples_leaf': 2, | |
'bootstrap': True, | |
} | |
# 「学習データとモデルを渡すと学習済みモデルを返す」関数を定義 | |
# xgboost, lightgbmという出来合いの謎の関数を使っている | |
# 自分で書くより性能高かった すごいー | |
def get_xgboost_model(train_x, train_y, params): | |
from xgboost import XGBClassifier | |
model = XGBClassifier(random_state=71, **params) | |
model.fit(train_x, train_y) | |
return model | |
def get_lgbm_model(train_x, train_y, params): | |
from lightgbm import LGBMClassifier | |
model = LGBMClassifier(random_state=71, **params) | |
model.fit(train_x, train_y) | |
return model | |
def get_logistic_model(train_x, train_y, params): | |
from sklearn.linear_model import LogisticRegression | |
model = LogisticRegression(random_state=71, **params) | |
model.fit(train_x, train_y) | |
return model | |
def get_nn_model(train_x, train_y, params): | |
from sklearn.neural_network import MLPClassifier | |
model = MLPClassifier(random_state=71, **params) | |
model.fit(train_x, train_y) | |
return model | |
def get_rf_model(train_x, train_y, params): | |
from sklearn.ensemble import RandomForestClassifier | |
model = RandomForestClassifier(random_state=71, **params) | |
model.fit(train_x, train_y) | |
return model | |
# 評価をする関数 | |
# データを5分割し、4つを学習データ、1つを評価データとして使う | |
# これを5回繰り返し、それぞれの評価データに対するスコアを平均して返す | |
# loglossはミスの大きさを表す指標、accuracyは正答率 | |
def kfold(model_generator, x, y, params): | |
scores_logloss = [] | |
scores_accuracy = [] | |
kf = KFold(n_splits=5, shuffle=True, random_state=71) | |
for tr_idx, va_idx in kf.split(train_x): | |
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] | |
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] | |
model = model_generator(tr_x, tr_y, params) | |
va_pred = model.predict_proba(va_x)[:, 1] | |
logloss = log_loss(va_y, va_pred) | |
accuracy = accuracy_score(va_y, va_pred > 0.5) | |
scores_logloss.append(logloss) | |
scores_accuracy.append(accuracy) | |
logloss = np.mean(scores_logloss) | |
accuracy = np.mean(scores_accuracy) | |
return (logloss, accuracy) | |
print("データの読み込みをしています...") | |
train = pd.read_csv('train.csv') | |
test = pd.read_csv('test.csv') | |
print("データの読み込みが完了しました。") | |
print("データの前処理をしています...") | |
# ignore=['urgent'] | |
# train = train.drop(ignore, axis=1) | |
# test = test.drop(ignore, axis=1) | |
# classはstring型なので数値に変換 | |
train['class'] = train['class'].map({ | |
'normal': 0, | |
'attack': 1 | |
}) | |
# データのうちstring型のものをどうにか数値にしたい | |
# protocol_typeは種類が少ない(3種類)ので、それぞれの値について「その値なら1、違うなら0」の列を追加する | |
# serviceとflagは種類が多いので、stringを数値にして終わり | |
from sklearn.preprocessing import OneHotEncoder, LabelEncoder | |
import warnings | |
warnings.simplefilter('ignore', FutureWarning) | |
for c in ['protocol_type', 'service', 'flag']: | |
le = LabelEncoder() | |
le.fit(pd.concat([train[c], test[c]]).fillna('NA')) | |
train[c] = le.transform(train[c].fillna('NA')) | |
test[c] = le.transform(test[c].fillna('NA')) | |
for c in ['protocol_type']: | |
ohe = OneHotEncoder() | |
ohe.fit(pd.DataFrame(pd.concat([train[c], test[c]]))) | |
train_enc = pd.DataFrame(ohe.transform(pd.DataFrame(train[c])).toarray(), columns=ohe.get_feature_names_out()) | |
train = pd.concat([train, train_enc], axis=1) | |
test_enc = pd.DataFrame(ohe.transform(pd.DataFrame(test[c])).toarray(), columns=ohe.get_feature_names_out()) | |
test = pd.concat([test, test_enc], axis=1) | |
from math import isnan | |
for col in ['root_shell', 'num_shells', 'num_access_files', 'num_failed_logins', 'wrong_fragment', 'urgent']: | |
train_dropped = train.dropna(subset=[col]).drop([col, 'class'], axis=1) | |
test_dropped = test.dropna(subset=[col]).drop([col], axis=1) | |
train_target = train.dropna(subset=[col])[col] | |
test_target = test.dropna(subset=[col])[col] | |
model = get_xgboost_model(train_dropped.fillna(train_dropped.mean()), train_target, xgb_best_params) | |
train_predict = model.predict(train.drop([col, 'class'], axis=1).fillna(train.drop([col], axis=1).mean())) | |
test_predict = model.predict(test.drop([col], axis=1).fillna(test.drop([col], axis=1).mean())) | |
train[col] = pd.DataFrame(train[col]).fillna(pd.DataFrame(train_predict)).to_numpy() | |
test[col] = pd.DataFrame(test[col]).fillna(pd.DataFrame(test_predict)).to_numpy() | |
import numpy as np | |
print("データの前処理が完了しました。") | |
print("特徴量の追加をしています...") | |
# この関数にデータを与えると、特徴量を追加してくれる | |
# 範囲によってランクをつけたり、対数を取ったりしてみている | |
def add_data(data): | |
data['count_rank'] = [0 if 65 < val < 80 or 150 < val < 170 or 240 < val < 255 else 1 for val in data['count']] | |
data['count_one_rank'] = [min(val - (val // 85) * 85, val - (val // 85 + 1) * 85) for val in data['count']] | |
data['count_two_rank'] = [min((val+30) - ((val+30) // 102) * 102, (val+30) - ((val+30) // 102 + 1) * 102)-30 for val in data['count']] | |
data['serror_rate_rank'] = [0 if val < 0.01 else 1 if val < 0.79 else 2 for val in data['serror_rate']] | |
data['srv_serror_rate_rank'] = [0 if val < 0.8 else 1 for val in data['srv_serror_rate']] | |
data['same_srv_rate_rank'] = [0 if val < 0.2 else 1 if val < 0.8 else 2 for val in data['same_srv_rate']] | |
data['dst_host_srv_count_rank'] = [0 if val < 30 else 1 if val < 220 else 2 for val in data['dst_host_srv_count']] | |
data['src_bytes'] = data['src_bytes'].apply(lambda x: np.log1p(x)) | |
data['dst_bytes'] = data['dst_bytes'].apply(lambda x: np.log1p(x)) | |
data['duration'] = data['duration'].apply(lambda x: np.log1p(x)) | |
data['num_compromised'] = data['num_compromised'].apply(lambda x: np.log1p(x)) | |
data['srv_count'] = data['srv_count'].apply(lambda x: np.log1p(x)) | |
data['srv_rerror_rate_log'] = data['srv_rerror_rate'].apply(lambda x: np.log1p(x)) | |
data['diff_srv_rate'] = data['diff_srv_rate'].apply(lambda x: np.log1p(x)) | |
# ここからなつめぐる追加分 | |
data['root_shell_rank'] = np.where(data['root_shell'] < 1, 0.45, 0.2) | |
data['num_shell_rank'] = np.where(data['num_shells'] < 2, 0.4, 0) | |
data['num_access_files_rank'] = np.where(data['num_access_files'] < 1, 0.45, 0) | |
data['num_failed_logins_rank'] = np.where(data['num_failed_logins'] < 1, 0.45, 0) | |
data['wrong_fragment_rank'] = np.where(data['wrong_fragment'] < 1, 0.45, 1) | |
data['urgent_rank'] = np.where(data['urgent'] < 2, 0.55, 0) | |
return data | |
train = add_data(train) | |
test = add_data(test) | |
# import seaborn as sns | |
# import matplotlib.pyplot as plt | |
# sns.set() | |
# for col in train.drop(['protocol_type', 'service', 'flag'], axis=1).columns: | |
# print(f"col: {col}") | |
# sns.displot(data=train, x=col, hue='class', kind='kde', bw_adjust=0.01).savefig(f'analytics/{col}.png') | |
# plt.close() | |
# plt.clf() | |
# データのidは明らかに答えと関係ないので消します | |
# train(元データ)から、idとclassを削除した訓練データtrain_x、classだけにした訓練答えデータtrain_yを生成 | |
train_x = train.drop(['id', 'class'], axis=1) | |
train_y = train['class'] | |
# test(元データ)から、idを削除したテストデータtest_xを生成 | |
test_x = test.copy().drop('id', axis=1) | |
train_x = train_x.astype(np.float32) | |
train_y = train_y.astype(np.float32) | |
test_x = test_x.astype(np.float32) | |
print("特徴量の追加が完了しました。") | |
import matplotlib.pyplot as plt | |
from sklearn.metrics import log_loss, accuracy_score | |
from sklearn.model_selection import KFold | |
# 評価 | |
print("評価をしています...") | |
# 評価用関数 | |
# 2つのモデルの結果と答え、そして2つのモデルの別の結果を渡すと、その答えをロジスティック回帰で予測するモデルを返す | |
# これを使って、xgboostとlightgbmの結果を組み合わせてみる | |
def get_logistic_pred(xgb_tr, lgbm_tr, rf_tr, y_tr, xgb_va, lgbm_va, rf_va): | |
xgb_va = xgb_va.reshape(-1, 1) | |
lgbm_va = lgbm_va.reshape(-1, 1) | |
rf_va = rf_va.reshape(-1, 1) | |
xgb_tr = xgb_tr.reshape(-1, 1) | |
lgbm_tr = lgbm_tr.reshape(-1, 1) | |
rf_tr = rf_tr.reshape(-1, 1) | |
x_va = np.concatenate([xgb_va, lgbm_va, rf_va], axis=1) | |
x_tr = np.concatenate([xgb_tr, lgbm_tr, rf_tr], axis=1) | |
logistic_model = get_logistic_model(x_tr, y_tr, logistic_best_params) | |
return logistic_model.predict_proba(x_va)[:, 1] | |
# 学習用問題、学習用答え、テスト用問題を渡すと、テスト用答えを返す関数 | |
# analytics=Trueにすると、学習データに対する予測の分布をプロットする | |
def get_merged_pred(tr_x, tr_y, va_x, analytics=False): | |
xgb_model = get_xgboost_model(tr_x, tr_y, xgb_best_params) | |
xgb_tr_pred = xgb_model.predict_proba(tr_x)[:, 1] | |
xgb_va_pred = xgb_model.predict_proba(va_x)[:, 1] | |
lgbm_model = get_lgbm_model(tr_x, tr_y, lgbm_best_params) | |
lgbm_tr_pred = lgbm_model.predict_proba(tr_x)[:, 1] | |
lgbm_va_pred = lgbm_model.predict_proba(va_x)[:, 1] | |
# nn_model = get_nn_model(tr_x.fillna(tr_x.mean()), tr_y, nn_best_params) | |
# nn_tr_pred = nn_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1] | |
# nn_va_pred = nn_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1] | |
# logistic_model = get_logistic_model(tr_x.fillna(tr_x.mean()), tr_y, logistic_best_params) | |
# logistic_tr_pred = logistic_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1] | |
# logistic_va_pred = logistic_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1] | |
rf_model = get_rf_model(tr_x.fillna(tr_x.mean()), tr_y, rf_best_params) | |
rf_tr_pred = rf_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1] | |
rf_va_pred = rf_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1] | |
# overall_va_pred = get_logistic_pred(xgb_tr_pred, lgbm_tr_pred, tr_y, xgb_va_pred, lgbm_va_pred) | |
overall_va_pred = get_logistic_pred(xgb_tr_pred, lgbm_tr_pred, rf_tr_pred, tr_y, xgb_va_pred, lgbm_va_pred, rf_va_pred) | |
# if analytics: | |
# plt.figure(figsize=(10, 10)) | |
# plt.scatter(xgb_tr_pred, nn_tr_pred, c=tr_y, cmap='viridis', s=1, alpha=0.5) | |
# plt.colorbar() | |
# plt.xlabel('xgb') | |
# plt.ylabel('lgbm') | |
# plt.savefig('analytics/plot.png') | |
# | |
# plt.figure(figsize=(10, 10)) | |
# plt.scatter(xgb_va_pred, nn_va_pred, c=overall_va_pred, cmap='viridis', s=1, alpha=0.5) | |
# plt.colorbar() | |
# plt.xlabel('xgb') | |
# plt.ylabel('lgbm') | |
# plt.savefig('analytics/plot2.png') | |
# import xgboost as xgb | |
# import lightgbm as lgb | |
# xgb.plot_importance(xgb_model).figure.savefig('analytics/xgb_importance.png') | |
# lgb.plot_importance(lgbm_model).figure.savefig('analytics/lgbm_importance.png') | |
# 確立の平均を取って総合的な答えにする | |
# overall_va_pred = (xgb_va_pred + lgbm_va_pred ) / 2 | |
return (overall_va_pred, xgb_va_pred, lgbm_va_pred, rf_va_pred) | |
eval_accuracy = [] | |
eval_xgb_accuracy = [] | |
eval_lgbm_accuracy = [] | |
eval_nn_accuracy = [] | |
eval_logistic_accuracy = [] | |
eval_rf_accuracy = [] | |
eval_logloss = [] | |
eval_xgb_logloss = [] | |
eval_lgbm_logloss = [] | |
eval_nn_logloss = [] | |
eval_logistic_logloss = [] | |
eval_rf_logloss = [] | |
# いちおう最後に正答率を調べてみる | |
# これが高くても過学習な可能性があって信頼できないけど... | |
kf = KFold(n_splits=4, shuffle=True, random_state=71) | |
for tr_idx, va_idx in kf.split(train_x): | |
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx] | |
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] | |
(va_pred, va_xgb_pred, va_lgbm_pred, va_rf_pred) = get_merged_pred(tr_x, tr_y, va_x) | |
# リストに入れておいて、あとで平均を取る | |
eval_logloss.append(log_loss(va_y, va_pred)) | |
eval_xgb_logloss.append(log_loss(va_y, va_xgb_pred)) | |
eval_lgbm_logloss.append(log_loss(va_y, va_lgbm_pred)) | |
# eval_nn_logloss.append(log_loss(va_y, va_nn_pred)) | |
# eval_logistic_logloss.append(log_loss(va_y, va_logistic_pred)) | |
eval_rf_logloss.append(log_loss(va_y, va_rf_pred)) | |
eval_accuracy.append(accuracy_score(va_y, va_pred > 0.5)) | |
eval_xgb_accuracy.append(accuracy_score(va_y, va_xgb_pred > 0.5)) | |
eval_lgbm_accuracy.append(accuracy_score(va_y, va_lgbm_pred > 0.5)) | |
# eval_nn_accuracy.append(accuracy_score(va_y, va_nn_pred > 0.5)) | |
# eval_logistic_accuracy.append(accuracy_score(va_y, va_logistic_pred > 0.5)) | |
eval_rf_accuracy.append(accuracy_score(va_y, va_rf_pred > 0.5)) | |
print("評価が完了しました。") | |
print(f'logloss: {np.mean(eval_logloss):.8f}, accuracy: {np.mean(eval_accuracy):.8f}') | |
print("各モデルの評価") | |
print(f'xgb logloss: {np.mean(eval_xgb_logloss):.8f}, accuracy: {np.mean(eval_xgb_accuracy):.8f}') | |
print(f'lgbm logloss: {np.mean(eval_lgbm_logloss):.8f}, accuracy: {np.mean(eval_lgbm_accuracy):.8f}') | |
# print(f'nn logloss: {np.mean(eval_nn_logloss):.8f}, accuracy: {np.mean(eval_nn_accuracy):.8f}') | |
# print(f'logistic logloss: {np.mean(eval_logistic_logloss):.8f}, accuracy: {np.mean(eval_logistic_accuracy):.8f}') | |
print(f'rf logloss: {np.mean(eval_rf_logloss):.8f}, accuracy: {np.mean(eval_rf_accuracy):.8f}') | |
# 実際の予測 | |
# 実際のテストデータに対して予測をして、提出用のcsvを作る | |
print("実際の予測をしています...") | |
(pred, xgb_pred, lgbm_pred, rf_pred) = get_merged_pred(train_x, train_y, test_x, True) | |
pred_label = np.where(pred > 0.5, "attack", "normal") | |
# 念の為、平均を取る前の各方式の予測も出力して残しておく | |
# xgb_raw_data = pd.DataFrame({'id': test['id'], 'pred': xgb_pred}) | |
# xgb_raw_data.to_csv('submission_xdg_raw.csv', index=False) | |
# xgb_data = pd.DataFrame({'id': test['id'], 'pred': np.where(xgb_pred > .5, "attack", "normal")}) | |
# xgb_data.to_csv('submission_xdg.csv', index=False) | |
# | |
# | |
# lgbm_raw_data = pd.DataFrame({'id': test['id'], 'pred': lgbm_pred}) | |
# lgbm_raw_data.to_csv('submission_lgbm_raw.csv', index=False) | |
# lgbm_data = pd.DataFrame({'id': test['id'], 'pred': np.where(lgbm_pred > .5, "attack", "normal")}) | |
# lgbm_data.to_csv('submission_lgbm.csv', index=False) | |
# | |
# nn_raw_data = pd.DataFrame({'id': test['id'], 'pred': nn_pred}) | |
# nn_raw_data.to_csv('submission_nn_raw.csv', index=False) | |
# nn_data = pd.DataFrame({'id': test['id'], 'pred': np.where(nn_pred > .5, "attack", "normal")}) | |
# nn_data.to_csv('submission_nn.csv', index=False) | |
submission = pd.DataFrame({'id': test['id'], 'pred': pred_label}) | |
submission.to_csv('submission.csv', index=False) | |
print("実際の予測が完了しました。") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment