Skip to content

Instantly share code, notes, and snippets.

@ZOI-dayo
Created July 17, 2024 15:31
Show Gist options
  • Save ZOI-dayo/d68b32568afad2a073bd97e8cbf201bd to your computer and use it in GitHub Desktop.
Save ZOI-dayo/d68b32568afad2a073bd97e8cbf201bd to your computer and use it in GitHub Desktop.
ml講習会コンペ解法
import pandas as pd
xgb_best_params = {
'objective': 'binary:logistic',
'max_depth': 8,
'n_estimators': 300,
'learning_rate': 0.1,
'eta': 0.01,
'gamma': 0.12,
'subsample': 0.5,
'colsample_bytree': 0.5,
}
lgbm_best_params = {
'objective': 'binary',
'metric': 'binary_logloss',
'max_depth': 20,
'n_estimators': 300,
'learning_rate': 0.1,
'subsample': 1.0,
'colsample_bytree': 1.0,
'verbosity': -1,
'max_bin': 255,
'feature_fraction': 1.0,
}
logistic_best_params = {
'C': 0.1,
'penalty': 'l2',
'max_iter': 1000,
}
nn_best_params = {
'activation': 'logistic',
'hidden_layer_sizes': (64, 128),
'max_iter': 10000,
'early_stopping': True,
}
rf_best_params = {
'n_estimators': 400,
'max_depth': 10,
'min_samples_split': 2,
'min_samples_leaf': 2,
'bootstrap': True,
}
# 「学習データとモデルを渡すと学習済みモデルを返す」関数を定義
# xgboost, lightgbmという出来合いの謎の関数を使っている
# 自分で書くより性能高かった すごいー
def get_xgboost_model(train_x, train_y, params):
from xgboost import XGBClassifier
model = XGBClassifier(random_state=71, **params)
model.fit(train_x, train_y)
return model
def get_lgbm_model(train_x, train_y, params):
from lightgbm import LGBMClassifier
model = LGBMClassifier(random_state=71, **params)
model.fit(train_x, train_y)
return model
def get_logistic_model(train_x, train_y, params):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=71, **params)
model.fit(train_x, train_y)
return model
def get_nn_model(train_x, train_y, params):
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(random_state=71, **params)
model.fit(train_x, train_y)
return model
def get_rf_model(train_x, train_y, params):
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=71, **params)
model.fit(train_x, train_y)
return model
# 評価をする関数
# データを5分割し、4つを学習データ、1つを評価データとして使う
# これを5回繰り返し、それぞれの評価データに対するスコアを平均して返す
# loglossはミスの大きさを表す指標、accuracyは正答率
def kfold(model_generator, x, y, params):
scores_logloss = []
scores_accuracy = []
kf = KFold(n_splits=5, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
model = model_generator(tr_x, tr_y, params)
va_pred = model.predict_proba(va_x)[:, 1]
logloss = log_loss(va_y, va_pred)
accuracy = accuracy_score(va_y, va_pred > 0.5)
scores_logloss.append(logloss)
scores_accuracy.append(accuracy)
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
return (logloss, accuracy)
print("データの読み込みをしています...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("データの読み込みが完了しました。")
print("データの前処理をしています...")
# ignore=['urgent']
# train = train.drop(ignore, axis=1)
# test = test.drop(ignore, axis=1)
# classはstring型なので数値に変換
train['class'] = train['class'].map({
'normal': 0,
'attack': 1
})
# データのうちstring型のものをどうにか数値にしたい
# protocol_typeは種類が少ない(3種類)ので、それぞれの値について「その値なら1、違うなら0」の列を追加する
# serviceとflagは種類が多いので、stringを数値にして終わり
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import warnings
warnings.simplefilter('ignore', FutureWarning)
for c in ['protocol_type', 'service', 'flag']:
le = LabelEncoder()
le.fit(pd.concat([train[c], test[c]]).fillna('NA'))
train[c] = le.transform(train[c].fillna('NA'))
test[c] = le.transform(test[c].fillna('NA'))
for c in ['protocol_type']:
ohe = OneHotEncoder()
ohe.fit(pd.DataFrame(pd.concat([train[c], test[c]])))
train_enc = pd.DataFrame(ohe.transform(pd.DataFrame(train[c])).toarray(), columns=ohe.get_feature_names_out())
train = pd.concat([train, train_enc], axis=1)
test_enc = pd.DataFrame(ohe.transform(pd.DataFrame(test[c])).toarray(), columns=ohe.get_feature_names_out())
test = pd.concat([test, test_enc], axis=1)
from math import isnan
for col in ['root_shell', 'num_shells', 'num_access_files', 'num_failed_logins', 'wrong_fragment', 'urgent']:
train_dropped = train.dropna(subset=[col]).drop([col, 'class'], axis=1)
test_dropped = test.dropna(subset=[col]).drop([col], axis=1)
train_target = train.dropna(subset=[col])[col]
test_target = test.dropna(subset=[col])[col]
model = get_xgboost_model(train_dropped.fillna(train_dropped.mean()), train_target, xgb_best_params)
train_predict = model.predict(train.drop([col, 'class'], axis=1).fillna(train.drop([col], axis=1).mean()))
test_predict = model.predict(test.drop([col], axis=1).fillna(test.drop([col], axis=1).mean()))
train[col] = pd.DataFrame(train[col]).fillna(pd.DataFrame(train_predict)).to_numpy()
test[col] = pd.DataFrame(test[col]).fillna(pd.DataFrame(test_predict)).to_numpy()
import numpy as np
print("データの前処理が完了しました。")
print("特徴量の追加をしています...")
# この関数にデータを与えると、特徴量を追加してくれる
# 範囲によってランクをつけたり、対数を取ったりしてみている
def add_data(data):
data['count_rank'] = [0 if 65 < val < 80 or 150 < val < 170 or 240 < val < 255 else 1 for val in data['count']]
data['count_one_rank'] = [min(val - (val // 85) * 85, val - (val // 85 + 1) * 85) for val in data['count']]
data['count_two_rank'] = [min((val+30) - ((val+30) // 102) * 102, (val+30) - ((val+30) // 102 + 1) * 102)-30 for val in data['count']]
data['serror_rate_rank'] = [0 if val < 0.01 else 1 if val < 0.79 else 2 for val in data['serror_rate']]
data['srv_serror_rate_rank'] = [0 if val < 0.8 else 1 for val in data['srv_serror_rate']]
data['same_srv_rate_rank'] = [0 if val < 0.2 else 1 if val < 0.8 else 2 for val in data['same_srv_rate']]
data['dst_host_srv_count_rank'] = [0 if val < 30 else 1 if val < 220 else 2 for val in data['dst_host_srv_count']]
data['src_bytes'] = data['src_bytes'].apply(lambda x: np.log1p(x))
data['dst_bytes'] = data['dst_bytes'].apply(lambda x: np.log1p(x))
data['duration'] = data['duration'].apply(lambda x: np.log1p(x))
data['num_compromised'] = data['num_compromised'].apply(lambda x: np.log1p(x))
data['srv_count'] = data['srv_count'].apply(lambda x: np.log1p(x))
data['srv_rerror_rate_log'] = data['srv_rerror_rate'].apply(lambda x: np.log1p(x))
data['diff_srv_rate'] = data['diff_srv_rate'].apply(lambda x: np.log1p(x))
# ここからなつめぐる追加分
data['root_shell_rank'] = np.where(data['root_shell'] < 1, 0.45, 0.2)
data['num_shell_rank'] = np.where(data['num_shells'] < 2, 0.4, 0)
data['num_access_files_rank'] = np.where(data['num_access_files'] < 1, 0.45, 0)
data['num_failed_logins_rank'] = np.where(data['num_failed_logins'] < 1, 0.45, 0)
data['wrong_fragment_rank'] = np.where(data['wrong_fragment'] < 1, 0.45, 1)
data['urgent_rank'] = np.where(data['urgent'] < 2, 0.55, 0)
return data
train = add_data(train)
test = add_data(test)
# import seaborn as sns
# import matplotlib.pyplot as plt
# sns.set()
# for col in train.drop(['protocol_type', 'service', 'flag'], axis=1).columns:
# print(f"col: {col}")
# sns.displot(data=train, x=col, hue='class', kind='kde', bw_adjust=0.01).savefig(f'analytics/{col}.png')
# plt.close()
# plt.clf()
# データのidは明らかに答えと関係ないので消します
# train(元データ)から、idとclassを削除した訓練データtrain_x、classだけにした訓練答えデータtrain_yを生成
train_x = train.drop(['id', 'class'], axis=1)
train_y = train['class']
# test(元データ)から、idを削除したテストデータtest_xを生成
test_x = test.copy().drop('id', axis=1)
train_x = train_x.astype(np.float32)
train_y = train_y.astype(np.float32)
test_x = test_x.astype(np.float32)
print("特徴量の追加が完了しました。")
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold
# 評価
print("評価をしています...")
# 評価用関数
# 2つのモデルの結果と答え、そして2つのモデルの別の結果を渡すと、その答えをロジスティック回帰で予測するモデルを返す
# これを使って、xgboostとlightgbmの結果を組み合わせてみる
def get_logistic_pred(xgb_tr, lgbm_tr, rf_tr, y_tr, xgb_va, lgbm_va, rf_va):
xgb_va = xgb_va.reshape(-1, 1)
lgbm_va = lgbm_va.reshape(-1, 1)
rf_va = rf_va.reshape(-1, 1)
xgb_tr = xgb_tr.reshape(-1, 1)
lgbm_tr = lgbm_tr.reshape(-1, 1)
rf_tr = rf_tr.reshape(-1, 1)
x_va = np.concatenate([xgb_va, lgbm_va, rf_va], axis=1)
x_tr = np.concatenate([xgb_tr, lgbm_tr, rf_tr], axis=1)
logistic_model = get_logistic_model(x_tr, y_tr, logistic_best_params)
return logistic_model.predict_proba(x_va)[:, 1]
# 学習用問題、学習用答え、テスト用問題を渡すと、テスト用答えを返す関数
# analytics=Trueにすると、学習データに対する予測の分布をプロットする
def get_merged_pred(tr_x, tr_y, va_x, analytics=False):
xgb_model = get_xgboost_model(tr_x, tr_y, xgb_best_params)
xgb_tr_pred = xgb_model.predict_proba(tr_x)[:, 1]
xgb_va_pred = xgb_model.predict_proba(va_x)[:, 1]
lgbm_model = get_lgbm_model(tr_x, tr_y, lgbm_best_params)
lgbm_tr_pred = lgbm_model.predict_proba(tr_x)[:, 1]
lgbm_va_pred = lgbm_model.predict_proba(va_x)[:, 1]
# nn_model = get_nn_model(tr_x.fillna(tr_x.mean()), tr_y, nn_best_params)
# nn_tr_pred = nn_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1]
# nn_va_pred = nn_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1]
# logistic_model = get_logistic_model(tr_x.fillna(tr_x.mean()), tr_y, logistic_best_params)
# logistic_tr_pred = logistic_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1]
# logistic_va_pred = logistic_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1]
rf_model = get_rf_model(tr_x.fillna(tr_x.mean()), tr_y, rf_best_params)
rf_tr_pred = rf_model.predict_proba(tr_x.fillna(tr_x.mean()))[:, 1]
rf_va_pred = rf_model.predict_proba(va_x.fillna(tr_x.mean()))[:, 1]
# overall_va_pred = get_logistic_pred(xgb_tr_pred, lgbm_tr_pred, tr_y, xgb_va_pred, lgbm_va_pred)
overall_va_pred = get_logistic_pred(xgb_tr_pred, lgbm_tr_pred, rf_tr_pred, tr_y, xgb_va_pred, lgbm_va_pred, rf_va_pred)
# if analytics:
# plt.figure(figsize=(10, 10))
# plt.scatter(xgb_tr_pred, nn_tr_pred, c=tr_y, cmap='viridis', s=1, alpha=0.5)
# plt.colorbar()
# plt.xlabel('xgb')
# plt.ylabel('lgbm')
# plt.savefig('analytics/plot.png')
#
# plt.figure(figsize=(10, 10))
# plt.scatter(xgb_va_pred, nn_va_pred, c=overall_va_pred, cmap='viridis', s=1, alpha=0.5)
# plt.colorbar()
# plt.xlabel('xgb')
# plt.ylabel('lgbm')
# plt.savefig('analytics/plot2.png')
# import xgboost as xgb
# import lightgbm as lgb
# xgb.plot_importance(xgb_model).figure.savefig('analytics/xgb_importance.png')
# lgb.plot_importance(lgbm_model).figure.savefig('analytics/lgbm_importance.png')
# 確立の平均を取って総合的な答えにする
# overall_va_pred = (xgb_va_pred + lgbm_va_pred ) / 2
return (overall_va_pred, xgb_va_pred, lgbm_va_pred, rf_va_pred)
eval_accuracy = []
eval_xgb_accuracy = []
eval_lgbm_accuracy = []
eval_nn_accuracy = []
eval_logistic_accuracy = []
eval_rf_accuracy = []
eval_logloss = []
eval_xgb_logloss = []
eval_lgbm_logloss = []
eval_nn_logloss = []
eval_logistic_logloss = []
eval_rf_logloss = []
# いちおう最後に正答率を調べてみる
# これが高くても過学習な可能性があって信頼できないけど...
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
(va_pred, va_xgb_pred, va_lgbm_pred, va_rf_pred) = get_merged_pred(tr_x, tr_y, va_x)
# リストに入れておいて、あとで平均を取る
eval_logloss.append(log_loss(va_y, va_pred))
eval_xgb_logloss.append(log_loss(va_y, va_xgb_pred))
eval_lgbm_logloss.append(log_loss(va_y, va_lgbm_pred))
# eval_nn_logloss.append(log_loss(va_y, va_nn_pred))
# eval_logistic_logloss.append(log_loss(va_y, va_logistic_pred))
eval_rf_logloss.append(log_loss(va_y, va_rf_pred))
eval_accuracy.append(accuracy_score(va_y, va_pred > 0.5))
eval_xgb_accuracy.append(accuracy_score(va_y, va_xgb_pred > 0.5))
eval_lgbm_accuracy.append(accuracy_score(va_y, va_lgbm_pred > 0.5))
# eval_nn_accuracy.append(accuracy_score(va_y, va_nn_pred > 0.5))
# eval_logistic_accuracy.append(accuracy_score(va_y, va_logistic_pred > 0.5))
eval_rf_accuracy.append(accuracy_score(va_y, va_rf_pred > 0.5))
print("評価が完了しました。")
print(f'logloss: {np.mean(eval_logloss):.8f}, accuracy: {np.mean(eval_accuracy):.8f}')
print("各モデルの評価")
print(f'xgb logloss: {np.mean(eval_xgb_logloss):.8f}, accuracy: {np.mean(eval_xgb_accuracy):.8f}')
print(f'lgbm logloss: {np.mean(eval_lgbm_logloss):.8f}, accuracy: {np.mean(eval_lgbm_accuracy):.8f}')
# print(f'nn logloss: {np.mean(eval_nn_logloss):.8f}, accuracy: {np.mean(eval_nn_accuracy):.8f}')
# print(f'logistic logloss: {np.mean(eval_logistic_logloss):.8f}, accuracy: {np.mean(eval_logistic_accuracy):.8f}')
print(f'rf logloss: {np.mean(eval_rf_logloss):.8f}, accuracy: {np.mean(eval_rf_accuracy):.8f}')
# 実際の予測
# 実際のテストデータに対して予測をして、提出用のcsvを作る
print("実際の予測をしています...")
(pred, xgb_pred, lgbm_pred, rf_pred) = get_merged_pred(train_x, train_y, test_x, True)
pred_label = np.where(pred > 0.5, "attack", "normal")
# 念の為、平均を取る前の各方式の予測も出力して残しておく
# xgb_raw_data = pd.DataFrame({'id': test['id'], 'pred': xgb_pred})
# xgb_raw_data.to_csv('submission_xdg_raw.csv', index=False)
# xgb_data = pd.DataFrame({'id': test['id'], 'pred': np.where(xgb_pred > .5, "attack", "normal")})
# xgb_data.to_csv('submission_xdg.csv', index=False)
#
#
# lgbm_raw_data = pd.DataFrame({'id': test['id'], 'pred': lgbm_pred})
# lgbm_raw_data.to_csv('submission_lgbm_raw.csv', index=False)
# lgbm_data = pd.DataFrame({'id': test['id'], 'pred': np.where(lgbm_pred > .5, "attack", "normal")})
# lgbm_data.to_csv('submission_lgbm.csv', index=False)
#
# nn_raw_data = pd.DataFrame({'id': test['id'], 'pred': nn_pred})
# nn_raw_data.to_csv('submission_nn_raw.csv', index=False)
# nn_data = pd.DataFrame({'id': test['id'], 'pred': np.where(nn_pred > .5, "attack", "normal")})
# nn_data.to_csv('submission_nn.csv', index=False)
submission = pd.DataFrame({'id': test['id'], 'pred': pred_label})
submission.to_csv('submission.csv', index=False)
print("実際の予測が完了しました。")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment