Created
April 5, 2021 13:22
-
-
Save yumaueno/d6d73ceeb02121d2551820eac7bf6dbf to your computer and use it in GitHub Desktop.
Nishika 中古マンション価格予測 XGBoost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import pandas as pd | |
import numpy as np | |
import xgboost as xgb | |
import category_encoders as ce | |
from sklearn.model_selection import train_test_split | |
files = glob.glob("train/*.csv") | |
data_list = [] | |
for file in files: | |
data_list.append(pd.read_csv(file, index_col=0)) | |
df = pd.concat(data_list) | |
def data_pre(df): | |
nonnull_list = [] | |
for col in df.columns: | |
nonnull = df[col].count() | |
if nonnull == 0: | |
nonnull_list.append(col) | |
df = df.drop(nonnull_list, axis=1) | |
df = df.drop("市区町村名", axis=1) | |
df = df.drop("種類", axis=1) | |
dis = { | |
"30分?60分":45, | |
"1H?1H30":75, | |
"2H?":120, | |
"1H30?2H":105 | |
} | |
df["最寄駅:距離(分)"] = df["最寄駅:距離(分)"].replace(dis).astype(float) | |
df["面積(㎡)"] = df["面積(㎡)"].replace("2000㎡以上", 2000).astype(float) | |
y_list = {} | |
for i in df["建築年"].value_counts().keys(): | |
if "平成" in i: | |
num = float(i.split("平成")[1].split("年")[0]) | |
year = 33 - num | |
if "令和" in i: | |
num = float(i.split("令和")[1].split("年")[0]) | |
year = 3 - num | |
if "昭和" in i: | |
num = float(i.split("昭和")[1].split("年")[0]) | |
year = 96 - num | |
y_list[i] = year | |
y_list["戦前"] = 76 | |
df["建築年"] = df["建築年"].replace(y_list) | |
year = { | |
"年第1四半期": ".25", | |
"年第2四半期": ".50", | |
"年第3四半期": ".75", | |
"年第4四半期": ".99" | |
} | |
year_list = {} | |
for i in df["取引時点"].value_counts().keys(): | |
for k, j in year.items(): | |
if k in i: | |
year_rep = i.replace(k, j) | |
year_list[i] = year_rep | |
df["取引時点"] = df["取引時点"].replace(year_list).astype(float) | |
cols = ["都道府県名", "地区名", "最寄駅:名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"] | |
ce_df = ce.OrdinalEncoder(cols=cols, handle_unknown='impute') | |
df = ce_df.fit_transform(df) | |
return df | |
df = data_pre(df) | |
df_train, df_val =train_test_split(df, test_size=0.2) | |
col = "取引価格(総額)_log" | |
train_y = df_train[col] | |
train_x = df_train.drop(col, axis=1) | |
val_y = df_val[col] | |
val_x = df_val.drop(col, axis=1) | |
train_data = xgb.DMatrix(train_x, label=train_y) | |
eval_data = xgb.DMatrix(val_x, label=val_y) | |
xgb_params = { | |
"objective": "reg:squarederror", | |
'eval_metric': "mae" | |
} | |
evals = [(train_data, 'train'), (eval_data, 'eval')] | |
gbm = xgb.train( | |
xgb_params, | |
train_data, | |
num_boost_round=100, | |
early_stopping_rounds=10, | |
evals=evals, | |
) | |
preds = gbm.predict(X_data) | |
df_test = pd.read_csv("test.csv", index_col=0) | |
df_test = data_pre(df_test) | |
predict = model.predict(df_test) | |
df_test["取引価格(総額)_log"] = predict | |
df_test[["取引価格(総額)_log"]].to_csv("submit_test.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment