Skip to content

Instantly share code, notes, and snippets.

@yumaueno
Created April 5, 2021 13:22
Show Gist options
  • Save yumaueno/d6d73ceeb02121d2551820eac7bf6dbf to your computer and use it in GitHub Desktop.
Save yumaueno/d6d73ceeb02121d2551820eac7bf6dbf to your computer and use it in GitHub Desktop.
Nishika 中古マンション価格予測 XGBoost
import glob
import pandas as pd
import numpy as np
import xgboost as xgb
import category_encoders as ce
from sklearn.model_selection import train_test_split
files = glob.glob("train/*.csv")
data_list = []
for file in files:
data_list.append(pd.read_csv(file, index_col=0))
df = pd.concat(data_list)
def data_pre(df):
nonnull_list = []
for col in df.columns:
nonnull = df[col].count()
if nonnull == 0:
nonnull_list.append(col)
df = df.drop(nonnull_list, axis=1)
df = df.drop("市区町村名", axis=1)
df = df.drop("種類", axis=1)
dis = {
"30分?60分":45,
"1H?1H30":75,
"2H?":120,
"1H30?2H":105
}
df["最寄駅:距離(分)"] = df["最寄駅:距離(分)"].replace(dis).astype(float)
df["面積(㎡)"] = df["面積(㎡)"].replace("2000㎡以上", 2000).astype(float)
y_list = {}
for i in df["建築年"].value_counts().keys():
if "平成" in i:
num = float(i.split("平成")[1].split("年")[0])
year = 33 - num
if "令和" in i:
num = float(i.split("令和")[1].split("年")[0])
year = 3 - num
if "昭和" in i:
num = float(i.split("昭和")[1].split("年")[0])
year = 96 - num
y_list[i] = year
y_list["戦前"] = 76
df["建築年"] = df["建築年"].replace(y_list)
year = {
"年第1四半期": ".25",
"年第2四半期": ".50",
"年第3四半期": ".75",
"年第4四半期": ".99"
}
year_list = {}
for i in df["取引時点"].value_counts().keys():
for k, j in year.items():
if k in i:
year_rep = i.replace(k, j)
year_list[i] = year_rep
df["取引時点"] = df["取引時点"].replace(year_list).astype(float)
cols = ["都道府県名", "地区名", "最寄駅:名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]
ce_df = ce.OrdinalEncoder(cols=cols, handle_unknown='impute')
df = ce_df.fit_transform(df)
return df
df = data_pre(df)
df_train, df_val =train_test_split(df, test_size=0.2)
col = "取引価格(総額)_log"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)
val_y = df_val[col]
val_x = df_val.drop(col, axis=1)
train_data = xgb.DMatrix(train_x, label=train_y)
eval_data = xgb.DMatrix(val_x, label=val_y)
xgb_params = {
"objective": "reg:squarederror",
'eval_metric': "mae"
}
evals = [(train_data, 'train'), (eval_data, 'eval')]
gbm = xgb.train(
xgb_params,
train_data,
num_boost_round=100,
early_stopping_rounds=10,
evals=evals,
)
preds = gbm.predict(X_data)
df_test = pd.read_csv("test.csv", index_col=0)
df_test = data_pre(df_test)
predict = model.predict(df_test)
df_test["取引価格(総額)_log"] = predict
df_test[["取引価格(総額)_log"]].to_csv("submit_test.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment