Skip to content

Instantly share code, notes, and snippets.

@smly
Last active September 19, 2019 08:41
Show Gist options
  • Save smly/2fb2b87e4ba2eea4b6f8023d11953973 to your computer and use it in GitHub Desktop.
Save smly/2fb2b87e4ba2eea4b6f8023d11953973 to your computer and use it in GitHub Desktop.
"""
Public LB: 0.50456
"""
from collections import Counter
import lightgbm as lgb
import ml_metrics
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
def parse_host_verifications(df):
raw_value_list = []
for val in df['host_verifications'].tolist():
values = eval(val)
if values is not None:
raw_value_list.append(Counter(values))
else:
raw_value_list.append({})
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(raw_value_list)
for idx, col in enumerate(vectorizer.feature_names_):
df[f'host_verifications_{col}'] = X[:, idx]
def load_data():
df_trn = pd.read_csv('./train.csv')
df_tst = pd.read_csv('./test.csv')
df = pd.concat([df_trn, df_tst], sort=False)
original_train_size = len(df_trn)
y_train = df.iloc[:original_train_size]['price'].values
# Parse date
df.loc[:, 'host_since_year'] = df.host_since.fillna('2020-01-01').apply(
lambda x: int(x.split('-')[0]))
df.loc[:, 'host_since_month'] = df.host_since.fillna('2020-01-01').apply(
lambda x: int(x.split('-')[1]))
df.loc[:, 'host_since_day'] = df.host_since.fillna('2020-01-01').apply(
lambda x: int(x.split('-')[2]))
# Parse host_verifications
parse_host_verifications(df)
# Baseline features
categorical_cols = []
cols = []
for col in df.columns:
if col in ['listing_id', 'price']:
continue
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].mean())
else:
df[col] = df[col].factorize()[0]
categorical_cols.append(col)
cols.append(col)
return df, df_trn, df_tst, y_train, cols, categorical_cols
def cv(params, fit_params):
df, df_trn, df_tst, y_train, cols, categorical_cols = load_data()
original_train_size = y_train.shape[0]
X_train = df.iloc[:original_train_size][cols].values
val_score_list = []
kf = KFold(n_splits=3, random_state=11, shuffle=True)
for idx_valtrn, idx_valtst in kf.split(X_train):
X_valtrn, X_valtst = X_train[idx_valtrn], X_train[idx_valtst]
y_valtrn, y_valtst = y_train[idx_valtrn], y_train[idx_valtst]
lgb_valtrn = lgb.Dataset(X_valtrn, np.log1p(y_valtrn),
feature_name=cols,
categorical_feature=categorical_cols)
lgb_eval = lgb.Dataset(X_valtst, np.log1p(y_valtst),
reference=lgb_valtrn,
feature_name=cols,
categorical_feature=categorical_cols)
fit_params['valid_sets'] = lgb_eval
clf = lgb.train(params, lgb_valtrn, **fit_params)
y_pred = np.expm1(clf.predict(X_valtst,
num_iteration=clf.best_iteration))
val_score = ml_metrics.rmsle(y_pred, y_valtst)
print(f'RMSLE: {val_score:.6f}')
val_score_list.append(val_score)
avg_val_score = np.mean(val_score_list)
print(f'Avg-RMSLE: {avg_val_score:.6f}')
def main(params, fit_params):
df, df_trn, df_tst, y_train, cols, categorical_cols = load_data()
original_train_size = y_train.shape[0]
cols = []
categorical_cols = []
for col in df.columns:
if col in ['listing_id', 'price']:
continue
if pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].fillna(df[col].mean())
else:
df[col] = df[col].factorize()[0]
categorical_cols.append(col)
cols.append(col)
X_train = df.iloc[:original_train_size][cols].values
X_test = df.iloc[original_train_size:][cols].values
# Early stopping のための validation split を作成
X_valtrn, X_valtst, y_valtrn, y_valtst = train_test_split(
X_train, y_train, test_size=0.1, random_state=11)
lgb_valtrn = lgb.Dataset(X_valtrn, np.log1p(y_valtrn),
feature_name=cols,
categorical_feature=categorical_cols)
lgb_eval = lgb.Dataset(X_valtst, np.log1p(y_valtst),
reference=lgb_valtrn,
feature_name=cols,
categorical_feature=categorical_cols)
fit_params['valid_sets'] = lgb_eval
clf = lgb.train(params, lgb_valtrn, **fit_params)
y_pred = np.expm1(clf.predict(X_valtst, num_iteration=clf.best_iteration))
print('RMSLE: {:.6f}'.format(ml_metrics.rmsle(y_pred, y_valtst)))
y_pred = np.expm1(clf.predict(X_test, num_iteration=clf.best_iteration))
df_tst.loc[:, 'price'] = y_pred
df_tst[['listing_id', 'price']].to_csv('./baseline_v2.csv', index=False)
if __name__ == '__main__':
# LightGBM parameters
# https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#core-parameters
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'rmse',
# 'num_leaves' : 60,
# 'learning_rate' : 0.1,
# 'feature_fraction' : 1.0,
# 'bagging_fraction' : 1.0,
'verbose': -1,
}
# https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst#learning-control-parameters
fit_params = {
'num_boost_round': 8,
'verbose_eval': 8,
'early_stopping_rounds': 3,
}
cv(params, fit_params)
main(params, fit_params)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment