Skip to content

Instantly share code, notes, and snippets.

@weizhou2273
Last active July 18, 2017 02:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save weizhou2273/ff7219b4f5f22b8ec39ae120714c53eb to your computer and use it in GitHub Desktop.
Save weizhou2273/ff7219b4f5f22b8ec39ae120714c53eb to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime
# read raw data
train = pd.read_csv('./data/raw/train.csv')
test = pd.read_csv('./data/raw/test.csv')
# prepare data
y_train = train["price_doc"] / train["full_sq"]
x_train = train.drop(["id", "price_doc"], axis=1)
x_test = test.drop(["id"], axis=1)
num_train = len(train)
df_all = pd.concat([x_train,x_test])
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
# XGBoost parameter setting.
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
# Throw all training features into XGBoost to generate feature importance ranking
cv_output = xgb.cv(xgb_params, dtrain,
num_boost_round=1000,
early_stopping_rounds=20,
verbose_eval=50, show_stdv=False)
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)
# Feature importance ranking
importance = model.get_fscore()
importance = pd.DataFrame(sorted(importance.items()),columns =['feature','fscore'])
# Sort ranking by fscore, the higher the fscore, the more important the feature
importance = importance.sort_values(by='fscore',ascending=False).reset_index(drop=True)
importance_features = list(importance.loc[:,'feature'])
###
### Iterative loop to select features
###
rmse_result_dict={}
for i in range(len(importance_features)):
# create subset of df_all
df_all_subset = df_all.loc[:,importance_features[:i]]
# prepare data
x_train_subset = df_all_subset.iloc[:num_train,:]
dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
# Train model / Cross validation
cv_output = xgb.cv(xgb_params, dtrain_subset, num_boost_round=1000, early_stopping_rounds=20,
verbose_eval=50, show_stdv=False)
# save result
rmse_result_dict[i]=cv_output.iloc[-1,:]
test_rmse_mean=[]
top_n_features=[]
for i in range(len(rmse_result_dict)):
top_n_features.append(i)
test_rmse_mean.append(rmse_result_dict[i]['test-rmse-mean'])
rmse=pd.DataFrame(zip(top_n_features,test_rmse_mean),columns=['Top_n_features','test_rmse_mean'])
fig, ax = plt.subplots()
plt.plot(rmse.Top_n_features,rmse.test_rmse_mean)
ax.set(xlabel='Top n important features',
ylabel='Test_rmse_mean',
title = 'Choose n top features to get best CV-rmse')
### Show the result of
plt.show()
### Greedy search: refine 95 features to 40 features
xgb_params = {
'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 0
}
test_rmse_dict = {}
test_rmse_bag = []
# create subset of df_all
feature_95=importance_features[:95]
# prepare data
x_train_subset = df_all_subset.loc[:num_train,feature_95]
x_test_subset = df_all_subset.loc[num_train:,feature_95]
for col_name in list(reversed(feature_95)):
x_train_subset = x_train_subset.drop([col_name],axis=1)
x_test_subset = x_test_subset.drop([col_name],axis=1)
print('Drop column: {}'.format(col_name))
dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
dtest_subset = xgb.DMatrix(x_test_subset)
cv_output = xgb.cv(xgb_params, dtrain_subset,
num_boost_round=1000,
early_stopping_rounds=20,
verbose_eval=50, show_stdv=False)
test_rmse = cv_output.loc[len(cv_output)-1,'test-rmse-mean']
print('{}: {}'.format(col_name,test_rmse))
test_rmse_dict[col_name] = test_rmse
test_rmse_bag.append(test_rmse)
# 40402.5 is the rmse score using 95 features
if test_rmse < 40402.5:
print('We need to drop {} because the test_rmse improved'.format(col_name))
else:
print('We want to keep {}'.format(col_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment