weizhou2273/feature_selection.py

## feature_selection.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import model_selection, preprocessing
import xgboost as xgb
import datetime

# read raw data
train = pd.read_csv('./data/raw/train.csv')
test = pd.read_csv('./data/raw/test.csv')

# prepare data
y_train = train["price_doc"] / train["full_sq"]
x_train = train.drop(["id", "price_doc"], axis=1)
x_test = test.drop(["id"], axis=1)

num_train = len(train)
df_all = pd.concat([x_train,x_test])
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)

# XGBoost parameter setting.
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Throw all training features into XGBoost to generate feature importance ranking
cv_output = xgb.cv(xgb_params, dtrain,
                   num_boost_round=1000,
                   early_stopping_rounds=20,
                   verbose_eval=50, show_stdv=False)
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

# Feature importance ranking
importance = model.get_fscore()
importance = pd.DataFrame(sorted(importance.items()),columns =['feature','fscore'])
# Sort ranking by fscore, the higher the fscore, the more important the feature
importance = importance.sort_values(by='fscore',ascending=False).reset_index(drop=True)
importance_features = list(importance.loc[:,'feature'])

###
### Iterative loop to select features
###
rmse_result_dict={}
for i in range(len(importance_features)):
    # create subset of df_all
    df_all_subset = df_all.loc[:,importance_features[:i]]
    # prepare data
    x_train_subset = df_all_subset.iloc[:num_train,:]
    dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
#     Train model / Cross validation
    cv_output = xgb.cv(xgb_params, dtrain_subset, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
#     save result
    rmse_result_dict[i]=cv_output.iloc[-1,:]


test_rmse_mean=[]
top_n_features=[]
for i in range(len(rmse_result_dict)):
    top_n_features.append(i)
    test_rmse_mean.append(rmse_result_dict[i]['test-rmse-mean'])

rmse=pd.DataFrame(zip(top_n_features,test_rmse_mean),columns=['Top_n_features','test_rmse_mean'])
fig, ax = plt.subplots()
plt.plot(rmse.Top_n_features,rmse.test_rmse_mean)
ax.set(xlabel='Top n important features',
       ylabel='Test_rmse_mean',
       title = 'Choose n top features to get best CV-rmse')
### Show the result of
plt.show()


### Greedy search: refine 95 features to 40 features

xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0
}

test_rmse_dict = {}
test_rmse_bag = []

# create subset of df_all
feature_95=importance_features[:95]
# prepare data
x_train_subset = df_all_subset.loc[:num_train,feature_95]
x_test_subset = df_all_subset.loc[num_train:,feature_95]

for col_name in list(reversed(feature_95)):
    x_train_subset = x_train_subset.drop([col_name],axis=1)
    x_test_subset = x_test_subset.drop([col_name],axis=1)
    print('Drop column: {}'.format(col_name))
    dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
    dtest_subset =  xgb.DMatrix(x_test_subset)
    cv_output = xgb.cv(xgb_params, dtrain_subset,
                       num_boost_round=1000,
                       early_stopping_rounds=20,
                       verbose_eval=50, show_stdv=False)
    test_rmse = cv_output.loc[len(cv_output)-1,'test-rmse-mean']
    print('{}: {}'.format(col_name,test_rmse))
    test_rmse_dict[col_name] = test_rmse
    test_rmse_bag.append(test_rmse)
# 40402.5 is the rmse score using 95 features
    if test_rmse < 40402.5:
        print('We need to drop {} because the test_rmse improved'.format(col_name))
    else:
        print('We want to keep {}'.format(col_name))
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline
	from sklearn import model_selection, preprocessing
	import xgboost as xgb
	import datetime

	# read raw data
	train = pd.read_csv('./data/raw/train.csv')
	test = pd.read_csv('./data/raw/test.csv')

	# prepare data
	y_train = train["price_doc"] / train["full_sq"]
	x_train = train.drop(["id", "price_doc"], axis=1)
	x_test = test.drop(["id"], axis=1)

	num_train = len(train)
	df_all = pd.concat([x_train,x_test])
	dtrain = xgb.DMatrix(x_train, y_train)
	dtest = xgb.DMatrix(x_test)

	# XGBoost parameter setting.
	xgb_params = {
	'eta': 0.05,
	'max_depth': 5,
	'subsample': 0.7,
	'colsample_bytree': 0.7,
	'objective': 'reg:linear',
	'eval_metric': 'rmse',
	'silent': 1
	}

	# Throw all training features into XGBoost to generate feature importance ranking
	cv_output = xgb.cv(xgb_params, dtrain,
	num_boost_round=1000,
	early_stopping_rounds=20,
	verbose_eval=50, show_stdv=False)
	num_boost_rounds = len(cv_output)
	model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

	# Feature importance ranking
	importance = model.get_fscore()
	importance = pd.DataFrame(sorted(importance.items()),columns =['feature','fscore'])
	# Sort ranking by fscore, the higher the fscore, the more important the feature
	importance = importance.sort_values(by='fscore',ascending=False).reset_index(drop=True)
	importance_features = list(importance.loc[:,'feature'])

	###
	### Iterative loop to select features
	###
	rmse_result_dict={}
	for i in range(len(importance_features)):
	# create subset of df_all
	df_all_subset = df_all.loc[:,importance_features[:i]]
	# prepare data
	x_train_subset = df_all_subset.iloc[:num_train,:]
	dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
	# Train model / Cross validation
	cv_output = xgb.cv(xgb_params, dtrain_subset, num_boost_round=1000, early_stopping_rounds=20,
	verbose_eval=50, show_stdv=False)
	# save result
	rmse_result_dict[i]=cv_output.iloc[-1,:]


	test_rmse_mean=[]
	top_n_features=[]
	for i in range(len(rmse_result_dict)):
	top_n_features.append(i)
	test_rmse_mean.append(rmse_result_dict[i]['test-rmse-mean'])

	rmse=pd.DataFrame(zip(top_n_features,test_rmse_mean),columns=['Top_n_features','test_rmse_mean'])
	fig, ax = plt.subplots()
	plt.plot(rmse.Top_n_features,rmse.test_rmse_mean)
	ax.set(xlabel='Top n important features',
	ylabel='Test_rmse_mean',
	title = 'Choose n top features to get best CV-rmse')
	### Show the result of
	plt.show()


	### Greedy search: refine 95 features to 40 features

	xgb_params = {
	'eta': 0.05,
	'max_depth': 5,
	'subsample': 0.7,
	'colsample_bytree': 0.7,
	'objective': 'reg:linear',
	'eval_metric': 'rmse',
	'silent': 0
	}

	test_rmse_dict = {}
	test_rmse_bag = []

	# create subset of df_all
	feature_95=importance_features[:95]
	# prepare data
	x_train_subset = df_all_subset.loc[:num_train,feature_95]
	x_test_subset = df_all_subset.loc[num_train:,feature_95]

	for col_name in list(reversed(feature_95)):
	x_train_subset = x_train_subset.drop([col_name],axis=1)
	x_test_subset = x_test_subset.drop([col_name],axis=1)
	print('Drop column: {}'.format(col_name))
	dtrain_subset = xgb.DMatrix(x_train_subset, y_train)
	dtest_subset = xgb.DMatrix(x_test_subset)
	cv_output = xgb.cv(xgb_params, dtrain_subset,
	num_boost_round=1000,
	early_stopping_rounds=20,
	verbose_eval=50, show_stdv=False)
	test_rmse = cv_output.loc[len(cv_output)-1,'test-rmse-mean']
	print('{}: {}'.format(col_name,test_rmse))
	test_rmse_dict[col_name] = test_rmse
	test_rmse_bag.append(test_rmse)
	# 40402.5 is the rmse score using 95 features
	if test_rmse < 40402.5:
	print('We need to drop {} because the test_rmse improved'.format(col_name))
	else:
	print('We want to keep {}'.format(col_name))