ashtom84/XGBoost

## XGBoost
import numpy as np
import pandas as pd
import random
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error
import sklearn.cross_validation as cv
from sklearn.cross_validation import KFold, train_test_split
from sklearn import preprocessing
import sklearn.metrics
from sklearn.grid_search import GridSearchCV
import time
from sklearn import ensemble
min_max_scaler = preprocessing.MinMaxScaler()


# In[88]:

#####################
# Read train and test
#####################
train = pd.read_csv("train.csv", sep=',', encoding='utf-8')
train = train.set_index("ID")
test = pd.read_csv("test.csv", sep=',', encoding='utf-8')
test = test.set_index("ID")


# In[89]:

#############################
# Remove the similar features
#############################
Full_train = train.iloc[:, :369]
temp = Full_train
ind_same = {}
for i in range(len(temp.columns)):
    tp = temp.drop(temp.columns[i], inplace=False, axis=1)
    for j in range(i+1, len(tp.columns)):
        lst = reduce(lambda x, y: x + y, ind_same.values(), [])
        if all(temp.iloc[:, i] == tp.iloc[:, j]):
            if i not in ind_same.keys():
                if i not in lst:
                    if j >= i: ind_same[i] = [j+1]
                    else: ind_same[i] = [j]
            else:
                if j >= i: ind_same[i].append(j+1)
                else: ind_same[i].append(j)

print "Full_train shape before dropping similar features: " + str(Full_train.shape)
ind_drop = reduce(lambda x, y: x + y, ind_same.values(), [])
Full_train.drop(Full_train.columns[ind_drop], inplace=True, axis=1)
print "Full_train shape after: " + str(Full_train.shape)


########################################################
##USING RANDOM FOREST TO WEED OUT ZERO VARIANCE FEATURES
########################################################
### Full train
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = 'balanced', \
#                                               max_depth =10, oob_score=True)
#randomForest.fit(X_train, y_train)

### Imbalanced train
randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
                                               max_depth =10, oob_score=True)
randomForest.fit(X_train, y_train)

### Balanced train
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
#                                               max_depth =10, oob_score=True)
#randomForest.fit(X_train_bal, y_train_bal)

print str(np.sum(randomForest.feature_importances_==0)) + " useless features"

#remove unimportant features by index
feature_imprtance = zip(Full_train.columns, randomForest.feature_importances_)
dtype = [('feature', 'S10'), ('importance', 'float')]
feature_imprtance = np.array(feature_imprtance, dtype = dtype)
feature_sort = np.sort(feature_imprtance, order='importance')[::-1]
imp = np.sort([int(i) for (i, j) in feature_sort if j!=0])
useless = [int(i) for (i, j) in feature_sort if j==0]

# filtering out the useless features in Full_train, train_target and Full_test
Full_train = Full_train.iloc[:, imp]
Full_test = Full_test.iloc[:, imp]

print "Retaining {} features".format(len(Full_train.columns))


# In[90]:

################################
## Outliers
################################
## adding the column TARGET to investigate the rows in Full_train
temp = pd.concat([Full_train, train['TARGET']], axis=1)
## finding out behavior of features' outliers in temp
prop_1_max, prop_1_min, row_ind_max, row_ind_min, feature = [], [], [], [], []
for name in Full_train.columns:
    col_max = temp[temp[name] == max(temp[name])]['TARGET']
    col_min = temp[temp[name] == min(temp[name])]['TARGET']
    val_max, val_min = np.mean(col_max), np.mean(col_min)
    prop_1_max.append(val_max)
    row_ind_max.append(Full_train.index[np.where(temp[name] == max(temp[name]))])
    prop_1_min.append(val_min)
    row_ind_min.append(Full_train.index[np.where(temp[name] == min(temp[name]))])
    feature.append(name)

prop_1 = pd.DataFrame({'Max': prop_1_max, 'Min': prop_1_min, 'row_ind_max': row_ind_max, 'row_ind_min': row_ind_min, 'feature': feature})
print "Number of features for which only 1s hit the max: " + str(len(prop_1[prop_1.Max == 1]['Max']))
print "Number of features for which only 0s hit the max: " + str(len(prop_1[prop_1.Max == 0]['Max']))
print "Number of features for which at least one 1 hit the min: " + str(len(prop_1[prop_1.Min != 0]['Min']))
print "Full_train shape: " + str(Full_train.shape)


# In[91]:

################################
## Max Outliers
################################
## when the TARGET HAS to be 0 and the outlier is isolated
row_out = [len(l) == 1 for l in prop_1.row_ind_max]
isol_out0 = prop_1[(prop_1.Max == 0) & row_out]
feat0 = isol_out0.feature
row_feat_out0 = [x[0] for x in isol_out0.row_ind_max]

new_col_0_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)

for row in row_feat_out0:
    if row in new_col_0_out.index:
        new_col_0_out[row] = 1
    else: print "Nope"
print len(new_col_0_out)
#new_col_0_out = pd.Series(new_col_0_out, index = Full_train.index)

for row, col in zip(row_feat_out0, feat0):
    Full_train.loc[row, col] = 0 ## set outliers to zero

Full_train = pd.concat([Full_train, pd.DataFrame({'Out_1': new_col_0_out})], axis=1)

## when the TARGET HAS to be 1 and the outlier is isolated
isol_out1 = prop_1[(prop_1.Max == 1) & row_out]
feat1 = isol_out1.feature
row_feat_out1 = [x[0] for x in isol_out1.row_ind_max]

new_col_1_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)
for row in row_feat_out1:
    if row in new_col_1_out.index:
        new_col_1_out[row] = 1
    else: print "Nope"
#new_col_1_out = pd.Series(new_col_1_out, index = Full_train.index)

for row, col in zip(row_feat_out1, feat1):
    Full_train.loc[row, col] = 0 ## set outliers to zero

Full_train = pd.concat([Full_train, pd.DataFrame({'Out_2': new_col_1_out})], axis=1)

Full_train.describe()


# In[92]:

##################################
# Normalize train (with no target)
##################################
Full_train = min_max_scaler.fit_transform(Full_train)
train_target = train.iloc[:, 369]

# Normalize test
Full_test = test
Full_test = min_max_scaler.fit_transform(Full_test)
Full_test = pd.DataFrame(Full_test, index = test.index)

######### Full train ##################
## Slow CV on Full train set
# Cross-Validation and evaluate_model: 75% train - 25% test
X_train, X_test, y_train, y_test = cv.train_test_split(Full_train, train_target, test_size=0.25, random_state=0, stratify = train_target)


def evaluate_model(clf):
    """Scores a model using log loss with the created train and test sets."""
    start = time.time()
    clf.fit(X_train, y_train)
    print "Train score:", sklearn.metrics.roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
    print "Test score:", sklearn.metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    print "Total time:", time.time() - start


print "Full Training: " + str(X_train.shape) + str(y_train.shape)
print "Full Test: " + str(X_test.shape) + str(y_test.shape)


# In[96]:

################################
## Auto Grid search
################################

param = {'max_depth': range(9, 11), 'n_estimators': [50], 'min_child_weight': [1], 'colsample_bytree': [0.7],
         'base_score': [0.29, 0.3], \
         'learning_rate': np.arange(0.09, 0.11, 0.01), \
         'scale_pos_weight': [1],
         'nthread': [1],
         'reg_lambda': [1], \
         'reg_alpha': [4], \
         'max_delta_step': np.arange(0, 3, 1)
         }

print str(np.product([len(param[x]) for x in param])) + " models to test"

xgb_mod = XGBClassifier(objective='binary:logistic')
clf = GridSearchCV(xgb_mod, param, verbose=2, scoring='roc_auc', cv=3)
clf.fit(X_train, y_train)

print(clf.best_score_)
print(clf.best_params_)
	import numpy as np
	import pandas as pd
	import random
	from xgboost import XGBClassifier
	from sklearn.metrics import confusion_matrix, mean_squared_error
	import sklearn.cross_validation as cv
	from sklearn.cross_validation import KFold, train_test_split
	from sklearn import preprocessing
	import sklearn.metrics
	from sklearn.grid_search import GridSearchCV
	import time
	from sklearn import ensemble
	min_max_scaler = preprocessing.MinMaxScaler()


	# In[88]:

	#####################
	# Read train and test
	#####################
	train = pd.read_csv("train.csv", sep=',', encoding='utf-8')
	train = train.set_index("ID")
	test = pd.read_csv("test.csv", sep=',', encoding='utf-8')
	test = test.set_index("ID")


	# In[89]:

	#############################
	# Remove the similar features
	#############################
	Full_train = train.iloc[:, :369]
	temp = Full_train
	ind_same = {}
	for i in range(len(temp.columns)):
	tp = temp.drop(temp.columns[i], inplace=False, axis=1)
	for j in range(i+1, len(tp.columns)):
	lst = reduce(lambda x, y: x + y, ind_same.values(), [])
	if all(temp.iloc[:, i] == tp.iloc[:, j]):
	if i not in ind_same.keys():
	if i not in lst:
	if j >= i: ind_same[i] = [j+1]
	else: ind_same[i] = [j]
	else:
	if j >= i: ind_same[i].append(j+1)
	else: ind_same[i].append(j)

	print "Full_train shape before dropping similar features: " + str(Full_train.shape)
	ind_drop = reduce(lambda x, y: x + y, ind_same.values(), [])
	Full_train.drop(Full_train.columns[ind_drop], inplace=True, axis=1)
	print "Full_train shape after: " + str(Full_train.shape)


	########################################################
	##USING RANDOM FOREST TO WEED OUT ZERO VARIANCE FEATURES
	########################################################
	### Full train
	#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = 'balanced', \
	# max_depth =10, oob_score=True)
	#randomForest.fit(X_train, y_train)

	### Imbalanced train
	randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
	max_depth =10, oob_score=True)
	randomForest.fit(X_train, y_train)

	### Balanced train
	#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
	# max_depth =10, oob_score=True)
	#randomForest.fit(X_train_bal, y_train_bal)

	print str(np.sum(randomForest.feature_importances_==0)) + " useless features"

	#remove unimportant features by index
	feature_imprtance = zip(Full_train.columns, randomForest.feature_importances_)
	dtype = [('feature', 'S10'), ('importance', 'float')]
	feature_imprtance = np.array(feature_imprtance, dtype = dtype)
	feature_sort = np.sort(feature_imprtance, order='importance')[::-1]
	imp = np.sort([int(i) for (i, j) in feature_sort if j!=0])
	useless = [int(i) for (i, j) in feature_sort if j==0]

	# filtering out the useless features in Full_train, train_target and Full_test
	Full_train = Full_train.iloc[:, imp]
	Full_test = Full_test.iloc[:, imp]

	print "Retaining {} features".format(len(Full_train.columns))


	# In[90]:

	################################
	## Outliers
	################################
	## adding the column TARGET to investigate the rows in Full_train
	temp = pd.concat([Full_train, train['TARGET']], axis=1)
	## finding out behavior of features' outliers in temp
	prop_1_max, prop_1_min, row_ind_max, row_ind_min, feature = [], [], [], [], []
	for name in Full_train.columns:
	col_max = temp[temp[name] == max(temp[name])]['TARGET']
	col_min = temp[temp[name] == min(temp[name])]['TARGET']
	val_max, val_min = np.mean(col_max), np.mean(col_min)
	prop_1_max.append(val_max)
	row_ind_max.append(Full_train.index[np.where(temp[name] == max(temp[name]))])
	prop_1_min.append(val_min)
	row_ind_min.append(Full_train.index[np.where(temp[name] == min(temp[name]))])
	feature.append(name)

	prop_1 = pd.DataFrame({'Max': prop_1_max, 'Min': prop_1_min, 'row_ind_max': row_ind_max, 'row_ind_min': row_ind_min, 'feature': feature})
	print "Number of features for which only 1s hit the max: " + str(len(prop_1[prop_1.Max == 1]['Max']))
	print "Number of features for which only 0s hit the max: " + str(len(prop_1[prop_1.Max == 0]['Max']))
	print "Number of features for which at least one 1 hit the min: " + str(len(prop_1[prop_1.Min != 0]['Min']))
	print "Full_train shape: " + str(Full_train.shape)


	# In[91]:

	################################
	## Max Outliers
	################################
	## when the TARGET HAS to be 0 and the outlier is isolated
	row_out = [len(l) == 1 for l in prop_1.row_ind_max]
	isol_out0 = prop_1[(prop_1.Max == 0) & row_out]
	feat0 = isol_out0.feature
	row_feat_out0 = [x[0] for x in isol_out0.row_ind_max]

	new_col_0_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)

	for row in row_feat_out0:
	if row in new_col_0_out.index:
	new_col_0_out[row] = 1
	else: print "Nope"
	print len(new_col_0_out)
	#new_col_0_out = pd.Series(new_col_0_out, index = Full_train.index)

	for row, col in zip(row_feat_out0, feat0):
	Full_train.loc[row, col] = 0 ## set outliers to zero

	Full_train = pd.concat([Full_train, pd.DataFrame({'Out_1': new_col_0_out})], axis=1)

	## when the TARGET HAS to be 1 and the outlier is isolated
	isol_out1 = prop_1[(prop_1.Max == 1) & row_out]
	feat1 = isol_out1.feature
	row_feat_out1 = [x[0] for x in isol_out1.row_ind_max]

	new_col_1_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)
	for row in row_feat_out1:
	if row in new_col_1_out.index:
	new_col_1_out[row] = 1
	else: print "Nope"
	#new_col_1_out = pd.Series(new_col_1_out, index = Full_train.index)

	for row, col in zip(row_feat_out1, feat1):
	Full_train.loc[row, col] = 0 ## set outliers to zero

	Full_train = pd.concat([Full_train, pd.DataFrame({'Out_2': new_col_1_out})], axis=1)

	Full_train.describe()


	# In[92]:

	##################################
	# Normalize train (with no target)
	##################################
	Full_train = min_max_scaler.fit_transform(Full_train)
	train_target = train.iloc[:, 369]

	# Normalize test
	Full_test = test
	Full_test = min_max_scaler.fit_transform(Full_test)
	Full_test = pd.DataFrame(Full_test, index = test.index)

	######### Full train ##################
	## Slow CV on Full train set
	# Cross-Validation and evaluate_model: 75% train - 25% test
	X_train, X_test, y_train, y_test = cv.train_test_split(Full_train, train_target, test_size=0.25, random_state=0, stratify = train_target)


	def evaluate_model(clf):
	"""Scores a model using log loss with the created train and test sets."""
	start = time.time()
	clf.fit(X_train, y_train)
	print "Train score:", sklearn.metrics.roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
	print "Test score:", sklearn.metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
	print "Total time:", time.time() - start



	print "Full Training: " + str(X_train.shape) + str(y_train.shape)
	print "Full Test: " + str(X_test.shape) + str(y_test.shape)


	# In[96]:

	################################
	## Auto Grid search
	################################

	param = {'max_depth': range(9, 11), 'n_estimators': [50], 'min_child_weight': [1], 'colsample_bytree': [0.7],
	'base_score': [0.29, 0.3], \
	'learning_rate': np.arange(0.09, 0.11, 0.01), \
	'scale_pos_weight': [1],
	'nthread': [1],
	'reg_lambda': [1], \
	'reg_alpha': [4], \
	'max_delta_step': np.arange(0, 3, 1)
	}

	print str(np.product([len(param[x]) for x in param])) + " models to test"

	xgb_mod = XGBClassifier(objective='binary:logistic')
	clf = GridSearchCV(xgb_mod, param, verbose=2, scoring='roc_auc', cv=3)
	clf.fit(X_train, y_train)

	print(clf.best_score_)
	print(clf.best_params_)