Skip to content

Instantly share code, notes, and snippets.

@ashtom84
Last active March 22, 2016 22:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ashtom84/743edd03affb029f3b40 to your computer and use it in GitHub Desktop.
Save ashtom84/743edd03affb029f3b40 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import random
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error
import sklearn.cross_validation as cv
from sklearn.cross_validation import KFold, train_test_split
from sklearn import preprocessing
import sklearn.metrics
from sklearn.grid_search import GridSearchCV
import time
from sklearn import ensemble
min_max_scaler = preprocessing.MinMaxScaler()
# In[88]:
#####################
# Read train and test
#####################
train = pd.read_csv("train.csv", sep=',', encoding='utf-8')
train = train.set_index("ID")
test = pd.read_csv("test.csv", sep=',', encoding='utf-8')
test = test.set_index("ID")
# In[89]:
#############################
# Remove the similar features
#############################
Full_train = train.iloc[:, :369]
temp = Full_train
ind_same = {}
for i in range(len(temp.columns)):
tp = temp.drop(temp.columns[i], inplace=False, axis=1)
for j in range(i+1, len(tp.columns)):
lst = reduce(lambda x, y: x + y, ind_same.values(), [])
if all(temp.iloc[:, i] == tp.iloc[:, j]):
if i not in ind_same.keys():
if i not in lst:
if j >= i: ind_same[i] = [j+1]
else: ind_same[i] = [j]
else:
if j >= i: ind_same[i].append(j+1)
else: ind_same[i].append(j)
print "Full_train shape before dropping similar features: " + str(Full_train.shape)
ind_drop = reduce(lambda x, y: x + y, ind_same.values(), [])
Full_train.drop(Full_train.columns[ind_drop], inplace=True, axis=1)
print "Full_train shape after: " + str(Full_train.shape)
########################################################
##USING RANDOM FOREST TO WEED OUT ZERO VARIANCE FEATURES
########################################################
### Full train
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = 'balanced', \
# max_depth =10, oob_score=True)
#randomForest.fit(X_train, y_train)
### Imbalanced train
randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
max_depth =10, oob_score=True)
randomForest.fit(X_train, y_train)
### Balanced train
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \
# max_depth =10, oob_score=True)
#randomForest.fit(X_train_bal, y_train_bal)
print str(np.sum(randomForest.feature_importances_==0)) + " useless features"
#remove unimportant features by index
feature_imprtance = zip(Full_train.columns, randomForest.feature_importances_)
dtype = [('feature', 'S10'), ('importance', 'float')]
feature_imprtance = np.array(feature_imprtance, dtype = dtype)
feature_sort = np.sort(feature_imprtance, order='importance')[::-1]
imp = np.sort([int(i) for (i, j) in feature_sort if j!=0])
useless = [int(i) for (i, j) in feature_sort if j==0]
# filtering out the useless features in Full_train, train_target and Full_test
Full_train = Full_train.iloc[:, imp]
Full_test = Full_test.iloc[:, imp]
print "Retaining {} features".format(len(Full_train.columns))
# In[90]:
################################
## Outliers
################################
## adding the column TARGET to investigate the rows in Full_train
temp = pd.concat([Full_train, train['TARGET']], axis=1)
## finding out behavior of features' outliers in temp
prop_1_max, prop_1_min, row_ind_max, row_ind_min, feature = [], [], [], [], []
for name in Full_train.columns:
col_max = temp[temp[name] == max(temp[name])]['TARGET']
col_min = temp[temp[name] == min(temp[name])]['TARGET']
val_max, val_min = np.mean(col_max), np.mean(col_min)
prop_1_max.append(val_max)
row_ind_max.append(Full_train.index[np.where(temp[name] == max(temp[name]))])
prop_1_min.append(val_min)
row_ind_min.append(Full_train.index[np.where(temp[name] == min(temp[name]))])
feature.append(name)
prop_1 = pd.DataFrame({'Max': prop_1_max, 'Min': prop_1_min, 'row_ind_max': row_ind_max, 'row_ind_min': row_ind_min, 'feature': feature})
print "Number of features for which only 1s hit the max: " + str(len(prop_1[prop_1.Max == 1]['Max']))
print "Number of features for which only 0s hit the max: " + str(len(prop_1[prop_1.Max == 0]['Max']))
print "Number of features for which at least one 1 hit the min: " + str(len(prop_1[prop_1.Min != 0]['Min']))
print "Full_train shape: " + str(Full_train.shape)
# In[91]:
################################
## Max Outliers
################################
## when the TARGET HAS to be 0 and the outlier is isolated
row_out = [len(l) == 1 for l in prop_1.row_ind_max]
isol_out0 = prop_1[(prop_1.Max == 0) & row_out]
feat0 = isol_out0.feature
row_feat_out0 = [x[0] for x in isol_out0.row_ind_max]
new_col_0_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)
for row in row_feat_out0:
if row in new_col_0_out.index:
new_col_0_out[row] = 1
else: print "Nope"
print len(new_col_0_out)
#new_col_0_out = pd.Series(new_col_0_out, index = Full_train.index)
for row, col in zip(row_feat_out0, feat0):
Full_train.loc[row, col] = 0 ## set outliers to zero
Full_train = pd.concat([Full_train, pd.DataFrame({'Out_1': new_col_0_out})], axis=1)
## when the TARGET HAS to be 1 and the outlier is isolated
isol_out1 = prop_1[(prop_1.Max == 1) & row_out]
feat1 = isol_out1.feature
row_feat_out1 = [x[0] for x in isol_out1.row_ind_max]
new_col_1_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index)
for row in row_feat_out1:
if row in new_col_1_out.index:
new_col_1_out[row] = 1
else: print "Nope"
#new_col_1_out = pd.Series(new_col_1_out, index = Full_train.index)
for row, col in zip(row_feat_out1, feat1):
Full_train.loc[row, col] = 0 ## set outliers to zero
Full_train = pd.concat([Full_train, pd.DataFrame({'Out_2': new_col_1_out})], axis=1)
Full_train.describe()
# In[92]:
##################################
# Normalize train (with no target)
##################################
Full_train = min_max_scaler.fit_transform(Full_train)
train_target = train.iloc[:, 369]
# Normalize test
Full_test = test
Full_test = min_max_scaler.fit_transform(Full_test)
Full_test = pd.DataFrame(Full_test, index = test.index)
######### Full train ##################
## Slow CV on Full train set
# Cross-Validation and evaluate_model: 75% train - 25% test
X_train, X_test, y_train, y_test = cv.train_test_split(Full_train, train_target, test_size=0.25, random_state=0, stratify = train_target)
def evaluate_model(clf):
"""Scores a model using log loss with the created train and test sets."""
start = time.time()
clf.fit(X_train, y_train)
print "Train score:", sklearn.metrics.roc_auc_score(y_train, clf.predict_proba(X_train)[:,1])
print "Test score:", sklearn.metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
print "Total time:", time.time() - start
print "Full Training: " + str(X_train.shape) + str(y_train.shape)
print "Full Test: " + str(X_test.shape) + str(y_test.shape)
# In[96]:
################################
## Auto Grid search
################################
param = {'max_depth': range(9, 11), 'n_estimators': [50], 'min_child_weight': [1], 'colsample_bytree': [0.7],
'base_score': [0.29, 0.3], \
'learning_rate': np.arange(0.09, 0.11, 0.01), \
'scale_pos_weight': [1],
'nthread': [1],
'reg_lambda': [1], \
'reg_alpha': [4], \
'max_delta_step': np.arange(0, 3, 1)
}
print str(np.product([len(param[x]) for x in param])) + " models to test"
xgb_mod = XGBClassifier(objective='binary:logistic')
clf = GridSearchCV(xgb_mod, param, verbose=2, scoring='roc_auc', cv=3)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment