Last active
March 22, 2016 22:16
-
-
Save ashtom84/743edd03affb029f3b40 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import random | |
from xgboost import XGBClassifier | |
from sklearn.metrics import confusion_matrix, mean_squared_error | |
import sklearn.cross_validation as cv | |
from sklearn.cross_validation import KFold, train_test_split | |
from sklearn import preprocessing | |
import sklearn.metrics | |
from sklearn.grid_search import GridSearchCV | |
import time | |
from sklearn import ensemble | |
min_max_scaler = preprocessing.MinMaxScaler() | |
# In[88]: | |
##################### | |
# Read train and test | |
##################### | |
train = pd.read_csv("train.csv", sep=',', encoding='utf-8') | |
train = train.set_index("ID") | |
test = pd.read_csv("test.csv", sep=',', encoding='utf-8') | |
test = test.set_index("ID") | |
# In[89]: | |
############################# | |
# Remove the similar features | |
############################# | |
Full_train = train.iloc[:, :369] | |
temp = Full_train | |
ind_same = {} | |
for i in range(len(temp.columns)): | |
tp = temp.drop(temp.columns[i], inplace=False, axis=1) | |
for j in range(i+1, len(tp.columns)): | |
lst = reduce(lambda x, y: x + y, ind_same.values(), []) | |
if all(temp.iloc[:, i] == tp.iloc[:, j]): | |
if i not in ind_same.keys(): | |
if i not in lst: | |
if j >= i: ind_same[i] = [j+1] | |
else: ind_same[i] = [j] | |
else: | |
if j >= i: ind_same[i].append(j+1) | |
else: ind_same[i].append(j) | |
print "Full_train shape before dropping similar features: " + str(Full_train.shape) | |
ind_drop = reduce(lambda x, y: x + y, ind_same.values(), []) | |
Full_train.drop(Full_train.columns[ind_drop], inplace=True, axis=1) | |
print "Full_train shape after: " + str(Full_train.shape) | |
######################################################## | |
##USING RANDOM FOREST TO WEED OUT ZERO VARIANCE FEATURES | |
######################################################## | |
### Full train | |
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = 'balanced', \ | |
# max_depth =10, oob_score=True) | |
#randomForest.fit(X_train, y_train) | |
### Imbalanced train | |
randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \ | |
max_depth =10, oob_score=True) | |
randomForest.fit(X_train, y_train) | |
### Balanced train | |
#randomForest = ensemble.RandomForestClassifier(n_estimators=50, class_weight = None, \ | |
# max_depth =10, oob_score=True) | |
#randomForest.fit(X_train_bal, y_train_bal) | |
print str(np.sum(randomForest.feature_importances_==0)) + " useless features" | |
#remove unimportant features by index | |
feature_imprtance = zip(Full_train.columns, randomForest.feature_importances_) | |
dtype = [('feature', 'S10'), ('importance', 'float')] | |
feature_imprtance = np.array(feature_imprtance, dtype = dtype) | |
feature_sort = np.sort(feature_imprtance, order='importance')[::-1] | |
imp = np.sort([int(i) for (i, j) in feature_sort if j!=0]) | |
useless = [int(i) for (i, j) in feature_sort if j==0] | |
# filtering out the useless features in Full_train, train_target and Full_test | |
Full_train = Full_train.iloc[:, imp] | |
Full_test = Full_test.iloc[:, imp] | |
print "Retaining {} features".format(len(Full_train.columns)) | |
# In[90]: | |
################################ | |
## Outliers | |
################################ | |
## adding the column TARGET to investigate the rows in Full_train | |
temp = pd.concat([Full_train, train['TARGET']], axis=1) | |
## finding out behavior of features' outliers in temp | |
prop_1_max, prop_1_min, row_ind_max, row_ind_min, feature = [], [], [], [], [] | |
for name in Full_train.columns: | |
col_max = temp[temp[name] == max(temp[name])]['TARGET'] | |
col_min = temp[temp[name] == min(temp[name])]['TARGET'] | |
val_max, val_min = np.mean(col_max), np.mean(col_min) | |
prop_1_max.append(val_max) | |
row_ind_max.append(Full_train.index[np.where(temp[name] == max(temp[name]))]) | |
prop_1_min.append(val_min) | |
row_ind_min.append(Full_train.index[np.where(temp[name] == min(temp[name]))]) | |
feature.append(name) | |
prop_1 = pd.DataFrame({'Max': prop_1_max, 'Min': prop_1_min, 'row_ind_max': row_ind_max, 'row_ind_min': row_ind_min, 'feature': feature}) | |
print "Number of features for which only 1s hit the max: " + str(len(prop_1[prop_1.Max == 1]['Max'])) | |
print "Number of features for which only 0s hit the max: " + str(len(prop_1[prop_1.Max == 0]['Max'])) | |
print "Number of features for which at least one 1 hit the min: " + str(len(prop_1[prop_1.Min != 0]['Min'])) | |
print "Full_train shape: " + str(Full_train.shape) | |
# In[91]: | |
################################ | |
## Max Outliers | |
################################ | |
## when the TARGET HAS to be 0 and the outlier is isolated | |
row_out = [len(l) == 1 for l in prop_1.row_ind_max] | |
isol_out0 = prop_1[(prop_1.Max == 0) & row_out] | |
feat0 = isol_out0.feature | |
row_feat_out0 = [x[0] for x in isol_out0.row_ind_max] | |
new_col_0_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index) | |
for row in row_feat_out0: | |
if row in new_col_0_out.index: | |
new_col_0_out[row] = 1 | |
else: print "Nope" | |
print len(new_col_0_out) | |
#new_col_0_out = pd.Series(new_col_0_out, index = Full_train.index) | |
for row, col in zip(row_feat_out0, feat0): | |
Full_train.loc[row, col] = 0 ## set outliers to zero | |
Full_train = pd.concat([Full_train, pd.DataFrame({'Out_1': new_col_0_out})], axis=1) | |
## when the TARGET HAS to be 1 and the outlier is isolated | |
isol_out1 = prop_1[(prop_1.Max == 1) & row_out] | |
feat1 = isol_out1.feature | |
row_feat_out1 = [x[0] for x in isol_out1.row_ind_max] | |
new_col_1_out = pd.Series(np.zeros(len(Full_train)), index = Full_train.index) | |
for row in row_feat_out1: | |
if row in new_col_1_out.index: | |
new_col_1_out[row] = 1 | |
else: print "Nope" | |
#new_col_1_out = pd.Series(new_col_1_out, index = Full_train.index) | |
for row, col in zip(row_feat_out1, feat1): | |
Full_train.loc[row, col] = 0 ## set outliers to zero | |
Full_train = pd.concat([Full_train, pd.DataFrame({'Out_2': new_col_1_out})], axis=1) | |
Full_train.describe() | |
# In[92]: | |
################################## | |
# Normalize train (with no target) | |
################################## | |
Full_train = min_max_scaler.fit_transform(Full_train) | |
train_target = train.iloc[:, 369] | |
# Normalize test | |
Full_test = test | |
Full_test = min_max_scaler.fit_transform(Full_test) | |
Full_test = pd.DataFrame(Full_test, index = test.index) | |
######### Full train ################## | |
## Slow CV on Full train set | |
# Cross-Validation and evaluate_model: 75% train - 25% test | |
X_train, X_test, y_train, y_test = cv.train_test_split(Full_train, train_target, test_size=0.25, random_state=0, stratify = train_target) | |
def evaluate_model(clf): | |
"""Scores a model using log loss with the created train and test sets.""" | |
start = time.time() | |
clf.fit(X_train, y_train) | |
print "Train score:", sklearn.metrics.roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]) | |
print "Test score:", sklearn.metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]) | |
print "Total time:", time.time() - start | |
print "Full Training: " + str(X_train.shape) + str(y_train.shape) | |
print "Full Test: " + str(X_test.shape) + str(y_test.shape) | |
# In[96]: | |
################################ | |
## Auto Grid search | |
################################ | |
param = {'max_depth': range(9, 11), 'n_estimators': [50], 'min_child_weight': [1], 'colsample_bytree': [0.7], | |
'base_score': [0.29, 0.3], \ | |
'learning_rate': np.arange(0.09, 0.11, 0.01), \ | |
'scale_pos_weight': [1], | |
'nthread': [1], | |
'reg_lambda': [1], \ | |
'reg_alpha': [4], \ | |
'max_delta_step': np.arange(0, 3, 1) | |
} | |
print str(np.product([len(param[x]) for x in param])) + " models to test" | |
xgb_mod = XGBClassifier(objective='binary:logistic') | |
clf = GridSearchCV(xgb_mod, param, verbose=2, scoring='roc_auc', cv=3) | |
clf.fit(X_train, y_train) | |
print(clf.best_score_) | |
print(clf.best_params_) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment