Created
January 18, 2017 00:39
-
-
Save TomHortons/dd672a4323f42aed59316c9f56f72574 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from scipy import sparse as ssp | |
import pylab as plt | |
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder | |
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer | |
from sklearn.decomposition import TruncatedSVD,NMF,PCA,FactorAnalysis | |
from sklearn.feature_selection import SelectFromModel,SelectPercentile,f_classif | |
from sklearn.decomposition import TruncatedSVD | |
from sklearn.metrics import log_loss,roc_auc_score | |
from sklearn.pipeline import Pipeline,make_pipeline | |
from sklearn.cross_validation import StratifiedKFold,KFold | |
from keras.preprocessing import sequence | |
from keras.callbacks import ModelCheckpoint,Callback | |
from keras import backend as K | |
from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda,AveragePooling1D | |
from keras.layers.normalization import BatchNormalization | |
from keras.optimizers import SGD | |
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU | |
from keras.models import Model | |
seed = 1 | |
np.random.seed(seed) | |
dim = 32 | |
hidden=64 | |
path = "../input/" | |
class AucCallback(Callback): #inherits from Callback | |
def __init__(self, validation_data=(), patience=25,is_regression=True,best_model_name='best_keras.mdl',feval='roc_auc_score',batch_size=1024*8): | |
super(Callback, self).__init__() | |
self.patience = patience | |
self.X_val, self.y_val = validation_data #tuple of validation X and y | |
self.best = -np.inf | |
self.wait = 0 #counter for patience | |
self.best_model=None | |
self.best_model_name = best_model_name | |
self.is_regression = is_regression | |
self.y_val = self.y_val#.astype(np.int) | |
self.feval = feval | |
self.batch_size = batch_size | |
def on_epoch_end(self, epoch, logs={}): | |
p = self.model.predict(self.X_val,batch_size=self.batch_size, verbose=0)#.ravel() | |
if self.feval=='roc_auc_score': | |
current = roc_auc_score(self.y_val,p) | |
if current > self.best: | |
self.best = current | |
self.wait = 0 | |
self.model.save_weights(self.best_model_name,overwrite=True) | |
else: | |
if self.wait >= self.patience: | |
self.model.stop_training = True | |
print('Epoch %05d: early stopping' % (epoch)) | |
self.wait += 1 #incremental the number of times without improvement | |
print('Epoch %d Auc: %f | Best Auc: %f \n' % (epoch,current,self.best)) | |
def make_batches(size, batch_size): | |
nb_batch = int(np.ceil(size/float(batch_size))) | |
return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] | |
def main(): | |
train = pd.read_csv(path+'act_train.csv') | |
test = pd.read_csv(path+'act_test.csv') | |
people = pd.read_csv(path+'people.csv') | |
columns = people.columns | |
test['outcome'] = np.nan | |
data = pd.concat([train,test]) | |
data = pd.merge(data,people,how='left',on='people_id').fillna('missing') | |
train = data[:train.shape[0]] | |
test = data[train.shape[0]:] | |
columns = train.columns.tolist() | |
columns.remove('activity_id') | |
columns.remove('outcome') | |
data = pd.concat([train,test]) | |
for c in columns: | |
data[c] = LabelEncoder().fit_transform(data[c].values) | |
train = data[:train.shape[0]] | |
test = data[train.shape[0]:] | |
data = pd.concat([train,test]) | |
columns = train.columns.tolist() | |
columns.remove('activity_id') | |
columns.remove('outcome') | |
flatten_layers = [] | |
inputs = [] | |
for c in columns: | |
inputs_c = Input(shape=(1,), dtype='int32') | |
num_c = len(np.unique(data[c].values)) | |
embed_c = Embedding( | |
num_c, | |
dim, | |
dropout=0.2, | |
input_length=1 | |
)(inputs_c) | |
flatten_c= Flatten()(embed_c) | |
inputs.append(inputs_c) | |
flatten_layers.append(flatten_c) | |
flatten = merge(flatten_layers,mode='concat') | |
fc1 = Dense(hidden,activation='relu')(flatten) | |
dp1 = Dropout(0.5)(fc1) | |
outputs = Dense(1,activation='sigmoid')(dp1) | |
model = Model(input=inputs, output=outputs) | |
model.compile( | |
optimizer='adam', | |
loss='binary_crossentropy', | |
) | |
del data | |
X = train[columns].values | |
X_t = test[columns].values | |
y = train["outcome"].values | |
people_id = train["people_id"].values | |
activity_id = test['activity_id'] | |
del train | |
del test | |
skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=seed) | |
for ind_tr, ind_te in skf: | |
X_train = X[ind_tr] | |
X_test = X[ind_te] | |
y_train = y[ind_tr] | |
y_test = y[ind_te] | |
break | |
X_train = [X_train[:,i] for i in range(X.shape[1])] | |
X_test = [X_test[:,i] for i in range(X.shape[1])] | |
del X | |
model_name = 'mlp_residual_%s_%s.hdf5'%(dim,hidden) | |
model_checkpoint = ModelCheckpoint(model_name, monitor='val_loss', save_best_only=True) | |
auc_callback = AucCallback(validation_data=(X_test,y_test), patience=5,is_regression=True,best_model_name=path+'best_keras.mdl',feval='roc_auc_score') | |
nb_epoch = 10 | |
batch_size = 1024*8 | |
load_model = False | |
if load_model: | |
print('Load Model') | |
model.load_weights(path+model_name) | |
# model.load_weights(path+'best_keras.mdl') | |
model.fit( | |
X_train, | |
y_train, | |
batch_size=batch_size, | |
nb_epoch=nb_epoch, | |
verbose=1, | |
shuffle=True, | |
validation_data=[X_test,y_test], | |
# callbacks = [ | |
# model_checkpoint, | |
# auc_callback, | |
# ], | |
) | |
# model.load_weights(model_name) | |
# model.load_weights(path+'best_keras.mdl') | |
y_preds = model.predict(X_test,batch_size=1024*8) | |
# print('auc',roc_auc_score(y_test,y_preds)) | |
# print('Make submission') | |
X_t = [X_t[:,i] for i in range(X_t.shape[1])] | |
outcome = model.predict(X_t,batch_size=1024*8) | |
submission = pd.DataFrame() | |
submission['activity_id'] = activity_id | |
submission['outcome'] = outcome | |
submission.to_csv('submission_residual_%s_%s.csv'%(dim,hidden),index=False) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment