TomHortons/CV.py

## chunk.py
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# I'm limited by RAM here and taking the first N rows is likely to be
# a bad idea for the date data since it is ordered.
# Sample the data in a roundabout way:
date_chunks = pd.read_csv(TRAIN_DATE, index_col=0, chunksize=100000, dtype=np.float32)
num_chunks = pd.read_csv(TRAIN_NUMERIC, index_col=0,
                         usecols=list(range(969)), chunksize=100000, dtype=np.float32)
X = pd.concat([pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05)
               for dchunk, nchunk in zip(date_chunks, num_chunks)])
y = pd.read_csv(TRAIN_NUMERIC, index_col=0, usecols=[0,969], dtype=np.float32).loc[X.index].values.ravel()
X = X.values

## CV.py
clf = XGBClassifier(max_depth=5, base_score=0.005)
cv = StratifiedKFold(y, n_folds=3)
preds = np.ones(y.shape[0])
for i, (train, test) in enumerate(cv):
    preds[test] = clf.fit(X[train], y[train]).predict_proba(X[test])[:,1]
    print("fold {}, ROC AUC: {:.3f}".format(i, roc_auc_score(y[test], preds[test])))
print(roc_auc_score(y, preds))

## data_preparation.py
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


feature_names = ['L3_S38_F3960', 'L3_S33_F3865', 'L3_S38_F3956', 'L3_S33_F3857',
       'L3_S29_F3321', 'L1_S24_F1846', 'L3_S32_F3850', 'L3_S29_F3354',
       'L3_S29_F3324', 'L3_S35_F3889', 'L0_S1_F28', 'L1_S24_F1844',
       'L3_S29_F3376', 'L0_S0_F22', 'L3_S33_F3859', 'L3_S38_F3952',
       'L3_S30_F3754', 'L2_S26_F3113', 'L3_S30_F3759', 'L0_S5_F114']

## date_missing.py
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

train_date_part = pd.read_csv(TRAIN_DATE, nrows=10000)
print(train_date_part.shape)
print(1.0 * train_date_part.count().sum() / train_date_part.size)

## file1.txt
(10000, 1157)
0.177920570441

## file11.txt
[  14   23   41   50  385 1019 1029 1034 1042 1056 1156 1161 1166 1171 1172
 1183 1203 1221 1294 1327 1350 1363 1403 1404 1482 1501 1507 1512 1535 1549
 1550 1843 1846 1849 1858 1879 1885 1887 1888 1891 1911 1940 1948 1951 1959
 1974 1975 1982 1985 1988 1993 1994 1995 1999 2006 2007 2010 2028 2040 2046
 2075 2093]

## file14.txt
fold 0, ROC AUC: 0.718
fold 1, ROC AUC: 0.704
fold 2, ROC AUC: 0.698
0.706413059353

## file16.txt
0.213120930917

## file18.txt
# -*- coding: utf-8 -*-
"""
@author: Faron
"""
import pandas as pd
import numpy as np
import xgboost as xgb

DATA_DIR = "../input"

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'

SEED = 0
CHUNKSIZE = 50000
NROWS = 250000

TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)

FILENAME = "etimelhoods"

train = pd.read_csv(TRAIN_NUMERIC, usecols=[ID_COLUMN, TARGET_COLUMN], nrows=NROWS)
test = pd.read_csv(TEST_NUMERIC, usecols=[ID_COLUMN], nrows=NROWS)

train["StartTime"] = -1
test["StartTime"] = -1


nrows = 0
for tr, te in zip(pd.read_csv(TRAIN_DATE, chunksize=CHUNKSIZE), pd.read_csv(TEST_DATE, chunksize=CHUNKSIZE)):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])

    stime_tr = tr[feats].min(axis=1).values
    stime_te = te[feats].min(axis=1).values

    train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr
    test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te

    nrows += CHUNKSIZE
    if nrows >= NROWS:
        break


ntrain = train.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)

train_test['magic1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True)

train_test['magic3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)
train = train_test.iloc[:ntrain, :]

features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN])

y = train.Response.ravel()
train = np.array(train[features])

print('train: {0}'.format(train.shape))
prior = np.sum(y) / (1.*len(y))

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 2,
    'eval_metric': 'auc',
    'base_score': prior
}


dtrain = xgb.DMatrix(train, label=y)
res = xgb.cv(xgb_params, dtrain, num_boost_round=10, nfold=4, seed=0, stratified=True,
             early_stopping_rounds=1, verbose_eval=1, show_stdv=True)

cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))

## file19.txt
def twoplot(df, col, xaxis=None):
    ''' scatter plot a feature split into response values as two subgraphs '''
    if col not in df.columns.values:
        print('ERROR: %s not a column' % col)
    ndf = pd.DataFrame(index = df.index)
    ndf[col] = df[col]
    ndf[xaxis] = df[xaxis] if xaxis else df.index
    ndf['Response'] = df['Response']

    g = sns.FacetGrid(ndf, col="Response", hue="Response")
    g.map(plt.scatter, xaxis, col, alpha=.7, s=1)
    g.add_legend();

    del ndf

## file22.txt
BATCH_SIZE = 5
train_batch =[pd.melt(train[train.columns[batch: batch + BATCH_SIZE].append(np.array(['Response']))],
                      id_vars = 'Response', value_vars = feature_names[batch: batch + BATCH_SIZE])
              for batch in list(range(0, train.shape[1] - 1, BATCH_SIZE))]
FIGSIZE = (12,16)
_, axs = plt.subplots(len(train_batch), figsize = FIGSIZE)
plt.suptitle('Univariate distributions')
for data, ax in zip(train_batch, axs):
    sns.violinplot(x = 'variable',  y = 'value', hue = 'Response', data = data, ax = ax, split =True)

## file23.txt
non_missing = pd.DataFrame(pd.concat([(X_neg.count()/X_neg.shape[0]).to_frame('negative samples'),
                                      (X_pos.count()/X_pos.shape[0]).to_frame('positive samples'),
                                      ],
                       axis = 1))
non_missing_sort = non_missing.sort_values(['negative samples'])
non_missing_sort.plot.barh(title = 'Proportion of non-missing values', figsize = FIGSIZE)
plt.gca().invert_yaxis()

## file24.txt
FIGSIZE = (13,4)
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE)
MIN_PERIODS = 100

triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1]))
triang_mask[np.triu_indices_from(triang_mask)] = True

ax1.set_title('Negative Class')
sns.heatmap(X_neg.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True,  ax = ax1)

ax2.set_title('Positive Class')
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True,  ax = ax2)

## file25.txt
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS) -X_neg.corr(min_periods = MIN_PERIODS),
             mask = triang_mask, square=True)

## file26.txt
nan_pos, nan_neg = np.isnan(X_pos), np.isnan(X_neg)

triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1]))
triang_mask[np.triu_indices_from(triang_mask)] = True

FIGSIZE = (13,4)
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE)
MIN_PERIODS = 100

ax1.set_title('Negative Class')
sns.heatmap(nan_neg.corr(),   square=True, mask = triang_mask, ax = ax1)

ax2.set_title('Positive Class')
sns.heatmap(nan_pos.corr(), square=True, mask = triang_mask,  ax = ax2)

## file27.txt
sns.heatmap(nan_neg.corr() - nan_pos.corr(), mask = triang_mask, square=True)

## file28.txt
library(data.table)
library(Rtsne)
library(ggplot2)
library(ggrepel)
cor_out <- as.matrix(fread("cor_train_numeric.csv", header = TRUE, sep = ","))
gc(verbose = FALSE)
set.seed(78)
tsne_model <- Rtsne(data.frame(cor_out),
                    dims = 2,
                    #initial_dims = 50,
                    initial_dims = ncol(cor_out),
                    perplexity = 322, #floor((ncol(cor_out)-1)/3)
                    theta = 0.00,
                    check_duplicates = FALSE,
                    pca = FALSE,
                    max_iter = 1350,
                    verbose = TRUE,
                    is_distance = FALSE)
corMatrix_out <- as.data.frame(tsne_model$Y)
    cor_kmeans <- kmeans(corMatrix_out, centers = 5, iter.max = 10, nstart = 3)
    corMatrix_outclust <- as.factor(c(cor_kmeans$cluster[1:968], 6))
corMatrix_names <- colnames(cor_out)
ggplot(corMatrix_out, aes(x = V1, y = V2, color = corMatrix_outclust, shape = corMatrix_outclust)) + geom_point(size = 2.5) + geom_rug() + stat_ellipse(type = "norm") + ggtitle("T-SNE of Features") + xlab("X") + ylab("Y") + labs(color = "Cluster", shape = "Cluster") + geom_text_repel(aes(x = V1, y = V2, label = corMatrix_names), size = 2.8)

## file29.txt
library(propagate)
for (i in 1:969) {
  cat("\rStep ", i, "\n", sep = "")
  train_data[!is.na(train_data[, i]), i] <- 1
  train_data[is.na(train_data[, i]), i] <- 0
  if (i %% 10) {gc(verbose = FALSE)}
}
gc(verbose = FALSE)

cor_table <- bigcor(train_data, fun = "cor", size = 194, verbose = TRUE)

## file3.txt
   Id   time station
0   4  82.24       0
0   4  82.24       0
0   4  82.24       0
0   4  82.24       0
0   4  82.24       0
(2048541, 3)
0.9821721580467314

## file30.txt
options(warn=-1) #surpress warnings

#imports for data wrangling
library(data.table)
library(dtplyr)
library(dplyr)

#get the data - nrows set to 10000 to keep runtime manageable.
#one expansion option would be to select a time frame to visualized
dtNum <- fread("input/train_numeric.csv", select = c("Id", "Response"),nrows = 10000)
dtDate <- fread("input/train_date.csv", nrows = 10000)

#for each job identify which stations are passed through and for those store the minimum time
for (station in paste0("S",0:51))
{
  cols = min(which((grepl(station,colnames(dtDate)))))
  if(!cols==Inf){
    dtDate[,paste0(station) := dtDate[,cols,with = FALSE]]
  }
}

#limit data to only when passed through station X
dtStations = dtDate[,!grepl("L",colnames(dtDate)),with=F]

#melt data to go from wide to long format
dtStationsM = melt(dtStations,id.vars=c("Id"))

#join with numeric to have Response
dtStationsM %>%
  left_join(dtNum, by = "Id") -> dtStationsM

#remove NA entries - these are plentiful as after melting each station-job combination has its own row
dtStationsM %>%
  filter(!is.na(value)) -> dtStationsMFiltered

#sort entries by ascending time
dtStationsMFiltered %>%
  arrange(value) -> dtStationsMFiltered

#imports for plotting
require(GGally)
library(network)
library(sna)
library(ggplot2)

#plotting format
options(repr.plot.width=5, repr.plot.height=15)

#for each row obtain the subsequent statoin
dtStationsMFiltered %>%
  group_by(Id) %>%
  mutate(nextStation = lead(variable)) -> edgelistsComplete

#for each id find the first node to be entered
edgelistsComplete %>%
  group_by(Id) %>%
  filter(!(variable %in% nextStation)) %>%
  ungroup() %>%
  select(variable,Response) -> startingPoints

#prior to each starting point insert an edge from a common origin
colnames(startingPoints) = c("nextStation","Response")
startingPoints$variable = "S"
edgelistsComplete %>%
  select(variable,nextStation,Response) -> paths

#for each id find the row where there is no next station (last station to be visited)
#fill this station with Response value
paths[is.na(nextStation)]$nextStation = paste("Result",paths[is.na(nextStation)]$Response)

#combine data
paths = rbind(startingPoints,paths)
paths = select(paths,-Response)
paths$nextStation = as.character(paths$nextStation)
paths$variable = as.character(paths$variable)

#rename columns for plotting
colnames(paths) <- c("Target","Source")

#flip columns in a costly way because ggnet is a little dumb and I am lazy
pathshelp = select(paths,Source)
pathshelp$Target = paths$Target
paths=pathshelp

#create network from edgelist
net = network(as.data.frame(na.omit(paths)),
              directed = TRUE)

#create a station-line mapping lookup
LineStations = NULL
for (station in unique(paths$Source)){
  if(station!="S")
  {
    x=paste0("_",station,"_")
    y=head(colnames(dtDate)[which(grepl(x,colnames(dtDate)))],1)
    y=strsplit(y,"_")[[1]][1]
    LineStations = rbind(LineStations,data.frame(Node=station,Line=y))
  }
}
LineStations = rbind(LineStations,data.frame(Node=c("Result 1","Result 0","S"),Line=c("Outcome","Outcome","START")))

#merge station-line mapping into graph for coloring purposes
x = data.frame(Node = network.vertex.names(net))
x = merge(x, LineStations, by = "Node", sort = FALSE)$Line
net %v% "line" = as.character(x)

#setup station coordinates analogue to @JohnM
nodeCoordinates=data.frame(label=c("S","S0","S1","S2","S3","S4","S5","S6",
                                   "S7","S8","S9","S10","S11","S12","S13",
                                   "S14","S15","S16","S17","S18","S19",
                                   "S20","S21","S22","S23","S24","S25",
                                   "S26","S27","S28","S29","S30","S31",
                                   "S32","S33","S34","S35","S36","S37",
                                   "S38","S39","S40","S41","S43",
                                   "S44","S45","S47","S48","S49",
                                   "S50","S51","Result 0","Result 1"),
                           y=c(0,
                               1,2,3,3,4,4,5,5,6,7,7,7,
                               1,2,3,3,4,4,5,5,6,7,7,7,
                               6,6,7,7,7,
                               8,9,10,10,10,11,11,12,13,14,
                               8,9,10,11,11,12,13,14,15,15,16,
                               17,17),
                           x=c(5,
                               9,9,10,8,10,8,10,8,7,10,9,8,
                               5,5,6,4,6,4,6,4,5,6,5,4,
                               2,0,2,1,0,
                               7,7,8,7,6,8,6,7,7,7,
                               3,3,3,4,2,3,3,3,4,2,3,
                               7,3))

nodeCoordinates$y = -3 * nodeCoordinates$y

#setup initial plot
network = ggnet2(net)

#grab node list from initial plot and attach coordinates
netCoordinates = select(network$data,label)
netCoordinates = left_join(netCoordinates,nodeCoordinates,by = "label")
netCoordinates = as.matrix(select(netCoordinates,x,y))

#setup plot with manual layout
network = ggnet2(net,
                 alpha = 0.75, size = "indegree",
                 label = T, size.cut = 4,
                 color = "line",palette = "Set1",
                 mode = netCoordinates,
                 edge.alpha = 0.5, edge.size = 1,
                 legend.position = "bottom")


#output plot on graphics device
print(network)

## file31.txt
#obtain summary statistic of number of edges on each station pair
pathshelp %>%
    mutate(stationpair=paste0(Source,"->",Target)) %>%
    group_by(stationpair) %>%
    summarize(count=n()) %>%
    arrange(-count) %>%
    print(n=10)

pathshelp %>%
    mutate(stationpair=paste0(Source,"->",Target)) %>%
    group_by(stationpair) %>%
    summarize(count=n()) %>%
    arrange(count) %>%
    print(n=10)

pathshelp %>%
    filter(Target=="Result 1") %>%
    mutate(stationpair=paste0(Source,"->",Target)) %>%
    group_by(stationpair) %>%
    summarize(count=n()) %>%
    arrange(count) %>%
    print(n=20)

pathshelp %>%
    filter(Target=="Result 0") %>%
    mutate(stationpair=paste0(Source,"->",Target)) %>%
    group_by(stationpair) %>%
    summarize(count=n()) %>%
    arrange(count) %>%
    print(n=20)

## file32.txt
# tbl_dt [162 × 2]
     stationpair count
           <chr> <int>
1       S29->S30  9452
2       S33->S34  9421
3  S37->Result 0  9211
4       S30->S33  8977
5          S->S0  5733
6         S0->S1  5726
7       S36->S37  4827
8       S34->S36  4801
9       S35->S37  4646
10      S34->S35  4635
# ... with 152 more rows
Source: local data table [162 x 2]

# tbl_dt [162 × 2]
   stationpair count
         <chr> <int>
1       S->S30     1
2     S22->S23     1
3      S6->S10     1
4     S10->S40     1
5     S16->S17     1
6     S28->S30     1
7     S21->S23     1
8       S2->S3     1
9     S24->S25     1
10      S4->S5     1
# ... with 152 more rows
Source: local data table [3 x 2]

# tbl_dt [3 × 2]
    stationpair count
          <chr> <int>
1 S51->Result 1     2
2 S38->Result 1     4
3 S37->Result 1    47
Source: local data table [6 x 2]

# tbl_dt [6 × 2]
    stationpair count
          <chr> <int>
1 S50->Result 0     1
2 S35->Result 0     3
3 S36->Result 0     3
4 S38->Result 0   227
5 S51->Result 0   499
6 S37->Result 0  9211

## file5.txt
fig = plt.figure()
plt.plot(train_time_cnt['time'].values, train_time_cnt['cnt'].values, 'b.', alpha=0.1, label='train')
plt.plot(test_time_cnt['time'].values, test_time_cnt['cnt'].values, 'r.', alpha=0.1, label='test')
plt.title('Original date values')
plt.ylabel('Number of records')
plt.xlabel('Time')
fig.savefig('original_date_values.png', dpi=300)
plt.show()

print((train_time_cnt['time'].min(), train_time_cnt['time'].max()))
print((test_time_cnt['time'].min(), test_time_cnt['time'].max()))

## file6.txt
(0.0, 1718.48)
(0.0, 1718.49)

## file7.txt
time_ticks = np.arange(train_time_cnt['time'].min(), train_time_cnt['time'].max() + 0.01, 0.01)
time_ticks = pd.DataFrame({'time': time_ticks})
time_ticks = pd.merge(time_ticks, train_time_cnt, how='left', on='time')
time_ticks = time_ticks.fillna(0)
# Autocorrelation
x = time_ticks['cnt'].values
max_lag = 8000
auto_corr_ks = range(1, max_lag)
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks])
fig = plt.figure()
plt.plot(auto_corr, 'k.', label='autocorrelation by 0.01')
plt.title('Train Sensor Time Auto-correlation')
period = 25
auto_corr_ks = list(range(period, max_lag, period))
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks])
plt.plot([0] + auto_corr_ks, auto_corr, 'go', alpha=0.5, label='strange autocorrelation at 0.25')
period = 1675
auto_corr_ks = list(range(period, max_lag, period))
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks])
plt.plot([0] + auto_corr_ks, auto_corr, 'ro', markersize=10, alpha=0.5, label='one week = 16.75?')
plt.xlabel('k * 0.01 -  autocorrelation lag')
plt.ylabel('autocorrelation')
plt.legend(loc=0)
fig.savefig('train_time_auto_correlation.png', dpi=300)

## importance.py
# threshold for a manageable number of features
plt.hist(clf.feature_importances_[clf.feature_importances_>0])
important_indices = np.where(clf.feature_importances_>0.005)[0]
print(important_indices)

## make_clf.py
clf = XGBClassifier(base_score=0.005)
clf.fit(X, y)

## make_submission.py
# load test data
X = np.concatenate([
    pd.read_csv("../input/test_date.csv", index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices<1156]+1])).values,
    pd.read_csv("../input/test_numeric.csv", index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices>=1156] +1 - 1156])).values
], axis=1)

# generate predictions at the chosen threshold
preds = (clf.predict_proba(X)[:,1] > best_threshold).astype(np.int8)

# and submit
sub = pd.read_csv("../input/sample_submission.csv", index_col=0)
sub["Response"] = preds
sub.to_csv("submission.csv.gz", compression="gzip")

## read_and_split.py
numeric_cols = pd.read_csv("../input/train_numeric.csv", nrows = 1).columns.values
imp_idxs = [np.argwhere(feature_name == numeric_cols)[0][0] for feature_name in feature_names]
train = pd.read_csv("../input/train_numeric.csv",
                index_col = 0, header = 0, usecols = [0, len(numeric_cols) - 1] + imp_idxs)
train = train[feature_names + ['Response']]
X_neg, X_pos = train[train['Response'] == 0].iloc[:, :-1], train[train['Response']==1].iloc[:, :-1]

## read_important_features.py
# load entire dataset for these features.
# note where the feature indices are split so we can load the correct ones straight from read_csv
n_date_features = 1156
X = np.concatenate([
    pd.read_csv(TRAIN_DATE, index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices < n_date_features] + 1])).values,
    pd.read_csv(TRAIN_NUMERIC, index_col=0, dtype=np.float32,
                usecols=np.concatenate([[0], important_indices[important_indices >= n_date_features] + 1 - 1156])).values
], axis=1)
y = pd.read_csv(TRAIN_NUMERIC, index_col=0, dtype=np.float32, usecols=[0,969]).values.ravel()

## read_station_times.py
# Let's check the min and max times for each station
def get_station_times(dates, withId=False):
    times = []
    cols = list(dates.columns)
    if 'Id' in cols:
        cols.remove('Id')
    for feature_name in cols:
        if withId:
            df = dates[['Id', feature_name]].copy()
            df.columns = ['Id', 'time']
        else:
            df = dates[[feature_name]].copy()
            df.columns = ['time']
        df['station'] = feature_name.split('_')[1][1:]
        df = df.dropna()
        times.append(df)
    return pd.concat(times)

station_times = get_station_times(train_date_part, withId=True).sort_values(by=['Id', 'station'])
print(station_times[:5])
print(station_times.shape)
min_station_times = station_times.groupby(['Id', 'station']).min()['time']
max_station_times = station_times.groupby(['Id', 'station']).max()['time']
print(np.mean(1. * (min_station_times == max_station_times)))

## read_train_test_station_date.py
# Read station times for train and test
date_cols = train_date_part.drop('Id', axis=1).count().reset_index().sort_values(by=0, ascending=False)
date_cols['station'] = date_cols['index'].apply(lambda s: s.split('_')[1])
date_cols = date_cols.drop_duplicates('station', keep='first')['index'].tolist()

train_date = pd.read_csv(TRAIN_DATE, usecols=date_cols)
train_station_times = get_station_times(train_date, withId=False)
train_time_cnt = train_station_times.groupby('time').count()[['station']].reset_index()
train_time_cnt.columns = ['time', 'cnt']

test_date = pd.read_csv(TEST_DATE, usecols=date_cols)
test_station_times = get_station_times(test_date, withId=False)
test_time_cnt = test_station_times.groupby('time').count()[['station']].reset_index()
test_time_cnt.columns = ['time', 'cnt']

## thres.py
# pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 50)
mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds])
plt.plot(thresholds, mcc)
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())
	import numpy as np
	import pandas as pd
	from xgboost import XGBClassifier
	from sklearn.metrics import matthews_corrcoef, roc_auc_score
	from sklearn.cross_validation import cross_val_score, StratifiedKFold
	import matplotlib.pyplot as plt
	import seaborn as sns
	%matplotlib inline

	# I'm limited by RAM here and taking the first N rows is likely to be
	# a bad idea for the date data since it is ordered.
	# Sample the data in a roundabout way:
	date_chunks = pd.read_csv(TRAIN_DATE, index_col=0, chunksize=100000, dtype=np.float32)
	num_chunks = pd.read_csv(TRAIN_NUMERIC, index_col=0,
	usecols=list(range(969)), chunksize=100000, dtype=np.float32)
	X = pd.concat([pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05)
	for dchunk, nchunk in zip(date_chunks, num_chunks)])
	y = pd.read_csv(TRAIN_NUMERIC, index_col=0, usecols=[0,969], dtype=np.float32).loc[X.index].values.ravel()
	X = X.values
	clf = XGBClassifier(max_depth=5, base_score=0.005)
	cv = StratifiedKFold(y, n_folds=3)
	preds = np.ones(y.shape[0])
	for i, (train, test) in enumerate(cv):
	preds[test] = clf.fit(X[train], y[train]).predict_proba(X[test])[:,1]
	print("fold {}, ROC AUC: {:.3f}".format(i, roc_auc_score(y[test], preds[test])))
	print(roc_auc_score(y, preds))
	[ 14 23 41 50 385 1019 1029 1034 1042 1056 1156 1161 1166 1171 1172
	1183 1203 1221 1294 1327 1350 1363 1403 1404 1482 1501 1507 1512 1535 1549
	1550 1843 1846 1849 1858 1879 1885 1887 1888 1891 1911 1940 1948 1951 1959
	1974 1975 1982 1985 1988 1993 1994 1995 1999 2006 2007 2010 2028 2040 2046
	2075 2093]
	fold 0, ROC AUC: 0.718
	fold 1, ROC AUC: 0.704
	fold 2, ROC AUC: 0.698
	0.706413059353
	# -- coding: utf-8 --
	"""
	@author: Faron
	"""
	import pandas as pd
	import numpy as np
	import xgboost as xgb

	DATA_DIR = "../input"

	ID_COLUMN = 'Id'
	TARGET_COLUMN = 'Response'

	SEED = 0
	CHUNKSIZE = 50000
	NROWS = 250000

	TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
	TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

	TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
	TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)

	FILENAME = "etimelhoods"

	train = pd.read_csv(TRAIN_NUMERIC, usecols=[ID_COLUMN, TARGET_COLUMN], nrows=NROWS)
	test = pd.read_csv(TEST_NUMERIC, usecols=[ID_COLUMN], nrows=NROWS)

	train["StartTime"] = -1
	test["StartTime"] = -1


	nrows = 0
	for tr, te in zip(pd.read_csv(TRAIN_DATE, chunksize=CHUNKSIZE), pd.read_csv(TEST_DATE, chunksize=CHUNKSIZE)):
	feats = np.setdiff1d(tr.columns, [ID_COLUMN])

	stime_tr = tr[feats].min(axis=1).values
	stime_te = te[feats].min(axis=1).values

	train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr
	test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te

	nrows += CHUNKSIZE
	if nrows >= NROWS:
	break


	ntrain = train.shape[0]
	train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False)

	train_test['magic1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
	train_test['magic2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

	train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True)

	train_test['magic3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
	train_test['magic4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)

	train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)
	train = train_test.iloc[:ntrain, :]

	features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN])

	y = train.Response.ravel()
	train = np.array(train[features])

	print('train: {0}'.format(train.shape))
	prior = np.sum(y) / (1.*len(y))

	xgb_params = {
	'seed': 0,
	'colsample_bytree': 0.7,
	'silent': 1,
	'subsample': 0.7,
	'learning_rate': 0.1,
	'objective': 'binary:logistic',
	'max_depth': 4,
	'num_parallel_tree': 1,
	'min_child_weight': 2,
	'eval_metric': 'auc',
	'base_score': prior
	}


	dtrain = xgb.DMatrix(train, label=y)
	res = xgb.cv(xgb_params, dtrain, num_boost_round=10, nfold=4, seed=0, stratified=True,
	early_stopping_rounds=1, verbose_eval=1, show_stdv=True)

	cv_mean = res.iloc[-1, 0]
	cv_std = res.iloc[-1, 1]

	print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))
	def twoplot(df, col, xaxis=None):
	''' scatter plot a feature split into response values as two subgraphs '''
	if col not in df.columns.values:
	print('ERROR: %s not a column' % col)
	ndf = pd.DataFrame(index = df.index)
	ndf[col] = df[col]
	ndf[xaxis] = df[xaxis] if xaxis else df.index
	ndf['Response'] = df['Response']

	g = sns.FacetGrid(ndf, col="Response", hue="Response")
	g.map(plt.scatter, xaxis, col, alpha=.7, s=1)
	g.add_legend();

	del ndf
	BATCH_SIZE = 5
	train_batch =[pd.melt(train[train.columns[batch: batch + BATCH_SIZE].append(np.array(['Response']))],
	id_vars = 'Response', value_vars = feature_names[batch: batch + BATCH_SIZE])
	for batch in list(range(0, train.shape[1] - 1, BATCH_SIZE))]
	FIGSIZE = (12,16)
	_, axs = plt.subplots(len(train_batch), figsize = FIGSIZE)
	plt.suptitle('Univariate distributions')
	for data, ax in zip(train_batch, axs):
	sns.violinplot(x = 'variable', y = 'value', hue = 'Response', data = data, ax = ax, split =True)
	non_missing = pd.DataFrame(pd.concat([(X_neg.count()/X_neg.shape[0]).to_frame('negative samples'),
	(X_pos.count()/X_pos.shape[0]).to_frame('positive samples'),
	],
	axis = 1))
	non_missing_sort = non_missing.sort_values(['negative samples'])
	non_missing_sort.plot.barh(title = 'Proportion of non-missing values', figsize = FIGSIZE)
	plt.gca().invert_yaxis()
	FIGSIZE = (13,4)
	_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE)
	MIN_PERIODS = 100

	triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1]))
	triang_mask[np.triu_indices_from(triang_mask)] = True

	ax1.set_title('Negative Class')
	sns.heatmap(X_neg.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True, ax = ax1)

	ax2.set_title('Positive Class')
	sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True, ax = ax2)
	sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS) -X_neg.corr(min_periods = MIN_PERIODS),
	mask = triang_mask, square=True)