Last active
November 24, 2016 19:44
-
-
Save TomHortons/2520fdf349b7bfb1e791df12687b23b3 to your computer and use it in GitHub Desktop.
Kaggleまとめ:BOSCH(kernels) ref: http://qiita.com/TomHortons/items/359f8e39b9cd424c2360
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from xgboost import XGBClassifier | |
from sklearn.metrics import matthews_corrcoef, roc_auc_score | |
from sklearn.cross_validation import cross_val_score, StratifiedKFold | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
# I'm limited by RAM here and taking the first N rows is likely to be | |
# a bad idea for the date data since it is ordered. | |
# Sample the data in a roundabout way: | |
date_chunks = pd.read_csv(TRAIN_DATE, index_col=0, chunksize=100000, dtype=np.float32) | |
num_chunks = pd.read_csv(TRAIN_NUMERIC, index_col=0, | |
usecols=list(range(969)), chunksize=100000, dtype=np.float32) | |
X = pd.concat([pd.concat([dchunk, nchunk], axis=1).sample(frac=0.05) | |
for dchunk, nchunk in zip(date_chunks, num_chunks)]) | |
y = pd.read_csv(TRAIN_NUMERIC, index_col=0, usecols=[0,969], dtype=np.float32).loc[X.index].values.ravel() | |
X = X.values |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clf = XGBClassifier(max_depth=5, base_score=0.005) | |
cv = StratifiedKFold(y, n_folds=3) | |
preds = np.ones(y.shape[0]) | |
for i, (train, test) in enumerate(cv): | |
preds[test] = clf.fit(X[train], y[train]).predict_proba(X[test])[:,1] | |
print("fold {}, ROC AUC: {:.3f}".format(i, roc_auc_score(y[test], preds[test]))) | |
print(roc_auc_score(y, preds)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%matplotlib inline | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
feature_names = ['L3_S38_F3960', 'L3_S33_F3865', 'L3_S38_F3956', 'L3_S33_F3857', | |
'L3_S29_F3321', 'L1_S24_F1846', 'L3_S32_F3850', 'L3_S29_F3354', | |
'L3_S29_F3324', 'L3_S35_F3889', 'L0_S1_F28', 'L1_S24_F1844', | |
'L3_S29_F3376', 'L0_S0_F22', 'L3_S33_F3859', 'L3_S38_F3952', | |
'L3_S30_F3754', 'L2_S26_F3113', 'L3_S30_F3759', 'L0_S5_F114'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%matplotlib inline | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
train_date_part = pd.read_csv(TRAIN_DATE, nrows=10000) | |
print(train_date_part.shape) | |
print(1.0 * train_date_part.count().sum() / train_date_part.size) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(10000, 1157) | |
0.177920570441 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[ 14 23 41 50 385 1019 1029 1034 1042 1056 1156 1161 1166 1171 1172 | |
1183 1203 1221 1294 1327 1350 1363 1403 1404 1482 1501 1507 1512 1535 1549 | |
1550 1843 1846 1849 1858 1879 1885 1887 1888 1891 1911 1940 1948 1951 1959 | |
1974 1975 1982 1985 1988 1993 1994 1995 1999 2006 2007 2010 2028 2040 2046 | |
2075 2093] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fold 0, ROC AUC: 0.718 | |
fold 1, ROC AUC: 0.704 | |
fold 2, ROC AUC: 0.698 | |
0.706413059353 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
0.213120930917 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
@author: Faron | |
""" | |
import pandas as pd | |
import numpy as np | |
import xgboost as xgb | |
DATA_DIR = "../input" | |
ID_COLUMN = 'Id' | |
TARGET_COLUMN = 'Response' | |
SEED = 0 | |
CHUNKSIZE = 50000 | |
NROWS = 250000 | |
TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR) | |
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR) | |
TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR) | |
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR) | |
FILENAME = "etimelhoods" | |
train = pd.read_csv(TRAIN_NUMERIC, usecols=[ID_COLUMN, TARGET_COLUMN], nrows=NROWS) | |
test = pd.read_csv(TEST_NUMERIC, usecols=[ID_COLUMN], nrows=NROWS) | |
train["StartTime"] = -1 | |
test["StartTime"] = -1 | |
nrows = 0 | |
for tr, te in zip(pd.read_csv(TRAIN_DATE, chunksize=CHUNKSIZE), pd.read_csv(TEST_DATE, chunksize=CHUNKSIZE)): | |
feats = np.setdiff1d(tr.columns, [ID_COLUMN]) | |
stime_tr = tr[feats].min(axis=1).values | |
stime_te = te[feats].min(axis=1).values | |
train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr | |
test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te | |
nrows += CHUNKSIZE | |
if nrows >= NROWS: | |
break | |
ntrain = train.shape[0] | |
train_test = pd.concat((train, test)).reset_index(drop=True).reset_index(drop=False) | |
train_test['magic1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int) | |
train_test['magic2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int) | |
train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True) | |
train_test['magic3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int) | |
train_test['magic4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int) | |
train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1) | |
train = train_test.iloc[:ntrain, :] | |
features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN]) | |
y = train.Response.ravel() | |
train = np.array(train[features]) | |
print('train: {0}'.format(train.shape)) | |
prior = np.sum(y) / (1.*len(y)) | |
xgb_params = { | |
'seed': 0, | |
'colsample_bytree': 0.7, | |
'silent': 1, | |
'subsample': 0.7, | |
'learning_rate': 0.1, | |
'objective': 'binary:logistic', | |
'max_depth': 4, | |
'num_parallel_tree': 1, | |
'min_child_weight': 2, | |
'eval_metric': 'auc', | |
'base_score': prior | |
} | |
dtrain = xgb.DMatrix(train, label=y) | |
res = xgb.cv(xgb_params, dtrain, num_boost_round=10, nfold=4, seed=0, stratified=True, | |
early_stopping_rounds=1, verbose_eval=1, show_stdv=True) | |
cv_mean = res.iloc[-1, 0] | |
cv_std = res.iloc[-1, 1] | |
print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def twoplot(df, col, xaxis=None): | |
''' scatter plot a feature split into response values as two subgraphs ''' | |
if col not in df.columns.values: | |
print('ERROR: %s not a column' % col) | |
ndf = pd.DataFrame(index = df.index) | |
ndf[col] = df[col] | |
ndf[xaxis] = df[xaxis] if xaxis else df.index | |
ndf['Response'] = df['Response'] | |
g = sns.FacetGrid(ndf, col="Response", hue="Response") | |
g.map(plt.scatter, xaxis, col, alpha=.7, s=1) | |
g.add_legend(); | |
del ndf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BATCH_SIZE = 5 | |
train_batch =[pd.melt(train[train.columns[batch: batch + BATCH_SIZE].append(np.array(['Response']))], | |
id_vars = 'Response', value_vars = feature_names[batch: batch + BATCH_SIZE]) | |
for batch in list(range(0, train.shape[1] - 1, BATCH_SIZE))] | |
FIGSIZE = (12,16) | |
_, axs = plt.subplots(len(train_batch), figsize = FIGSIZE) | |
plt.suptitle('Univariate distributions') | |
for data, ax in zip(train_batch, axs): | |
sns.violinplot(x = 'variable', y = 'value', hue = 'Response', data = data, ax = ax, split =True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
non_missing = pd.DataFrame(pd.concat([(X_neg.count()/X_neg.shape[0]).to_frame('negative samples'), | |
(X_pos.count()/X_pos.shape[0]).to_frame('positive samples'), | |
], | |
axis = 1)) | |
non_missing_sort = non_missing.sort_values(['negative samples']) | |
non_missing_sort.plot.barh(title = 'Proportion of non-missing values', figsize = FIGSIZE) | |
plt.gca().invert_yaxis() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FIGSIZE = (13,4) | |
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE) | |
MIN_PERIODS = 100 | |
triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1])) | |
triang_mask[np.triu_indices_from(triang_mask)] = True | |
ax1.set_title('Negative Class') | |
sns.heatmap(X_neg.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True, ax = ax1) | |
ax2.set_title('Positive Class') | |
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS), mask = triang_mask, square=True, ax = ax2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sns.heatmap(X_pos.corr(min_periods = MIN_PERIODS) -X_neg.corr(min_periods = MIN_PERIODS), | |
mask = triang_mask, square=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nan_pos, nan_neg = np.isnan(X_pos), np.isnan(X_neg) | |
triang_mask = np.zeros((X_pos.shape[1], X_pos.shape[1])) | |
triang_mask[np.triu_indices_from(triang_mask)] = True | |
FIGSIZE = (13,4) | |
_, (ax1, ax2) = plt.subplots(1,2, figsize = FIGSIZE) | |
MIN_PERIODS = 100 | |
ax1.set_title('Negative Class') | |
sns.heatmap(nan_neg.corr(), square=True, mask = triang_mask, ax = ax1) | |
ax2.set_title('Positive Class') | |
sns.heatmap(nan_pos.corr(), square=True, mask = triang_mask, ax = ax2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sns.heatmap(nan_neg.corr() - nan_pos.corr(), mask = triang_mask, square=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(Rtsne) | |
library(ggplot2) | |
library(ggrepel) | |
cor_out <- as.matrix(fread("cor_train_numeric.csv", header = TRUE, sep = ",")) | |
gc(verbose = FALSE) | |
set.seed(78) | |
tsne_model <- Rtsne(data.frame(cor_out), | |
dims = 2, | |
#initial_dims = 50, | |
initial_dims = ncol(cor_out), | |
perplexity = 322, #floor((ncol(cor_out)-1)/3) | |
theta = 0.00, | |
check_duplicates = FALSE, | |
pca = FALSE, | |
max_iter = 1350, | |
verbose = TRUE, | |
is_distance = FALSE) | |
corMatrix_out <- as.data.frame(tsne_model$Y) | |
cor_kmeans <- kmeans(corMatrix_out, centers = 5, iter.max = 10, nstart = 3) | |
corMatrix_outclust <- as.factor(c(cor_kmeans$cluster[1:968], 6)) | |
corMatrix_names <- colnames(cor_out) | |
ggplot(corMatrix_out, aes(x = V1, y = V2, color = corMatrix_outclust, shape = corMatrix_outclust)) + geom_point(size = 2.5) + geom_rug() + stat_ellipse(type = "norm") + ggtitle("T-SNE of Features") + xlab("X") + ylab("Y") + labs(color = "Cluster", shape = "Cluster") + geom_text_repel(aes(x = V1, y = V2, label = corMatrix_names), size = 2.8) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(propagate) | |
for (i in 1:969) { | |
cat("\rStep ", i, "\n", sep = "") | |
train_data[!is.na(train_data[, i]), i] <- 1 | |
train_data[is.na(train_data[, i]), i] <- 0 | |
if (i %% 10) {gc(verbose = FALSE)} | |
} | |
gc(verbose = FALSE) | |
cor_table <- bigcor(train_data, fun = "cor", size = 194, verbose = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Id time station | |
0 4 82.24 0 | |
0 4 82.24 0 | |
0 4 82.24 0 | |
0 4 82.24 0 | |
0 4 82.24 0 | |
(2048541, 3) | |
0.9821721580467314 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
options(warn=-1) #surpress warnings | |
#imports for data wrangling | |
library(data.table) | |
library(dtplyr) | |
library(dplyr) | |
#get the data - nrows set to 10000 to keep runtime manageable. | |
#one expansion option would be to select a time frame to visualized | |
dtNum <- fread("input/train_numeric.csv", select = c("Id", "Response"),nrows = 10000) | |
dtDate <- fread("input/train_date.csv", nrows = 10000) | |
#for each job identify which stations are passed through and for those store the minimum time | |
for (station in paste0("S",0:51)) | |
{ | |
cols = min(which((grepl(station,colnames(dtDate))))) | |
if(!cols==Inf){ | |
dtDate[,paste0(station) := dtDate[,cols,with = FALSE]] | |
} | |
} | |
#limit data to only when passed through station X | |
dtStations = dtDate[,!grepl("L",colnames(dtDate)),with=F] | |
#melt data to go from wide to long format | |
dtStationsM = melt(dtStations,id.vars=c("Id")) | |
#join with numeric to have Response | |
dtStationsM %>% | |
left_join(dtNum, by = "Id") -> dtStationsM | |
#remove NA entries - these are plentiful as after melting each station-job combination has its own row | |
dtStationsM %>% | |
filter(!is.na(value)) -> dtStationsMFiltered | |
#sort entries by ascending time | |
dtStationsMFiltered %>% | |
arrange(value) -> dtStationsMFiltered | |
#imports for plotting | |
require(GGally) | |
library(network) | |
library(sna) | |
library(ggplot2) | |
#plotting format | |
options(repr.plot.width=5, repr.plot.height=15) | |
#for each row obtain the subsequent statoin | |
dtStationsMFiltered %>% | |
group_by(Id) %>% | |
mutate(nextStation = lead(variable)) -> edgelistsComplete | |
#for each id find the first node to be entered | |
edgelistsComplete %>% | |
group_by(Id) %>% | |
filter(!(variable %in% nextStation)) %>% | |
ungroup() %>% | |
select(variable,Response) -> startingPoints | |
#prior to each starting point insert an edge from a common origin | |
colnames(startingPoints) = c("nextStation","Response") | |
startingPoints$variable = "S" | |
edgelistsComplete %>% | |
select(variable,nextStation,Response) -> paths | |
#for each id find the row where there is no next station (last station to be visited) | |
#fill this station with Response value | |
paths[is.na(nextStation)]$nextStation = paste("Result",paths[is.na(nextStation)]$Response) | |
#combine data | |
paths = rbind(startingPoints,paths) | |
paths = select(paths,-Response) | |
paths$nextStation = as.character(paths$nextStation) | |
paths$variable = as.character(paths$variable) | |
#rename columns for plotting | |
colnames(paths) <- c("Target","Source") | |
#flip columns in a costly way because ggnet is a little dumb and I am lazy | |
pathshelp = select(paths,Source) | |
pathshelp$Target = paths$Target | |
paths=pathshelp | |
#create network from edgelist | |
net = network(as.data.frame(na.omit(paths)), | |
directed = TRUE) | |
#create a station-line mapping lookup | |
LineStations = NULL | |
for (station in unique(paths$Source)){ | |
if(station!="S") | |
{ | |
x=paste0("_",station,"_") | |
y=head(colnames(dtDate)[which(grepl(x,colnames(dtDate)))],1) | |
y=strsplit(y,"_")[[1]][1] | |
LineStations = rbind(LineStations,data.frame(Node=station,Line=y)) | |
} | |
} | |
LineStations = rbind(LineStations,data.frame(Node=c("Result 1","Result 0","S"),Line=c("Outcome","Outcome","START"))) | |
#merge station-line mapping into graph for coloring purposes | |
x = data.frame(Node = network.vertex.names(net)) | |
x = merge(x, LineStations, by = "Node", sort = FALSE)$Line | |
net %v% "line" = as.character(x) | |
#setup station coordinates analogue to @JohnM | |
nodeCoordinates=data.frame(label=c("S","S0","S1","S2","S3","S4","S5","S6", | |
"S7","S8","S9","S10","S11","S12","S13", | |
"S14","S15","S16","S17","S18","S19", | |
"S20","S21","S22","S23","S24","S25", | |
"S26","S27","S28","S29","S30","S31", | |
"S32","S33","S34","S35","S36","S37", | |
"S38","S39","S40","S41","S43", | |
"S44","S45","S47","S48","S49", | |
"S50","S51","Result 0","Result 1"), | |
y=c(0, | |
1,2,3,3,4,4,5,5,6,7,7,7, | |
1,2,3,3,4,4,5,5,6,7,7,7, | |
6,6,7,7,7, | |
8,9,10,10,10,11,11,12,13,14, | |
8,9,10,11,11,12,13,14,15,15,16, | |
17,17), | |
x=c(5, | |
9,9,10,8,10,8,10,8,7,10,9,8, | |
5,5,6,4,6,4,6,4,5,6,5,4, | |
2,0,2,1,0, | |
7,7,8,7,6,8,6,7,7,7, | |
3,3,3,4,2,3,3,3,4,2,3, | |
7,3)) | |
nodeCoordinates$y = -3 * nodeCoordinates$y | |
#setup initial plot | |
network = ggnet2(net) | |
#grab node list from initial plot and attach coordinates | |
netCoordinates = select(network$data,label) | |
netCoordinates = left_join(netCoordinates,nodeCoordinates,by = "label") | |
netCoordinates = as.matrix(select(netCoordinates,x,y)) | |
#setup plot with manual layout | |
network = ggnet2(net, | |
alpha = 0.75, size = "indegree", | |
label = T, size.cut = 4, | |
color = "line",palette = "Set1", | |
mode = netCoordinates, | |
edge.alpha = 0.5, edge.size = 1, | |
legend.position = "bottom") | |
#output plot on graphics device | |
print(network) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#obtain summary statistic of number of edges on each station pair | |
pathshelp %>% | |
mutate(stationpair=paste0(Source,"->",Target)) %>% | |
group_by(stationpair) %>% | |
summarize(count=n()) %>% | |
arrange(-count) %>% | |
print(n=10) | |
pathshelp %>% | |
mutate(stationpair=paste0(Source,"->",Target)) %>% | |
group_by(stationpair) %>% | |
summarize(count=n()) %>% | |
arrange(count) %>% | |
print(n=10) | |
pathshelp %>% | |
filter(Target=="Result 1") %>% | |
mutate(stationpair=paste0(Source,"->",Target)) %>% | |
group_by(stationpair) %>% | |
summarize(count=n()) %>% | |
arrange(count) %>% | |
print(n=20) | |
pathshelp %>% | |
filter(Target=="Result 0") %>% | |
mutate(stationpair=paste0(Source,"->",Target)) %>% | |
group_by(stationpair) %>% | |
summarize(count=n()) %>% | |
arrange(count) %>% | |
print(n=20) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tbl_dt [162 × 2] | |
stationpair count | |
<chr> <int> | |
1 S29->S30 9452 | |
2 S33->S34 9421 | |
3 S37->Result 0 9211 | |
4 S30->S33 8977 | |
5 S->S0 5733 | |
6 S0->S1 5726 | |
7 S36->S37 4827 | |
8 S34->S36 4801 | |
9 S35->S37 4646 | |
10 S34->S35 4635 | |
# ... with 152 more rows | |
Source: local data table [162 x 2] | |
# tbl_dt [162 × 2] | |
stationpair count | |
<chr> <int> | |
1 S->S30 1 | |
2 S22->S23 1 | |
3 S6->S10 1 | |
4 S10->S40 1 | |
5 S16->S17 1 | |
6 S28->S30 1 | |
7 S21->S23 1 | |
8 S2->S3 1 | |
9 S24->S25 1 | |
10 S4->S5 1 | |
# ... with 152 more rows | |
Source: local data table [3 x 2] | |
# tbl_dt [3 × 2] | |
stationpair count | |
<chr> <int> | |
1 S51->Result 1 2 | |
2 S38->Result 1 4 | |
3 S37->Result 1 47 | |
Source: local data table [6 x 2] | |
# tbl_dt [6 × 2] | |
stationpair count | |
<chr> <int> | |
1 S50->Result 0 1 | |
2 S35->Result 0 3 | |
3 S36->Result 0 3 | |
4 S38->Result 0 227 | |
5 S51->Result 0 499 | |
6 S37->Result 0 9211 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig = plt.figure() | |
plt.plot(train_time_cnt['time'].values, train_time_cnt['cnt'].values, 'b.', alpha=0.1, label='train') | |
plt.plot(test_time_cnt['time'].values, test_time_cnt['cnt'].values, 'r.', alpha=0.1, label='test') | |
plt.title('Original date values') | |
plt.ylabel('Number of records') | |
plt.xlabel('Time') | |
fig.savefig('original_date_values.png', dpi=300) | |
plt.show() | |
print((train_time_cnt['time'].min(), train_time_cnt['time'].max())) | |
print((test_time_cnt['time'].min(), test_time_cnt['time'].max())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(0.0, 1718.48) | |
(0.0, 1718.49) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
time_ticks = np.arange(train_time_cnt['time'].min(), train_time_cnt['time'].max() + 0.01, 0.01) | |
time_ticks = pd.DataFrame({'time': time_ticks}) | |
time_ticks = pd.merge(time_ticks, train_time_cnt, how='left', on='time') | |
time_ticks = time_ticks.fillna(0) | |
# Autocorrelation | |
x = time_ticks['cnt'].values | |
max_lag = 8000 | |
auto_corr_ks = range(1, max_lag) | |
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks]) | |
fig = plt.figure() | |
plt.plot(auto_corr, 'k.', label='autocorrelation by 0.01') | |
plt.title('Train Sensor Time Auto-correlation') | |
period = 25 | |
auto_corr_ks = list(range(period, max_lag, period)) | |
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks]) | |
plt.plot([0] + auto_corr_ks, auto_corr, 'go', alpha=0.5, label='strange autocorrelation at 0.25') | |
period = 1675 | |
auto_corr_ks = list(range(period, max_lag, period)) | |
auto_corr = np.array([1] + [np.corrcoef(x[:-k], x[k:])[0, 1] for k in auto_corr_ks]) | |
plt.plot([0] + auto_corr_ks, auto_corr, 'ro', markersize=10, alpha=0.5, label='one week = 16.75?') | |
plt.xlabel('k * 0.01 - autocorrelation lag') | |
plt.ylabel('autocorrelation') | |
plt.legend(loc=0) | |
fig.savefig('train_time_auto_correlation.png', dpi=300) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# threshold for a manageable number of features | |
plt.hist(clf.feature_importances_[clf.feature_importances_>0]) | |
important_indices = np.where(clf.feature_importances_>0.005)[0] | |
print(important_indices) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
clf = XGBClassifier(base_score=0.005) | |
clf.fit(X, y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load test data | |
X = np.concatenate([ | |
pd.read_csv("../input/test_date.csv", index_col=0, dtype=np.float32, | |
usecols=np.concatenate([[0], important_indices[important_indices<1156]+1])).values, | |
pd.read_csv("../input/test_numeric.csv", index_col=0, dtype=np.float32, | |
usecols=np.concatenate([[0], important_indices[important_indices>=1156] +1 - 1156])).values | |
], axis=1) | |
# generate predictions at the chosen threshold | |
preds = (clf.predict_proba(X)[:,1] > best_threshold).astype(np.int8) | |
# and submit | |
sub = pd.read_csv("../input/sample_submission.csv", index_col=0) | |
sub["Response"] = preds | |
sub.to_csv("submission.csv.gz", compression="gzip") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
numeric_cols = pd.read_csv("../input/train_numeric.csv", nrows = 1).columns.values | |
imp_idxs = [np.argwhere(feature_name == numeric_cols)[0][0] for feature_name in feature_names] | |
train = pd.read_csv("../input/train_numeric.csv", | |
index_col = 0, header = 0, usecols = [0, len(numeric_cols) - 1] + imp_idxs) | |
train = train[feature_names + ['Response']] | |
X_neg, X_pos = train[train['Response'] == 0].iloc[:, :-1], train[train['Response']==1].iloc[:, :-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load entire dataset for these features. | |
# note where the feature indices are split so we can load the correct ones straight from read_csv | |
n_date_features = 1156 | |
X = np.concatenate([ | |
pd.read_csv(TRAIN_DATE, index_col=0, dtype=np.float32, | |
usecols=np.concatenate([[0], important_indices[important_indices < n_date_features] + 1])).values, | |
pd.read_csv(TRAIN_NUMERIC, index_col=0, dtype=np.float32, | |
usecols=np.concatenate([[0], important_indices[important_indices >= n_date_features] + 1 - 1156])).values | |
], axis=1) | |
y = pd.read_csv(TRAIN_NUMERIC, index_col=0, dtype=np.float32, usecols=[0,969]).values.ravel() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Let's check the min and max times for each station | |
def get_station_times(dates, withId=False): | |
times = [] | |
cols = list(dates.columns) | |
if 'Id' in cols: | |
cols.remove('Id') | |
for feature_name in cols: | |
if withId: | |
df = dates[['Id', feature_name]].copy() | |
df.columns = ['Id', 'time'] | |
else: | |
df = dates[[feature_name]].copy() | |
df.columns = ['time'] | |
df['station'] = feature_name.split('_')[1][1:] | |
df = df.dropna() | |
times.append(df) | |
return pd.concat(times) | |
station_times = get_station_times(train_date_part, withId=True).sort_values(by=['Id', 'station']) | |
print(station_times[:5]) | |
print(station_times.shape) | |
min_station_times = station_times.groupby(['Id', 'station']).min()['time'] | |
max_station_times = station_times.groupby(['Id', 'station']).max()['time'] | |
print(np.mean(1. * (min_station_times == max_station_times))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read station times for train and test | |
date_cols = train_date_part.drop('Id', axis=1).count().reset_index().sort_values(by=0, ascending=False) | |
date_cols['station'] = date_cols['index'].apply(lambda s: s.split('_')[1]) | |
date_cols = date_cols.drop_duplicates('station', keep='first')['index'].tolist() | |
train_date = pd.read_csv(TRAIN_DATE, usecols=date_cols) | |
train_station_times = get_station_times(train_date, withId=False) | |
train_time_cnt = train_station_times.groupby('time').count()[['station']].reset_index() | |
train_time_cnt.columns = ['time', 'cnt'] | |
test_date = pd.read_csv(TEST_DATE, usecols=date_cols) | |
test_station_times = get_station_times(test_date, withId=False) | |
test_time_cnt = test_station_times.groupby('time').count()[['station']].reset_index() | |
test_time_cnt.columns = ['time', 'cnt'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pick the best threshold out-of-fold | |
thresholds = np.linspace(0.01, 0.99, 50) | |
mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds]) | |
plt.plot(thresholds, mcc) | |
best_threshold = thresholds[mcc.argmax()] | |
print(mcc.max()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment