Skip to content

Instantly share code, notes, and snippets.

@TomHortons
Last active November 22, 2016 07:34
Show Gist options
  • Save TomHortons/0a861f4d731f15f2e7f9c2ab7a60e7cc to your computer and use it in GitHub Desktop.
Save TomHortons/0a861f4d731f15f2e7f9c2ab7a60e7cc to your computer and use it in GitHub Desktop.
Kaggle まとめ: BOSCH (intro + forum discussion) ref: http://qiita.com/TomHortons/items/e8a7cea90226bd5ed32f
cat_cols = pd.read_csv(TRAIN_CAT, nrows = 1).columns.values
print 'cat_cols: ', cat_cols
print 'cat_cols.shape: ', cat_cols.shape
cats = pd.read_csv(TRAIN_CAT, usecols=(cat_cols[:2].tolist()))
print 'cats.shape: ', cats.shape
print cats
date_cols = pd.read_csv(TRAIN_DATE, nrows = 1).columns.values
date = pd.read_csv(TRAIN_DATE, usecols=(date_cols[:2].tolist()))
print 'date_cols.shape: ', date_cols.shape
print date_cols
print 'date.shape: ', date.shape
print date
import numpy as np
import pandas as pd
numeric_cols = pd.read_csv(TRAIN_NUMERIC, nrows = 1).columns.values
print numeric_cols
print 'cols.shape: ', numeric_cols.shape
F0 = pd.read_csv(TRAIN_NUMERIC, usecols=(numeric_cols[:2].tolist() + ['Response']))
print 'F0.shape: ', F0.shape
Id,Response
1,0
2,1
3,0
etc.
array(['Id', 'L0_S0_F0', 'L0_S0_F2', 'L0_S0_F4', 'L0_S0_F6', 'L0_S0_F8',
'L0_S0_F10', 'L0_S0_F12', 'L0_S0_F14', 'L0_S0_F16', 'L0_S0_F18',
'L0_S0_F20', 'L0_S0_F22', 'L0_S1_F24', 'L0_S1_F28', 'L0_S2_F32',
'L0_S2_F36', 'L0_S2_F40', 'L0_S2_F44', 'L0_S2_F48', 'L0_S2_F52',
'L0_S2_F56', 'L0_S2_F60', 'L0_S2_F64', 'L0_S3_F68', 'L0_S3_F72',
.....
'L3_S50_F4245', 'L3_S50_F4247', 'L3_S50_F4249', 'L3_S50_F4251',
'L3_S50_F4253', 'L3_S51_F4256', 'L3_S51_F4258', 'L3_S51_F4260',
'L3_S51_F4262', 'Response'], dtype=object)
cols.shape: (970,)
F0.shape: (1183747, 2)
Id L0_S0_F0 Response
0 4 0.030 0
1 6 NaN 0
2 7 0.088 0
3 9 -0.036 0
cat_cols: ['Id' 'L0_S1_F25' 'L0_S1_F27' ..., 'L3_S49_F4237' 'L3_S49_F4239'
'L3_S49_F4240']
cat_cols.shape: (2141,)
cats.shape: (1183747, 2)
Id L0_S1_F25
0 4 NaN
1 6 NaN
2 7 NaN
3 9 NaN
4 11 NaN
5 13 NaN
6 14 NaN
7 16 NaN
8 18 NaN
date_cols.shape: (1157,)
['Id' 'L0_S0_D1' 'L0_S0_D3' ..., 'L3_S51_D4259' 'L3_S51_D4261'
'L3_S51_D4263']
date.shape: (1183747, 2)
Id L0_S0_D1
0 4 82.24
1 6 NaN
2 7 1618.70
3 9 1149.20
4 11 602.64
5 13 1331.66
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
DATA_DIR = "../input"
TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
COL_BATCH = 100
numeric_cols = pd.read_csv(TRAIN_NUMERIC, nrows = 1).columns.values
for n_ in range(len(numeric_cols)/COL_BATCH):
cols = numeric_cols[(n_*COL_BATCH):(n_*COL_BATCH+COL_BATCH)].tolist()
train = pd.read_csv(TRAIN_NUMERIC, index_col = 0, usecols=(cols + ['Response']))
X_neg, X_pos = train[train['Response'] == 0].iloc[:, :-1], train[train['Response']==1].iloc[:, :-1]
BATCH_SIZE = 10
dummy = []
source = train.drop('Response', axis=1)
for n in list(range(0, train.shape[1], BATCH_SIZE)):
data = source.iloc[:, n:n+BATCH_SIZE]
data_cols = data.columns.tolist()
dummy.append(pd.melt(pd.concat([data, train.Response], axis=1), id_vars = 'Response', value_vars = data_cols))
FIGSIZE = (3*(BATCH_SIZE),4*(COL_BATCH/BATCH_SIZE))
_, axs = plt.subplots(len(dummy), figsize = FIGSIZE)
for data, ax in zip(dummy, axs):
v_plots = sns.violinplot(x = 'variable', y = 'value', hue = 'Response', data = data, ax = ax, split =True)
v_plots.get_figure().savefig("violin_{0}.jpg".format(n_))
import pandas as pd
import numpy as np
import seaborn as sns
features_names = [
'L0_S11_F298', 'L1_S24_F1672', 'L1_S24_F766', 'L1_S24_F1844',
'L1_S24_F1632', 'L1_S24_F1723', 'L1_S24_F1846', 'L1_S25_F2761',
'L1_S25_F2193'
]
features = pd.read_csv(TRAIN_NUMERIC, index_col = 0, usecols=(features_names + ['Response'])).reset_index()
for f in features.columns[:-1]:
features[f][np.isnan(features[f])] = features[f].median()
X_neg, X_pos = features[features['Response'] == 0], features[features['Response']==1]
volumes = len(X_pos) if len(X_pos)<len(X_neg) else len(X_neg)
features = pd.concat([X_pos, X_neg]).reset_index(drop=True)
g = sns.pairplot(features, hue="Response", vars=test.columns.tolist()[:-1], markers='.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment