Skip to content

Instantly share code, notes, and snippets.

@TomHortons
Created January 12, 2017 22:52
Show Gist options
  • Save TomHortons/6c78706b96077cbf6f23bdbd41c58bf9 to your computer and use it in GitHub Desktop.
Save TomHortons/6c78706b96077cbf6f23bdbd41c58bf9 to your computer and use it in GitHub Desktop.
Kaggleまとめ:Redhat(前編) ref: http://qiita.com/TomHortons/items/039852bca3714b43e887
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
activity_id date activity_category char_1 char_2 char_3 char_4 char_5 char_6 char_7 char_8 char_9 char_10
people_id
ppl_100004 act1_249281 2022-07-20 type 1 type 5 type 10 type 5 type 1 type 6 type 1 type 1 type 7 type 4 NaN
ppl_100004 act2_230855 2022-07-20 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 682
ppl_10001 act1_240724 2022-10-14 type 1 type 12 type 1 type 5 type 4 type 6 type 1 type 1 type 13 type 10 NaN
print('Correlation of date_x distribution in training/testing sets: ' + str(np.corrcoef(date_x_freq.T)[0,1]))
print('Correlation of date_y distribution in training/testing sets: ' + str(np.corrcoef(date_y_freq.fillna(0).T)[0,1]))
print('date_y correlation in year 1: ' + str(np.corrcoef(date_y_freq[:i].fillna(0).T)[0,1]))
print('date_y correlation in year 2: ' + str(np.corrcoef(date_y_freq[i:2*i].fillna(0).T)[0,1]))
print('date_y correlation in year 3: ' + str(np.corrcoef(date_y_freq[2*i:].fillna(0).T)[0,1]))
date_x = pd.DataFrame()
date_x['Class probability'] = df_train.groupby('date_x')['outcome'].mean()
date_x['Frequency'] = df_train.groupby('date_x')['outcome'].size()
date_x.plot(secondary_y='Frequency', figsize=(20, 10))
activity_id,outcome
act1_1,0
act1_100006,0
act1_100050,0
etc.
Correlation of date_x distribution in training/testing sets: 0.853430807691
Correlation of date_y distribution in training/testing sets: 0.709589035055
date_y correlation in year 1: 0.237056344324
date_y correlation in year 2: 0.682344221229
date_y correlation in year 3: 0.807207224857
date_x_prob AUC: 0.626182
date_y_prob AUC: 0.720296
date_x_count AUC: 0.465697
date_y_count AUC: 0.475916
import pandas as pd
import numpy as np
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.
# Load and transform people data.
ppl = pd.read_csv('../input/people.csv')
# Convert booleans to integers.
p_logi = ppl.select_dtypes(include=['bool']).columns
ppl[p_logi] = ppl[p_logi].astype('int')
del p_logi
# Transform date.
ppl['date'] = pd.to_datetime(ppl['date'])
# Load activities.
# Read and combine.
activs = pd.read_csv('../input/act_train.csv')
TestActivs = pd.read_csv('../input/act_test.csv')
TestActivs['outcome'] = np.nan ## Add the missing column to the test set.
activs = pd.concat([activs, TestActivs], axis=0) ## Append train and test sets.
del TestActivs
# Extract only required variables.
activs = activs[['people_id', 'outcome', 'activity_id', 'date']] ## Let's look at these columns only.
# Merge people data into activities.
## This keeps all the rows from activities.
d1 = pd.merge(activs, ppl, on='people_id', how='right')
## These are the indices of the rows from the test set.
testset = ppl[ppl['people_id'].isin(d1[d1['outcome'].isnull()]['people_id'])].index
d1['activdate'] = pd.to_datetime(d1['date_x'])
del activs
##
## 0 1 2 3
## 25646 3687 565 1
Start of date_x: 2022-07-17
End of date_x: 2023-08-31
Range of date_x: 410 days 00:00:00
Start of date_y: 2020-05-18
End of date_y: 2023-08-31
Range of date_y: 1200 days 00:00:00
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv('../input/act_train.csv', parse_dates=['date'])
test = pd.read_csv('../input/act_test.csv', parse_dates=['date'])
ppl = pd.read_csv('../input/people.csv', parse_dates=['date'])
df_train = pd.merge(train, ppl, on='people_id')
df_test = pd.merge(test, ppl, on='people_id')
del train, test, ppl
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
char_1 group_1 char_2 date char_3 char_4 char_5 char_6 char_7 char_8 ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 type 2 group 17304 type 2 2021-06-29 type 5 type 5 type 5 type 3 type 11 type 2 ... False True True False False True True True False 36
ppl_100002 type 2 group 8688 type 3 2021-01-06 type 28 type 9 type 5 type 3 type 11 type 2 ... False True True True True True True True False 76
ppl_100003 type 2 group 33592 type 3 2022-06-10 type 4 type 8 type 5 type 2 type 5 type 2 ... False False True True True True False True True 99
from sklearn.metrics import roc_auc_score
features = pd.DataFrame()
features['date_x_prob'] = df_train.groupby('date_x')['outcome'].transform('mean')
features['date_y_prob'] = df_train.groupby('date_y')['outcome'].transform('mean')
features['date_x_count'] = df_train.groupby('date_x')['outcome'].transform('count')
features['date_y_count'] = df_train.groupby('date_y')['outcome'].transform('count')
_=[print(f.ljust(12) + ' AUC: ' + str(round(roc_auc_score(df_train['outcome'], features[f]), 6))) for f in features.columns]
for d in ['date_x', 'date_y']:
print('Start of ' + d + ': ' + str(df_train[d].min().date()))
print(' End of ' + d + ': ' + str(df_train[d].max().date()))
print('Range of ' + d + ': ' + str(df_train[d].max() - df_train[d].min()) + '\n')
date_y = pd.DataFrame()
date_y['Class probability'] = df_train.groupby('date_y')['outcome'].mean()
date_y['Frequency'] = df_train.groupby('date_y')['outcome'].size()
# We need to split it into multiple graphs since the time-scale is too long to show well on one graph
i = int(len(date_y) / 3)
date_y[:i].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 1')
date_y[i:2*i].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 2')
date_y[2*i:].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 3')
date_x_freq = pd.DataFrame()
date_x_freq['Training set'] = df_train.groupby('date_x')['activity_id'].count()
date_x_freq['Testing set'] = df_test.groupby('date_x')['activity_id'].count()
date_x_freq.plot(secondary_y='Testing set', figsize=(20, 8),
title='Comparison of date_x distribution between training/testing set')
date_y_freq = pd.DataFrame()
date_y_freq['Training set'] = df_train.groupby('date_y')['activity_id'].count()
date_y_freq['Testing set'] = df_test.groupby('date_y')['activity_id'].count()
date_y_freq[:i].plot(secondary_y='Testing set', figsize=(20, 8),
title='Comparison of date_y distribution between training/testing set (first year)')
date_y_freq[2*i:].plot(secondary_y='Testing set', figsize=(20, 8),
title='Comparison of date_y distribution between training/testing set (last year)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment