TomHortons/act_test.csv

## act_test.csv
activity_id date    activity_category   char_1  char_2  char_3  char_4  char_5  char_6  char_7  char_8  char_9  char_10
people_id
ppl_100004  act1_249281 2022-07-20  type 1  type 5  type 10 type 5  type 1  type 6  type 1  type 1  type 7  type 4  NaN
ppl_100004  act2_230855 2022-07-20  type 5  NaN NaN NaN NaN NaN NaN NaN NaN NaN type 682
ppl_10001   act1_240724 2022-10-14  type 1  type 12 type 1  type 5  type 4  type 6  type 1  type 1  type 13 type 10 NaN

## correlation.py
print('Correlation of date_x distribution in training/testing sets: ' + str(np.corrcoef(date_x_freq.T)[0,1]))
print('Correlation of date_y distribution in training/testing sets: ' + str(np.corrcoef(date_y_freq.fillna(0).T)[0,1]))

## correlation_date_y.py
print('date_y correlation in year 1: ' + str(np.corrcoef(date_y_freq[:i].fillna(0).T)[0,1]))
print('date_y correlation in year 2: ' + str(np.corrcoef(date_y_freq[i:2*i].fillna(0).T)[0,1]))
print('date_y correlation in year 3: ' + str(np.corrcoef(date_y_freq[2*i:].fillna(0).T)[0,1]))

## feature_structure.py
date_x = pd.DataFrame()
date_x['Class probability'] = df_train.groupby('date_x')['outcome'].mean()
date_x['Frequency'] = df_train.groupby('date_x')['outcome'].size()
date_x.plot(secondary_y='Frequency', figsize=(20, 10))

## file0.txt
activity_id,outcome
act1_1,0
act1_100006,0
act1_100050,0
etc.

## file10.txt
Correlation of date_x distribution in training/testing sets: 0.853430807691
Correlation of date_y distribution in training/testing sets: 0.709589035055

## file12.txt
date_y correlation in year 1: 0.237056344324
date_y correlation in year 2: 0.682344221229
date_y correlation in year 3: 0.807207224857

## file14.txt
date_x_prob  AUC: 0.626182
date_y_prob  AUC: 0.720296
date_x_count AUC: 0.465697
date_y_count AUC: 0.475916

## file15.txt
import pandas as pd
import numpy as np
import datetime
from itertools import product
from scipy import interpolate ## For other interpolation functions.

## file16.txt
# Load and transform people data.
ppl = pd.read_csv('../input/people.csv')

# Convert booleans to integers.
p_logi = ppl.select_dtypes(include=['bool']).columns
ppl[p_logi] = ppl[p_logi].astype('int')
del p_logi

# Transform date.
ppl['date'] = pd.to_datetime(ppl['date'])

## file17.txt
# Load activities.
# Read and combine.
activs = pd.read_csv('../input/act_train.csv')
TestActivs = pd.read_csv('../input/act_test.csv')
TestActivs['outcome'] = np.nan ## Add the missing column to the test set.
activs = pd.concat([activs, TestActivs], axis=0) ## Append train and test sets.
del TestActivs

## file18.txt
# Extract only required variables.
activs = activs[['people_id', 'outcome', 'activity_id', 'date']] ## Let's look at these columns only.
# Merge people data into activities.
## This keeps all the rows from activities.
d1 = pd.merge(activs, ppl, on='people_id', how='right')

## file19.txt
## These are the indices of the rows from the test set.
testset = ppl[ppl['people_id'].isin(d1[d1['outcome'].isnull()]['people_id'])].index

    d1['activdate'] = pd.to_datetime(d1['date_x'])

    del activs

## file20.txt
##
##     0     1     2     3
## 25646  3687   565     1

## file5.txt
Start of date_x: 2022-07-17
  End of date_x: 2023-08-31
Range of date_x: 410 days 00:00:00

Start of date_y: 2020-05-18
  End of date_y: 2023-08-31
Range of date_y: 1200 days 00:00:00

## import_data.py
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

train = pd.read_csv('../input/act_train.csv', parse_dates=['date'])
test = pd.read_csv('../input/act_test.csv', parse_dates=['date'])
ppl = pd.read_csv('../input/people.csv', parse_dates=['date'])

df_train = pd.merge(train, ppl, on='people_id')
df_test = pd.merge(test, ppl, on='people_id')
del train, test, ppl

## people.csv
    char_1  group_1 char_2  date    char_3  char_4  char_5  char_6  char_7  char_8  ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 type 2  group 17304 type 2  2021-06-29  type 5  type 5  type 5  type 3  type 11 type 2  ... False   True    True    False   False   True    True    True    False   36
ppl_100002  type 2  group 8688  type 3  2021-01-06  type 28 type 9  type 5  type 3  type 11 type 2  ... False   True    True    True    True    True    True    True    False   76
ppl_100003  type 2  group 33592 type 3  2022-06-10  type 4  type 8  type 5  type 2  type 5  type 2  ... False   False   True    True    True    True    False   True    True    99

## probability_features.py
from sklearn.metrics import roc_auc_score
features = pd.DataFrame()
features['date_x_prob'] = df_train.groupby('date_x')['outcome'].transform('mean')
features['date_y_prob'] = df_train.groupby('date_y')['outcome'].transform('mean')
features['date_x_count'] = df_train.groupby('date_x')['outcome'].transform('count')
features['date_y_count'] = df_train.groupby('date_y')['outcome'].transform('count')
_=[print(f.ljust(12) + ' AUC: ' + str(round(roc_auc_score(df_train['outcome'], features[f]), 6))) for f in features.columns]

## show_day.py
for d in ['date_x', 'date_y']:
    print('Start of ' + d + ': ' + str(df_train[d].min().date()))
    print('  End of ' + d + ': ' + str(df_train[d].max().date()))
    print('Range of ' + d + ': ' + str(df_train[d].max() - df_train[d].min()) + '\n')

## show_day_y.py
date_y = pd.DataFrame()
date_y['Class probability'] = df_train.groupby('date_y')['outcome'].mean()
date_y['Frequency'] = df_train.groupby('date_y')['outcome'].size()
# We need to split it into multiple graphs since the time-scale is too long to show well on one graph
i = int(len(date_y) / 3)
date_y[:i].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 1')
date_y[i:2*i].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 2')
date_y[2*i:].plot(secondary_y='Frequency', figsize=(20, 5), title='date_y Year 3')

## show_test.py
date_x_freq = pd.DataFrame()
date_x_freq['Training set'] = df_train.groupby('date_x')['activity_id'].count()
date_x_freq['Testing set'] = df_test.groupby('date_x')['activity_id'].count()
date_x_freq.plot(secondary_y='Testing set', figsize=(20, 8),
                 title='Comparison of date_x distribution between training/testing set')
date_y_freq = pd.DataFrame()
date_y_freq['Training set'] = df_train.groupby('date_y')['activity_id'].count()
date_y_freq['Testing set'] = df_test.groupby('date_y')['activity_id'].count()
date_y_freq[:i].plot(secondary_y='Testing set', figsize=(20, 8),
                 title='Comparison of date_y distribution between training/testing set (first year)')
date_y_freq[2*i:].plot(secondary_y='Testing set', figsize=(20, 8),
                 title='Comparison of date_y distribution between training/testing set (last year)
	activity_id date activity_category char_1 char_2 char_3 char_4 char_5 char_6 char_7 char_8 char_9 char_10
	people_id
	ppl_100004 act1_249281 2022-07-20 type 1 type 5 type 10 type 5 type 1 type 6 type 1 type 1 type 7 type 4 NaN
	ppl_100004 act2_230855 2022-07-20 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 682
	ppl_10001 act1_240724 2022-10-14 type 1 type 12 type 1 type 5 type 4 type 6 type 1 type 1 type 13 type 10 NaN
	print('Correlation of date_x distribution in training/testing sets: ' + str(np.corrcoef(date_x_freq.T)[0,1]))
	print('Correlation of date_y distribution in training/testing sets: ' + str(np.corrcoef(date_y_freq.fillna(0).T)[0,1]))
	print('date_y correlation in year 1: ' + str(np.corrcoef(date_y_freq[:i].fillna(0).T)[0,1]))
	print('date_y correlation in year 2: ' + str(np.corrcoef(date_y_freq[i:2*i].fillna(0).T)[0,1]))
	print('date_y correlation in year 3: ' + str(np.corrcoef(date_y_freq[2*i:].fillna(0).T)[0,1]))
	date_x = pd.DataFrame()
	date_x['Class probability'] = df_train.groupby('date_x')['outcome'].mean()
	date_x['Frequency'] = df_train.groupby('date_x')['outcome'].size()
	date_x.plot(secondary_y='Frequency', figsize=(20, 10))
	Correlation of date_x distribution in training/testing sets: 0.853430807691
	Correlation of date_y distribution in training/testing sets: 0.709589035055
	date_y correlation in year 1: 0.237056344324
	date_y correlation in year 2: 0.682344221229
	date_y correlation in year 3: 0.807207224857
	date_x_prob AUC: 0.626182
	date_y_prob AUC: 0.720296
	date_x_count AUC: 0.465697
	date_y_count AUC: 0.475916
	import pandas as pd
	import numpy as np
	import datetime
	from itertools import product
	from scipy import interpolate ## For other interpolation functions.
	# Load and transform people data.
	ppl = pd.read_csv('../input/people.csv')

	# Convert booleans to integers.
	p_logi = ppl.select_dtypes(include=['bool']).columns
	ppl[p_logi] = ppl[p_logi].astype('int')
	del p_logi

	# Transform date.
	ppl['date'] = pd.to_datetime(ppl['date'])
	# Load activities.
	# Read and combine.
	activs = pd.read_csv('../input/act_train.csv')
	TestActivs = pd.read_csv('../input/act_test.csv')
	TestActivs['outcome'] = np.nan ## Add the missing column to the test set.
	activs = pd.concat([activs, TestActivs], axis=0) ## Append train and test sets.
	del TestActivs