Skip to content

Instantly share code, notes, and snippets.

@lucdangelis
Last active June 10, 2020 06:26
Show Gist options
  • Save lucdangelis/602826161ee2b0e88b8922879b8ceadb to your computer and use it in GitHub Desktop.
Save lucdangelis/602826161ee2b0e88b8922879b8ceadb to your computer and use it in GitHub Desktop.
#Calibration and Holdouts periods for data split
date_start = ['2013-04-01','2014-04-01','2015-04-01','2016-04-01']
calibration_period_end = ['2015-03-31','2016-03-31','2017-03-31','2018-03-31']
date_end = ['2016-03-31','2017-03-31','2018-03-31','2019-03-31']
#Arrays where to store the results of cross validation
accuracies_1y = []
holdouts_1y = []
predictions_1y = []
#Execute the cross validation
sample_size = ... #sample size for each iteration
t_12 = 12 #units of time in holdout period
for i in range(0,len(date_start)):
print('Loop n: ', i)
#Select dataset and sample from a transaction dataframe (data) containing all transactions from 2013 onward.
filtered_df = data[(pd.to_datetime(data['date']) >= pd.to_datetime(date_start[i])) & (pd.to_datetime(data['date']) <= pd.to_datetime(date_end[i]))]
sample = pd.DataFrame(filtered_df['customer_id'].unique()).sample(sample_size)
sample.columns = ['customer_id']
cv_df = pd.merge(filtered_df, sample, on='customer_id', how='inner')
#12 months holdout - 2 years calibration
cal_hold = calibration_and_holdout_data(cv_df, 'customer_id', 'date',
calibration_period_end=calibration_period_end[i],
observation_period_end=date_end[i],
freq = 'M')
print('Cal_hol dataset n:', i)
#BG/NBD model
bgf_loop = BetaGeoFitter(penalizer_coef=0.001) #ModifiedBetaGeoFitter assigns probability being alive <> 1 to customers who did 1 purchase: https://github.com/CamDavidsonPilon/lifetimes/issues/173
bgf_loop.fit(cal_hold['frequency_cal'], cal_hold['recency_cal'], cal_hold['T_cal'])
print(bgf_loop)
print(bgf_loop.summary)
#Results
cal_hold['predicted_purchases_12t'] = bgf_loop.conditional_expected_number_of_purchases_up_to_time(t_12, cal_hold['frequency_cal'], cal_hold['recency_cal'], cal_hold['T_cal'])
holdout = cal_hold['frequency_holdout'].sum(axis = 0)
prediction = cal_hold['predicted_purchases_12t'].sum(axis = 0)
accuracies_1y.append((prediction-holdout)/holdout)
holdouts_1y.append(holdout)
predictions_1y.append(prediction)
print('Results: holdout: ', holdout, ', prediction: ', prediction, ', % Error: ', (holdout-prediction)/holdout)
#Plot Cross Validation Results
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams.update({'font.size': 20})
N = len(holdouts_1y)
t = ['2015','2016','2017','2018']
ind = np.arange(N)
width = 0.15
fig, ax = plt.subplots()
rects1 = ax.bar(ind -width/2, holdouts_1y, width, color = '#4472C4')
rects2 = ax.bar(ind + width/2, predictions_1y, width, color = '#ED7C32')
ax.set_ylabel('Transactions')
ax.set_title('4 Periods Cross Validation - 2 years Calibration & 1 year Holdout')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels((t))
ax.legend(('Holdout', 'Prediction'), loc = 3)
ax2 = ax.twinx()
ax2.plot(accuracies_1y, marker='d', color = '#8FAADC')
ax2.set_ylabel('Prediction Error')
ax2.legend(['Prediction Error'],loc=4)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment