Skip to content

Instantly share code, notes, and snippets.

@mattbaggott
Last active September 15, 2020 22:16
Show Gist options
  • Star 36 You must be signed in to star a gist
  • Fork 27 You must be signed in to fork a gist
  • Save mattbaggott/5113177 to your computer and use it in GitHub Desktop.
Save mattbaggott/5113177 to your computer and use it in GitHub Desktop.
Uses the BTYD package and Pareto/NBD model to predict customer behavior in R Slides are at: http://www.slideshare.net/mattbagg/baggott-predict-customerinrpart1#
#
# PREDICTING LONG TERM CUSTOMER VALUE WITH BTYD PACKAGE
# Pareto/NBD (negative binomial distribution) modeling of
# repeat-buying behavior in a noncontractual setting
#
# Matthew Baggott, matt@baggott.net
#
# Accompanying slides at:
# http://www.slideshare.net/mattbagg/baggott-predict-customerinrpart1#
#
# Based on Schmittlein, Morrison, and Colombo (1987), “Counting Your Customers:
# Who Are They and What Will They Do Next?” Management Science, 33, 1–24.
#
# Required data for model is :
#
# "customer-by-sufficient-statistic” (cbs) matrix
# with the 'sufficient' stats being:
# frequency of transaction
# recency (time of last transaction) and
# total time observed
# Main model params are :
# beta unobserved shape parameter for dropout process
# s unobserved scale parameter for dropout process
# r unobserved shape parameter for NBD transaction
# alpha unobserved scale parameter for NBD transaction
# Data are divided into earlier calibration and later holdout segments,
# using a single date as the cut point. "cal" data are used
# to predict "holdout" data
#
############################################
############################################
#
# INSTALL AND LOAD NEEDED PACKAGES
#
############################################
toInstallCandidates <- c("ggplot2", "BTYD", "reshape2", "plyr", "lubridate")
# check if pkgs are already present
toInstall <- toInstallCandidates[!toInstallCandidates%in%library()$results[,1]]
if(length(toInstall)!=0)
{install.packages(toInstall, repos = "http://cran.r-project.org")}
# load pkgs
lapply(toInstallCandidates, library, character.only = TRUE)
############################################
#
# LOAD DATA
#
############################################
# Data are 10% of the cohort of customers who made their first transactions
# with online retailer CDNOW (founded 1994) in the first quarter of 1997.
cdnowElog <- system.file("data/cdnowElog.csv", package = "BTYD")
elog=read.csv(cdnowElog) # read data
head(elog) # take a look
elog<-elog[,c(2,3,5)] # we need these columns
names(elog) <- c("cust","date","sales") # model functions expect these names
# format date
elog$date <- as.Date(as.character(elog$date), format="%Y%m%d")
# Transaction-flow models, such as the Pareto/NBD, are concerned
# with interpurchase intervals.
# Since we only have dates and there may be multiple purchases on a day
# we merge all transactions that occurred on the same day
# using dc.MergeTransactionsOnSameDate()
elog <- dc.MergeTransactionsOnSameDate(elog)
head(elog)
summary(elog) # no NAs
############################################
#
# EXAMINE DATA
#
############################################
# make log plot and plot sales
ggplot(elog, aes(x=date,y=sales,group=cust))+
geom_line(alpha=0.1)+
scale_x_date()+
scale_y_log10()+
ggtitle("Sales for individual customers")+
ylab("Sales ($, US)")+xlab("")+
theme_minimal()
# look at days between orders
# model describes rates via a gamma distribution across customers
purchaseFreq <- ddply(elog, .(cust), summarize,
daysBetween = as.numeric(diff(date)))
windows();ggplot(purchaseFreq,aes(x=daysBetween))+
geom_histogram(fill="orange")+
xlab("Time between purchases (days)")+
theme_minimal()
#(fitg<-fitdist(daysBetween$daysBetween,"gamma",method="mme"))
#windows();plot(fitg)
############################################
#
# DIVIDE DATA
#
############################################
# into a calibration phase
# and a holdout phase
# determine middle point for splitting
(end.of.cal.period <-
min(elog$date)+as.numeric((max(elog$date)-min(elog$date))/2))
# split data into train(calibration) and test (holdout) and make matrices
data <- dc.ElogToCbsCbt(elog, per="week",
T.cal=end.of.cal.period,
merge.same.date=TRUE, # not needed, we already did it
statistic = "freq") # which CBT to return
# take a look
str(data)
# cbs is short for "customer-by-sufficient-statistic” matrix
# with the sufficient stats being:
# frequency
# recency (time of last transaction) and
# total time observed
# extract calibration matrix
cal2.cbs <- as.matrix(data[[1]][[1]])
str(cal2.cbs)
############################################
#
# ESTIMATE PARAMETERS FOR MODEL
#
############################################
# initial estimate
(params2 <- pnbd.EstimateParameters(cal2.cbs))
# look at log likelihood
(LL <- pnbd.cbs.LL(params2, cal2.cbs))
# make a series of estimates, see if they converge
p.matrix <- c(params2, LL)
for (i in 1:20) {
params2 <- pnbd.EstimateParameters(cal2.cbs, params2)
LL <- pnbd.cbs.LL(params2, cal2.cbs)
p.matrix.row <- c(params2, LL)
p.matrix <- rbind(p.matrix, p.matrix.row)
}
# examine
p.matrix
# use final set of values
(params2 <- p.matrix[dim(p.matrix)[1],1:4])
# Main model params are :
# r gamma parameter for NBD transaction
# alpha gamma parameter for NBD transaction
# s gamma parameter for Pareto (exponential gamma) dropout process
# beta gammma parameter for Pareto (exponential gamma) dropout process
############################################
#
# PLOT LOG-LIKELIHOOD ISO-CONTOURS FOR MAIN PARAMS
#
############################################
# set up parameter names for a more descriptive result
param.names <- c("r", "alpha", "s", "beta")
LL <- pnbd.cbs.LL(params2, cal2.cbs)
dc.PlotLogLikelihoodContours(pnbd.cbs.LL, params2, cal.cbs = cal2.cbs , n.divs = 5,
num.contour.lines = 7, zoom.percent = 0.3,
allow.neg.params = FALSE, param.names = param.names)
############################################
#
# PLOT GROUP DISTRIBUTION OF PROPENSITY TO PURCHASE, DROPOUT
#
############################################
# par to make two plots side by side
par(mfrow=c(1,2))
# Plot the estimated distribution of lambda
# (customers' propensities to purchase)
pnbd.PlotTransactionRateHeterogeneity(params2, lim = NULL)
# lim is upper xlim
# Plot estimated distribution of gamma
# (customers' propensities to drop out).
pnbd.PlotDropoutRateHeterogeneity(params2)
# set par to normal
par(mfrow=c(1,1))
############################################
#
# EXAMINE INDIVIDUAL PREDICTIONS
#
############################################
# estimate number transactions a new customer
# will make in 52 weeks
pnbd.Expectation(params2, t = 52)
# expected characteristics for a specific individual,
# conditional on their purchasing behavior during calibration
# calibration data for customer 1516
# frequency("x"), recency("t.x") and total time observed("T.cal")
cal2.cbs["1516",]
x <- cal2.cbs["1516", "x"] # x is frequency
t.x <- cal2.cbs["1516", "t.x"] # t.x is recency, ie time of last transactions
T.cal <- cal2.cbs["1516", "T.cal"] # T.cal is total time observed
# estimate transactions in a T.star-long duration for that cust
pnbd.ConditionalExpectedTransactions(params2, T.star = 52, # weeks
x, t.x, T.cal)
############################################
#
# PROBABILITY A CUSTOMER IS ALIVE AT END OF CALIBRATION / TRAINING
#
############################################
x # freq of purchase
t.x # week of last purchase
T.cal <- 39 # week of end of cal, i.e. present
pnbd.PAlive(params2, x, t.x, T.cal)
# To visualize the distribution of P(Alive) across customers:
params3 <- pnbd.EstimateParameters(cal2.cbs)
p.alives <- pnbd.PAlive(params3, cal2.cbs[,"x"], cal2.cbs[,"t.x"], cal2.cbs[,"T.cal"])
ggplot(as.data.frame(p.alives),aes(x=p.alives))+
geom_histogram(colour="grey",fill="orange")+
ylab("Number of Customers")+
xlab("Probability Customer is 'Live'")+
theme_minimal()
# plot actual & expected customers binned by num of repeat transactions
pnbd.PlotFrequencyInCalibration(params2, cal2.cbs,
censor=10, title="Model vs. Reality during Calibration")
############################################
#
# HOW WELL DOES MODEL DO IN HOLDOUT PERIOD?
#
############################################
# get holdout transactions from dataframe data, add in as x.star
x.star <- data[[2]][[2]][,1]
cal2.cbs <- cbind(cal2.cbs, x.star)
str(cal2.cbs)
holdoutdates <- attributes(data[[2]][[1]])[[2]][[2]]
holdoutlength <- round(as.numeric(max(as.Date(holdoutdates))-
min(as.Date(holdoutdates)))/7)
# plot predicted vs seen conditional freqs and get matrix 'comp' w values
T.star <- holdoutlength
censor <- 10 # Bin all order numbers here and above
comp <- pnbd.PlotFreqVsConditionalExpectedFrequency(params2, T.star,
cal2.cbs, x.star, censor)
rownames(comp) <- c("act", "exp", "bin")
comp
# plot predicted vs actual by week
# get data without first transaction, this removes those who buy 1x
removedFirst.elog <- dc.SplitUpElogForRepeatTrans(elog)$repeat.trans.elog
removedFirst.cbt <- dc.CreateFreqCBT(removedFirst.elog)
# get all data, so we have customers who buy 1x
allCust.cbt <- dc.CreateFreqCBT(elog)
# add 1x customers into matrix
tot.cbt <- dc.MergeCustomers(data.correct=allCust.cbt,
data.to.correct=removedFirst.cbt)
lengthInDays <- as.numeric(max(as.Date(colnames(tot.cbt)))-
min(as.Date(colnames(tot.cbt))))
origin <- min(as.Date(colnames(tot.cbt)))
tot.cbt.df <- melt(tot.cbt,varnames=c("cust","date"),value.name="Freq")
tot.cbt.df$date<-as.Date(tot.cbt.df$date)
tot.cbt.df$week<-as.numeric(1+floor((tot.cbt.df$date-origin+1)/7))
transactByDay <- ddply(tot.cbt.df,.(date),summarize,sum(Freq))
transactByWeek <- ddply(tot.cbt.df,.(week),summarize,sum(Freq))
names(transactByWeek) <- c("week","Transactions")
names(transactByDay) <- c("date","Transactions")
T.cal <- cal2.cbs[,"T.cal"]
T.tot <- 78 # end of holdout
comparisonByWeek <- pnbd.PlotTrackingInc(params2, T.cal,
T.tot, actual.inc.tracking.data=transactByWeek$Transactions)
############################################
#
# FORMAL MEASURE OF ACCURACY
#
############################################
# root mean squared error
rmse <- function(est, act) { return(sqrt(mean((est-act)^2))) }
# mean squared logarithmic error
msle <- function(est, act) { return(mean((log1p(est)-log1p(act))^2)) }
str(cal2.cbs)
cal2.cbs[,"x"]
predict<-pnbd.ConditionalExpectedTransactions(params2, T.star = 38, # weeks
x = cal2.cbs[,"x"],
t.x = cal2.cbs[,"t.x"],
T.cal = cal2.cbs[,"T.cal"])
cal2.cbs[,"x.star"] # actual transactions for each person
rmse(act=cal2.cbs[,"x.star"],est=predict)
msle(act=cal2.cbs[,"x.star"],est=predict)
# not useful w/o comparison: is this better than guessing?
@dah33
Copy link

dah33 commented Nov 19, 2018

Thanks for building this. Very useful!

@ZellW
Copy link

ZellW commented Apr 26, 2019

Really good. Any chance you might share your slides? They are no longer availble on SlideShare.

@asyatikkanen
Copy link

Thank you for the notebook, it's extremely helpful. It'd be amazing to get access to the slides since they're no longer available on SS.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment