Skip to content

Instantly share code, notes, and snippets.

@xxyjoel
Created June 6, 2018 02:59
Show Gist options
  • Save xxyjoel/850aca8d9475d95168848fd624fa0835 to your computer and use it in GitHub Desktop.
Save xxyjoel/850aca8d9475d95168848fd624fa0835 to your computer and use it in GitHub Desktop.
require(dplyr)
require(stats)
require(data.table)
require(RODBC)
require(DBI)
require(RODBCext)
#PULL CONNECTION
my_connection <- odbcDriverConnect('driver={SQL Server};server=SERVER;database=DATABASE;trusted_connection=true')
#sales order table needed for instead
sales <- sqlQuery(my_connection, '
select
Qty
, Amt
, CustomerDimKey
, DateDimKey
from
TABLE WITH DATA
group by
Qty
, Amt
, CustomerDimKey
, DateDimKey
')
sales_customer <- sqlQuery(my_connection, '
select
SUM(Qty) as Sales
, CustomerDimKey
from
TABLE WITH DATA
group
by CustomerDimKey
')
top_filter <- 20
min_purchase_qty <- 100
#FILTER OUT TOP CUSTOMERS TO GET A LIST OF TARGET CUSTOMERS
customers_top <- sales_customer[order(-sales_customer$Sales), ]
customers_target <- customers_top[(top_filter + 1):(nrow(customers_top)),]
#FILTER OUT CUSOTMERS WHO HAVE NOT PURCHASED MORE THAN 5 ITEMS
customers_target <- customers_target[customers_target$Sales > min_purchase_qty,]
#FILTER TARGET CUSTOMERS IN TRANSACTION LIST
sales_transactions <- sales %>%
filter(CustomerDimKey %in% customers_target$CustomerDimKey)
#convert to date
sales_transactions$DateDimKey <- as.character(sales_transactions$DateDimKey)
sales_transactions$DateDimKey <- as.Date(sales_transactions$DateDimKey, "%Y %m %d")
#filter out 0 value sales qty and group by date
sales_transactions <- aggregate(.~ DateDimKey + CustomerDimKey, sales_transactions, sum)
sales_transactions <- sales_transactions[sales_transactions$Qty != 0,]
#locate first and last sold date by customer and merge to main transaction df
customer_first_sale <- aggregate(sales_transactions$DateDimKey, by=list(sales_transactions$CustomerDimKey), min)
colnames(customer_first_sale) <- c("CustomerDimKey", "FirstSoldDate")
customer_last_sale <- aggregate(sales_transactions$DateDimKey, by=list(sales_transactions$CustomerDimKey), max)
colnames(customer_last_sale) <- c("CustomerDimKey", "LastSoldDate")
sales_transactions <- merge(sales_transactions, customer_first_sale, by = c("CustomerDimKey"))
sales_transactions <- merge(sales_transactions, customer_last_sale, by = c("CustomerDimKey"))
#calculate cumsum qty ordered
sales_transactions <- arrange(sales_transactions,
CustomerDimKey,
DateDimKey)
sales_transactions$PurchasedQtyCumSum <- as.vector(ave(sales_transactions$Qty,
sales_transactions$CustomerDimKey,
FUN = cumsum))
sales_transactions$PurchasedAmtCumSum <- as.vector(ave(sales_transactions$Amt,
sales_transactions$CustomerDimKey,
FUN = cumsum))
#transaction count
sales_transactions <- data.table(sales_transactions)
sales_transactions <- sales_transactions[, TransactionCount := sequence(.N), by = CustomerDimKey]
#age
today <- Sys.Date()
sales_transactions$AgeAtTrans <- as.numeric(sales_transactions$DateDimKey - sales_transactions$FirstSoldDate)
sales_transactions$CustomerAge <- as.numeric(today - sales_transactions$FirstSoldDate)
#calculate metrics for segmentation
#recency
sales_transactions$DaysSinceLastTransaction <- today - sales_transactions$LastSoldDate
#frequency of order
sales_transactions$OrdersPerDay <- sales_transactions$TransactionCount / sales_transactions$AgeAtTrans
sales_transactions$TransFreq <- sales_transactions$AgeAtTrans / sales_transactions$TransactionCount
#monetary avg order size
sales_transactions$AvgOrderSize <- sales_transactions$PurchasedQtyCumSum / sales_transactions$TransactionCount
sales_transactions$AvgOrderAmt <- sales_transactions$PurchasedAmtCumSum / sales_transactions$TransactionCount
#LTV metrics
#can parameterize using quantile percentages
sales_transactions$AvgAmtPerDay <- sales_transactions$AvgOrderAmt * sales_transactions$OrdersPerDay
sales_transactions$LowerLTV <- as.numeric(quantile(sales_transactions$CustomerAge, 0.25)) *
sales_transactions$AvgAmtPerDay
sales_transactions$MidQLTV <- as.numeric(quantile(sales_transactions$CustomerAge, 0.50)) *
sales_transactions$AvgAmtPerDay
sales_transactions$UpperQLTV <- as.numeric(quantile(sales_transactions$CustomerAge, 0.75)) *
sales_transactions$AvgAmtPerDay
#scoring mechanism
Rscore <- function(sales_transactions) {
#browser()
score <- vector()
DaysSinceLastTransaction <- as.numeric(sales_transactions$DaysSinceLastTransaction)
highest <- as.numeric(quantile(DaysSinceLastTransaction, 0.50))
high <- as.numeric(quantile(DaysSinceLastTransaction, 0.60))
benchmark <- as.numeric(quantile(DaysSinceLastTransaction, 0.70))
low <- as.numeric(quantile(DaysSinceLastTransaction, 0.80))
lowest <- as.numeric(quantile(DaysSinceLastTransaction, 0.90))
for (i in 1:length(DaysSinceLastTransaction)) {
if(DaysSinceLastTransaction[i] < 0 ||
DaysSinceLastTransaction[i] >= 0 &
DaysSinceLastTransaction[i] <= highest) {
score[i] <- 5
} else if(DaysSinceLastTransaction[i] > highest &
DaysSinceLastTransaction[i] <= high) {
score[i] <- 4
} else if(DaysSinceLastTransaction[i] > high &
DaysSinceLastTransaction[i] <= benchmark) {
score[i] <- 3
} else if(DaysSinceLastTransaction[i] > benchmark &
DaysSinceLastTransaction[i] <= low) {
score[i] <- 2
} else if(DaysSinceLastTransaction[i] > low &
DaysSinceLastTransaction[i] <= lowest) {
score[i] <- 1
} else {
score[i] <- 0
}
}
sales_transactions$RScore <- score
return(sales_transactions)
}
sales_transactions <- Rscore(sales_transactions)
#scoring mechanism
Fscore <- function(sales_transactions) {
#browser()
score <- vector()
TransFreq <- as.numeric(sales_transactions$TransFreq)
highest <- as.numeric(quantile(TransFreq, 0.20))
high <- as.numeric(quantile(TransFreq, 0.40))
benchmark <- as.numeric(quantile(TransFreq, 0.60))
low <- as.numeric(quantile(TransFreq, 0.80))
lowest <- as.numeric(quantile(TransFreq, 0.95))
for (i in 1:length(TransFreq)) {
if(TransFreq[i] < highest) {
score[i] <- 5
} else if(TransFreq[i] < high &
TransFreq[i] >= highest) {
score[i] <- 4
} else if(TransFreq[i] < benchmark &
TransFreq[i] >= high) {
score[i] <- 3
} else if(TransFreq[i] < low &
TransFreq[i] >= benchmark) {
score[i] <- 2
} else if(TransFreq[i] < lowest &
TransFreq[i] >= low) {
score[i] <- 1
} else {
score[i] <- 0
}
}
sales_transactions$FScore <- score
return(sales_transactions)
}
sales_transactions <- Fscore(sales_transactions)
Mscore <- function(sales_transactions) {
#browser()
score <- vector()
AvgOrderSize <- as.numeric(sales_transactions$AvgOrderSize)
highest <- as.numeric(quantile(AvgOrderSize, 0.90))
high <- as.numeric(quantile(AvgOrderSize, 0.80))
benchmark <- as.numeric(quantile(AvgOrderSize, 0.70))
low <- as.numeric(quantile(AvgOrderSize, 0.60))
lowest <- as.numeric(quantile(AvgOrderSize, 0.50))
for (i in 1:length(AvgOrderSize)) {
if(AvgOrderSize[i] > highest) {
score[i] <- 5
} else if(AvgOrderSize[i] > high &
AvgOrderSize[i] <= highest) {
score[i] <- 4
} else if(AvgOrderSize[i] > benchmark &
AvgOrderSize[i] <= high) {
score[i] <- 3
} else if(AvgOrderSize[i] > low &
AvgOrderSize[i] <= benchmark) {
score[i] <- 2
} else if(AvgOrderSize[i] > lowest &
AvgOrderSize[i] <= low) {
score[i] <- 1
} else {
score[i] <- 0
}
}
sales_transactions$MScore <- score
return(sales_transactions)
}
sales_transactions <- Mscore(sales_transactions)
AssignCohort <- function(sales_transactions) {
#browser()
flag <- vector()
Rscore <- sales_transactions$RScore
Fscore <- sales_transactions$FScore
Mscore <- sales_transactions$MScore
DaysSinceLastTransaction <- sales_transactions$DaysSinceLastTransaction
TransFreq <- sales_transactions$TransFreq
for (i in 1:length(Rscore)) {
if (Rscore[i] <= 2 &
Fscore[i] >= 3 &
Mscore[i] >= 3 ) {
flag[i] <- 'At Risk'
} else if
(Rscore[i] >= 4 &
Fscore[i] >= 4 &
Mscore[i] >= 4) {
flag[i] <- 'VIPs'
} else if
(Rscore[i] <= 5 &
Fscore[i] <= 5 &
Fscore[i] > 2 &
Mscore[i] <= 2 ) {
flag[i] <- 'UpSell'
} else if
(Rscore[i] <= 2 &
Fscore[i] <= 2 &
Mscore[i] <= 2 ) {
flag[i] <- 'Lost'
} else if
(Rscore[i] >= 3 &
Fscore[i] <= 2 &
Mscore[i] >= 3 ) {
flag[i] <- 'Seasonal'
} else if
((Rscore[i] + Fscore[i] + Mscore[i]) >= 12) {
flag[i] <- 'Healthy'
} else if
((Rscore[i] + Fscore[i] + Mscore[i]) < 9) {
flag[i] <- 'Un-Healthy'
} else if
((Rscore[i] + Fscore[i] + Mscore[i]) >= 9 ||
(Rscore[i] + Fscore[i] + Mscore[i]) < 12) {
flag[i] <- 'Un-Assigned'
}
}
return(flag)
}
sales_transactions$Cohort <- AssignCohort(sales_transactions)
last_recorded_sales <- sales_transactions %>%
group_by(CustomerDimKey) %>%
filter(DateDimKey == max(DateDimKey))
AtRisk <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "At Risk", ]))
Healthy <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Healthy", ]))
UnHealthy <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Un-Healthy", ]))
Unassigned <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Un-Assigned", ]))
Lost <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Lost", ]))
Seasonal <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "Seasonal", ]))
UpSell <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "UpSell", ]))
VIPs <- as.numeric(nrow(last_recorded_sales[last_recorded_sales$Cohort == "VIPs", ]))
CohortCnts <- c(AtRisk, Healthy, UnHealthy, Unassigned, Lost, Seasonal, UpSell, VIPs)
CohortCntLabels <- data.frame(Cohorts=c('AtRisk','Healthy','UnHealthy','Unassigned', 'Lost', 'Seasonal', 'UpSell', 'VIPs'))
CohortCntLabels <- cbind(CohortCntLabels, CohortCnts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment