Skip to content

Instantly share code, notes, and snippets.

@lorinc
Created February 2, 2015 21:18
Show Gist options
  • Save lorinc/b9cd56990677d5453042 to your computer and use it in GitHub Desktop.
Save lorinc/b9cd56990677d5453042 to your computer and use it in GitHub Desktop.
400 lines of ETL, data cleansing, cross-validating, plotting and exploratory analysis in R using data.table and ggplot, using SFDC, Jira and accounting data
#### reading libraries ####
library(data.table)
library(ggplot2)
library(gridExtra)
#### reading sources ####
setwd("~/Desktop/consolidated")
# "SFaccountID","SFopportunityID","SFaccountName","SFcreateDate","SFendDate","SFindustry","SFproductName","SFtotalPriceCurrency","SFtotalPrice"
# Source: Salesforce report exported to CSV
SF <- data.table(read.csv("SF-data-consolidated-w-names.csv"))
# LESAcreateDate,LESAaccountID,LESAaccountName,LESAticketID,LESAisEscalated,LESAgrossResolutionDays,LESAcommentMass,LESAcommentCount,LESAhasLPP,LESAstatus
# Source: SQL query run on LRDC DB dump
LESA <- data.table(read.csv("LESA-data-consolidated.csv"))
# text-to-date in all sources
SF$SFcreateDate <- as.Date(SF$SFcreateDate, "%m/%d/%Y")
SF$SFendDate <- as.Date(SF$SFendDate, "%m/%d/%Y")
LESA$LESAcreateDate <- as.Date(LESA$LESAcreateDate, "%m/%d/%Y")
#### SF data processing ####
# subscription level data
SubscriptionLevels <- c("Basic",
"Limited",
"Gold",
"Platinum",
"OEM",
"Indemnification",
"Dev Workstation",
"Support Contact",
"Silver",
"Unlimited",
"Social Office",
"6.x License")
# creating regexp pattern from SubscriptionLevels data
ptn <- paste(".*(", paste(SubscriptionLevels, collapse="|"), ").*", sep="")
# deriving subscription levels from product name for each invoice
SF[, SFsubscriptionLevel := gsub(ptn, "\\1", SFproductName, ignore.case=FALSE)]
SF$SFsubscriptionLevel <- as.factor(SF$SFsubscriptionLevel)
# keep one item for each invoice, new table called SF.SUBSCRIPTIONS
SF <- SF[sort(-SF$SFtotalPrice)]
setkey(SF, SFopportunityID)
# dropping all but the most expensive item from all invoices (hoping that it will represent the rest well in terms of subscription level)
# ERROR PRONE FUNCTION: can not handle multiple level of subscriptions per invoice
SF.SUBSCRIPTIONS <- unique(SF)
# renewal flag applied if there is a future invoice for the same account
SF.SUBSCRIPTIONS[ , SFGotRenewed := SFendDate <= max(SFcreateDate), by=SFaccountID]
# setting up isSupported flag
SF.SUBSCRIPTIONS[ , isSupported := SFsubscriptionLevel %in% c("Gold", "Platinum", "OEM", "Indemnification", "Unlimited", "Social Office")]
# factorizing initial subscription year
SF.SUBSCRIPTIONS[, startYear := as.character.Date(SFcreateDate, format="%Y"), by=SFaccountID ]
#### LESA effort calculation ####
# global variables
escalationCostMultiplier <- 1
costForLppMultiplier <- 2
# calculating effort factor exchange rates
aRd <- mean(LESA$LESAgrossResolutionDays)
CmInRd <- aRd / mean(LESA$LESAcommentMass)
CcInRd <- aRd / mean(LESA$LESAcommentCount)
# averaging the 3 effort factors
LESA$LESAeffortInDays <- (LESA$LESAgrossResolutionDays +
LESA$LESAcommentMass * CmInRd +
LESA$LESAcommentCount * CcInRd) / 3
# applying escalation and LPS multipliers
LESA$LESAeffortInDays <- LESA$LESAeffortInDays +
LESA$LESAeffortInDays * escalationCostMultiplier * LESA$LESAisEscalated +
LESA$LESAeffortInDays * costForLppMultiplier * LESA$LESAhasLPP
#### DUAL DATA - creating the LESA-ID SF-ID account mapping table ####
# mapping is based on opportunity ID recorded in LESA account notes
setkey(SF, SFopportunityID)
# map tables
LESAID_SFOPP_LINK <- data.table(read.csv("LESA-SF-AUTOMATED.csv"))
SFOPP_SFACC_LINK <- SF.SUBSCRIPTIONS[, list(SFopportunityID, SFaccountID)]
# set up keys
setkey(LESAID_SFOPP_LINK, SFopportunityID)
setkey(SFOPP_SFACC_LINK, SFopportunityID)
# cartesian products
CARTP1 <- LESAID_SFOPP_LINK[SFOPP_SFACC_LINK,]
CARTP2 <- SFOPP_SFACC_LINK[LESAID_SFOPP_LINK,]
# joining tables based opportunityID
MAPPING1 <- CARTP1[ !is.na(LESAaccountID) & !is.na(SFaccountID.1), list(LESAaccountID, SFaccountID = SFaccountID.1)]
MAPPING2 <- CARTP2[ !is.na(LESAaccountID) & !is.na(SFaccountID), list(LESAaccountID, SFaccountID)]
# appending MAP tables
MAPPING <- rbind(MAPPING1, MAPPING2)
# removing duplicates
setkey(MAPPING, LESAaccountID, SFaccountID)
MAPPING <- MAPPING[!duplicated(MAPPING),]
# cleanup
remove(CARTP1, CARTP2, MAPPING1, MAPPING2)
#### DUAL DATA - Support Effort vs. SF Invoice Renewal ####
# getting data filled up from LESA data
EFFORT.RENEWAL.CORRELATION <- LESA[, list(LESAcreateDate,
LESAaccountID,
LESAticketID,
LESAeffortInDays)]
setkey(EFFORT.RENEWAL.CORRELATION, LESAaccountID)
# mapping SF account ID for each ticket
EFFORT.RENEWAL.CORRELATION$SFaccountID <- MAPPING[ match( EFFORT.RENEWAL.CORRELATION$LESAaccountID, MAPPING$LESAaccountID ), SFaccountID]
# removing those tickets whose SF mapping is not available (resulting in 10.696 valid records)
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[!is.na(SFaccountID),]
# creating cartesian product of LESA ticket - SF opportunity data
setkey(SFOPP_SFACC_LINK, SFaccountID)
setkey(EFFORT.RENEWAL.CORRELATION, SFaccountID)
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[SFOPP_SFACC_LINK,]
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[!is.na(LESAticketID)]
EFFORT.RENEWAL.CORRELATION$SFcreateDate <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFcreateDate]
EFFORT.RENEWAL.CORRELATION$SFendDate <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFendDate]
EFFORT.RENEWAL.CORRELATION$SFtotalPrice <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFtotalPrice]
EFFORT.RENEWAL.CORRELATION$SFsubscriptionLevel <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFsubscriptionLevel]
EFFORT.RENEWAL.CORRELATION$SFGotRenewed <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFGotRenewed]
EFFORT.RENEWAL.CORRELATION$isSupported <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), isSupported]
EFFORT.RENEWAL.CORRELATION$SFindustry <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFindustry]
# removing records, where ticket creation date does not match opportunity lifespan - resulting valid ticket-invoice mapping!
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[LESAcreateDate >= SFcreateDate & LESAcreateDate < SFendDate]
# aggregate by opportunity
EFFORT.RENEWAL.CORRELATION.AGGR <- EFFORT.RENEWAL.CORRELATION[, list( SFcreateDate = head(SFcreateDate),
SFendDate = head(SFendDate),
SFtotalPrice = head(SFtotalPrice),
SFdailyFee = sum(SFtotalPrice) / as.integer(head(SFendDate) - head(SFcreateDate)),
SFsubscriptionLevel = head(SFsubscriptionLevel),
SFGotRenewed = head(SFGotRenewed),
LESAeffort = sum(LESAeffortInDays),
LESAworkload = sum(LESAeffortInDays) / as.integer(head(SFendDate) - head(SFcreateDate)),
SFindustry = head(SFindustry)), by = SFopportunityID]
EFFORT.RENEWAL.CORRELATION.AGGR <- EFFORT.RENEWAL.CORRELATION.AGGR[!duplicated(SFopportunityID)]
#### SF data preprocessing for plotting ####
# subsetting to consistent data in SF by date
SF.SUBSCRIPTIONS.SANITIZED <- SF.SUBSCRIPTIONS[SFcreateDate >= as.Date("2009-01-01", format = "%Y-%m-%d")
& SFcreateDate < as.Date("2012-01-01", format = "%Y-%m-%d") ]
# subsetting to invoices that could have been renewed by 2013-01-01
# SF.SUBSCRIPTIONS.SANITIZED <- SF.SUBSCRIPTIONS.SANITIZED[SFendDate < as.Date("2013-01-01"), ]
SF.AGGR.SunSet <- SF.SUBSCRIPTIONS.SANITIZED[SFendDate < as.Date("2030-01-01"),
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)),
start = min(SFcreateDate),
supported = head(isSupported)), by=SFaccountID]
SF.AGGR.Current <- SF.SUBSCRIPTIONS[SFendDate < as.Date("2030-01-01") & SFendDate > as.Date("2013-01-01"),
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)),
start = min(SFcreateDate),
supported = head(isSupported)), by=SFaccountID]
SF.AGGR.Unsorted <- SF.SUBSCRIPTIONS[SFendDate < as.Date("2020-01-01"),
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)),
start = min(SFcreateDate),
supported = head(isSupported),
startYear = min(startYear)), by=SFaccountID]
#### --------------------------- PLOTTING --------------------------- ####
#### SF data plotting - RENEWAL RATE ####
ggplot(SF.SUBSCRIPTIONS.SANITIZED,
aes(x=SFcreateDate,
y=as.integer(SFGotRenewed),
color=isSupported,
fill=isSupported))+
geom_smooth(method = "loess", span=.4, level = .9, size=1, alpha=.2)+
labs(y="Renewal Rate",
x="Date",
title="% of Subscriptions Renewed After Expiration\n(grouped by supported / unsupported products)",
fill = "is supported",
color = "is supported")
# ggplot(SF.SUBSCRIPTIONS.SANITIZED,
# aes(x=SFcreateDate,
# y=as.integer(SFGotRenewed)))+
# geom_smooth(method = "loess", span=.4, level = .9, size=1, alpha=.2)+geom_rug()+facet_wrap(~ SFindustry, ncol = 4)
#
#### SF data plotting - Lifespan of DISCONTINUED subscriptions ####
scatterplot <- ggplot(SF.AGGR.SunSet,
aes(x = start,
y = lifespan/365,
color = supported,
fill = supported)) +
geom_point() +
geom_smooth(method = "loess",
span = .4,
level = .9,
alpha = .3) +
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
scale_y_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05))+
scale_color_brewer(palette="Set1")+
scale_fill_brewer(palette="Set1")+
labs(x = "time of the initial subscription",
y = "how long the subscriber stayed with us",
color = "supported product",
fill = "supported product") +
theme(legend.position = "bottom",
legend.direction = "horizontal",
plot.margin = unit(c(-5,-5,3,4.5), "mm"))
right_hist <- ggplot(SF.AGGR.SunSet, aes(lifespan/365))+
geom_density(aes(y = ..count.., color = supported))+
scale_x_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05))+
scale_color_brewer(palette="Set1")+
coord_flip()+
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(-5,3,26,0), "mm"))
top_hist <- ggplot(SF.AGGR.SunSet, aes(start))+
geom_density(aes(y = ..count.., color = supported))+
scale_color_brewer(palette="Set1")+
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(3,-5.5,4,3), "mm")) +
labs(title = "Lengths of All DISCONTINUED Subscriptions")
empty <- ggplot()+geom_point(aes(1,1), colour="white")+
theme(axis.ticks=element_blank(),
panel.background=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(),
axis.title.x=element_blank(), axis.title.y=element_blank())
grid.arrange(top_hist,
empty,
scatterplot,
right_hist,
ncol=2,
nrow=2,
widths=c(6, 1),
heights=c(4, 9))
#### SF data plotting - Lifespan of ALL subscriptions ####
scatterplot <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(x = start,
y = lifespan/365,
color = supported,
fill = supported)) +
geom_point() +
geom_smooth(method = "loess",
span = .4,
level = .9,
alpha = .3) +
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
scale_y_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05)) +
scale_color_brewer(palette="Set1")+
scale_fill_brewer(palette="Set1")+
labs(x = "time of the initial subscription",
y = "how long the subscriber stayed with us",
color = "supported product",
fill = "supported product") +
theme(legend.position = "bottom",
legend.direction = "horizontal",
plot.margin = unit(c(-5,-5,3,4.5), "mm"))
right_hist <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(lifespan/365))+
geom_density(aes(y = ..count.., color = supported))+
scale_x_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05)) + scale_color_brewer(palette="Set1")+
coord_flip()+
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(-5,3,26,0), "mm"))
top_hist <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(start))+
geom_density(aes(y = ..count.., color = supported))+
scale_color_brewer(palette="Set1")+
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(3,-5.5,4,5), "mm")) +
labs(title = "Lengths of ALL Subscriptions\n(active and discontinued together)")
empty <- ggplot()+geom_point(aes(1,1), colour="white")+
theme(axis.ticks=element_blank(),
panel.background=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(),
axis.title.x=element_blank(), axis.title.y=element_blank())
grid.arrange(top_hist,
empty,
scatterplot,
right_hist,
ncol=2,
nrow=2,
widths=c(6, 1),
heights=c(4, 9))
#### SF data plotting - Lifespan of CURRENTLY ACTIVE subscriptions ####
scatterplot <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(x = start,
y = lifespan/365,
color = supported,
fill = supported)) +
geom_point() +
geom_smooth(method = "loess",
span = .4,
level = .9,
alpha = .3) +
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
scale_y_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05)) +
scale_color_brewer(palette="Set1")+
scale_fill_brewer(palette="Set1")+
labs(x = "time of the initial subscription",
y = "how long the subscriber stayed with us",
color = "supported product",
fill = "supported product") +
theme(legend.position = "bottom",
legend.direction = "horizontal",
plot.margin = unit(c(-5,-5,3,4.5), "mm"))
right_hist <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(lifespan/365))+
geom_density(aes(y = ..count.., color = supported))+
scale_x_continuous(breaks = 0:6,
limits = c(0,6),
expand = c(.05,.05)) +
scale_color_brewer(palette="Set1")+
coord_flip()+
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(-5,3,26,0), "mm"))
top_hist <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") &
start < as.Date("2013-01-01"),],
aes(start))+
geom_density(aes(y = ..count.., color = supported))+
scale_color_brewer(palette="Set1")+
scale_x_date(breaks = "1 year",
minor_breaks = "1 month",
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) +
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.title.y=element_blank(),
plot.margin = unit(c(3,-5.5,4,5), "mm")) +
labs(title = "Lengths of CURRENTLY ACTIVE Subscriptions\n(including future end dates)")
empty <- ggplot()+geom_point(aes(1,1), colour="white")+
theme(axis.ticks=element_blank(),
panel.background=element_blank(),
axis.text.x=element_blank(), axis.text.y=element_blank(),
axis.title.x=element_blank(), axis.title.y=element_blank())
grid.arrange(top_hist,
empty,
scatterplot,
right_hist,
ncol=2,
nrow=2,
widths=c(6, 1),
heights=c(4, 9))
#### DUAL DATA plotting ####
# library(scales)
#
# ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, aes(x = LESAeffort, y = as.integer(SFGotRenewed)))+
# geom_smooth(method = "loess", span = .7, level = 0)+
# scale_x_continuous(limits = c(7,2000))
# facet_wrap(~ SFindustry)
# breaks = trans_breaks("log10", function(x) 10^x),
# labels = trans_format("log10", math_format(10^.x))
# support as cost center graph
top <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, aes(x = LESAworkload))+
geom_density(aes(y = ..density..))+
theme(axis.title.x = element_blank(),
plot.margin = unit(c(2,2,2,2), "mm")) +
labs(title = "",
y = "subsc. density") +
scale_x_continuous(limits = c(0,10))+
geom_vline(aes(xintercept = 1.75), color = "red")
mid <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR,
aes(x = LESAworkload,
y = as.integer(SFGotRenewed))) +
geom_smooth(method = "loess", span = .7, level = 0) +
theme(axis.title.x=element_blank(),
legend.position = "none",
plot.margin = unit(c(2,2,2,2), "mm")) +
scale_x_continuous(limits = c(0,10)) +
labs(y = "renewal rate")+
geom_vline(aes(xintercept = 1.75), color = "red")
bot <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR,
aes(x = LESAworkload,
y = SFdailyFee)) +
geom_smooth(method = "loess", span = .7, level = 0) +
theme(legend.position = "none",
plot.margin = unit(c(2,2,2,-0.5), "mm")) +
scale_x_continuous(limits = c(0,10)) +
# scale_y_continuous(limits = c(0,5000)) +
labs(x = "support load",
y = "daily fee")+
geom_vline(aes(xintercept = 1.75), color = "red")
grid.arrange(top, mid, bot,
ncol=1,
nrow=3,
heights=c(1,1,1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment