Created
February 2, 2015 21:18
-
-
Save lorinc/b9cd56990677d5453042 to your computer and use it in GitHub Desktop.
400 lines of ETL, data cleansing, cross-validating, plotting and exploratory analysis in R using data.table and ggplot, using SFDC, Jira and accounting data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### reading libraries #### | |
library(data.table) | |
library(ggplot2) | |
library(gridExtra) | |
#### reading sources #### | |
setwd("~/Desktop/consolidated") | |
# "SFaccountID","SFopportunityID","SFaccountName","SFcreateDate","SFendDate","SFindustry","SFproductName","SFtotalPriceCurrency","SFtotalPrice" | |
# Source: Salesforce report exported to CSV | |
SF <- data.table(read.csv("SF-data-consolidated-w-names.csv")) | |
# LESAcreateDate,LESAaccountID,LESAaccountName,LESAticketID,LESAisEscalated,LESAgrossResolutionDays,LESAcommentMass,LESAcommentCount,LESAhasLPP,LESAstatus | |
# Source: SQL query run on LRDC DB dump | |
LESA <- data.table(read.csv("LESA-data-consolidated.csv")) | |
# text-to-date in all sources | |
SF$SFcreateDate <- as.Date(SF$SFcreateDate, "%m/%d/%Y") | |
SF$SFendDate <- as.Date(SF$SFendDate, "%m/%d/%Y") | |
LESA$LESAcreateDate <- as.Date(LESA$LESAcreateDate, "%m/%d/%Y") | |
#### SF data processing #### | |
# subscription level data | |
SubscriptionLevels <- c("Basic", | |
"Limited", | |
"Gold", | |
"Platinum", | |
"OEM", | |
"Indemnification", | |
"Dev Workstation", | |
"Support Contact", | |
"Silver", | |
"Unlimited", | |
"Social Office", | |
"6.x License") | |
# creating regexp pattern from SubscriptionLevels data | |
ptn <- paste(".*(", paste(SubscriptionLevels, collapse="|"), ").*", sep="") | |
# deriving subscription levels from product name for each invoice | |
SF[, SFsubscriptionLevel := gsub(ptn, "\\1", SFproductName, ignore.case=FALSE)] | |
SF$SFsubscriptionLevel <- as.factor(SF$SFsubscriptionLevel) | |
# keep one item for each invoice, new table called SF.SUBSCRIPTIONS | |
SF <- SF[sort(-SF$SFtotalPrice)] | |
setkey(SF, SFopportunityID) | |
# dropping all but the most expensive item from all invoices (hoping that it will represent the rest well in terms of subscription level) | |
# ERROR PRONE FUNCTION: can not handle multiple level of subscriptions per invoice | |
SF.SUBSCRIPTIONS <- unique(SF) | |
# renewal flag applied if there is a future invoice for the same account | |
SF.SUBSCRIPTIONS[ , SFGotRenewed := SFendDate <= max(SFcreateDate), by=SFaccountID] | |
# setting up isSupported flag | |
SF.SUBSCRIPTIONS[ , isSupported := SFsubscriptionLevel %in% c("Gold", "Platinum", "OEM", "Indemnification", "Unlimited", "Social Office")] | |
# factorizing initial subscription year | |
SF.SUBSCRIPTIONS[, startYear := as.character.Date(SFcreateDate, format="%Y"), by=SFaccountID ] | |
#### LESA effort calculation #### | |
# global variables | |
escalationCostMultiplier <- 1 | |
costForLppMultiplier <- 2 | |
# calculating effort factor exchange rates | |
aRd <- mean(LESA$LESAgrossResolutionDays) | |
CmInRd <- aRd / mean(LESA$LESAcommentMass) | |
CcInRd <- aRd / mean(LESA$LESAcommentCount) | |
# averaging the 3 effort factors | |
LESA$LESAeffortInDays <- (LESA$LESAgrossResolutionDays + | |
LESA$LESAcommentMass * CmInRd + | |
LESA$LESAcommentCount * CcInRd) / 3 | |
# applying escalation and LPS multipliers | |
LESA$LESAeffortInDays <- LESA$LESAeffortInDays + | |
LESA$LESAeffortInDays * escalationCostMultiplier * LESA$LESAisEscalated + | |
LESA$LESAeffortInDays * costForLppMultiplier * LESA$LESAhasLPP | |
#### DUAL DATA - creating the LESA-ID SF-ID account mapping table #### | |
# mapping is based on opportunity ID recorded in LESA account notes | |
setkey(SF, SFopportunityID) | |
# map tables | |
LESAID_SFOPP_LINK <- data.table(read.csv("LESA-SF-AUTOMATED.csv")) | |
SFOPP_SFACC_LINK <- SF.SUBSCRIPTIONS[, list(SFopportunityID, SFaccountID)] | |
# set up keys | |
setkey(LESAID_SFOPP_LINK, SFopportunityID) | |
setkey(SFOPP_SFACC_LINK, SFopportunityID) | |
# cartesian products | |
CARTP1 <- LESAID_SFOPP_LINK[SFOPP_SFACC_LINK,] | |
CARTP2 <- SFOPP_SFACC_LINK[LESAID_SFOPP_LINK,] | |
# joining tables based opportunityID | |
MAPPING1 <- CARTP1[ !is.na(LESAaccountID) & !is.na(SFaccountID.1), list(LESAaccountID, SFaccountID = SFaccountID.1)] | |
MAPPING2 <- CARTP2[ !is.na(LESAaccountID) & !is.na(SFaccountID), list(LESAaccountID, SFaccountID)] | |
# appending MAP tables | |
MAPPING <- rbind(MAPPING1, MAPPING2) | |
# removing duplicates | |
setkey(MAPPING, LESAaccountID, SFaccountID) | |
MAPPING <- MAPPING[!duplicated(MAPPING),] | |
# cleanup | |
remove(CARTP1, CARTP2, MAPPING1, MAPPING2) | |
#### DUAL DATA - Support Effort vs. SF Invoice Renewal #### | |
# getting data filled up from LESA data | |
EFFORT.RENEWAL.CORRELATION <- LESA[, list(LESAcreateDate, | |
LESAaccountID, | |
LESAticketID, | |
LESAeffortInDays)] | |
setkey(EFFORT.RENEWAL.CORRELATION, LESAaccountID) | |
# mapping SF account ID for each ticket | |
EFFORT.RENEWAL.CORRELATION$SFaccountID <- MAPPING[ match( EFFORT.RENEWAL.CORRELATION$LESAaccountID, MAPPING$LESAaccountID ), SFaccountID] | |
# removing those tickets whose SF mapping is not available (resulting in 10.696 valid records) | |
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[!is.na(SFaccountID),] | |
# creating cartesian product of LESA ticket - SF opportunity data | |
setkey(SFOPP_SFACC_LINK, SFaccountID) | |
setkey(EFFORT.RENEWAL.CORRELATION, SFaccountID) | |
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[SFOPP_SFACC_LINK,] | |
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[!is.na(LESAticketID)] | |
EFFORT.RENEWAL.CORRELATION$SFcreateDate <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFcreateDate] | |
EFFORT.RENEWAL.CORRELATION$SFendDate <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFendDate] | |
EFFORT.RENEWAL.CORRELATION$SFtotalPrice <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFtotalPrice] | |
EFFORT.RENEWAL.CORRELATION$SFsubscriptionLevel <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFsubscriptionLevel] | |
EFFORT.RENEWAL.CORRELATION$SFGotRenewed <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFGotRenewed] | |
EFFORT.RENEWAL.CORRELATION$isSupported <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), isSupported] | |
EFFORT.RENEWAL.CORRELATION$SFindustry <- SF.SUBSCRIPTIONS[ match(EFFORT.RENEWAL.CORRELATION$SFopportunityID, SF.SUBSCRIPTIONS$SFopportunityID), SFindustry] | |
# removing records, where ticket creation date does not match opportunity lifespan - resulting valid ticket-invoice mapping! | |
EFFORT.RENEWAL.CORRELATION <- EFFORT.RENEWAL.CORRELATION[LESAcreateDate >= SFcreateDate & LESAcreateDate < SFendDate] | |
# aggregate by opportunity | |
EFFORT.RENEWAL.CORRELATION.AGGR <- EFFORT.RENEWAL.CORRELATION[, list( SFcreateDate = head(SFcreateDate), | |
SFendDate = head(SFendDate), | |
SFtotalPrice = head(SFtotalPrice), | |
SFdailyFee = sum(SFtotalPrice) / as.integer(head(SFendDate) - head(SFcreateDate)), | |
SFsubscriptionLevel = head(SFsubscriptionLevel), | |
SFGotRenewed = head(SFGotRenewed), | |
LESAeffort = sum(LESAeffortInDays), | |
LESAworkload = sum(LESAeffortInDays) / as.integer(head(SFendDate) - head(SFcreateDate)), | |
SFindustry = head(SFindustry)), by = SFopportunityID] | |
EFFORT.RENEWAL.CORRELATION.AGGR <- EFFORT.RENEWAL.CORRELATION.AGGR[!duplicated(SFopportunityID)] | |
#### SF data preprocessing for plotting #### | |
# subsetting to consistent data in SF by date | |
SF.SUBSCRIPTIONS.SANITIZED <- SF.SUBSCRIPTIONS[SFcreateDate >= as.Date("2009-01-01", format = "%Y-%m-%d") | |
& SFcreateDate < as.Date("2012-01-01", format = "%Y-%m-%d") ] | |
# subsetting to invoices that could have been renewed by 2013-01-01 | |
# SF.SUBSCRIPTIONS.SANITIZED <- SF.SUBSCRIPTIONS.SANITIZED[SFendDate < as.Date("2013-01-01"), ] | |
SF.AGGR.SunSet <- SF.SUBSCRIPTIONS.SANITIZED[SFendDate < as.Date("2030-01-01"), | |
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)), | |
start = min(SFcreateDate), | |
supported = head(isSupported)), by=SFaccountID] | |
SF.AGGR.Current <- SF.SUBSCRIPTIONS[SFendDate < as.Date("2030-01-01") & SFendDate > as.Date("2013-01-01"), | |
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)), | |
start = min(SFcreateDate), | |
supported = head(isSupported)), by=SFaccountID] | |
SF.AGGR.Unsorted <- SF.SUBSCRIPTIONS[SFendDate < as.Date("2020-01-01"), | |
list(lifespan = as.integer(max(SFendDate)-min(SFcreateDate)), | |
start = min(SFcreateDate), | |
supported = head(isSupported), | |
startYear = min(startYear)), by=SFaccountID] | |
#### --------------------------- PLOTTING --------------------------- #### | |
#### SF data plotting - RENEWAL RATE #### | |
ggplot(SF.SUBSCRIPTIONS.SANITIZED, | |
aes(x=SFcreateDate, | |
y=as.integer(SFGotRenewed), | |
color=isSupported, | |
fill=isSupported))+ | |
geom_smooth(method = "loess", span=.4, level = .9, size=1, alpha=.2)+ | |
labs(y="Renewal Rate", | |
x="Date", | |
title="% of Subscriptions Renewed After Expiration\n(grouped by supported / unsupported products)", | |
fill = "is supported", | |
color = "is supported") | |
# ggplot(SF.SUBSCRIPTIONS.SANITIZED, | |
# aes(x=SFcreateDate, | |
# y=as.integer(SFGotRenewed)))+ | |
# geom_smooth(method = "loess", span=.4, level = .9, size=1, alpha=.2)+geom_rug()+facet_wrap(~ SFindustry, ncol = 4) | |
# | |
#### SF data plotting - Lifespan of DISCONTINUED subscriptions #### | |
scatterplot <- ggplot(SF.AGGR.SunSet, | |
aes(x = start, | |
y = lifespan/365, | |
color = supported, | |
fill = supported)) + | |
geom_point() + | |
geom_smooth(method = "loess", | |
span = .4, | |
level = .9, | |
alpha = .3) + | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
scale_y_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05))+ | |
scale_color_brewer(palette="Set1")+ | |
scale_fill_brewer(palette="Set1")+ | |
labs(x = "time of the initial subscription", | |
y = "how long the subscriber stayed with us", | |
color = "supported product", | |
fill = "supported product") + | |
theme(legend.position = "bottom", | |
legend.direction = "horizontal", | |
plot.margin = unit(c(-5,-5,3,4.5), "mm")) | |
right_hist <- ggplot(SF.AGGR.SunSet, aes(lifespan/365))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_x_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05))+ | |
scale_color_brewer(palette="Set1")+ | |
coord_flip()+ | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(-5,3,26,0), "mm")) | |
top_hist <- ggplot(SF.AGGR.SunSet, aes(start))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_color_brewer(palette="Set1")+ | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(3,-5.5,4,3), "mm")) + | |
labs(title = "Lengths of All DISCONTINUED Subscriptions") | |
empty <- ggplot()+geom_point(aes(1,1), colour="white")+ | |
theme(axis.ticks=element_blank(), | |
panel.background=element_blank(), | |
axis.text.x=element_blank(), axis.text.y=element_blank(), | |
axis.title.x=element_blank(), axis.title.y=element_blank()) | |
grid.arrange(top_hist, | |
empty, | |
scatterplot, | |
right_hist, | |
ncol=2, | |
nrow=2, | |
widths=c(6, 1), | |
heights=c(4, 9)) | |
#### SF data plotting - Lifespan of ALL subscriptions #### | |
scatterplot <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(x = start, | |
y = lifespan/365, | |
color = supported, | |
fill = supported)) + | |
geom_point() + | |
geom_smooth(method = "loess", | |
span = .4, | |
level = .9, | |
alpha = .3) + | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
scale_y_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05)) + | |
scale_color_brewer(palette="Set1")+ | |
scale_fill_brewer(palette="Set1")+ | |
labs(x = "time of the initial subscription", | |
y = "how long the subscriber stayed with us", | |
color = "supported product", | |
fill = "supported product") + | |
theme(legend.position = "bottom", | |
legend.direction = "horizontal", | |
plot.margin = unit(c(-5,-5,3,4.5), "mm")) | |
right_hist <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(lifespan/365))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_x_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05)) + scale_color_brewer(palette="Set1")+ | |
coord_flip()+ | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(-5,3,26,0), "mm")) | |
top_hist <- ggplot(SF.AGGR.Unsorted[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(start))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_color_brewer(palette="Set1")+ | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(3,-5.5,4,5), "mm")) + | |
labs(title = "Lengths of ALL Subscriptions\n(active and discontinued together)") | |
empty <- ggplot()+geom_point(aes(1,1), colour="white")+ | |
theme(axis.ticks=element_blank(), | |
panel.background=element_blank(), | |
axis.text.x=element_blank(), axis.text.y=element_blank(), | |
axis.title.x=element_blank(), axis.title.y=element_blank()) | |
grid.arrange(top_hist, | |
empty, | |
scatterplot, | |
right_hist, | |
ncol=2, | |
nrow=2, | |
widths=c(6, 1), | |
heights=c(4, 9)) | |
#### SF data plotting - Lifespan of CURRENTLY ACTIVE subscriptions #### | |
scatterplot <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(x = start, | |
y = lifespan/365, | |
color = supported, | |
fill = supported)) + | |
geom_point() + | |
geom_smooth(method = "loess", | |
span = .4, | |
level = .9, | |
alpha = .3) + | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
scale_y_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05)) + | |
scale_color_brewer(palette="Set1")+ | |
scale_fill_brewer(palette="Set1")+ | |
labs(x = "time of the initial subscription", | |
y = "how long the subscriber stayed with us", | |
color = "supported product", | |
fill = "supported product") + | |
theme(legend.position = "bottom", | |
legend.direction = "horizontal", | |
plot.margin = unit(c(-5,-5,3,4.5), "mm")) | |
right_hist <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(lifespan/365))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_x_continuous(breaks = 0:6, | |
limits = c(0,6), | |
expand = c(.05,.05)) + | |
scale_color_brewer(palette="Set1")+ | |
coord_flip()+ | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(-5,3,26,0), "mm")) | |
top_hist <- ggplot(SF.AGGR.Current[start >= as.Date("2009-01-01") & | |
start < as.Date("2013-01-01"),], | |
aes(start))+ | |
geom_density(aes(y = ..count.., color = supported))+ | |
scale_color_brewer(palette="Set1")+ | |
scale_x_date(breaks = "1 year", | |
minor_breaks = "1 month", | |
limits = c(as.Date("2009-01-01"), as.Date("2013-01-01"))) + | |
theme(legend.position = "none", | |
axis.title.x=element_blank(), | |
axis.title.y=element_blank(), | |
plot.margin = unit(c(3,-5.5,4,5), "mm")) + | |
labs(title = "Lengths of CURRENTLY ACTIVE Subscriptions\n(including future end dates)") | |
empty <- ggplot()+geom_point(aes(1,1), colour="white")+ | |
theme(axis.ticks=element_blank(), | |
panel.background=element_blank(), | |
axis.text.x=element_blank(), axis.text.y=element_blank(), | |
axis.title.x=element_blank(), axis.title.y=element_blank()) | |
grid.arrange(top_hist, | |
empty, | |
scatterplot, | |
right_hist, | |
ncol=2, | |
nrow=2, | |
widths=c(6, 1), | |
heights=c(4, 9)) | |
#### DUAL DATA plotting #### | |
# library(scales) | |
# | |
# ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, aes(x = LESAeffort, y = as.integer(SFGotRenewed)))+ | |
# geom_smooth(method = "loess", span = .7, level = 0)+ | |
# scale_x_continuous(limits = c(7,2000)) | |
# facet_wrap(~ SFindustry) | |
# breaks = trans_breaks("log10", function(x) 10^x), | |
# labels = trans_format("log10", math_format(10^.x)) | |
# support as cost center graph | |
top <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, aes(x = LESAworkload))+ | |
geom_density(aes(y = ..density..))+ | |
theme(axis.title.x = element_blank(), | |
plot.margin = unit(c(2,2,2,2), "mm")) + | |
labs(title = "", | |
y = "subsc. density") + | |
scale_x_continuous(limits = c(0,10))+ | |
geom_vline(aes(xintercept = 1.75), color = "red") | |
mid <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, | |
aes(x = LESAworkload, | |
y = as.integer(SFGotRenewed))) + | |
geom_smooth(method = "loess", span = .7, level = 0) + | |
theme(axis.title.x=element_blank(), | |
legend.position = "none", | |
plot.margin = unit(c(2,2,2,2), "mm")) + | |
scale_x_continuous(limits = c(0,10)) + | |
labs(y = "renewal rate")+ | |
geom_vline(aes(xintercept = 1.75), color = "red") | |
bot <- ggplot(EFFORT.RENEWAL.CORRELATION.AGGR, | |
aes(x = LESAworkload, | |
y = SFdailyFee)) + | |
geom_smooth(method = "loess", span = .7, level = 0) + | |
theme(legend.position = "none", | |
plot.margin = unit(c(2,2,2,-0.5), "mm")) + | |
scale_x_continuous(limits = c(0,10)) + | |
# scale_y_continuous(limits = c(0,5000)) + | |
labs(x = "support load", | |
y = "daily fee")+ | |
geom_vline(aes(xintercept = 1.75), color = "red") | |
grid.arrange(top, mid, bot, | |
ncol=1, | |
nrow=3, | |
heights=c(1,1,1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment