zjuul/cohort_chart.R

## cohort_chart.R
# From: https://stuifbergen.com/2018/03/cohort-analysis-with-snowplow-and-r/

########## 1: get data
appname <- "NAME" # used as selection criterium, replace that with the name of your app
q <- paste0("
  SELECT
  user_id,
  TO_CHAR(CONVERT_TIMEZONE('UTC', 'Europe/Amsterdam', derived_tstamp),'YYYY ww') AS yw
  FROM atomic.events
  WHERE app_id = '", appname, "' AND derived_tstamp >= '2018-01-01'
  GROUP BY 1,2;
")
conn <- dbConnect(driver, url) # make the database connection - this is from the RJDBC package (library(RJDBC))
dbdata <- dbGetQuery(conn, q)  # get the data
dbDisconnect(conn)             # close the connection

########## 2: make cohort table
library(tidyverse)

cohort <- dbdata %>%           # store in cohort table, get from dbdata
  group_by(user_id) %>%        # group all users together
  mutate(first = min(yw)) %>%  # for every user, find the first period
  group_by(first, yw) %>%      # group by this first period + the other periods
  summarise(users = n()) %>%   # for each combination, count the number of users
  spread(yw, users)            # and make columns with period names


########## 3: make pretty
shiftrow <- function(v) {
  # put a vector in, strip off leading NA values, and place that amount at the end
  first_na_index <- min( which(!is.na(v)) )

  # return that bit to the end,  and pad with NAs.
  c(v[first_na_index:length(v)], rep(NA, first_na_index-1))
}

# create a new dataframe, with shifted rows (and keep the first one)
shifted <- data.frame(
    cohort = cohort$first,
    t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end
           1, # for every row
           shiftrow ))
)

# and make column names readable
# first should be "cohort" and the rest week.<number>, (padded)
colnames(shifted) <- c("cohort", sub("","week.", str_pad(1:(ncol(shifted)-1),2,pad = "0")))

# percentages
shifted_pct <- data.frame(
  cohort = shifted$cohort, # first column
  shifted[,2:nrow(shifted)+1] / shifted[["week.01"]] # rest: divide by week.01
)

######### 4: prepare plot data
# ggplot loves long data. Let's melt it. One for the absolute values, one for the pcts
plotdata_abs <- gather(shifted,     "cohort_age", "people"  ,2:ncol(shifted    ))
plotdata_pct <- gather(shifted_pct, "cohort_age", "percent" ,2:ncol(shifted_pct))

# now add some data.. we need pretty labels..
# first bit is the length of the width of the wide column (minus 1, that's the cohort name)
# that contains the absolute numbers
# last bit is the rest, those are percentages.
labelnames <- c( plotdata_abs$people[1:(ncol(shifted)-1)],
                 plotdata_pct$percent[(ncol(shifted)):(nrow(plotdata_pct))])

# we need pretty labels.
pretty_print <- function(n) {
  case_when( n <= 1  ~ sprintf("%1.0f %%", n*100),
             n >  1  ~ as.character(n),
             TRUE    ~ " ") # for NA values, skip the label
}

# create the plot data
plotdata <- data.frame(
  cohort     = plotdata_pct$cohort,
  cohort_age = plotdata_pct$cohort_age,
  percentage = plotdata_pct$percent,
  label      = pretty_print(labelnames)
)

######### 5: plot!
# plot (with reordered y axis, oldest group on top)

# optional: if the percentages are really low, replace the 1.0 in the first column with zero
plotdata[which(plotdata$percentage == 1), "percentage"] <- 0

ggplot(plotdata, aes(x = cohort_age, y = reorder(cohort, desc(cohort)))) +
  geom_raster(aes(fill = percentage)) +
  scale_fill_continuous(guide = FALSE) + # no legend
  geom_text(aes(label = label), color = "white") +
  xlab("cohort age") + ylab("cohort") +
  ggtitle(paste("Retention table (cohort) for",appname, "app"))
	# From: https://stuifbergen.com/2018/03/cohort-analysis-with-snowplow-and-r/

	########## 1: get data
	appname <- "NAME" # used as selection criterium, replace that with the name of your app
	q <- paste0("
	SELECT
	user_id,
	TO_CHAR(CONVERT_TIMEZONE('UTC', 'Europe/Amsterdam', derived_tstamp),'YYYY ww') AS yw
	FROM atomic.events
	WHERE app_id = '", appname, "' AND derived_tstamp >= '2018-01-01'
	GROUP BY 1,2;
	")
	conn <- dbConnect(driver, url) # make the database connection - this is from the RJDBC package (library(RJDBC))
	dbdata <- dbGetQuery(conn, q) # get the data
	dbDisconnect(conn) # close the connection

	########## 2: make cohort table
	library(tidyverse)

	cohort <- dbdata %>% # store in cohort table, get from dbdata
	group_by(user_id) %>% # group all users together
	mutate(first = min(yw)) %>% # for every user, find the first period
	group_by(first, yw) %>% # group by this first period + the other periods
	summarise(users = n()) %>% # for each combination, count the number of users
	spread(yw, users) # and make columns with period names


	########## 3: make pretty
	shiftrow <- function(v) {
	# put a vector in, strip off leading NA values, and place that amount at the end
	first_na_index <- min( which(!is.na(v)) )

	# return that bit to the end, and pad with NAs.
	c(v[first_na_index:length(v)], rep(NA, first_na_index-1))
	}

	# create a new dataframe, with shifted rows (and keep the first one)
	shifted <- data.frame(
	cohort = cohort$first,
	t(apply( select(as.data.frame(cohort), 2:ncol(cohort)), # 2nd column to the end
	1, # for every row
	shiftrow ))
	)

	# and make column names readable
	# first should be "cohort" and the rest week.<number>, (padded)
	colnames(shifted) <- c("cohort", sub("","week.", str_pad(1:(ncol(shifted)-1),2,pad = "0")))

	# percentages
	shifted_pct <- data.frame(
	cohort = shifted$cohort, # first column
	shifted[,2:nrow(shifted)+1] / shifted[["week.01"]] # rest: divide by week.01
	)

	######### 4: prepare plot data
	# ggplot loves long data. Let's melt it. One for the absolute values, one for the pcts
	plotdata_abs <- gather(shifted, "cohort_age", "people" ,2:ncol(shifted ))
	plotdata_pct <- gather(shifted_pct, "cohort_age", "percent" ,2:ncol(shifted_pct))

	# now add some data.. we need pretty labels..
	# first bit is the length of the width of the wide column (minus 1, that's the cohort name)
	# that contains the absolute numbers
	# last bit is the rest, those are percentages.
	labelnames <- c( plotdata_abs$people[1:(ncol(shifted)-1)],
	plotdata_pct$percent[(ncol(shifted)):(nrow(plotdata_pct))])

	# we need pretty labels.
	pretty_print <- function(n) {
	case_when( n <= 1 ~ sprintf("%1.0f %%", n*100),
	n > 1 ~ as.character(n),
	TRUE ~ " ") # for NA values, skip the label
	}

	# create the plot data
	plotdata <- data.frame(
	cohort = plotdata_pct$cohort,
	cohort_age = plotdata_pct$cohort_age,
	percentage = plotdata_pct$percent,
	label = pretty_print(labelnames)
	)

	######### 5: plot!
	# plot (with reordered y axis, oldest group on top)

	# optional: if the percentages are really low, replace the 1.0 in the first column with zero
	plotdata[which(plotdata$percentage == 1), "percentage"] <- 0

	ggplot(plotdata, aes(x = cohort_age, y = reorder(cohort, desc(cohort)))) +
	geom_raster(aes(fill = percentage)) +
	scale_fill_continuous(guide = FALSE) + # no legend
	geom_text(aes(label = label), color = "white") +
	xlab("cohort age") + ylab("cohort") +
	ggtitle(paste("Retention table (cohort) for",appname, "app"))