Skip to content

Instantly share code, notes, and snippets.

@shedoesdatascience
Last active October 3, 2023 05:40
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save shedoesdatascience/de3c5d3c2c88132339347c7da838a126 to your computer and use it in GitHub Desktop.
Save shedoesdatascience/de3c5d3c2c88132339347c7da838a126 to your computer and use it in GitHub Desktop.
A/B Testing in R
#### 1. Import libraries and set working directory ####
library(readr)
library(dplyr)
library(ggplot2)
library(lubridate)
library(pwr)
library(plyr)
setwd("C:\\Users\\sssssss\\Desktop\\TM\\")
#### 2. Read in files and merge ####
eng_df<-read.csv("email_engagement.csv")
length(unique(eng_df$user_id))
dim(eng_df) #all user ids are unique with no duplicates - 4,648 ids
test_conv_df<-read.csv("test_conversions.csv") #those that converted as a result of A/B test
length(unique(test_conv_df$user_id)) #only 2,210 ids
summary(test_conv_df) #only contains conversion data
variations_df<-read.csv("variations.csv")
visits_df<-read.csv("visits.csv")
length(unique(visits_df$user_id)) #more user_ids than in the email engagement file
merge_1<-merge(variations_df,visits_df,by.x="user_id",by.y="user_id")
merge_2<-merge(merge_1,test_conv_df,by.x="user_id",by.y="user_id",all.x=TRUE)
merge_3<-merge(merge_2,eng_df,by.x="user_id",by.y="user_id",all.x=TRUE)
#Assume that if NA for conversion then 0. i.e. user did not click
#Assume that if NA for clicked then 0. i.e. user did not click
merge_3$converted<-if_else(is.na(merge_3$converted),0,1)
merge_3$clicked_on_email<-if_else(is.na(merge_3$clicked_on_email),0,1)
merge_3$converted<-as.factor(merge_3$converted)
merge_3$clicked_on_email<-as.factor(merge_3$clicked_on_email)
#### 3. Data Transformation ####
#convert visit time into a useful variable
merge_3$timeofday<- mapvalues(hour(merge_3$visit_time),from=c(0:23),
to=c(rep("night",times=5), rep("morning",times=6),rep("afternoon",times=5),rep("night", times=8)))
merge_3$timeofday<-as.factor(merge_3$timeofday)
#### 4. Exploratory data analysis to understand users ####
#Income ranges from 14,000 to $1511958 - too much variation to see a trend
l <- ggplot(merge_3, aes(variation,fill = converted))
l <- l + geom_histogram(stat="count")
print(l)
print("Proportion that converted by variation converted")
tapply(as.numeric(merge_3$converted) - 1 ,merge_3$variation,mean)
#slightly greater proportion of those that viewed treatment variation converted
### 5.1 A/B testing - visits, variations and test converions
# Null Hypothesis: Assumption that there is no difference between the conversion rates for control & exp
#Alternative Hypothesis: There is a difference between the conversion rates for control & exp
control_sz<-length(which(merge_3$variation=="control"))
exp_sz<-length(which(merge_3$variation=="treatment"))
control_yes<-length(which(merge_3$variation=="control" & merge_3$converted=="1"))
exp_yes<-length(which(merge_3$variation=="treatment" & merge_3$converted=="1"))
prop.test(c (control_yes, exp_yes), c (control_sz, exp_sz))
#p-value<0.05 - so yes there is a difference in conversion rates. Slightly higher for treatment
#chi-sq test for conversion and variation
#Null hypothesis: There is no association between conversion and variation
ch_test<-chisq.test(merge_3$variation,merge_3$converted) #p-value < 0.05, There may be an association between
#likelihood of conversion and variation
ch_test$stdres
#Pearson's std residuals measure
# how large is the deviation from each cell to the null hypothesis (in this case, independence between row and column's).
#treatment is negatively associated with non-conversion, so it has fewer proportion of people who did not convert
#test which variables are key in determining conversion via logistic regression
## 5.2 Further analysis - logistic regression ##
glm_model<-glm(converted~variation+channel+age+gender+timeofday+income,data=merge_3,family = binomial(link="logit"))
summary(glm_model)
#only variation treatment is statistically signficant
#positive coefficient indicates that all other factors remaining fixed, if person has viewed treatment variation, they are more likely to convert
#viewing treatment rather than control increases odds by 0.0272 for conversion
anova(glm_model, test="Chisq")
#Finding: Only adding channel and income is statistically significant in reducing the residual deviance.
#How our model does against model without intercep
#assess model fit
library(pscl)
pR2(glm_model) #McFadden R2 index can be used to assess the model fit.
#only 12.94$ of variation in whether or not someone will convert is likely to be explained by current model
#### 5. Exploratory learning for engagemnt with emails ####
#Exploratory data analysis to understand relationship between attributes and engagement with emails
l <- ggplot(merge_3, aes(gender,fill = clicked_on_email))
l <- l + geom_histogram(stat="count")
print(l)
print("Greater proportion of males clicked on email")
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$gender,mean)
l <- ggplot(merge_3, aes(timeofday,fill = clicked_on_email))
l <- l + geom_histogram(stat="count")
print(l)
print("Greatest proportion of people that visited website visited in the nighttime. Similar proportions of those that clicked on email by time of day")
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$timeofday,mean)
l <- ggplot(merge_3, aes(channel,fill = clicked_on_email))
l <- l + geom_histogram(stat="count")
print(l)
print("Those that had PPC as their channel had the highest proportion of clicking on email (73%).
However, highest number of users were prompted to visit the website due to TV")
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$channel,mean)
#Age has 1,243 missing values
row.has.na <- apply(merge_3, 1, function(x){any(is.na(x))})
sum(row.has.na)
final.filtered <- merge_3[!row.has.na,]
ggplot(final.filtered,aes(x=clicked_on_email,y=age))+geom_boxplot()
print("On average, those that converted were on average 38 years old, while those that didn't were around 32 years of age")
dplyr::group_by(final.filtered, clicked_on_email) %>%
dplyr::summarise(mean=mean(age), sd=sd(age))
ggplot(final.filtered,aes(x=clicked_on_email,y=income))+geom_boxplot()
dplyr::group_by(final.filtered, clicked_on_email) %>%
dplyr::summarise(mean=mean(income), sd=sd(income))
print("On average, those that converted on average earned more than those that didn't. High variation in income, with a lot of very high income values in both groups")
#### 6. Machine learning model for engagement with emails ####
library(caTools)
library(e1071)
library(glmnet)
## drop columns not required
mydatanew = final.filtered[,-c(1,3,8)]
#randomly split data into training, test and validation
#Splitting data
split <- sample.split(mydatanew$clicked_on_email, SplitRatio = 0.70)
train <- subset(mydatanew, split == T) #ensure same proportion of clicks and non-clicks in both training and test sets to ensure balance
test <- subset(mydatanew, split == F)
## Testing model accuracy on test set
model_glm <- glm(clicked_on_email ~ ., data = train, family='binomial')
predicted_glm <- predict(model_glm, test, type='response')
predicted_glm <- ifelse(predicted_glm > 0.5,1,0)
misClasificError <- mean(predicted_glm != test$clicked_on_email)
print(paste('Accuracy',1-misClasificError))
p<-predicted_glm
pr <- prediction(p, test$clicked_on_email)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
summary(model_glm)
print("Statistically signficant predicturs are channel, age, and gender where males are
more likely to click on email and those that come via Facebook, followed by PPC and then TV. Also, an older user is more likely to click on an email than a younger user")
##Response to Q1: There is a statistically significant difference in those that converted based on variation; however, the difference in proportion is not very large. I would recommend re-running the test with a larger dataset to ensure reproducibility.
##Response to Q2: The model can predict whether or not someone will click on an email with 84% accuracy. Top predictors are gender, age, and channel.
@MarkDebono
Copy link

Hi Sheenal, thanks for the article, it is very interesting! By any chance, are you providing the data file "email_engagement.csv" in order to be able to go true and understand the code/method used? Regards, Mark

@datasciencevj
Copy link

Thanks Sheenal! Would be great if you can share the data file. You can mask your data file.

@shedoesdatascience
Copy link
Author

Thanks @MarkDebono and @datasciencevj for reading my article. Unfortunately, I no longer have access to these data files.

@jess8181
Copy link

jess8181 commented Dec 4, 2021

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment