Last active
October 3, 2023 05:40
-
-
Save shedoesdatascience/de3c5d3c2c88132339347c7da838a126 to your computer and use it in GitHub Desktop.
A/B Testing in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### 1. Import libraries and set working directory #### | |
library(readr) | |
library(dplyr) | |
library(ggplot2) | |
library(lubridate) | |
library(pwr) | |
library(plyr) | |
setwd("C:\\Users\\sssssss\\Desktop\\TM\\") | |
#### 2. Read in files and merge #### | |
eng_df<-read.csv("email_engagement.csv") | |
length(unique(eng_df$user_id)) | |
dim(eng_df) #all user ids are unique with no duplicates - 4,648 ids | |
test_conv_df<-read.csv("test_conversions.csv") #those that converted as a result of A/B test | |
length(unique(test_conv_df$user_id)) #only 2,210 ids | |
summary(test_conv_df) #only contains conversion data | |
variations_df<-read.csv("variations.csv") | |
visits_df<-read.csv("visits.csv") | |
length(unique(visits_df$user_id)) #more user_ids than in the email engagement file | |
merge_1<-merge(variations_df,visits_df,by.x="user_id",by.y="user_id") | |
merge_2<-merge(merge_1,test_conv_df,by.x="user_id",by.y="user_id",all.x=TRUE) | |
merge_3<-merge(merge_2,eng_df,by.x="user_id",by.y="user_id",all.x=TRUE) | |
#Assume that if NA for conversion then 0. i.e. user did not click | |
#Assume that if NA for clicked then 0. i.e. user did not click | |
merge_3$converted<-if_else(is.na(merge_3$converted),0,1) | |
merge_3$clicked_on_email<-if_else(is.na(merge_3$clicked_on_email),0,1) | |
merge_3$converted<-as.factor(merge_3$converted) | |
merge_3$clicked_on_email<-as.factor(merge_3$clicked_on_email) | |
#### 3. Data Transformation #### | |
#convert visit time into a useful variable | |
merge_3$timeofday<- mapvalues(hour(merge_3$visit_time),from=c(0:23), | |
to=c(rep("night",times=5), rep("morning",times=6),rep("afternoon",times=5),rep("night", times=8))) | |
merge_3$timeofday<-as.factor(merge_3$timeofday) | |
#### 4. Exploratory data analysis to understand users #### | |
#Income ranges from 14,000 to $1511958 - too much variation to see a trend | |
l <- ggplot(merge_3, aes(variation,fill = converted)) | |
l <- l + geom_histogram(stat="count") | |
print(l) | |
print("Proportion that converted by variation converted") | |
tapply(as.numeric(merge_3$converted) - 1 ,merge_3$variation,mean) | |
#slightly greater proportion of those that viewed treatment variation converted | |
### 5.1 A/B testing - visits, variations and test converions | |
# Null Hypothesis: Assumption that there is no difference between the conversion rates for control & exp | |
#Alternative Hypothesis: There is a difference between the conversion rates for control & exp | |
control_sz<-length(which(merge_3$variation=="control")) | |
exp_sz<-length(which(merge_3$variation=="treatment")) | |
control_yes<-length(which(merge_3$variation=="control" & merge_3$converted=="1")) | |
exp_yes<-length(which(merge_3$variation=="treatment" & merge_3$converted=="1")) | |
prop.test(c (control_yes, exp_yes), c (control_sz, exp_sz)) | |
#p-value<0.05 - so yes there is a difference in conversion rates. Slightly higher for treatment | |
#chi-sq test for conversion and variation | |
#Null hypothesis: There is no association between conversion and variation | |
ch_test<-chisq.test(merge_3$variation,merge_3$converted) #p-value < 0.05, There may be an association between | |
#likelihood of conversion and variation | |
ch_test$stdres | |
#Pearson's std residuals measure | |
# how large is the deviation from each cell to the null hypothesis (in this case, independence between row and column's). | |
#treatment is negatively associated with non-conversion, so it has fewer proportion of people who did not convert | |
#test which variables are key in determining conversion via logistic regression | |
## 5.2 Further analysis - logistic regression ## | |
glm_model<-glm(converted~variation+channel+age+gender+timeofday+income,data=merge_3,family = binomial(link="logit")) | |
summary(glm_model) | |
#only variation treatment is statistically signficant | |
#positive coefficient indicates that all other factors remaining fixed, if person has viewed treatment variation, they are more likely to convert | |
#viewing treatment rather than control increases odds by 0.0272 for conversion | |
anova(glm_model, test="Chisq") | |
#Finding: Only adding channel and income is statistically significant in reducing the residual deviance. | |
#How our model does against model without intercep | |
#assess model fit | |
library(pscl) | |
pR2(glm_model) #McFadden R2 index can be used to assess the model fit. | |
#only 12.94$ of variation in whether or not someone will convert is likely to be explained by current model | |
#### 5. Exploratory learning for engagemnt with emails #### | |
#Exploratory data analysis to understand relationship between attributes and engagement with emails | |
l <- ggplot(merge_3, aes(gender,fill = clicked_on_email)) | |
l <- l + geom_histogram(stat="count") | |
print(l) | |
print("Greater proportion of males clicked on email") | |
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$gender,mean) | |
l <- ggplot(merge_3, aes(timeofday,fill = clicked_on_email)) | |
l <- l + geom_histogram(stat="count") | |
print(l) | |
print("Greatest proportion of people that visited website visited in the nighttime. Similar proportions of those that clicked on email by time of day") | |
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$timeofday,mean) | |
l <- ggplot(merge_3, aes(channel,fill = clicked_on_email)) | |
l <- l + geom_histogram(stat="count") | |
print(l) | |
print("Those that had PPC as their channel had the highest proportion of clicking on email (73%). | |
However, highest number of users were prompted to visit the website due to TV") | |
tapply(as.numeric(merge_3$clicked_on_email) - 1 ,merge_3$channel,mean) | |
#Age has 1,243 missing values | |
row.has.na <- apply(merge_3, 1, function(x){any(is.na(x))}) | |
sum(row.has.na) | |
final.filtered <- merge_3[!row.has.na,] | |
ggplot(final.filtered,aes(x=clicked_on_email,y=age))+geom_boxplot() | |
print("On average, those that converted were on average 38 years old, while those that didn't were around 32 years of age") | |
dplyr::group_by(final.filtered, clicked_on_email) %>% | |
dplyr::summarise(mean=mean(age), sd=sd(age)) | |
ggplot(final.filtered,aes(x=clicked_on_email,y=income))+geom_boxplot() | |
dplyr::group_by(final.filtered, clicked_on_email) %>% | |
dplyr::summarise(mean=mean(income), sd=sd(income)) | |
print("On average, those that converted on average earned more than those that didn't. High variation in income, with a lot of very high income values in both groups") | |
#### 6. Machine learning model for engagement with emails #### | |
library(caTools) | |
library(e1071) | |
library(glmnet) | |
## drop columns not required | |
mydatanew = final.filtered[,-c(1,3,8)] | |
#randomly split data into training, test and validation | |
#Splitting data | |
split <- sample.split(mydatanew$clicked_on_email, SplitRatio = 0.70) | |
train <- subset(mydatanew, split == T) #ensure same proportion of clicks and non-clicks in both training and test sets to ensure balance | |
test <- subset(mydatanew, split == F) | |
## Testing model accuracy on test set | |
model_glm <- glm(clicked_on_email ~ ., data = train, family='binomial') | |
predicted_glm <- predict(model_glm, test, type='response') | |
predicted_glm <- ifelse(predicted_glm > 0.5,1,0) | |
misClasificError <- mean(predicted_glm != test$clicked_on_email) | |
print(paste('Accuracy',1-misClasificError)) | |
p<-predicted_glm | |
pr <- prediction(p, test$clicked_on_email) | |
prf <- performance(pr, measure = "tpr", x.measure = "fpr") | |
plot(prf) | |
auc <- performance(pr, measure = "auc") | |
auc <- auc@y.values[[1]] | |
auc | |
summary(model_glm) | |
print("Statistically signficant predicturs are channel, age, and gender where males are | |
more likely to click on email and those that come via Facebook, followed by PPC and then TV. Also, an older user is more likely to click on an email than a younger user") | |
##Response to Q1: There is a statistically significant difference in those that converted based on variation; however, the difference in proportion is not very large. I would recommend re-running the test with a larger dataset to ensure reproducibility. | |
##Response to Q2: The model can predict whether or not someone will click on an email with 84% accuracy. Top predictors are gender, age, and channel. |
Thanks @MarkDebono and @datasciencevj for reading my article. Unfortunately, I no longer have access to these data files.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks Sheenal! Would be great if you can share the data file. You can mask your data file.