Skip to content

Instantly share code, notes, and snippets.

@jwinternheimer
Last active August 29, 2015 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwinternheimer/4c36ce1efe29ecd48b89 to your computer and use it in GitHub Desktop.
Save jwinternheimer/4c36ce1efe29ecd48b89 to your computer and use it in GitHub Desktop.
churn_exploration.R
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(scales); library(grid); library(RColorBrewer)
###############################################################
## MOST COMMON ACTIONS DURING FIRST TWO WEEKS
###############################################################
## IMPORT AND TIDY DATA
healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",")
churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",")
names(healthy_user_actions) <- c("user_id","action","count")
names(churned_user_actions) <- c("user_id","action","count")
## GROUP BY ACTION, BIND DATASETS
healthy_by_action <- healthy_user_actions %>%
group_by(action) %>%
summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
arrange(desc(count_per_user))
churned_by_action <- churned_user_actions %>%
group_by(action) %>%
summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
mutate(count_per_user = total_count/users, user_type=as.factor("churned"))
total_by_action <- rbind(healthy_by_action,churned_by_action)
save(total_by_action, file="~/Google Drive/total_by_action.Rda")
## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER
target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action
target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action
## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10)
action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1),
aes(x=action,y=count_per_user,fill=user_type)) +
geom_bar(stat="identity",position="dodge") +
coord_flip() + buffer_theme() +
labs(x="",y="Average Actions Per User")
action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2),
aes(x=action,y=count_per_user,fill=user_type)) +
geom_bar(stat="identity",position="dodge") +
coord_flip() + buffer_theme() +
labs(x="",y="Average Actions Per User")
action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3),
aes(x=action,y=count_per_user,fill=user_type)) +
geom_bar(stat="identity",position="dodge") +
coord_flip() + buffer_theme() +
labs(x="",y="Average Actions Per User")
action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4),
aes(x=action,y=count_per_user,fill=user_type)) +
geom_bar(stat="identity",position="dodge") +
coord_flip() + buffer_theme() +
labs(x="",y="Average Actions Per User")
action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5),
aes(x=action,y=count_per_user,fill=user_type)) +
geom_bar(stat="identity",position="dodge") +
coord_flip() + buffer_theme() +
labs(x="",y="Average Actions Per User")
###############################################################
## UPDATES BEHAVIOR
###############################################################
## IMPORT AND TIDY DATA
churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T)
healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T)
names(churned_update_actions) <- c("user_id","action","count")
names(healthy_update_actions) <- c("user_id","action","count")
churned_update_actions$user_type <- as.factor("churned")
healthy_update_actions$user_type <- as.factor("healthy")
total_update_actions <- rbind(churned_update_actions,healthy_update_actions)
## GROUP BY ACTION, BIND DATASETS
healthy_updates_by_action <- healthy_update_actions %>%
group_by(action) %>%
summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
arrange(desc(count_per_user))
churned_updates_by_action <- churned_update_actions %>%
group_by(action) %>%
summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
arrange(desc(count_per_user))
total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action)
save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda")
## TARGET ACTIONS - FIND MOST COMMON ACTIONS
target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action
target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action
target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action
target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action
target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action
target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action
## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS
## CDF OF ACTIONS COUNT
updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1),
aes(x=count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) +
labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") +
scale_y_continuous(breaks=seq(0,1.00,0.2))
updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2),
aes(x=count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") +
scale_y_continuous(breaks=seq(0,1.00,0.2))
updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4),
aes(x=count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") +
scale_y_continuous(breaks=seq(0,1.00,0.2))
updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5),
aes(x=count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") +
scale_y_continuous(breaks=seq(0,1.00,0.2))
###############################################################
## INTERACTIONS PER UPDATE
###############################################################
## IMPORT AND TIDY DATA
churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T)
healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T)
names(churned_interactions) <- c("user_id","service","interactions","updates")
names(healthy_interactions) <- c("user_id","service","interactions","updates")
churned_interactions$user_type <- as.factor("churned")
healthy_interactions$user_type <- as.factor("healthy")
total_interactions <- rbind(churned_interactions,healthy_interactions)
total_interactions <- total_interactions %>%
mutate(interaction_per_update = interactions/updates)
## CDFS OF INTERACTIONS
twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"),
aes(x=interaction_per_update,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+
labs(x="Interactions per Tweet",y="Percent of Users")
facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"),
aes(x=interaction_per_update,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+
labs(x="Interactions per Facebook Post",y="Percent of Users")
linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"),
aes(x=interaction_per_update,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
labs(x="Interactions per Linkedin Post",y="Percent of Users")
google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"),
aes(x=interaction_per_update,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
labs(x="Interactions per Google+ Post",y="Percent of Users")
###############################################################
## EXTENSION UPDATES
###############################################################
## IMPORT AND TIDY DATA
healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T)
churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T)
names(healthy_extension) <- c("user_id","action","count")
names(churned_extension) <- c("user_id","action","count")
healthy_extension$user_type <- as.factor("healthy")
churned_extension$user_type <- as.factor("churned")
total_extension_use <- rbind(healthy_extension,churned_extension)
extension_by_user <- total_extension_use %>%
group_by(user_id, user_type) %>%
summarise(updates_shared = sum(count))
## CDF OF EXTENSION USAGE
extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) +
scale_y_continuous(breaks=seq(0,1.00,0.2)) +
labs(x="Updates Shared From Extension",y="Percent of Users")
###############################################################
## NUMBER OF PROFILES CONNECTED
###############################################################
## IMPORT AND TIDY DATA
churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T)
healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T)
names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
churned_profiles$user_type <- as.factor("churned")
healthy_profiles$user_type <- as.factor("healthy")
total_profiles <- rbind(churned_profiles,healthy_profiles)
total_profiles <- total_profiles %>%
mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles)
## CDF OF PROFILES
profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Profiles",y="Percent of Users")
twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Twitter Profiles",y="Percent of Users")
fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Facebook Profiles",y="Percent of Users")
linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Linkedin Profiles",y="Percent of Users")
###############################################################
## TEAMMATES
###############################################################
## IMPORT AND TIDY DATA
healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T)
churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T)
names(healthy_teammates) <- c("user_id","teammates")
names(churned_teammates) <- c("user_id","teammates")
healthy_teammates$user_type <- as.factor('healthy')
churned_teammates$user_type <- as.factor('churned')
total_teammates <- rbind(churned_teammates,healthy_teammates)
## MOBILE UPDATES CDF
teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Teammates",y="Percent of Users")
###############################################################
## ANALYTICS ACTIONS
###############################################################
## IMPORT AND TIDY DATA
healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T)
churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T)
names(healthy_analytics) <- c("user_id","action_count")
names(churned_analytics) <- c("user_id","action_count")
healthy_analytics$user_type <- as.factor('healthy')
churned_analytics$user_type <- as.factor('churned')
total_analytics <- rbind(churned_analytics,healthy_analytics)
## ANALYTICS ACTIONS CDF
analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Analytics Actions",y="Percent of Users")
###############################################################
## MOBILE UPDATES
###############################################################
## IMPORT AND TIDY DATA
healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T)
churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T)
names(healthy_mobile) <- c("user_id","action","count")
names(churned_mobile) <- c("user_id","action","count")
healthy_mobile$user_type <- as.factor('healthy')
churned_mobile$user_type <- as.factor('churned')
total_mobile <- rbind(churned_mobile,healthy_mobile)
## MOBILE UPDATES CDF
mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) +
stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
scale_y_continuous(breaks=seq(0,1.00,0.1)) +
labs(x="Number of Updates",y="Percent of Users")
###############################################################
## ACTION COUNT ANALYSIS
###############################################################
## IMPORT AND TIDY DATA
churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T)
healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T)
names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week",
"changed_schedule","used_extension","used_optimal_timing",
"facebook_profiles","linkedin_profiles","pinterest_profiles",
"twitter_profiles","team_members","total_updates","action","count")
names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week",
"changed_schedule","used_extension","used_optimal_timing",
"facebook_profiles","linkedin_profiles","pinterest_profiles",
"twitter_profiles","team_members","total_updates","action","count")
churn_actions$user_type <- as.factor("churned")
healthy_actions$user_type <- as.factor("healthy")
all_actions <- rbind(churn_actions,healthy_actions)
## TARGET CERTAIN ACTIONS
target_actions <- c("dashboard loaded","dashboard profiles selected",
"dashboard schedule changed_schedule changed_time",
"dashboard schedule changed_schedule changed_day",
"dashboard schedule changed_schedule added time",
"dashboard analytics viewed basic",
"dashboard analytics viewed advanced",
"dashboard updates shared composer later",
"dashboard updates shared composer now",
"dashboard composer media added_photo",
"dashboard suggestions viewed",
"dashboard updates shared suggestions",
"dashboard updates shared composer schedule",
"dashboard viewed sent_posts",
"dashboard queue changed_update edited composer",
"dashboard queue changed_update edited inline",
"dashboard queue changed_update deleted",
"dashboard updates shared contributions",
"dashboard feeds viewed general",
"dashboard queue changed_update custom_scheduled",
"dashboard updates shared feeds",
"extension viewed","extension updates shared composer later",
"extension composer media added_photo",
"extension composer media added_link",
"share sent", "share cancelled")
filtered_actions <- all_actions %>%
filter(action %in% target_actions)
## CONVERT DATA FRAME FROM LONG TO WIDE
actions_wide <- spread(filtered_actions,action,count)
actions_wide[is.na(actions_wide)] <- 0
actions_wide <- select(actions_wide,-total_updates)
names(actions_wide) <- make.names(names(actions_wide))
###############################################################
## MACHINE LEARNING ALGORITHMS
###############################################################
library(caret); library(rattle)
inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F)
training <- actions_wide[inTrain,]
testing <- actions_wide[-inTrain,]
names(training) <- make.names(names(training))
names(testing) <- make.names(names(testing))
###############################################################
## DECISION TREE
###############################################################
library(rpart)
treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class")
## PLOT DECISION TREE
fancyRpartPlot(treeFit,cex=0.7)
## TEST DECISION TREE
actions_wide$tree_predictions <- predict(treeFit,actions_wide)
###############################################################
## RANDOM FORESTS
###############################################################
## FIT RANDOM FORESTS MODEL
rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T)
save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda")
## FIT A SECOND MODEL, FOR KICKS
rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update)))
## CHECK PREDICTIONS OF FIRST MODEL
rf_predictions <- predict(rf_fit,testing)
testing$rf_prediction_right <- rf_predictions == testing$user_type
table(rf_predictions,testing$rf_prediction_right)
## CHECK PREDICTIONS OF SECOND MODEL
rf2_predictions <- predict(rf_fit2,testing)
testing$predRight2 <- rf2_predictions == testing$user_type
table(rf2_predictions,testing$predRight2)
## SET PREDICTION COLUMN IN ORIGINAL DATA
actions_wide$rf_prediction <- predict(rf_fit,actions_wide)
## PRESENT USERS, PREDICTIONS
predicted_users <- actions_wide %>%
select(-(days_with_update:team_members)) %>%
select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>%
filter(rf_prediction == "churned" & user_type == "healthy")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment