jwinternheimer/churn_exploration.R

## churn_exploration.R
library(data.table); library(dplyr); library(tidyr)
library(ggplot2); library(scales); library(grid); library(RColorBrewer)

###############################################################
## MOST COMMON ACTIONS DURING FIRST TWO WEEKS
###############################################################

## IMPORT AND TIDY DATA
healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",")
churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",")

names(healthy_user_actions) <- c("user_id","action","count")
names(churned_user_actions) <- c("user_id","action","count")


## GROUP BY ACTION, BIND DATASETS
healthy_by_action <- healthy_user_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))

churned_by_action <- churned_user_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("churned"))

total_by_action <- rbind(healthy_by_action,churned_by_action)
save(total_by_action, file="~/Google Drive/total_by_action.Rda")

## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER
target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action
target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10)
action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1),
                          aes(x=action,y=count_per_user,fill=user_type)) +
                          geom_bar(stat="identity",position="dodge") +
                          coord_flip() + buffer_theme() +
                          labs(x="",y="Average Actions Per User")


action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2),
                           aes(x=action,y=count_per_user,fill=user_type)) +
                           geom_bar(stat="identity",position="dodge") +
                           coord_flip() + buffer_theme() +
                           labs(x="",y="Average Actions Per User")


action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") +
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")


action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") +
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")


action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5),
                           aes(x=action,y=count_per_user,fill=user_type)) +
  geom_bar(stat="identity",position="dodge") +
  coord_flip() + buffer_theme() +
  labs(x="",y="Average Actions Per User")


###############################################################
## UPDATES BEHAVIOR
###############################################################

## IMPORT AND TIDY DATA
churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T)
healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T)

names(churned_update_actions) <- c("user_id","action","count")
names(healthy_update_actions) <- c("user_id","action","count")

churned_update_actions$user_type <- as.factor("churned")
healthy_update_actions$user_type <- as.factor("healthy")

total_update_actions <- rbind(churned_update_actions,healthy_update_actions)


## GROUP BY ACTION, BIND DATASETS
healthy_updates_by_action <- healthy_update_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))


churned_updates_by_action <- churned_update_actions %>%
  group_by(action) %>%
  summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
  mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
  arrange(desc(count_per_user))

total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action)
save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda")


## TARGET ACTIONS - FIND MOST COMMON ACTIONS
target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action
target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action
target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action
target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action
target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action
target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action
target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS

## CDF OF ACTIONS COUNT
updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1),
                           aes(x=count,color=user_type)) +
                           stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) +
                           labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") +
                           scale_y_continuous(breaks=seq(0,1.00,0.2))


updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2),
                      aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))


updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4),
                       aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))


updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5),
                       aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
  labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") +
  scale_y_continuous(breaks=seq(0,1.00,0.2))

###############################################################
## INTERACTIONS PER UPDATE
###############################################################

## IMPORT AND TIDY DATA
churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T)
healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T)

names(churned_interactions) <- c("user_id","service","interactions","updates")
names(healthy_interactions) <- c("user_id","service","interactions","updates")

churned_interactions$user_type <- as.factor("churned")
healthy_interactions$user_type <- as.factor("healthy")

total_interactions <- rbind(churned_interactions,healthy_interactions)
total_interactions <- total_interactions %>%
  mutate(interaction_per_update = interactions/updates)


## CDFS OF INTERACTIONS
twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"),
                                  aes(x=interaction_per_update,color=user_type)) +
                                  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+
  labs(x="Interactions per Tweet",y="Percent of Users")


facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+
  labs(x="Interactions per Facebook Post",y="Percent of Users")


linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
  labs(x="Interactions per Linkedin Post",y="Percent of Users")


google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"),
                                  aes(x=interaction_per_update,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
  labs(x="Interactions per Google+ Post",y="Percent of Users")


###############################################################
## EXTENSION UPDATES
###############################################################

## IMPORT AND TIDY DATA
healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T)
churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T)

names(healthy_extension) <- c("user_id","action","count")
names(churned_extension) <- c("user_id","action","count")

healthy_extension$user_type <- as.factor("healthy")
churned_extension$user_type <- as.factor("churned")

total_extension_use <- rbind(healthy_extension,churned_extension)

extension_by_user <- total_extension_use %>%
  group_by(user_id, user_type) %>%
  summarise(updates_shared = sum(count))


## CDF OF EXTENSION USAGE

extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) +
  scale_y_continuous(breaks=seq(0,1.00,0.2)) +
  labs(x="Updates Shared From Extension",y="Percent of Users")


###############################################################
## NUMBER OF PROFILES CONNECTED
###############################################################


## IMPORT AND TIDY DATA
churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T)
healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T)

names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")

churned_profiles$user_type <- as.factor("churned")
healthy_profiles$user_type <- as.factor("healthy")

total_profiles <- rbind(churned_profiles,healthy_profiles)
total_profiles <- total_profiles %>%
  mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles)


## CDF OF PROFILES
profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Profiles",y="Percent of Users")


twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Twitter Profiles",y="Percent of Users")


fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Facebook Profiles",y="Percent of Users")


linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Linkedin Profiles",y="Percent of Users")


###############################################################
## TEAMMATES
###############################################################

## IMPORT AND TIDY DATA
healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T)
churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T)

names(healthy_teammates) <- c("user_id","teammates")
names(churned_teammates) <- c("user_id","teammates")

healthy_teammates$user_type <- as.factor('healthy')
churned_teammates$user_type <- as.factor('churned')

total_teammates <- rbind(churned_teammates,healthy_teammates)


## MOBILE UPDATES CDF
teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Teammates",y="Percent of Users")


###############################################################
## ANALYTICS ACTIONS
###############################################################

## IMPORT AND TIDY DATA
healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T)
churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T)

names(healthy_analytics) <- c("user_id","action_count")
names(churned_analytics) <- c("user_id","action_count")

healthy_analytics$user_type <- as.factor('healthy')
churned_analytics$user_type <- as.factor('churned')

total_analytics <- rbind(churned_analytics,healthy_analytics)


## ANALYTICS ACTIONS CDF
analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Analytics Actions",y="Percent of Users")


###############################################################
## MOBILE UPDATES
###############################################################

## IMPORT AND TIDY DATA
healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T)
churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T)

names(healthy_mobile) <- c("user_id","action","count")
names(churned_mobile) <- c("user_id","action","count")

healthy_mobile$user_type <- as.factor('healthy')
churned_mobile$user_type <- as.factor('churned')

total_mobile <- rbind(churned_mobile,healthy_mobile)


## MOBILE UPDATES CDF
mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) +
  stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
  scale_y_continuous(breaks=seq(0,1.00,0.1)) +
  labs(x="Number of Updates",y="Percent of Users")


###############################################################
## ACTION COUNT ANALYSIS
###############################################################

## IMPORT AND TIDY DATA
churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T)
healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T)

names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week",
                            "changed_schedule","used_extension","used_optimal_timing",
                            "facebook_profiles","linkedin_profiles","pinterest_profiles",
                            "twitter_profiles","team_members","total_updates","action","count")

names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week",
                          "changed_schedule","used_extension","used_optimal_timing",
                          "facebook_profiles","linkedin_profiles","pinterest_profiles",
                          "twitter_profiles","team_members","total_updates","action","count")

churn_actions$user_type <- as.factor("churned")
healthy_actions$user_type <- as.factor("healthy")

all_actions <- rbind(churn_actions,healthy_actions)


## TARGET CERTAIN ACTIONS
target_actions <- c("dashboard loaded","dashboard profiles selected",
                    "dashboard schedule changed_schedule changed_time",
                    "dashboard schedule changed_schedule changed_day",
                    "dashboard schedule changed_schedule added time",
                    "dashboard analytics viewed basic",
                    "dashboard analytics viewed advanced",
                    "dashboard updates shared composer later",
                    "dashboard updates shared composer now",
                    "dashboard composer media added_photo",
                    "dashboard suggestions viewed",
                    "dashboard updates shared suggestions",
                    "dashboard updates shared composer schedule",
                    "dashboard viewed sent_posts",
                    "dashboard queue changed_update edited composer",
                    "dashboard queue changed_update edited inline",
                    "dashboard queue changed_update deleted",
                    "dashboard updates shared contributions",
                    "dashboard feeds viewed general",
                    "dashboard queue changed_update custom_scheduled",
                    "dashboard updates shared feeds",
                    "extension viewed","extension updates shared composer later",
                    "extension composer media added_photo",
                    "extension composer media added_link",
                    "share sent", "share cancelled")

filtered_actions <- all_actions %>%
  filter(action %in% target_actions)

## CONVERT DATA FRAME FROM LONG TO WIDE
actions_wide <- spread(filtered_actions,action,count)
actions_wide[is.na(actions_wide)] <- 0
actions_wide <- select(actions_wide,-total_updates)
names(actions_wide) <- make.names(names(actions_wide))

###############################################################
## MACHINE LEARNING ALGORITHMS
###############################################################
library(caret); library(rattle)

inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F)
training <- actions_wide[inTrain,]
testing <- actions_wide[-inTrain,]

names(training) <- make.names(names(training))
names(testing) <- make.names(names(testing))


###############################################################
## DECISION TREE
###############################################################
library(rpart)
treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class")

## PLOT DECISION TREE
fancyRpartPlot(treeFit,cex=0.7)

## TEST DECISION TREE
actions_wide$tree_predictions <- predict(treeFit,actions_wide)


###############################################################
## RANDOM FORESTS
###############################################################

## FIT RANDOM FORESTS MODEL
rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T)
save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda")

## FIT A SECOND MODEL, FOR KICKS
rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update)))

## CHECK PREDICTIONS OF FIRST MODEL
rf_predictions <- predict(rf_fit,testing)
testing$rf_prediction_right <- rf_predictions == testing$user_type
table(rf_predictions,testing$rf_prediction_right)

## CHECK PREDICTIONS OF SECOND MODEL
rf2_predictions <- predict(rf_fit2,testing)
testing$predRight2 <- rf2_predictions == testing$user_type
table(rf2_predictions,testing$predRight2)

## SET PREDICTION COLUMN IN ORIGINAL DATA
actions_wide$rf_prediction <- predict(rf_fit,actions_wide)

## PRESENT USERS, PREDICTIONS
predicted_users <- actions_wide %>%
  select(-(days_with_update:team_members)) %>%
  select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>%
  filter(rf_prediction == "churned"  & user_type == "healthy")
	library(data.table); library(dplyr); library(tidyr)
	library(ggplot2); library(scales); library(grid); library(RColorBrewer)

	###############################################################
	## MOST COMMON ACTIONS DURING FIRST TWO WEEKS
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_user_actions <- read.table("~/Downloads/small_user_actions.csv",header=T,sep=",")
	churned_user_actions <- read.table("~/Downloads/churned_users_actions.csv",header=T,sep=",")

	names(healthy_user_actions) <- c("user_id","action","count")
	names(churned_user_actions) <- c("user_id","action","count")


	## GROUP BY ACTION, BIND DATASETS
	healthy_by_action <- healthy_user_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))

	churned_by_action <- churned_user_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("churned"))

	total_by_action <- rbind(healthy_by_action,churned_by_action)
	save(total_by_action, file="~/Google Drive/total_by_action.Rda")

	## TARGET ACTIONS - PLOT TOTAL ACTION COUNTS PER USER
	target_actions1 <- arrange(total_by_action,desc(total_count))[0:10,]$action
	target_actions2 <- arrange(total_by_action,desc(total_count))[11:20,]$action
	target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
	target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
	target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


	## HORIZONTAL BAR PLOT OF TOP ACTIONS (BY 10)
	action_bar_plot1 <- ggplot(filter(total_by_action,action %in% target_actions1),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")


	action_bar_plot2 <- ggplot(filter(total_by_action,action %in% target_actions2),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot3 <- ggplot(filter(total_by_action,action %in% target_actions3),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot4 <- ggplot(filter(total_by_action,action %in% target_actions4),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	action_bar_plot5 <- ggplot(filter(total_by_action,action %in% target_actions5),
	aes(x=action,y=count_per_user,fill=user_type)) +
	geom_bar(stat="identity",position="dodge") +
	coord_flip() + buffer_theme() +
	labs(x="",y="Average Actions Per User")



	###############################################################
	## UPDATES BEHAVIOR
	###############################################################

	## IMPORT AND TIDY DATA
	churned_update_actions <- read.table("~/Downloads/churned_users_updates.csv",sep=",",header=T)
	healthy_update_actions <- read.table("~/Downloads/healthy_users_updates.csv",sep=",",header=T)

	names(churned_update_actions) <- c("user_id","action","count")
	names(healthy_update_actions) <- c("user_id","action","count")

	churned_update_actions$user_type <- as.factor("churned")
	healthy_update_actions$user_type <- as.factor("healthy")

	total_update_actions <- rbind(churned_update_actions,healthy_update_actions)


	## GROUP BY ACTION, BIND DATASETS
	healthy_updates_by_action <- healthy_update_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))


	churned_updates_by_action <- churned_update_actions %>%
	group_by(action) %>%
	summarise(users=n_distinct(user_id),total_count = sum(count)) %>%
	mutate(count_per_user = total_count/users, user_type=as.factor("healthy")) %>%
	arrange(desc(count_per_user))

	total_updates_by_action <- rbind(healthy_updates_by_action,churned_updates_by_action)
	save(total_updates_by_action, file="~/Google Drive/total_updates_by_action.Rda")


	## TARGET ACTIONS - FIND MOST COMMON ACTIONS
	target_update_actions1 <- arrange(total_updates_by_action,desc(total_count))[0:1,]$action
	target_update_actions2 <- arrange(total_updates_by_action,desc(total_count))[2,]$action
	target_update_actions3 <- arrange(total_updates_by_action,desc(total_count))[3,]$action
	target_update_actions4 <- arrange(total_updates_by_action,desc(total_count))[4,]$action
	target_update_actions5 <- arrange(total_updates_by_action,desc(total_count))[5,]$action
	target_actions2 <- arrange(total_updates_by_action,desc(total_count))[11:20,]$action
	target_actions3 <- arrange(total_by_action,desc(total_count))[21:30,]$action
	target_actions4 <- arrange(total_by_action,desc(total_count))[31:40,]$action
	target_actions5 <- arrange(total_by_action,desc(total_count))[41:50,]$action


	## FIND ACTIONS WITH BIGGEST DIFFERENCES IN COUNTS

	## CDF OF ACTIONS COUNT
	updates_cdf <- ggplot(filter(total_update_actions,action %in% target_update_actions1),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,250)) +
	labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Dashboard") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf2 <- ggplot(filter(total_update_actions,action %in% target_update_actions2),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Suggestions Shared in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf4 <- ggplot(filter(total_update_actions,action %in% target_update_actions4),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Feeds Shared in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))


	updates_cdf5 <- ggplot(filter(total_update_actions,action %in% target_update_actions5),
	aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,150)) +
	labs(x="Number of Updates",y="Percent of Users",title="Updates Scheduled From Composer in First 2 Weeks") +
	scale_y_continuous(breaks=seq(0,1.00,0.2))

	###############################################################
	## INTERACTIONS PER UPDATE
	###############################################################

	## IMPORT AND TIDY DATA
	churned_interactions <- read.table("~/Downloads/churned_user_interaction.csv",sep=",",header=T)
	healthy_interactions <- read.table("~/Downloads/healthy_user_interaction.csv",sep=",",header=T)

	names(churned_interactions) <- c("user_id","service","interactions","updates")
	names(healthy_interactions) <- c("user_id","service","interactions","updates")

	churned_interactions$user_type <- as.factor("churned")
	healthy_interactions$user_type <- as.factor("healthy")

	total_interactions <- rbind(churned_interactions,healthy_interactions)
	total_interactions <- total_interactions %>%
	mutate(interaction_per_update = interactions/updates)


	## CDFS OF INTERACTIONS
	twitter_interaction_cdf <- ggplot(filter(total_interactions,service=="twitter"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,50))+
	labs(x="Interactions per Tweet",y="Percent of Users")


	facebook_interaction_cdf <- ggplot(filter(total_interactions,service=="facebook"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100))+
	labs(x="Interactions per Facebook Post",y="Percent of Users")


	linkedin_interaction_cdf <- ggplot(filter(total_interactions,service=="linkedin"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
	labs(x="Interactions per Linkedin Post",y="Percent of Users")


	google_interaction_cdf <- ggplot(filter(total_interactions,service=="google"),
	aes(x=interaction_per_update,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,10))+
	labs(x="Interactions per Google+ Post",y="Percent of Users")



	###############################################################
	## EXTENSION UPDATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_extension <- read.csv("~/Downloads/extension_updates_healthy.csv",header=T)
	churned_extension <- read.csv("~/Downloads/extension_updates_churned.csv",header=T)

	names(healthy_extension) <- c("user_id","action","count")
	names(churned_extension) <- c("user_id","action","count")

	healthy_extension$user_type <- as.factor("healthy")
	churned_extension$user_type <- as.factor("churned")

	total_extension_use <- rbind(healthy_extension,churned_extension)

	extension_by_user <- total_extension_use %>%
	group_by(user_id, user_type) %>%
	summarise(updates_shared = sum(count))


	## CDF OF EXTENSION USAGE

	extension_cdf <- ggplot(extension_by_user,aes(x=updates_shared,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,200)) +
	scale_y_continuous(breaks=seq(0,1.00,0.2)) +
	labs(x="Updates Shared From Extension",y="Percent of Users")


	###############################################################
	## NUMBER OF PROFILES CONNECTED
	###############################################################


	## IMPORT AND TIDY DATA
	churned_profiles <- read.csv("~/Downloads/profiles_churned.csv",header=T)
	healthy_profiles <- read.csv("~/Downloads/profiles_healthy.csv",header=T)

	names(churned_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")
	names(healthy_profiles) <- c("user_id","facebook_profiles","linkedin_profiles","pinterest_profiles","twitter_profiles")

	churned_profiles$user_type <- as.factor("churned")
	healthy_profiles$user_type <- as.factor("healthy")

	total_profiles <- rbind(churned_profiles,healthy_profiles)
	total_profiles <- total_profiles %>%
	mutate(total_profiles = facebook_profiles + linkedin_profiles + pinterest_profiles + twitter_profiles)


	## CDF OF PROFILES
	profiles_cdf <- ggplot(total_profiles,aes(x=total_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Profiles",y="Percent of Users")


	twitter_cdf <- ggplot(total_profiles,aes(x=twitter_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Twitter Profiles",y="Percent of Users")


	fb_cdf <- ggplot(total_profiles,aes(x=facebook_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Facebook Profiles",y="Percent of Users")


	linkedin_cdf <- ggplot(total_profiles,aes(x=linkedin_profiles,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,25)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Linkedin Profiles",y="Percent of Users")


	###############################################################
	## TEAMMATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_teammates <- read.csv("~/Downloads/healthy_teammates.csv",header=T)
	churned_teammates <- read.csv("~/Downloads/churned_teammates.csv",header=T)

	names(healthy_teammates) <- c("user_id","teammates")
	names(churned_teammates) <- c("user_id","teammates")

	healthy_teammates$user_type <- as.factor('healthy')
	churned_teammates$user_type <- as.factor('churned')

	total_teammates <- rbind(churned_teammates,healthy_teammates)


	## MOBILE UPDATES CDF
	teammates_cdf <- ggplot(total_teammates,aes(x=teammates,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,20)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Teammates",y="Percent of Users")


	###############################################################
	## ANALYTICS ACTIONS
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_analytics <- read.csv("~/Downloads/healthy_analytics.csv",header=T)
	churned_analytics <- read.csv("~/Downloads/churned_analytics.csv",header=T)

	names(healthy_analytics) <- c("user_id","action_count")
	names(churned_analytics) <- c("user_id","action_count")

	healthy_analytics$user_type <- as.factor('healthy')
	churned_analytics$user_type <- as.factor('churned')

	total_analytics <- rbind(churned_analytics,healthy_analytics)


	## ANALYTICS ACTIONS CDF
	analytics_cdf <- ggplot(total_analytics,aes(x=action_count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Analytics Actions",y="Percent of Users")


	###############################################################
	## MOBILE UPDATES
	###############################################################

	## IMPORT AND TIDY DATA
	healthy_mobile <- read.csv("~/Downloads/healthy_mobile.csv",header=T)
	churned_mobile <- read.csv("~/Downloads/churned_mobile.csv",header=T)

	names(healthy_mobile) <- c("user_id","action","count")
	names(churned_mobile) <- c("user_id","action","count")

	healthy_mobile$user_type <- as.factor('healthy')
	churned_mobile$user_type <- as.factor('churned')

	total_mobile <- rbind(churned_mobile,healthy_mobile)


	## MOBILE UPDATES CDF
	mobile_cdf <- ggplot(total_mobile,aes(x=count,color=user_type)) +
	stat_ecdf(size=1) + coord_cartesian(xlim=c(0,100)) +
	scale_y_continuous(breaks=seq(0,1.00,0.1)) +
	labs(x="Number of Updates",y="Percent of Users")




	###############################################################
	## ACTION COUNT ANALYSIS
	###############################################################

	## IMPORT AND TIDY DATA
	churn_actions <- read.csv("~/Downloads/user_actions_churned.csv",header=T)
	healthy_actions <- read.csv("~/Downloads/user_actions_healthy.csv",header=T)

	names(churn_actions) <- c("user_id","days_with_update","15_updates_first_week",
	"changed_schedule","used_extension","used_optimal_timing",
	"facebook_profiles","linkedin_profiles","pinterest_profiles",
	"twitter_profiles","team_members","total_updates","action","count")

	names(healthy_actions) <- c("user_id","days_with_update","15_updates_first_week",
	"changed_schedule","used_extension","used_optimal_timing",
	"facebook_profiles","linkedin_profiles","pinterest_profiles",
	"twitter_profiles","team_members","total_updates","action","count")

	churn_actions$user_type <- as.factor("churned")
	healthy_actions$user_type <- as.factor("healthy")

	all_actions <- rbind(churn_actions,healthy_actions)


	## TARGET CERTAIN ACTIONS
	target_actions <- c("dashboard loaded","dashboard profiles selected",
	"dashboard schedule changed_schedule changed_time",
	"dashboard schedule changed_schedule changed_day",
	"dashboard schedule changed_schedule added time",
	"dashboard analytics viewed basic",
	"dashboard analytics viewed advanced",
	"dashboard updates shared composer later",
	"dashboard updates shared composer now",
	"dashboard composer media added_photo",
	"dashboard suggestions viewed",
	"dashboard updates shared suggestions",
	"dashboard updates shared composer schedule",
	"dashboard viewed sent_posts",
	"dashboard queue changed_update edited composer",
	"dashboard queue changed_update edited inline",
	"dashboard queue changed_update deleted",
	"dashboard updates shared contributions",
	"dashboard feeds viewed general",
	"dashboard queue changed_update custom_scheduled",
	"dashboard updates shared feeds",
	"extension viewed","extension updates shared composer later",
	"extension composer media added_photo",
	"extension composer media added_link",
	"share sent", "share cancelled")

	filtered_actions <- all_actions %>%
	filter(action %in% target_actions)

	## CONVERT DATA FRAME FROM LONG TO WIDE
	actions_wide <- spread(filtered_actions,action,count)
	actions_wide[is.na(actions_wide)] <- 0
	actions_wide <- select(actions_wide,-total_updates)
	names(actions_wide) <- make.names(names(actions_wide))

	###############################################################
	## MACHINE LEARNING ALGORITHMS
	###############################################################
	library(caret); library(rattle)

	inTrain <- createDataPartition(y=actions_wide$user_type,p=0.7,list=F)
	training <- actions_wide[inTrain,]
	testing <- actions_wide[-inTrain,]

	names(training) <- make.names(names(training))
	names(testing) <- make.names(names(testing))


	###############################################################
	## DECISION TREE
	###############################################################
	library(rpart)
	treeFit <- rpart(user_type ~., data=select(training,-(user_id:days_with_update)), method="class")

	## PLOT DECISION TREE
	fancyRpartPlot(treeFit,cex=0.7)

	## TEST DECISION TREE
	actions_wide$tree_predictions <- predict(treeFit,actions_wide)


	###############################################################
	## RANDOM FORESTS
	###############################################################

	## FIT RANDOM FORESTS MODEL
	rf_fit <- train(user_type ~., data=select(training,-(user_id:days_with_update)), method="rf", prox=T)
	save(rf_fit,file="~/Google Drive/R_scripts/rf_model.Rda")

	## FIT A SECOND MODEL, FOR KICKS
	rf_fit2 <- randomForest(user_type~., data=select(training,-(user_id:days_with_update)))

	## CHECK PREDICTIONS OF FIRST MODEL
	rf_predictions <- predict(rf_fit,testing)
	testing$rf_prediction_right <- rf_predictions == testing$user_type
	table(rf_predictions,testing$rf_prediction_right)

	## CHECK PREDICTIONS OF SECOND MODEL
	rf2_predictions <- predict(rf_fit2,testing)
	testing$predRight2 <- rf2_predictions == testing$user_type
	table(rf2_predictions,testing$predRight2)

	## SET PREDICTION COLUMN IN ORIGINAL DATA
	actions_wide$rf_prediction <- predict(rf_fit,actions_wide)

	## PRESENT USERS, PREDICTIONS
	predicted_users <- actions_wide %>%
	select(-(days_with_update:team_members)) %>%
	select(-(dashboard.analytics.viewed.advanced:tree_predictions)) %>%
	filter(rf_prediction == "churned" & user_type == "healthy")