-
-
Save ocoyawale/a400261e03117df3b8fb4b7907831849 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#############################Telecom Solution################### | |
################################################################ | |
#Business Understanding | |
#Data Understanding | |
#Data Preparation & EDA | |
#Model Building | |
#Model Evaluation | |
################################################################ | |
### Business Understanding: | |
# Based on the past and current customer information, | |
# the company has maintained a database containing personal/demographic information, | |
# the services availed by a customer and the expense information related to each customer. | |
## AIM: | |
# The aim is to automate the process of predicting | |
# if a customer would telecom or not and to find the factors affecting the telecom. | |
# Whether a customer will telecom or not will depend on data from the following three buckets: | |
# 1. Demographic Information | |
# 2. Services Availed by the customer | |
# 3. Overall Expenses | |
################################################################ | |
### Data Understanding | |
# Install and Load the required packages | |
#install.packages("MASS") | |
#install.packages("car") | |
#install.packages("e1071") | |
#install.packages("caret", dependencies = c("Depends", "Suggests")) | |
#install.packages("cowplot") | |
#install.packages("GGally") | |
#install.packages("caTools") | |
library(MASS) | |
library(car) | |
library(e1071) | |
library(caret) | |
library(ggplot2) | |
library(cowplot) | |
library(caTools) | |
# Loading 3 files | |
churn_data<- read.csv("churn_data.csv", stringsAsFactors = F) | |
customer_data<- read.csv("customer_data.csv", stringsAsFactors = F) | |
internet_data<- read.csv("internet_data.csv", stringsAsFactors = F) | |
str(churn_data) # 7043 obs of 9 variables including the target variable | |
str(customer_data) # 7043 obs of 5 variables | |
str(internet_data) # 7043 obs of 9 variables | |
# Collate the data together in one single file | |
length(unique(tolower(churn_data$customerID))) # 7043, confirming customerID is key | |
length(unique(tolower(customer_data$customerID))) # 7043, confirming customerID is key | |
length(unique(tolower(internet_data$customerID))) # 7043, confirming customerID is key | |
setdiff(churn_data$customerID,customer_data$customerID) # Identical customerID across these datasets | |
setdiff(churn_data$customerID,internet_data$customerID) # Identical customerID across these datasets | |
telecom<- merge(churn_data,customer_data, by="customerID",all = F) | |
telecom<- merge(telecom,internet_data, by="customerID", all = F) | |
View(telecom) #master file | |
################################################################ | |
### Data Preparation & Exploratory Data Analysis | |
# Understanding the structure of the collated file | |
str(telecom) #7043 obs. of 21 variables; | |
# tenure, MonthlyCharges, TotalCharges are continuous | |
# SeniorCitizen need to be changed from integer to categorical | |
# Barcharts for categorical features with stacked telecom information | |
bar_theme1<- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), | |
legend.position="none") | |
plot_grid(ggplot(telecom, aes(x=PhoneService,fill=Churn))+ geom_bar(), | |
ggplot(telecom, aes(x=Contract,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=PaperlessBilling,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=PaymentMethod,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=gender,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=factor(SeniorCitizen),fill=Churn))+ geom_bar() +bar_theme1, | |
align = "h") | |
plot_grid(ggplot(telecom, aes(x=Partner,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=Dependents,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=MultipleLines,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=InternetService,fill=Churn))+ geom_bar()+bar_theme1, | |
align = "h") | |
plot_grid(ggplot(telecom, aes(x=OnlineSecurity,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=OnlineBackup,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=DeviceProtection,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=TechSupport,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=StreamingTV,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=StreamingMovies,fill=Churn))+ geom_bar()+bar_theme1, | |
align = "h") | |
#reveals strong contrast for telecom wrt Contract,InternetServices, (not availing) OnlineSecurity and Techsupport and PaymentMethod, | |
#moderate wrt whether SeniorCitizen, having partner, OnlineBackup and DeviceProtection | |
# Histogram and Boxplots for numeric variables | |
box_theme<- theme(axis.line=element_blank(),axis.title=element_blank(), | |
axis.ticks=element_blank(), axis.text=element_blank()) | |
box_theme_y<- theme(axis.line.y=element_blank(),axis.title.y=element_blank(), | |
axis.ticks.y=element_blank(), axis.text.y=element_blank(), | |
legend.position="none") | |
plot_grid(ggplot(telecom, aes(tenure))+ geom_histogram(binwidth = 10), | |
ggplot(telecom, aes(x="",y=tenure))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
plot_grid(ggplot(telecom, aes(MonthlyCharges))+ geom_histogram(binwidth = 20), | |
ggplot(telecom, aes(x="",y=MonthlyCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
plot_grid(ggplot(telecom, aes(TotalCharges))+ geom_histogram(), | |
ggplot(telecom, aes(x="",y=TotalCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
#No outliers in numeric variables | |
# Boxplots of numeric variables relative to telecom status | |
plot_grid(ggplot(telecom, aes(x=Churn,y=tenure, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() +theme(legend.position="none"), | |
ggplot(telecom, aes(x=Churn,y=MonthlyCharges, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() + box_theme_y, | |
ggplot(telecom, aes(x=Churn,y=TotalCharges, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() + box_theme_y, | |
align = "v",nrow = 1) | |
# Shorter tenure sees more telecom | |
# Correlation between numeric variables | |
library(GGally) | |
ggpairs(telecom[, c("tenure", "MonthlyCharges", "TotalCharges")]) | |
#As expected, tenure and TotalCharges are highly correlated (corr 0.83) | |
################################################################ | |
### Data Preparation | |
# De-Duplication | |
# not needed | |
# Bringing the variables in the correct format | |
telecom$SeniorCitizen<- ifelse(telecom$SeniorCitizen==1,"Yes","No") | |
# Outlier treatment and imputing missing value | |
#Boxplot showed no outlier, Nevertheless confirming it also with percentiles | |
sapply(telecom[,c("tenure","MonthlyCharges","TotalCharges")], | |
function(x) quantile(x,seq(0,1,.01),na.rm = T)) #no outlier | |
# Missing value | |
sapply(telecom, function(x) sum(is.na(x))) # shows all 11 NAs are in Totalcharges column | |
View(subset(telecom, is.na(TotalCharges))) # All 11 NAs in TotalCharges correspond to tenure 0 | |
View(subset(telecom, telecom$tenure==0)) #vice-versa of above is also true | |
# It means that 11/7043 = 0.001561834 i.e 0.1%, best is to remove these observations from the analysis | |
telecom <- telecom[!is.na(telecom$TotalCharges),] | |
################################################################ | |
# Feature standardisation | |
# Normalising continuous features | |
telecom$tenure<- scale(telecom$tenure) # scale used: mean 32.4, sd 24.6 | |
summary(telecom$tenure) | |
sd(telecom$tenure) | |
telecom$MonthlyCharges<- scale(telecom$MonthlyCharges) # scale used: mean 64.8, sd 30.1 | |
telecom$TotalCharges<- scale(telecom$TotalCharges) # scale used: mean 2280, sd 2267 | |
# converting target variable telecom from No/Yes character to factorwith levels 0/1 | |
telecom$Churn<- ifelse(telecom$Churn=="Yes",1,0) | |
# Checking churn rate of prospect customer | |
Churn <- sum(telecom$Churn)/nrow(telecom) | |
Churn # 26.57% churn rate. | |
# creating a dataframe of categorical features | |
telecom_chr<- telecom[,-c(1,2,7,8,9)] | |
str(telecom_chr) | |
# converting categorical attributes to factor | |
telecom_fact<- data.frame(sapply(telecom_chr, function(x) factor(x))) | |
str(telecom_fact) | |
# creating dummy variables for factor attributes | |
dummies<- data.frame(sapply(telecom_fact, | |
function(x) data.frame(model.matrix(~x-1,data =telecom_fact))[,-1])) | |
check <- data.frame(model.matrix(~telecom_fact$InternetService-1,data =telecom_fact)) | |
# For variables having only two levels, verified PhoneService, PaperlessBilling, Partner, Dependents "yes" is 1, | |
#gender "male" is 1 and SeniorCitizen "1" is 1 | |
# Final dataset | |
telecom_final<- cbind(telecom[,c(9,2,7,8)],dummies) | |
View(telecom_final) #7032 obs. of 31 variables | |
######################################################################## | |
# splitting the data between train and test | |
set.seed(100) | |
indices = sample.split(telecom_final$Churn, SplitRatio = 0.7) | |
train = telecom_final[indices,] | |
test = telecom_final[!(indices),] | |
######################################################################## | |
# Logistic Regression: | |
#Initial model | |
model_1 = glm(Churn ~ ., data = train, family = "binomial") | |
summary(model_1) #AIC 4150.1....31 coeff..nullDev 5699.5...resDev 4102.1 | |
# Stepwise selection | |
library("MASS") | |
model_2<- stepAIC(model_1, direction="both") | |
summary(model_2) | |
# Removing multicollinearity through VIF check | |
library(car) | |
vif(model_2) | |
#Excluding TotalCharges | |
model_3<- glm(formula = Churn ~ tenure + MonthlyCharges + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_3) | |
vif(model_3) | |
# MonthlyCharges has high vif and but high significance, also, it came in the above model as well, even the VIF | |
# has not reduced much,so better is to remove this variable | |
# Excluding MonthlyCharges | |
model_4<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_4) | |
vif(model_4) # cannot exclude any more variable based on vif | |
#as most of them have low vif; those with higher vif are very significant and not correlated | |
#Excluding OnlineBackup.xYes due to lower significance | |
model_5<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_5) | |
#Excluding DeviceProtection.xYes due to lower significance | |
model_6<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_6) | |
#Excluding SeniorCitizen due to lower significance | |
model_7<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_7) | |
#Excluding StreamingTV.xYes due to lower significance with respect to other 10 variables | |
model_8<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_8) | |
######################################################################## | |
# With 10 significant variables in the model | |
final_model<- model_8 | |
####################################################################### | |
### Model Evaluation | |
### Test Data #### | |
#predicted probabilities of Churn 1 for test data | |
test_pred = predict(final_model, type = "response", | |
newdata = test[,-1]) | |
# Let's see the summary | |
summary(test_pred) | |
test$prob <- test_pred | |
View(test) | |
# Let's use the probability cutoff of 50%. | |
test_pred_churn <- factor(ifelse(test_pred >= 0.50, "Yes", "No")) | |
test_actual_churn <- factor(ifelse(test$Churn==1,"Yes","No")) | |
table(test_actual_churn,test_pred_churn) | |
####################################################################### | |
test_pred_churn <- factor(ifelse(test_pred >= 0.40, "Yes", "No")) | |
install.packages("e1071") | |
library(e1071) | |
test_conf <- confusionMatrix(test_pred_churn, test_actual_churn, positive = "Yes") | |
test_conf | |
####################################################################### | |
######################################################################################### | |
# Let's Choose the cutoff value. | |
# | |
# Let's find out the optimal probalility cutoff | |
perform_fn <- function(cutoff) | |
{ | |
predicted_churn <- factor(ifelse(test_pred >= cutoff, "Yes", "No")) | |
conf <- confusionMatrix(predicted_churn, test_actual_churn, positive = "Yes") | |
acc <- conf$overall[1] | |
sens <- conf$byClass[1] | |
spec <- conf$byClass[2] | |
out <- t(as.matrix(c(sens, spec, acc))) | |
colnames(out) <- c("sensitivity", "specificity", "accuracy") | |
return(out) | |
} | |
# Creating cutoff values from 0.003575 to 0.812100 for plotting and initiallizing a matrix of 100 X 3. | |
# Summary of test probability | |
summary(test_pred) | |
s = seq(.01,.80,length=100) | |
s | |
OUT = matrix(0,100,3) | |
for(i in 1:100) | |
{ | |
OUT[i,] = perform_fn(s[i]) | |
} | |
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2) | |
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
lines(s,OUT[,2],col="darkgreen",lwd=2) | |
lines(s,OUT[,3],col=4,lwd=2) | |
box() | |
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy")) | |
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.01)] | |
cutoff | |
# Let's choose a cutoff value of 0.3132 for final model | |
test_cutoff_churn <- factor(ifelse(test_pred >=0.3132, "Yes", "No")) | |
conf_final <- confusionMatrix(test_cutoff_churn, test_actual_churn, positive = "Yes") | |
acc <- conf_final$overall[1] | |
sens <- conf_final$byClass[1] | |
spec <- conf_final$byClass[2] | |
acc | |
sens | |
spec | |
View(test) | |
################################################################################################## | |
### KS -statistic - Test Data ###### | |
test_cutoff_churn <- ifelse(test_cutoff_churn=="Yes",1,0) | |
test_actual_churn <- ifelse(test_actual_churn=="Yes",1,0) | |
library(ROCR) | |
#on testing data | |
pred_object_test<- prediction(test_cutoff_churn, test_actual_churn) | |
performance_measures_test<- performance(pred_object_test, "tpr", "fpr") | |
ks_table_test <- attr(performance_measures_test, "y.values")[[1]] - | |
(attr(performance_measures_test, "x.values")[[1]]) | |
max(ks_table_test) | |
#################################################################### | |
# Lift & Gain Chart | |
# plotting the lift chart | |
# Loading dplyr package | |
require(dplyr) | |
library(dplyr) | |
lift <- function(labels , predicted_prob,groups=10) { | |
if(is.factor(labels)) labels <- as.integer(as.character(labels )) | |
if(is.factor(predicted_prob)) predicted_prob <- as.integer(as.character(predicted_prob)) | |
helper = data.frame(cbind(labels , predicted_prob)) | |
helper[,"bucket"] = ntile(-helper[,"predicted_prob"], groups) | |
gaintable = helper %>% group_by(bucket) %>% | |
summarise_at(vars(labels ), funs(total = n(), | |
totalresp=sum(., na.rm = TRUE))) %>% | |
mutate(Cumresp = cumsum(totalresp), | |
Gain=Cumresp/sum(totalresp)*100, | |
Cumlift=Gain/(bucket*(100/groups))) | |
return(gaintable) | |
} | |
Churn_decile = lift(test_actual_churn, test_pred, groups = 10) | |
Churn_decile | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#############################Telecom Solution################### | |
################################################################ | |
#Business Understanding | |
#Data Understanding | |
#Data Preparation & EDA | |
#Model Building | |
#Model Evaluation | |
################################################################ | |
### Business Understanding: | |
# Based on the past and current customer information, | |
# the company has maintained a database containing personal/demographic information, | |
# the services availed by a customer and the expense information related to each customer. | |
## AIM: | |
# The aim is to automate the process of predicting | |
# if a customer would telecom or not and to find the factors affecting the telecom. | |
# Whether a customer will telecom or not will depend on data from the following three buckets: | |
# 1. Demographic Information | |
# 2. Services Availed by the customer | |
# 3. Overall Expenses | |
################################################################ | |
### Data Understanding | |
# Install and Load the required packages | |
#install.packages("MASS") | |
#install.packages("car") | |
#install.packages("e1071") | |
#install.packages("caret", dependencies = c("Depends", "Suggests")) | |
#install.packages("cowplot") | |
#install.packages("GGally") | |
#install.packages("caTools") | |
library(MASS) | |
library(car) | |
library(e1071) | |
library(caret) | |
library(ggplot2) | |
library(cowplot) | |
library(caTools) | |
# Loading 3 files | |
churn_data<- read.csv("churn_data.csv", stringsAsFactors = F) | |
customer_data<- read.csv("customer_data.csv", stringsAsFactors = F) | |
internet_data<- read.csv("internet_data.csv", stringsAsFactors = F) | |
str(churn_data) # 7043 obs of 9 variables including the target variable | |
str(customer_data) # 7043 obs of 5 variables | |
str(internet_data) # 7043 obs of 9 variables | |
# Collate the data together in one single file | |
length(unique(tolower(churn_data$customerID))) # 7043, confirming customerID is key | |
length(unique(tolower(customer_data$customerID))) # 7043, confirming customerID is key | |
length(unique(tolower(internet_data$customerID))) # 7043, confirming customerID is key | |
setdiff(churn_data$customerID,customer_data$customerID) # Identical customerID across these datasets | |
setdiff(churn_data$customerID,internet_data$customerID) # Identical customerID across these datasets | |
telecom<- merge(churn_data,customer_data, by="customerID",all = F) | |
telecom<- merge(telecom,internet_data, by="customerID", all = F) | |
View(telecom) #master file | |
################################################################ | |
### Data Preparation & Exploratory Data Analysis | |
# Understanding the structure of the collated file | |
str(telecom) #7043 obs. of 21 variables; | |
# tenure, MonthlyCharges, TotalCharges are continuous | |
# SeniorCitizen need to be changed from integer to categorical | |
# Barcharts for categorical features with stacked telecom information | |
bar_theme1<- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), | |
legend.position="none") | |
plot_grid(ggplot(telecom, aes(x=PhoneService,fill=Churn))+ geom_bar(), | |
ggplot(telecom, aes(x=Contract,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=PaperlessBilling,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=PaymentMethod,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=gender,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=factor(SeniorCitizen),fill=Churn))+ geom_bar() +bar_theme1, | |
align = "h") | |
plot_grid(ggplot(telecom, aes(x=Partner,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=Dependents,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=MultipleLines,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=InternetService,fill=Churn))+ geom_bar()+bar_theme1, | |
align = "h") | |
plot_grid(ggplot(telecom, aes(x=OnlineSecurity,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=OnlineBackup,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=DeviceProtection,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=TechSupport,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=StreamingTV,fill=Churn))+ geom_bar()+bar_theme1, | |
ggplot(telecom, aes(x=StreamingMovies,fill=Churn))+ geom_bar()+bar_theme1, | |
align = "h") | |
#reveals strong contrast for telecom wrt Contract,InternetServices, (not availing) OnlineSecurity and Techsupport and PaymentMethod, | |
#moderate wrt whether SeniorCitizen, having partner, OnlineBackup and DeviceProtection | |
# Histogram and Boxplots for numeric variables | |
box_theme<- theme(axis.line=element_blank(),axis.title=element_blank(), | |
axis.ticks=element_blank(), axis.text=element_blank()) | |
box_theme_y<- theme(axis.line.y=element_blank(),axis.title.y=element_blank(), | |
axis.ticks.y=element_blank(), axis.text.y=element_blank(), | |
legend.position="none") | |
plot_grid(ggplot(telecom, aes(tenure))+ geom_histogram(binwidth = 10), | |
ggplot(telecom, aes(x="",y=tenure))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
plot_grid(ggplot(telecom, aes(MonthlyCharges))+ geom_histogram(binwidth = 20), | |
ggplot(telecom, aes(x="",y=MonthlyCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
plot_grid(ggplot(telecom, aes(TotalCharges))+ geom_histogram(), | |
ggplot(telecom, aes(x="",y=TotalCharges))+ geom_boxplot(width=0.1)+coord_flip()+box_theme, | |
align = "v",ncol = 1) | |
#No outliers in numeric variables | |
# Boxplots of numeric variables relative to telecom status | |
plot_grid(ggplot(telecom, aes(x=Churn,y=tenure, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() +theme(legend.position="none"), | |
ggplot(telecom, aes(x=Churn,y=MonthlyCharges, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() + box_theme_y, | |
ggplot(telecom, aes(x=Churn,y=TotalCharges, fill=Churn))+ geom_boxplot(width=0.2)+ | |
coord_flip() + box_theme_y, | |
align = "v",nrow = 1) | |
# Shorter tenure sees more telecom | |
# Correlation between numeric variables | |
library(GGally) | |
ggpairs(telecom[, c("tenure", "MonthlyCharges", "TotalCharges")]) | |
#As expected, tenure and TotalCharges are highly correlated (corr 0.83) | |
################################################################ | |
### Data Preparation | |
# De-Duplication | |
# not needed | |
# Bringing the variables in the correct format | |
telecom$SeniorCitizen<- ifelse(telecom$SeniorCitizen==1,"Yes","No") | |
# Outlier treatment and imputing missing value | |
#Boxplot showed no outlier, Nevertheless confirming it also with percentiles | |
sapply(telecom[,c("tenure","MonthlyCharges","TotalCharges")], | |
function(x) quantile(x,seq(0,1,.01),na.rm = T)) #no outlier | |
# Missing value | |
sapply(telecom, function(x) sum(is.na(x))) # shows all 11 NAs are in Totalcharges column | |
View(subset(telecom, is.na(TotalCharges))) # All 11 NAs in TotalCharges correspond to tenure 0 | |
View(subset(telecom, telecom$tenure==0)) #vice-versa of above is also true | |
# It means that 11/7043 = 0.001561834 i.e 0.1%, best is to remove these observations from the analysis | |
telecom <- telecom[!is.na(telecom$TotalCharges),] | |
################################################################ | |
# Feature standardisation | |
# Normalising continuous features | |
telecom$tenure<- scale(telecom$tenure) # scale used: mean 32.4, sd 24.6 | |
summary(telecom$tenure) | |
sd(telecom$tenure) | |
telecom$MonthlyCharges<- scale(telecom$MonthlyCharges) # scale used: mean 64.8, sd 30.1 | |
telecom$TotalCharges<- scale(telecom$TotalCharges) # scale used: mean 2280, sd 2267 | |
# converting target variable telecom from No/Yes character to factorwith levels 0/1 | |
telecom$Churn<- ifelse(telecom$Churn=="Yes",1,0) | |
# Checking churn rate of prospect customer | |
Churn <- sum(telecom$Churn)/nrow(telecom) | |
Churn # 26.57% churn rate. | |
# creating a dataframe of categorical features | |
telecom_chr<- telecom[,-c(1,2,7,8,9)] | |
str(telecom_chr) | |
# converting categorical attributes to factor | |
telecom_fact<- data.frame(sapply(telecom_chr, function(x) factor(x))) | |
str(telecom_fact) | |
# creating dummy variables for factor attributes | |
dummies<- data.frame(sapply(telecom_fact, | |
function(x) data.frame(model.matrix(~x-1,data =telecom_fact))[,-1])) | |
check <- data.frame(model.matrix(~telecom_fact$InternetService-1,data =telecom_fact)) | |
# For variables having only two levels, verified PhoneService, PaperlessBilling, Partner, Dependents "yes" is 1, | |
#gender "male" is 1 and SeniorCitizen "1" is 1 | |
# Final dataset | |
telecom_final<- cbind(telecom[,c(9,2,7,8)],dummies) | |
View(telecom_final) #7032 obs. of 31 variables | |
######################################################################## | |
# splitting the data between train and test | |
set.seed(100) | |
indices = sample.split(telecom_final$Churn, SplitRatio = 0.7) | |
train = telecom_final[indices,] | |
test = telecom_final[!(indices),] | |
######################################################################## | |
# Logistic Regression: | |
#Initial model | |
model_1 = glm(Churn ~ ., data = train, family = "binomial") | |
summary(model_1) #AIC 4150.1....31 coeff..nullDev 5699.5...resDev 4102.1 | |
# Stepwise selection | |
library("MASS") | |
model_2<- stepAIC(model_1, direction="both") | |
summary(model_2) | |
# Removing multicollinearity through VIF check | |
library(car) | |
vif(model_2) | |
#Excluding TotalCharges | |
model_3<- glm(formula = Churn ~ tenure + MonthlyCharges + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_3) | |
vif(model_3) | |
# MonthlyCharges has high vif and but high significance, also, it came in the above model as well, even the VIF | |
# has not reduced much,so better is to remove this variable | |
# Excluding MonthlyCharges | |
model_4<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
OnlineBackup.xYes + DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_4) | |
vif(model_4) # cannot exclude any more variable based on vif | |
#as most of them have low vif; those with higher vif are very significant and not correlated | |
#Excluding OnlineBackup.xYes due to lower significance | |
model_5<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
DeviceProtection.xYes + StreamingTV.xYes + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_5) | |
#Excluding DeviceProtection.xYes due to lower significance | |
model_6<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + SeniorCitizen + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_6) | |
#Excluding SeniorCitizen due to lower significance | |
model_7<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingTV.xYes + StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_7) | |
#Excluding StreamingTV.xYes due to lower significance with respect to other 10 variables | |
model_8<- glm(formula = Churn ~ tenure + | |
PhoneService + Contract.xOne.year + Contract.xTwo.year + | |
PaperlessBilling + PaymentMethod.xElectronic.check + | |
MultipleLines.xYes + InternetService.xFiber.optic + InternetService.xNo + | |
StreamingMovies.xYes, family = "binomial", data = train) | |
summary(model_8) | |
######################################################################## | |
# With 10 significant variables in the model | |
final_model<- model_8 | |
####################################################################### | |
### Model Evaluation | |
### Test Data #### | |
#predicted probabilities of Churn 1 for test data | |
test_pred = predict(final_model, type = "response", | |
newdata = test[,-1]) | |
# Let's see the summary | |
summary(test_pred) | |
test$prob <- test_pred | |
View(test) | |
# Let's use the probability cutoff of 50%. | |
test_pred_churn <- factor(ifelse(test_pred >= 0.50, "Yes", "No")) | |
test_actual_churn <- factor(ifelse(test$Churn==1,"Yes","No")) | |
table(test_actual_churn,test_pred_churn) | |
####################################################################### | |
test_pred_churn <- factor(ifelse(test_pred >= 0.40, "Yes", "No")) | |
install.packages("e1071") | |
library(e1071) | |
test_conf <- confusionMatrix(test_pred_churn, test_actual_churn, positive = "Yes") | |
test_conf | |
####################################################################### | |
######################################################################################### | |
# Let's Choose the cutoff value. | |
# | |
# Let's find out the optimal probalility cutoff | |
perform_fn <- function(cutoff) | |
{ | |
predicted_churn <- factor(ifelse(test_pred >= cutoff, "Yes", "No")) | |
conf <- confusionMatrix(predicted_churn, test_actual_churn, positive = "Yes") | |
acc <- conf$overall[1] | |
sens <- conf$byClass[1] | |
spec <- conf$byClass[2] | |
out <- t(as.matrix(c(sens, spec, acc))) | |
colnames(out) <- c("sensitivity", "specificity", "accuracy") | |
return(out) | |
} | |
# Creating cutoff values from 0.003575 to 0.812100 for plotting and initiallizing a matrix of 100 X 3. | |
# Summary of test probability | |
summary(test_pred) | |
s = seq(.01,.80,length=100) | |
s | |
OUT = matrix(0,100,3) | |
for(i in 1:100) | |
{ | |
OUT[i,] = perform_fn(s[i]) | |
} | |
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2) | |
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
lines(s,OUT[,2],col="darkgreen",lwd=2) | |
lines(s,OUT[,3],col=4,lwd=2) | |
box() | |
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy")) | |
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.01)] | |
cutoff | |
# Let's choose a cutoff value of 0.3132 for final model | |
test_cutoff_churn <- factor(ifelse(test_pred >=0.3132, "Yes", "No")) | |
conf_final <- confusionMatrix(test_cutoff_churn, test_actual_churn, positive = "Yes") | |
acc <- conf_final$overall[1] | |
sens <- conf_final$byClass[1] | |
spec <- conf_final$byClass[2] | |
acc | |
sens | |
spec | |
View(test) | |
################################################################################################## | |
### KS -statistic - Test Data ###### | |
test_cutoff_churn <- ifelse(test_cutoff_churn=="Yes",1,0) | |
test_actual_churn <- ifelse(test_actual_churn=="Yes",1,0) | |
library(ROCR) | |
#on testing data | |
pred_object_test<- prediction(test_cutoff_churn, test_actual_churn) | |
performance_measures_test<- performance(pred_object_test, "tpr", "fpr") | |
ks_table_test <- attr(performance_measures_test, "y.values")[[1]] - | |
(attr(performance_measures_test, "x.values")[[1]]) | |
max(ks_table_test) | |
#################################################################### | |
# Lift & Gain Chart | |
# plotting the lift chart | |
# Loading dplyr package | |
require(dplyr) | |
library(dplyr) | |
lift <- function(labels , predicted_prob,groups=10) { | |
if(is.factor(labels)) labels <- as.integer(as.character(labels )) | |
if(is.factor(predicted_prob)) predicted_prob <- as.integer(as.character(predicted_prob)) | |
helper = data.frame(cbind(labels , predicted_prob)) | |
helper[,"bucket"] = ntile(-helper[,"predicted_prob"], groups) | |
gaintable = helper %>% group_by(bucket) %>% | |
summarise_at(vars(labels ), funs(total = n(), | |
totalresp=sum(., na.rm = TRUE))) %>% | |
mutate(Cumresp = cumsum(totalresp), | |
Gain=Cumresp/sum(totalresp)*100, | |
Cumlift=Gain/(bucket*(100/groups))) | |
return(gaintable) | |
} | |
Churn_decile = lift(test_actual_churn, test_pred, groups = 10) | |
Churn_decile | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment