Skip to content

Instantly share code, notes, and snippets.

@15Nik
Created January 17, 2021 20:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save 15Nik/1f59bd143984de9f403e4d094ccdeba8 to your computer and use it in GitHub Desktop.
Save 15Nik/1f59bd143984de9f403e4d094ccdeba8 to your computer and use it in GitHub Desktop.
########################################### I. BUSINESS UNDERSTANDING ####################################################
# Goal - To predict the probability of response of each prospect
# and target the ones most likely to respond to the next telemarketing campaign.
#Including all the necessary libraries
library(caret)
library(caTools)
library(ggplot2)
library(MASS)
library(car)
library(dplyr)
library(dummies)
# Loading the data into a data frame called bank_marketing
bank_marketing<- read.csv("bank_marketing.csv")
########################################### II. DATA UNDERSTANDING ####################################################
# Checking the structure of the given data
str(bank_marketing)
# Summary
summary(bank_marketing)
# Calculating the response rate
response_rate <- 4640/(36548+4640)
#The response rate is found to be 11.26%
# Checking missing values
sum(is.na(bank_marketing))
# Clearly, there are no missing values found as all missing values have been reported as "unknown"
##### For Age Variable
# Plotting of age histogram
ggplot(bank_marketing,aes(age)) + geom_histogram()
# We can see that the age is spread between 17 to 98 years. This clearly needs outlier treatment.
# Checking the outlier in age
quantile(bank_marketing$age, seq(0,1,0.01))
# We see a jump in age at 99% which has 71 age. Box plot will give more insight on the outliers.
# Using box plot to check outliers in age
boxplot(bank_marketing$age)
# clearly, after 71 age, the other values can be removed.
#So, lets Cap the upper age limit to 71
bank_marketing[(which(bank_marketing$age>71)),]$age <- 71
#Binning the age variable will help ease the analysis and give better insights
bank_marketing$age_binned <- as.factor(cut(bank_marketing$age, breaks = c(16, 20, 30, 40, 50, 60, 70, 80)))
#Changing the response value to numbers
bank_marketing$response <- ifelse(bank_marketing$response == "yes", 1, 0)
# Check the numeric value of response rate in each bucket
aggregate_age <- merge(aggregate(response ~ age_binned, bank_marketing, mean),aggregate(response~age_binned, bank_marketing, sum),by = "age_binned")
# Adding No_of_prospect
count <- data.frame(table(bank_marketing$age_binned))
count <- count[,-1]
aggregate_age <- cbind(aggregate_age,count)
# Changing column name of each variables in aggregate_age dataframe
colnames(aggregate_age) <- c("age", "response_rate", "count_prospects","No_of_prospect")
# Round Off the values
aggregate_age$response_rate <- format(round(aggregate_age$response_rate, 2))
aggregate_age
# Plotting graph for response rate of each age bucket
ggplot(aggregate_age, aes(age, No_of_prospect,label = response_rate)) +
geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_text(size = 3, vjust = -0.5)
# We see that most of the data set lies in the age range 20 to 60.
# Checking dataset of age less than 20 years.
bank_marketing_age20 <- subset(bank_marketing,age <20)
summary(bank_marketing_age20)
#### Writing a function to do the same task for each variable
plot_create <- function(category_variable, variable_name){
a <- aggregate(response~category_variable, bank_marketing, mean)
count <- data.frame(table(category_variable))
count <- count[,-1]
aggregate_response <- cbind(a, count)
colnames(aggregate_response) <- c(variable_name, "response_rate","No_of_Prospect")
aggregate_response[, 2] <- format(round(aggregate_response[, 2], 2))
ggplot(aggregate_response, aes(aggregate_response[, 1], count, label = response_rate)) + geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + geom_text(size = 3, vjust = -0.5) + xlab(variable_name)
}
##### For Job Variable
# Levels of job
levels(bank_marketing$job)
#Summary job
summary(bank_marketing$job)
# Plotting for job variable.
plot_create(bank_marketing$job, "job")
# Admin and blue-collar jobs are high in number. However, the response rate is good
# among retired and students
##### For Martial Status Variable
# Levels of Marial Status
levels(bank_marketing$marital)
# Summary of Martial status
summary(bank_marketing$marital)
# Replacing Unknown level to married
levels(bank_marketing$marital)[4] <- "married"
# Plotting marital status
plot_create(bank_marketing$marital,"Marital Status")
#Clearly, no impact on the response rate.
##### For Education Variable
# Levels of education
levels(bank_marketing$education)
# Summary of Education
summary(bank_marketing$education)
# Reducing the levels of education variable
levels(bank_marketing$education)[c(1:3,5)] <- "Primary_Education"
levels(bank_marketing$education)[2] <- "Secondary_Education"
levels(bank_marketing$education)[4]<- "Tertiary_Education"
# Plotting Education
plot_create(bank_marketing$education,"Education")
#Clearly, no impact on the response rate.
##### For default variable
# Levels of default
levels(bank_marketing$default)
# Summary of default
summary(bank_marketing$default)
# Plotting default
plot_create(bank_marketing$default, "Default")
#Clearly, no impact on the response rate.
bank_marketing <- bank_marketing[,-5]
##### For housing variable
# Levels of housing
levels(bank_marketing$housing)
# Summary of housing
summary(bank_marketing$housing)
#Plotting housing
plot_create(bank_marketing$housing, "Housing")
# Clearly, not much of an impact on the response.
##### For loan variable
# Levels of loan
levels(bank_marketing$loan)
# Summary of loan
summary(bank_marketing$loan)
#Plotting loan
plot_create(bank_marketing$loan, "Loan Status")
# Clearly, not much of an impact on the response.
##### For contact variable
# Levels of contact
levels(bank_marketing$contact)
# Summary of contact
summary(bank_marketing$contact)
#Plotting contact
plot_create(bank_marketing$contact, "Contact Mode")
# Cellular has some impact on the response than telephone.
##### For month variable
# Levels of month
levels(bank_marketing$month)
# Summary of month
summary(bank_marketing$month)
#Plotting month
plot_create(bank_marketing$month, "Contact Month")
# The contact months seem to have an impact on the response.
##### For day_of_week variable
# Levels of day_of_week
levels(bank_marketing$day_of_week)
# Summary of day_of_week
summary(bank_marketing$day_of_week)
#Plotting day_of_week
plot_create(bank_marketing$day_of_week, "Day Of Week")
# The day of the week has no imapct on response.
##### For duration variable
# Duration is a quantile varibale.
# Summary of duration
summary(bank_marketing$duration)
# Plotting histogram
ggplot(bank_marketing,aes(duration))+geom_histogram()
# Average duration
bank_marketing$response_1 <- as.factor(bank_marketing$response)
Avg_duration <- aggregate(duration~response_1,bank_marketing,mean)
bank_marketing <- bank_marketing[,-(21:22)]
##### For campaign variable
#summary
summary(bank_marketing$campaign)
# percentile distribution
boxplot(bank_marketing$campaign)
#quantile
quantile(bank_marketing$campaign,seq(0,1,0.01))
# We see a huge jump at 99%. So lets cap at it.
# Capping at 99%, value is 14
bank_marketing[which(bank_marketing$campaign>14),]$campaign <- 14
# Plotting campaign
ggplot(bank_marketing,aes(campaign))+geom_histogram()
##### For pdays variable
#convert pdays to factor type
bank_marketing$pdays<- as.factor(bank_marketing$pdays)
# check levels
levels(bank_marketing$pdays)
# summary
summary(bank_marketing$pdays)
# Reducing the levels.
levels(bank_marketing$pdays)[1:10] <- "Contacted_in_first_10days"
levels(bank_marketing$pdays)[2:17] <-"Contacted_after_10days"
levels(bank_marketing$pdays)[3] <- "First_time_contacted"
# Plotting campaign
plot_create(bank_marketing$pday,"Pday")
#Contacted_in_first_10days and Contacted_after_10days have greater response rate
# Summary each category
summary(bank_marketing$pdays)
##### For previous variable
#summary of previous
summary(bank_marketing$previous)
# Max=7, best is to convert this variable to factor
#to factor
bank_marketing$previous <- as.factor(bank_marketing$previous)
#levels of previous
levels(bank_marketing$previous)
#Reducing levels
levels(bank_marketing$previous)[1]<-"Never contacted"
levels(bank_marketing$previous)[2:4] <- "Less_than_3_times"
levels(bank_marketing$previous)[3:6] <- "More than_3_times"
#Summary
summary(bank_marketing$previous)
#Plotting Previous
plot_create(bank_marketing$previous,"Previous Contacts")
# More than_3_times seems to be having greater impact on response rates.
##### For Poutcome variable
# Levels of poutcome
levels(bank_marketing$poutcome)
# Summary of poutcome
summary(bank_marketing$poutcome)
#Plotting poutcome
plot_create(bank_marketing$poutcome, "Outcome_of_Previous_contacts")
# This seems to have an impact on response.
##### For emp.var.rate variable
#summary
summary(bank_marketing$emp.var.rate)
# Plotting histogram
ggplot(bank_marketing,aes(emp.var.rate))+geom_histogram()
##### For cons.price.idx variable
#summary
summary(bank_marketing$cons.price.idx)
# Plotting histogram
ggplot(bank_marketing,aes(cons.price.idx))+geom_histogram()
##### For cons.conf.idx variable
#summary
summary(bank_marketing$cons.conf.idx)
##### For ceuribor3m
#summary
summary(bank_marketing$euribor3m)
##### For nr.employed
#summary
summary(bank_marketing$nr.employed)
cust_num <- rownames(bank_marketing)
bank_marketing <- cbind(cust_num=cust_num, bank_marketing)
colnames(bank_marketing)[1] <- "prospectID"
#Creating two new data frames
bank_marketing_1 <- bank_marketing
bank_marketing_2 <- bank_marketing
########################################### III. MODEL BUILDING ####################################################
# We will be using Logistic Regression to build the model.
#Lets start with the creation of dummy variables
bank_marketing$response <- as.integer(bank_marketing$response)
bank_marketing <- cbind(bank_marketing[,1],dummy.data.frame(bank_marketing[,-1]))
colnames(bank_marketing)[1] <- "prospectID"
bank_marketing$response <- as.factor(ifelse(bank_marketing$response == 1, "yes", "no"))
# splitting into train and test data
set.seed(1)
split_indices <- sample.split(bank_marketing$response, SplitRatio = 0.70)
train <- bank_marketing[split_indices, ]
test <- bank_marketing[!split_indices, ]
nrow(train)/nrow(bank_marketing)
nrow(test)/nrow(bank_marketing)
#As per the the requirement mentioned in the problems, lets remove the duration variable
test <- test[,-46]
train <- train[,-46]
#Next, Lets start building the Model 1
Model_1 <- glm(response ~ ., family = "binomial", data = train[,-1])
summary(Model_1)
# Using stepwise to remove insignificant variables
Model_2 <- stepAIC(Model_1, direction = "both")
summary(Model_2)
# stepAIC has removed some variables and only the following ones remain
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education + educationTertiary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + emp.var.rate + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
# Lets find out VIF now for Model_3
vif(Model_3)
#emp.var.rate variable has very high VIF value at 139.73
summary(Model_3)
#lets remove the emp.var.rate variable and build the next model.
#Building model 3
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education + educationTertiary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
vif(Model_3)
summary(Model_3)
#monthmay has VIF 2.96, monthjul has VIF 2.04, monthjun has Vif 2.01.
# But all are significant variables. SO we remove monthnov which has VIF 1.499 but is not significant.
#Lets remove the monthnov variable and build the next model.
#Model 4
Model_4 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education + educationTertiary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + monthoct + day_of_weekfri + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
vif(Model_4)
summary(Model_4)
# monthoct has VIF 1.23 and is not significant. So lets remove it.
#Building Model 5 by removing monthoct
Model_5 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education + educationTertiary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekfri + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
vif(Model_5)
summary(Model_5)
#educationTertiary_Education has high VIF and low significance. Lets remove it.
#Model 6
Model_6 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekfri + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
vif(Model_6)
summary(Model_6)
#day_of_weekfri has high VIF and no significance. Lets remove it.
#Model 7
Model_7 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.price.idx + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
vif(Model_7)
summary(Model_7)
#cons.price.idx has high VIF and no significance. Lets remove it.
#Model 8
Model_8 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.conf.idx +
nr.employed + `previousMore than_3_times`,
family = "binomial", data = train[,-1])
summary(Model_8)
#previousMore than_3_times has no significance. Lets remove it.
#Model 9
Model_9 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
maritaldivorced + educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.conf.idx +
nr.employed,
family = "binomial", data = train[,-1])
summary(Model_9)
#martialdivorced has no significance. Lets remove it.
#Model 10
Model_10 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.conf.idx +
nr.employed,
family = "binomial", data = train[,-1])
summary(Model_10)
#jobadmin. has no significance. Lets remove it.
#Model 11
Model_11 <- glm(formula = response ~ jobretired + jobstudent + jobtechnician +
educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.conf.idx +
nr.employed,
family = "binomial", data = train[,-1])
summary(Model_11)
#jobtechnician has no significance. Lets remove it.
#Model 12
Model_12 <- glm(formula = response ~ jobretired + jobstudent +
educationPrimary_Education +
contactcellular + monthapr + monthjul + monthjun + monthmar +
monthmay + day_of_weekmon +
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
poutcomefailure + cons.conf.idx +
nr.employed,
family = "binomial", data = train[,-1])
summary(Model_12)
# At the model 12, we get all the significant variables. So I would like to have Model_12 as my final model.
########################################### IV. MODEL EVALUATION ####################################################
# Predicting test data
predictions_logit <- predict(Model_12, newdata = test[, -c(1,61)], type = "response")
summary(predictions_logit)
# At first we try with the probability cutoff of 50%.
predicted_response <- factor(ifelse(predictions_logit >= 0.50, "yes", "no"))
# Creating confusion matrix for identifying the model evaluation.
conf <- confusionMatrix(predicted_response, test$response, positive = "yes")
# Accuracy - 89.96%
# Sensitivity - 23.42%
# Specificity - 98.41%
# We would like to improve the specificity percentage. So, lets try to get the optimal probability cut-off.
# Writing a function for optimal probalility cutoff
probability_cutoff_func <- function(cutoff)
{
predicted_response <- factor(ifelse(predictions_logit >= cutoff, "yes", "no"))
conf <- confusionMatrix(predicted_response, test$response, positive = "yes")
acc <- conf$overall[1]
sens <- conf$byClass[1]
spec <- conf$byClass[2]
out <- t(as.matrix(c(sens, spec, acc)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
# Creating cutoff values from 0.01 to 0.99 for plotting and initiallizing a matrix of 1000 X 4.
s = seq(.01,.93,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = probability_cutoff_func(s[i])
}
# plotting cutoffs
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.03)]
# From the above plot, we found the cutoff value of 11% for our final model
predicted_response <- factor(ifelse(predictions_logit >= 0.078, "yes", "no"))
conf_final <- confusionMatrix(predicted_response, test$response, positive = "yes")
accuracy1 <- conf_final$overall[1]
sensitivity1 <- conf_final$byClass[1]
specificity1 <- conf_final$byClass[2]
#accuracy
#73.74%
#sensitivity
#70.11%
#specificity
#74.20%
################### Lift and Gain Charts
prediction_1 <- predict(Model_12, newdata = bank_marketing[, -c(1,46,62)], type = "response")
summary(prediction_1)
prediction_response <- factor(ifelse(prediction_1 >= 0.078, "yes", "no"))
bank_marketing$Predicted_response <- prediction_response
bank_marketing$Predicted_probability <- prediction_1
# A new test dataframe as per given problem.
test_bank_marketing <- bank_marketing[, c("prospectID","duration", "response", "Predicted_response", "Predicted_probability")]
#Given in the problem statement that: Cost per call (INR) = 0.033*(duration_in_seconds) + 0.8
# lets now do that
test_bank_marketing$call_cost <- 0.033 * bank_marketing$duration + 0.8
test_bank_marketing <- test_bank_marketing[order(test_bank_marketing$Predicted_probability, decreasing = T), ]
#Lift Function
Func_Lift <- function(labels, Predicted_probability, call_cost, groups=10) {
if(is.factor(labels)) labels <- as.integer(as.character(labels ))
if(is.factor(Predicted_probability)) Predicted_probability <- as.integer(as.character(Predicted_probability))
helper = data.frame(cbind(labels , Predicted_probability))
helper[,"bucket"] = ntile(-helper[,"Predicted_probability"], groups)
gaintable = helper %>% group_by(bucket) %>%
summarise_at(vars(labels ), funs(total = n(),
totalresp=sum(., na.rm = TRUE))) %>%
mutate(Cumresp = cumsum(totalresp),
Gain=Cumresp/sum(totalresp)*100,
Cumlift=Gain/(bucket*(100/groups)))
return(gaintable)
}
# Getting cumulative gain and lift
test_bank_marketing$response <- as.factor(ifelse(test_bank_marketing$response=="yes",1,0))
liftandgain = Func_Lift(test_bank_marketing$response, test_bank_marketing$Predicted_probability, groups = 10)
test_bank_marketing$number <- 1:nrow(test_bank_marketing)
test_bank_marketing <- test_bank_marketing%>% mutate(bucket = ntile(number, 10))
Call_agg <- aggregate(call_cost~bucket,test_bank_marketing,sum)
liftandgain <- merge(liftandgain,Call_agg,by="bucket")
liftandgain$cum_cost <- cumsum(liftandgain$call_cost)
liftandgain$cost_percent <- (liftandgain$call_cost/sum(liftandgain$call_cost))*100
liftandgain <- merge(liftandgain,aggregate(duration~bucket,test_bank_marketing,mean),by="bucket")
names(liftandgain)[10] <- "Avgerage_duration"
## Then we get the Average cost per call
liftandgain$Avg_call_cost <- liftandgain$call_cost/liftandgain$total
# Now, we can plot the gain Chart to see the gain obtained
# Tells us the number of responders captured (y-axis) as a function of the number of prospects targeted (x-axis)
plot(liftandgain$bucket,liftandgain$Gain,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Positive Response (in percentage)")
# Next, we plot the lift Chart
# Compares the response rate with and without using the model.
plot(liftandgain$bucket,liftandgain$Cumlift,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Lift")
# We can conclude that our target is the 80.79% or 81% of the customers from the gain chart.
# The Average cost per call for the targeted 81% customers is 9.7745728
# The Avearage duration of xall for the targeted 81% customers is 271.95674
# The total cost for all the 81% targeted customers will come up to be approximately 2 lakhs which is better than the random model.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment