Created
January 17, 2021 20:15
-
-
Save 15Nik/1f59bd143984de9f403e4d094ccdeba8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################### I. BUSINESS UNDERSTANDING #################################################### | |
# Goal - To predict the probability of response of each prospect | |
# and target the ones most likely to respond to the next telemarketing campaign. | |
#Including all the necessary libraries | |
library(caret) | |
library(caTools) | |
library(ggplot2) | |
library(MASS) | |
library(car) | |
library(dplyr) | |
library(dummies) | |
# Loading the data into a data frame called bank_marketing | |
bank_marketing<- read.csv("bank_marketing.csv") | |
########################################### II. DATA UNDERSTANDING #################################################### | |
# Checking the structure of the given data | |
str(bank_marketing) | |
# Summary | |
summary(bank_marketing) | |
# Calculating the response rate | |
response_rate <- 4640/(36548+4640) | |
#The response rate is found to be 11.26% | |
# Checking missing values | |
sum(is.na(bank_marketing)) | |
# Clearly, there are no missing values found as all missing values have been reported as "unknown" | |
##### For Age Variable | |
# Plotting of age histogram | |
ggplot(bank_marketing,aes(age)) + geom_histogram() | |
# We can see that the age is spread between 17 to 98 years. This clearly needs outlier treatment. | |
# Checking the outlier in age | |
quantile(bank_marketing$age, seq(0,1,0.01)) | |
# We see a jump in age at 99% which has 71 age. Box plot will give more insight on the outliers. | |
# Using box plot to check outliers in age | |
boxplot(bank_marketing$age) | |
# clearly, after 71 age, the other values can be removed. | |
#So, lets Cap the upper age limit to 71 | |
bank_marketing[(which(bank_marketing$age>71)),]$age <- 71 | |
#Binning the age variable will help ease the analysis and give better insights | |
bank_marketing$age_binned <- as.factor(cut(bank_marketing$age, breaks = c(16, 20, 30, 40, 50, 60, 70, 80))) | |
#Changing the response value to numbers | |
bank_marketing$response <- ifelse(bank_marketing$response == "yes", 1, 0) | |
# Check the numeric value of response rate in each bucket | |
aggregate_age <- merge(aggregate(response ~ age_binned, bank_marketing, mean),aggregate(response~age_binned, bank_marketing, sum),by = "age_binned") | |
# Adding No_of_prospect | |
count <- data.frame(table(bank_marketing$age_binned)) | |
count <- count[,-1] | |
aggregate_age <- cbind(aggregate_age,count) | |
# Changing column name of each variables in aggregate_age dataframe | |
colnames(aggregate_age) <- c("age", "response_rate", "count_prospects","No_of_prospect") | |
# Round Off the values | |
aggregate_age$response_rate <- format(round(aggregate_age$response_rate, 2)) | |
aggregate_age | |
# Plotting graph for response rate of each age bucket | |
ggplot(aggregate_age, aes(age, No_of_prospect,label = response_rate)) + | |
geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + | |
geom_text(size = 3, vjust = -0.5) | |
# We see that most of the data set lies in the age range 20 to 60. | |
# Checking dataset of age less than 20 years. | |
bank_marketing_age20 <- subset(bank_marketing,age <20) | |
summary(bank_marketing_age20) | |
#### Writing a function to do the same task for each variable | |
plot_create <- function(category_variable, variable_name){ | |
a <- aggregate(response~category_variable, bank_marketing, mean) | |
count <- data.frame(table(category_variable)) | |
count <- count[,-1] | |
aggregate_response <- cbind(a, count) | |
colnames(aggregate_response) <- c(variable_name, "response_rate","No_of_Prospect") | |
aggregate_response[, 2] <- format(round(aggregate_response[, 2], 2)) | |
ggplot(aggregate_response, aes(aggregate_response[, 1], count, label = response_rate)) + geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + geom_text(size = 3, vjust = -0.5) + xlab(variable_name) | |
} | |
##### For Job Variable | |
# Levels of job | |
levels(bank_marketing$job) | |
#Summary job | |
summary(bank_marketing$job) | |
# Plotting for job variable. | |
plot_create(bank_marketing$job, "job") | |
# Admin and blue-collar jobs are high in number. However, the response rate is good | |
# among retired and students | |
##### For Martial Status Variable | |
# Levels of Marial Status | |
levels(bank_marketing$marital) | |
# Summary of Martial status | |
summary(bank_marketing$marital) | |
# Replacing Unknown level to married | |
levels(bank_marketing$marital)[4] <- "married" | |
# Plotting marital status | |
plot_create(bank_marketing$marital,"Marital Status") | |
#Clearly, no impact on the response rate. | |
##### For Education Variable | |
# Levels of education | |
levels(bank_marketing$education) | |
# Summary of Education | |
summary(bank_marketing$education) | |
# Reducing the levels of education variable | |
levels(bank_marketing$education)[c(1:3,5)] <- "Primary_Education" | |
levels(bank_marketing$education)[2] <- "Secondary_Education" | |
levels(bank_marketing$education)[4]<- "Tertiary_Education" | |
# Plotting Education | |
plot_create(bank_marketing$education,"Education") | |
#Clearly, no impact on the response rate. | |
##### For default variable | |
# Levels of default | |
levels(bank_marketing$default) | |
# Summary of default | |
summary(bank_marketing$default) | |
# Plotting default | |
plot_create(bank_marketing$default, "Default") | |
#Clearly, no impact on the response rate. | |
bank_marketing <- bank_marketing[,-5] | |
##### For housing variable | |
# Levels of housing | |
levels(bank_marketing$housing) | |
# Summary of housing | |
summary(bank_marketing$housing) | |
#Plotting housing | |
plot_create(bank_marketing$housing, "Housing") | |
# Clearly, not much of an impact on the response. | |
##### For loan variable | |
# Levels of loan | |
levels(bank_marketing$loan) | |
# Summary of loan | |
summary(bank_marketing$loan) | |
#Plotting loan | |
plot_create(bank_marketing$loan, "Loan Status") | |
# Clearly, not much of an impact on the response. | |
##### For contact variable | |
# Levels of contact | |
levels(bank_marketing$contact) | |
# Summary of contact | |
summary(bank_marketing$contact) | |
#Plotting contact | |
plot_create(bank_marketing$contact, "Contact Mode") | |
# Cellular has some impact on the response than telephone. | |
##### For month variable | |
# Levels of month | |
levels(bank_marketing$month) | |
# Summary of month | |
summary(bank_marketing$month) | |
#Plotting month | |
plot_create(bank_marketing$month, "Contact Month") | |
# The contact months seem to have an impact on the response. | |
##### For day_of_week variable | |
# Levels of day_of_week | |
levels(bank_marketing$day_of_week) | |
# Summary of day_of_week | |
summary(bank_marketing$day_of_week) | |
#Plotting day_of_week | |
plot_create(bank_marketing$day_of_week, "Day Of Week") | |
# The day of the week has no imapct on response. | |
##### For duration variable | |
# Duration is a quantile varibale. | |
# Summary of duration | |
summary(bank_marketing$duration) | |
# Plotting histogram | |
ggplot(bank_marketing,aes(duration))+geom_histogram() | |
# Average duration | |
bank_marketing$response_1 <- as.factor(bank_marketing$response) | |
Avg_duration <- aggregate(duration~response_1,bank_marketing,mean) | |
bank_marketing <- bank_marketing[,-(21:22)] | |
##### For campaign variable | |
#summary | |
summary(bank_marketing$campaign) | |
# percentile distribution | |
boxplot(bank_marketing$campaign) | |
#quantile | |
quantile(bank_marketing$campaign,seq(0,1,0.01)) | |
# We see a huge jump at 99%. So lets cap at it. | |
# Capping at 99%, value is 14 | |
bank_marketing[which(bank_marketing$campaign>14),]$campaign <- 14 | |
# Plotting campaign | |
ggplot(bank_marketing,aes(campaign))+geom_histogram() | |
##### For pdays variable | |
#convert pdays to factor type | |
bank_marketing$pdays<- as.factor(bank_marketing$pdays) | |
# check levels | |
levels(bank_marketing$pdays) | |
# summary | |
summary(bank_marketing$pdays) | |
# Reducing the levels. | |
levels(bank_marketing$pdays)[1:10] <- "Contacted_in_first_10days" | |
levels(bank_marketing$pdays)[2:17] <-"Contacted_after_10days" | |
levels(bank_marketing$pdays)[3] <- "First_time_contacted" | |
# Plotting campaign | |
plot_create(bank_marketing$pday,"Pday") | |
#Contacted_in_first_10days and Contacted_after_10days have greater response rate | |
# Summary each category | |
summary(bank_marketing$pdays) | |
##### For previous variable | |
#summary of previous | |
summary(bank_marketing$previous) | |
# Max=7, best is to convert this variable to factor | |
#to factor | |
bank_marketing$previous <- as.factor(bank_marketing$previous) | |
#levels of previous | |
levels(bank_marketing$previous) | |
#Reducing levels | |
levels(bank_marketing$previous)[1]<-"Never contacted" | |
levels(bank_marketing$previous)[2:4] <- "Less_than_3_times" | |
levels(bank_marketing$previous)[3:6] <- "More than_3_times" | |
#Summary | |
summary(bank_marketing$previous) | |
#Plotting Previous | |
plot_create(bank_marketing$previous,"Previous Contacts") | |
# More than_3_times seems to be having greater impact on response rates. | |
##### For Poutcome variable | |
# Levels of poutcome | |
levels(bank_marketing$poutcome) | |
# Summary of poutcome | |
summary(bank_marketing$poutcome) | |
#Plotting poutcome | |
plot_create(bank_marketing$poutcome, "Outcome_of_Previous_contacts") | |
# This seems to have an impact on response. | |
##### For emp.var.rate variable | |
#summary | |
summary(bank_marketing$emp.var.rate) | |
# Plotting histogram | |
ggplot(bank_marketing,aes(emp.var.rate))+geom_histogram() | |
##### For cons.price.idx variable | |
#summary | |
summary(bank_marketing$cons.price.idx) | |
# Plotting histogram | |
ggplot(bank_marketing,aes(cons.price.idx))+geom_histogram() | |
##### For cons.conf.idx variable | |
#summary | |
summary(bank_marketing$cons.conf.idx) | |
##### For ceuribor3m | |
#summary | |
summary(bank_marketing$euribor3m) | |
##### For nr.employed | |
#summary | |
summary(bank_marketing$nr.employed) | |
cust_num <- rownames(bank_marketing) | |
bank_marketing <- cbind(cust_num=cust_num, bank_marketing) | |
colnames(bank_marketing)[1] <- "prospectID" | |
#Creating two new data frames | |
bank_marketing_1 <- bank_marketing | |
bank_marketing_2 <- bank_marketing | |
########################################### III. MODEL BUILDING #################################################### | |
# We will be using Logistic Regression to build the model. | |
#Lets start with the creation of dummy variables | |
bank_marketing$response <- as.integer(bank_marketing$response) | |
bank_marketing <- cbind(bank_marketing[,1],dummy.data.frame(bank_marketing[,-1])) | |
colnames(bank_marketing)[1] <- "prospectID" | |
bank_marketing$response <- as.factor(ifelse(bank_marketing$response == 1, "yes", "no")) | |
# splitting into train and test data | |
set.seed(1) | |
split_indices <- sample.split(bank_marketing$response, SplitRatio = 0.70) | |
train <- bank_marketing[split_indices, ] | |
test <- bank_marketing[!split_indices, ] | |
nrow(train)/nrow(bank_marketing) | |
nrow(test)/nrow(bank_marketing) | |
#As per the the requirement mentioned in the problems, lets remove the duration variable | |
test <- test[,-46] | |
train <- train[,-46] | |
#Next, Lets start building the Model 1 | |
Model_1 <- glm(response ~ ., family = "binomial", data = train[,-1]) | |
summary(Model_1) | |
# Using stepwise to remove insignificant variables | |
Model_2 <- stepAIC(Model_1, direction = "both") | |
summary(Model_2) | |
# stepAIC has removed some variables and only the following ones remain | |
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + educationTertiary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + emp.var.rate + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
# Lets find out VIF now for Model_3 | |
vif(Model_3) | |
#emp.var.rate variable has very high VIF value at 139.73 | |
summary(Model_3) | |
#lets remove the emp.var.rate variable and build the next model. | |
#Building model 3 | |
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + educationTertiary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
vif(Model_3) | |
summary(Model_3) | |
#monthmay has VIF 2.96, monthjul has VIF 2.04, monthjun has Vif 2.01. | |
# But all are significant variables. SO we remove monthnov which has VIF 1.499 but is not significant. | |
#Lets remove the monthnov variable and build the next model. | |
#Model 4 | |
Model_4 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + educationTertiary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + monthoct + day_of_weekfri + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
vif(Model_4) | |
summary(Model_4) | |
# monthoct has VIF 1.23 and is not significant. So lets remove it. | |
#Building Model 5 by removing monthoct | |
Model_5 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + educationTertiary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekfri + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
vif(Model_5) | |
summary(Model_5) | |
#educationTertiary_Education has high VIF and low significance. Lets remove it. | |
#Model 6 | |
Model_6 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekfri + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
vif(Model_6) | |
summary(Model_6) | |
#day_of_weekfri has high VIF and no significance. Lets remove it. | |
#Model 7 | |
Model_7 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.price.idx + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
vif(Model_7) | |
summary(Model_7) | |
#cons.price.idx has high VIF and no significance. Lets remove it. | |
#Model 8 | |
Model_8 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.conf.idx + | |
nr.employed + `previousMore than_3_times`, | |
family = "binomial", data = train[,-1]) | |
summary(Model_8) | |
#previousMore than_3_times has no significance. Lets remove it. | |
#Model 9 | |
Model_9 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
maritaldivorced + educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.conf.idx + | |
nr.employed, | |
family = "binomial", data = train[,-1]) | |
summary(Model_9) | |
#martialdivorced has no significance. Lets remove it. | |
#Model 10 | |
Model_10 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician + | |
educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.conf.idx + | |
nr.employed, | |
family = "binomial", data = train[,-1]) | |
summary(Model_10) | |
#jobadmin. has no significance. Lets remove it. | |
#Model 11 | |
Model_11 <- glm(formula = response ~ jobretired + jobstudent + jobtechnician + | |
educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.conf.idx + | |
nr.employed, | |
family = "binomial", data = train[,-1]) | |
summary(Model_11) | |
#jobtechnician has no significance. Lets remove it. | |
#Model 12 | |
Model_12 <- glm(formula = response ~ jobretired + jobstudent + | |
educationPrimary_Education + | |
contactcellular + monthapr + monthjul + monthjun + monthmar + | |
monthmay + day_of_weekmon + | |
campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days + | |
poutcomefailure + cons.conf.idx + | |
nr.employed, | |
family = "binomial", data = train[,-1]) | |
summary(Model_12) | |
# At the model 12, we get all the significant variables. So I would like to have Model_12 as my final model. | |
########################################### IV. MODEL EVALUATION #################################################### | |
# Predicting test data | |
predictions_logit <- predict(Model_12, newdata = test[, -c(1,61)], type = "response") | |
summary(predictions_logit) | |
# At first we try with the probability cutoff of 50%. | |
predicted_response <- factor(ifelse(predictions_logit >= 0.50, "yes", "no")) | |
# Creating confusion matrix for identifying the model evaluation. | |
conf <- confusionMatrix(predicted_response, test$response, positive = "yes") | |
# Accuracy - 89.96% | |
# Sensitivity - 23.42% | |
# Specificity - 98.41% | |
# We would like to improve the specificity percentage. So, lets try to get the optimal probability cut-off. | |
# Writing a function for optimal probalility cutoff | |
probability_cutoff_func <- function(cutoff) | |
{ | |
predicted_response <- factor(ifelse(predictions_logit >= cutoff, "yes", "no")) | |
conf <- confusionMatrix(predicted_response, test$response, positive = "yes") | |
acc <- conf$overall[1] | |
sens <- conf$byClass[1] | |
spec <- conf$byClass[2] | |
out <- t(as.matrix(c(sens, spec, acc))) | |
colnames(out) <- c("sensitivity", "specificity", "accuracy") | |
return(out) | |
} | |
# Creating cutoff values from 0.01 to 0.99 for plotting and initiallizing a matrix of 1000 X 4. | |
s = seq(.01,.93,length=100) | |
OUT = matrix(0,100,3) | |
for(i in 1:100) | |
{ | |
OUT[i,] = probability_cutoff_func(s[i]) | |
} | |
# plotting cutoffs | |
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2) | |
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5) | |
lines(s,OUT[,2],col="darkgreen",lwd=2) | |
lines(s,OUT[,3],col=4,lwd=2) | |
box() | |
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy")) | |
cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.03)] | |
# From the above plot, we found the cutoff value of 11% for our final model | |
predicted_response <- factor(ifelse(predictions_logit >= 0.078, "yes", "no")) | |
conf_final <- confusionMatrix(predicted_response, test$response, positive = "yes") | |
accuracy1 <- conf_final$overall[1] | |
sensitivity1 <- conf_final$byClass[1] | |
specificity1 <- conf_final$byClass[2] | |
#accuracy | |
#73.74% | |
#sensitivity | |
#70.11% | |
#specificity | |
#74.20% | |
################### Lift and Gain Charts | |
prediction_1 <- predict(Model_12, newdata = bank_marketing[, -c(1,46,62)], type = "response") | |
summary(prediction_1) | |
prediction_response <- factor(ifelse(prediction_1 >= 0.078, "yes", "no")) | |
bank_marketing$Predicted_response <- prediction_response | |
bank_marketing$Predicted_probability <- prediction_1 | |
# A new test dataframe as per given problem. | |
test_bank_marketing <- bank_marketing[, c("prospectID","duration", "response", "Predicted_response", "Predicted_probability")] | |
#Given in the problem statement that: Cost per call (INR) = 0.033*(duration_in_seconds) + 0.8 | |
# lets now do that | |
test_bank_marketing$call_cost <- 0.033 * bank_marketing$duration + 0.8 | |
test_bank_marketing <- test_bank_marketing[order(test_bank_marketing$Predicted_probability, decreasing = T), ] | |
#Lift Function | |
Func_Lift <- function(labels, Predicted_probability, call_cost, groups=10) { | |
if(is.factor(labels)) labels <- as.integer(as.character(labels )) | |
if(is.factor(Predicted_probability)) Predicted_probability <- as.integer(as.character(Predicted_probability)) | |
helper = data.frame(cbind(labels , Predicted_probability)) | |
helper[,"bucket"] = ntile(-helper[,"Predicted_probability"], groups) | |
gaintable = helper %>% group_by(bucket) %>% | |
summarise_at(vars(labels ), funs(total = n(), | |
totalresp=sum(., na.rm = TRUE))) %>% | |
mutate(Cumresp = cumsum(totalresp), | |
Gain=Cumresp/sum(totalresp)*100, | |
Cumlift=Gain/(bucket*(100/groups))) | |
return(gaintable) | |
} | |
# Getting cumulative gain and lift | |
test_bank_marketing$response <- as.factor(ifelse(test_bank_marketing$response=="yes",1,0)) | |
liftandgain = Func_Lift(test_bank_marketing$response, test_bank_marketing$Predicted_probability, groups = 10) | |
test_bank_marketing$number <- 1:nrow(test_bank_marketing) | |
test_bank_marketing <- test_bank_marketing%>% mutate(bucket = ntile(number, 10)) | |
Call_agg <- aggregate(call_cost~bucket,test_bank_marketing,sum) | |
liftandgain <- merge(liftandgain,Call_agg,by="bucket") | |
liftandgain$cum_cost <- cumsum(liftandgain$call_cost) | |
liftandgain$cost_percent <- (liftandgain$call_cost/sum(liftandgain$call_cost))*100 | |
liftandgain <- merge(liftandgain,aggregate(duration~bucket,test_bank_marketing,mean),by="bucket") | |
names(liftandgain)[10] <- "Avgerage_duration" | |
## Then we get the Average cost per call | |
liftandgain$Avg_call_cost <- liftandgain$call_cost/liftandgain$total | |
# Now, we can plot the gain Chart to see the gain obtained | |
# Tells us the number of responders captured (y-axis) as a function of the number of prospects targeted (x-axis) | |
plot(liftandgain$bucket,liftandgain$Gain,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Positive Response (in percentage)") | |
# Next, we plot the lift Chart | |
# Compares the response rate with and without using the model. | |
plot(liftandgain$bucket,liftandgain$Cumlift,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Lift") | |
# We can conclude that our target is the 80.79% or 81% of the customers from the gain chart. | |
# The Average cost per call for the targeted 81% customers is 9.7745728 | |
# The Avearage duration of xall for the targeted 81% customers is 271.95674 | |
# The total cost for all the 81% targeted customers will come up to be approximately 2 lakhs which is better than the random model. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment