15Nik/Acquisition Analytics

## Acquisition Analytics
########################################### I. BUSINESS UNDERSTANDING ####################################################
# Goal - To predict the probability of response of each prospect
# and target the ones most likely to respond to the next telemarketing campaign.


#Including all the necessary libraries

library(caret)
library(caTools)
library(ggplot2)
library(MASS)
library(car)
library(dplyr)
library(dummies)


# Loading the data into a data frame called bank_marketing
bank_marketing<- read.csv("bank_marketing.csv")


########################################### II. DATA UNDERSTANDING ####################################################

# Checking the structure of the given data
str(bank_marketing)


# Summary
summary(bank_marketing)


# Calculating the response rate
response_rate <- 4640/(36548+4640)
#The response rate is found to be 11.26%


# Checking missing values
sum(is.na(bank_marketing))
# Clearly, there are no missing values found as all missing values have been reported as "unknown"


##### For Age Variable
# Plotting of age histogram
ggplot(bank_marketing,aes(age)) + geom_histogram()
# We can see that the age is spread between 17 to 98 years. This clearly needs outlier treatment.


# Checking the outlier in age
quantile(bank_marketing$age, seq(0,1,0.01))
# We see a jump in age at 99% which has 71 age. Box plot will give more insight on the outliers.


# Using box plot to check outliers in age
boxplot(bank_marketing$age)
# clearly, after 71 age, the other values can be removed.


#So, lets Cap the upper age limit to 71
bank_marketing[(which(bank_marketing$age>71)),]$age <- 71


#Binning the age variable will help ease the analysis and give better insights
bank_marketing$age_binned <- as.factor(cut(bank_marketing$age, breaks = c(16, 20, 30, 40, 50, 60, 70, 80)))


#Changing the response value to numbers
bank_marketing$response <- ifelse(bank_marketing$response == "yes", 1, 0)


# Check the numeric value of response rate in each bucket
aggregate_age <- merge(aggregate(response ~ age_binned, bank_marketing, mean),aggregate(response~age_binned, bank_marketing, sum),by = "age_binned")


# Adding No_of_prospect
count <- data.frame(table(bank_marketing$age_binned))
count <- count[,-1]
aggregate_age <- cbind(aggregate_age,count)


# Changing column name of each variables in aggregate_age dataframe
colnames(aggregate_age) <- c("age", "response_rate", "count_prospects","No_of_prospect")


# Round Off the values
aggregate_age$response_rate <- format(round(aggregate_age$response_rate, 2))
aggregate_age


# Plotting graph for response rate of each age bucket
ggplot(aggregate_age, aes(age, No_of_prospect,label = response_rate)) +
  geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  geom_text(size = 3, vjust = -0.5)
# We see that most of the data set lies in the age range 20 to 60.


# Checking dataset of age less than 20 years.
bank_marketing_age20 <- subset(bank_marketing,age <20)
summary(bank_marketing_age20)


#### Writing a function to do the same task for each variable
plot_create <- function(category_variable, variable_name){
  a <- aggregate(response~category_variable, bank_marketing, mean)
  count <- data.frame(table(category_variable))
  count <- count[,-1]
  aggregate_response <- cbind(a, count)

  colnames(aggregate_response) <- c(variable_name, "response_rate","No_of_Prospect")
  aggregate_response[, 2] <- format(round(aggregate_response[, 2], 2))

  ggplot(aggregate_response, aes(aggregate_response[, 1], count, label = response_rate)) + geom_bar(stat = 'identity') + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + geom_text(size = 3, vjust = -0.5) + xlab(variable_name)

}


##### For Job Variable
# Levels of job
levels(bank_marketing$job)


#Summary job
summary(bank_marketing$job)

# Plotting for job variable.
plot_create(bank_marketing$job, "job")
# Admin and blue-collar jobs are high in number. However, the response rate is good
# among retired and students


##### For Martial Status Variable

# Levels of Marial Status
levels(bank_marketing$marital)

# Summary of Martial status
summary(bank_marketing$marital)


# Replacing Unknown level to married
levels(bank_marketing$marital)[4] <- "married"


# Plotting marital status
plot_create(bank_marketing$marital,"Marital Status")
#Clearly, no impact on the response rate.


##### For Education Variable

# Levels of education
levels(bank_marketing$education)

# Summary of Education
summary(bank_marketing$education)


# Reducing the levels of education variable
levels(bank_marketing$education)[c(1:3,5)] <- "Primary_Education"
levels(bank_marketing$education)[2] <- "Secondary_Education"
levels(bank_marketing$education)[4]<- "Tertiary_Education"


# Plotting Education
plot_create(bank_marketing$education,"Education")
#Clearly, no impact on the response rate.


##### For default variable

# Levels of default
levels(bank_marketing$default)

# Summary of default
summary(bank_marketing$default)

# Plotting default
plot_create(bank_marketing$default, "Default")
#Clearly, no impact on the response rate.

bank_marketing <- bank_marketing[,-5]


##### For housing variable

# Levels of housing
levels(bank_marketing$housing)

# Summary of housing
summary(bank_marketing$housing)

#Plotting housing
plot_create(bank_marketing$housing, "Housing")
# Clearly, not much of an impact on the response.


##### For loan variable

# Levels of loan
levels(bank_marketing$loan)

# Summary of loan
summary(bank_marketing$loan)

#Plotting loan
plot_create(bank_marketing$loan, "Loan Status")
# Clearly, not much of an impact on the response.


##### For contact variable

# Levels of contact
levels(bank_marketing$contact)

# Summary of contact
summary(bank_marketing$contact)

#Plotting contact
plot_create(bank_marketing$contact, "Contact Mode")
# Cellular has some impact on the response than telephone.


##### For month variable

# Levels of month
levels(bank_marketing$month)

# Summary of month
summary(bank_marketing$month)

#Plotting month
plot_create(bank_marketing$month, "Contact Month")
# The contact months seem to have an impact on the response.


##### For day_of_week variable

# Levels of day_of_week
levels(bank_marketing$day_of_week)

# Summary of day_of_week
summary(bank_marketing$day_of_week)

#Plotting day_of_week
plot_create(bank_marketing$day_of_week, "Day Of Week")
# The day of the week has no imapct on response.


##### For duration variable
# Duration is a quantile varibale.

# Summary of duration
summary(bank_marketing$duration)


# Plotting histogram
ggplot(bank_marketing,aes(duration))+geom_histogram()


# Average duration
bank_marketing$response_1 <- as.factor(bank_marketing$response)
Avg_duration <- aggregate(duration~response_1,bank_marketing,mean)

bank_marketing <- bank_marketing[,-(21:22)]


##### For campaign variable

#summary
summary(bank_marketing$campaign)

# percentile distribution
boxplot(bank_marketing$campaign)

#quantile
quantile(bank_marketing$campaign,seq(0,1,0.01))
# We see a huge jump at 99%. So lets cap at it.


# Capping at 99%, value is 14
bank_marketing[which(bank_marketing$campaign>14),]$campaign <- 14


# Plotting campaign
ggplot(bank_marketing,aes(campaign))+geom_histogram()


##### For pdays variable

#convert pdays to factor type
bank_marketing$pdays<- as.factor(bank_marketing$pdays)

# check levels
levels(bank_marketing$pdays)

# summary
summary(bank_marketing$pdays)


# Reducing the levels.
levels(bank_marketing$pdays)[1:10] <- "Contacted_in_first_10days"
levels(bank_marketing$pdays)[2:17] <-"Contacted_after_10days"
levels(bank_marketing$pdays)[3] <- "First_time_contacted"


# Plotting campaign
plot_create(bank_marketing$pday,"Pday")
#Contacted_in_first_10days and Contacted_after_10days have greater response rate


# Summary each category
summary(bank_marketing$pdays)


##### For previous variable

#summary of previous
summary(bank_marketing$previous)
# Max=7, best is to convert this variable to factor

#to factor
bank_marketing$previous <- as.factor(bank_marketing$previous)

#levels of previous
levels(bank_marketing$previous)

#Reducing levels
levels(bank_marketing$previous)[1]<-"Never contacted"
levels(bank_marketing$previous)[2:4] <- "Less_than_3_times"
levels(bank_marketing$previous)[3:6] <- "More than_3_times"

#Summary
summary(bank_marketing$previous)

#Plotting Previous
plot_create(bank_marketing$previous,"Previous Contacts")
# More than_3_times seems to be having greater impact on response rates.


##### For Poutcome variable

# Levels of poutcome
levels(bank_marketing$poutcome)

# Summary of poutcome
summary(bank_marketing$poutcome)

#Plotting poutcome
plot_create(bank_marketing$poutcome, "Outcome_of_Previous_contacts")
# This seems to have an impact on response.


##### For emp.var.rate variable

#summary
summary(bank_marketing$emp.var.rate)

# Plotting histogram
ggplot(bank_marketing,aes(emp.var.rate))+geom_histogram()


##### For cons.price.idx variable

#summary
summary(bank_marketing$cons.price.idx)

# Plotting histogram
ggplot(bank_marketing,aes(cons.price.idx))+geom_histogram()


##### For cons.conf.idx variable

#summary
summary(bank_marketing$cons.conf.idx)


##### For ceuribor3m

#summary
summary(bank_marketing$euribor3m)


##### For nr.employed

#summary
summary(bank_marketing$nr.employed)


cust_num <- rownames(bank_marketing)
bank_marketing <- cbind(cust_num=cust_num, bank_marketing)
colnames(bank_marketing)[1] <- "prospectID"


#Creating two new data frames
bank_marketing_1 <- bank_marketing
bank_marketing_2 <- bank_marketing


########################################### III. MODEL BUILDING ####################################################

# We will be using Logistic Regression to build the model.


#Lets start with the creation of dummy variables
bank_marketing$response <- as.integer(bank_marketing$response)


bank_marketing <- cbind(bank_marketing[,1],dummy.data.frame(bank_marketing[,-1]))
colnames(bank_marketing)[1] <- "prospectID"
bank_marketing$response <- as.factor(ifelse(bank_marketing$response == 1, "yes", "no"))


# splitting into train and test data

set.seed(1)
split_indices <- sample.split(bank_marketing$response, SplitRatio = 0.70)
train <- bank_marketing[split_indices, ]
test <- bank_marketing[!split_indices, ]
nrow(train)/nrow(bank_marketing)
nrow(test)/nrow(bank_marketing)


#As per the the requirement mentioned in the problems, lets remove the duration variable
test <- test[,-46]
train <- train[,-46]


#Next, Lets start building the Model 1
Model_1 <- glm(response ~ ., family = "binomial", data = train[,-1])

summary(Model_1)


# Using stepwise to remove insignificant variables
Model_2 <- stepAIC(Model_1, direction = "both")

summary(Model_2)

# stepAIC has removed some variables and only the following ones remain
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education + educationTertiary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + emp.var.rate + cons.price.idx + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
                  family = "binomial", data = train[,-1])


# Lets find out VIF now for Model_3
vif(Model_3)
#emp.var.rate variable has very high VIF value at 139.73

summary(Model_3)
#lets remove the emp.var.rate variable and build the next model.


#Building model 3
Model_3 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                    maritaldivorced + educationPrimary_Education + educationTertiary_Education +
                    contactcellular + monthapr + monthjul + monthjun + monthmar +
                    monthmay + monthnov + monthoct + day_of_weekfri + day_of_weekmon +
                    campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                    poutcomefailure + cons.price.idx + cons.conf.idx +
                    nr.employed + `previousMore than_3_times`,
                  family = "binomial", data = train[,-1])


vif(Model_3)
summary(Model_3)
#monthmay has VIF 2.96, monthjul has VIF 2.04, monthjun has Vif 2.01.
# But all are significant variables. SO we remove monthnov which has VIF 1.499 but is not significant.


#Lets remove the monthnov variable and build the next model.
#Model 4
Model_4 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education + educationTertiary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + monthoct + day_of_weekfri + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.price.idx + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
               family = "binomial", data = train[,-1])


vif(Model_4)
summary(Model_4)
# monthoct has VIF 1.23 and is not significant. So lets remove it.


#Building Model 5 by removing monthoct
Model_5 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education + educationTertiary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekfri + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.price.idx + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
               family = "binomial", data = train[,-1])


vif(Model_5)
summary(Model_5)
#educationTertiary_Education has high VIF and low significance. Lets remove it.


#Model 6
Model_6 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekfri + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.price.idx + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
               family = "binomial", data = train[,-1])


vif(Model_6)
summary(Model_6)
#day_of_weekfri has high VIF and no significance. Lets remove it.


#Model 7
Model_7 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.price.idx + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
               family = "binomial", data = train[,-1])


vif(Model_7)
summary(Model_7)
#cons.price.idx has high VIF and no significance. Lets remove it.


#Model 8
Model_8 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.conf.idx +
                 nr.employed + `previousMore than_3_times`,
               family = "binomial", data = train[,-1])


summary(Model_8)
#previousMore than_3_times has no significance. Lets remove it.


#Model 9
Model_9 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 maritaldivorced + educationPrimary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.conf.idx +
                 nr.employed,
               family = "binomial", data = train[,-1])


summary(Model_9)
#martialdivorced has no significance. Lets remove it.


#Model 10
Model_10 <- glm(formula = response ~ jobadmin. + jobretired + jobstudent + jobtechnician +
                 educationPrimary_Education +
                 contactcellular + monthapr + monthjul + monthjun + monthmar +
                 monthmay + day_of_weekmon +
                 campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                 poutcomefailure + cons.conf.idx +
                 nr.employed,
               family = "binomial", data = train[,-1])


summary(Model_10)
#jobadmin. has no significance. Lets remove it.


#Model 11
Model_11 <- glm(formula = response ~ jobretired + jobstudent + jobtechnician +
                  educationPrimary_Education +
                  contactcellular + monthapr + monthjul + monthjun + monthmar +
                  monthmay + day_of_weekmon +
                  campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                  poutcomefailure + cons.conf.idx +
                  nr.employed,
                family = "binomial", data = train[,-1])


summary(Model_11)
#jobtechnician has no significance. Lets remove it.


#Model 12
Model_12 <- glm(formula = response ~ jobretired + jobstudent +
                  educationPrimary_Education +
                  contactcellular + monthapr + monthjul + monthjun + monthmar +
                  monthmay + day_of_weekmon +
                  campaign + pdaysContacted_in_first_10days + pdaysContacted_after_10days +
                  poutcomefailure + cons.conf.idx +
                  nr.employed,
                family = "binomial", data = train[,-1])


summary(Model_12)

# At the model 12, we get all the significant variables. So I would like to have Model_12 as my final model.


########################################### IV. MODEL EVALUATION ####################################################


# Predicting test data
predictions_logit <- predict(Model_12, newdata = test[, -c(1,61)], type = "response")
summary(predictions_logit)


# At first we try with the probability cutoff of 50%.
predicted_response <- factor(ifelse(predictions_logit >= 0.50, "yes", "no"))

# Creating confusion matrix for identifying the model evaluation.
conf <- confusionMatrix(predicted_response, test$response, positive = "yes")
# Accuracy - 89.96%
# Sensitivity - 23.42%
# Specificity - 98.41%


# We would like to improve the specificity percentage. So, lets try to get the optimal probability cut-off.


# Writing a function for optimal probalility cutoff

probability_cutoff_func <- function(cutoff)
{
  predicted_response <- factor(ifelse(predictions_logit >= cutoff, "yes", "no"))
  conf <- confusionMatrix(predicted_response, test$response, positive = "yes")
  acc <- conf$overall[1]
  sens <- conf$byClass[1]
  spec <- conf$byClass[2]
  out <- t(as.matrix(c(sens, spec, acc)))
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}


# Creating cutoff values from 0.01 to 0.99 for plotting and initiallizing a matrix of 1000 X 4.

s = seq(.01,.93,length=100)

OUT = matrix(0,100,3)


for(i in 1:100)
{
  OUT[i,] = probability_cutoff_func(s[i])
}


# plotting cutoffs
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",cex.lab=1.5,cex.axis=1.5,ylim=c(0,1),type="l",lwd=2,axes=FALSE,col=2)
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col="darkgreen",lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend(0,.50,col=c(2,"darkgreen",4,"darkred"),lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))


cutoff <- s[which(abs(OUT[,1]-OUT[,2])<0.03)]


# From the above plot, we found the cutoff value of 11% for our final model

predicted_response <- factor(ifelse(predictions_logit >= 0.078, "yes", "no"))

conf_final <- confusionMatrix(predicted_response, test$response, positive = "yes")

accuracy1 <- conf_final$overall[1]

sensitivity1 <- conf_final$byClass[1]

specificity1 <- conf_final$byClass[2]

#accuracy
#73.74%
#sensitivity
#70.11%
#specificity
#74.20%


################### Lift and Gain Charts
prediction_1 <- predict(Model_12, newdata = bank_marketing[, -c(1,46,62)], type = "response")
summary(prediction_1)
prediction_response <- factor(ifelse(prediction_1 >= 0.078, "yes", "no"))
bank_marketing$Predicted_response <- prediction_response
bank_marketing$Predicted_probability <- prediction_1


# A new test dataframe as per given problem.
test_bank_marketing <- bank_marketing[, c("prospectID","duration", "response", "Predicted_response", "Predicted_probability")]


#Given in the problem statement that: Cost per call (INR) = 0.033*(duration_in_seconds) + 0.8
# lets now do that
test_bank_marketing$call_cost <- 0.033 * bank_marketing$duration + 0.8
test_bank_marketing <- test_bank_marketing[order(test_bank_marketing$Predicted_probability, decreasing = T), ]


#Lift Function
Func_Lift <- function(labels, Predicted_probability, call_cost, groups=10) {

  if(is.factor(labels)) labels  <- as.integer(as.character(labels ))
  if(is.factor(Predicted_probability)) Predicted_probability <- as.integer(as.character(Predicted_probability))
  helper = data.frame(cbind(labels , Predicted_probability))
  helper[,"bucket"] = ntile(-helper[,"Predicted_probability"], groups)
  gaintable = helper %>% group_by(bucket)  %>%
    summarise_at(vars(labels ), funs(total = n(),
                                     totalresp=sum(., na.rm = TRUE))) %>%

    mutate(Cumresp = cumsum(totalresp),
           Gain=Cumresp/sum(totalresp)*100,
           Cumlift=Gain/(bucket*(100/groups)))
  return(gaintable)
}


# Getting cumulative gain and lift
test_bank_marketing$response <- as.factor(ifelse(test_bank_marketing$response=="yes",1,0))
liftandgain = Func_Lift(test_bank_marketing$response, test_bank_marketing$Predicted_probability, groups = 10)
test_bank_marketing$number <- 1:nrow(test_bank_marketing)
test_bank_marketing <- test_bank_marketing%>% mutate(bucket = ntile(number, 10))
Call_agg <- aggregate(call_cost~bucket,test_bank_marketing,sum)
liftandgain <- merge(liftandgain,Call_agg,by="bucket")
liftandgain$cum_cost <- cumsum(liftandgain$call_cost)
liftandgain$cost_percent <- (liftandgain$call_cost/sum(liftandgain$call_cost))*100
liftandgain <- merge(liftandgain,aggregate(duration~bucket,test_bank_marketing,mean),by="bucket")
names(liftandgain)[10] <- "Avgerage_duration"


## Then we get the Average cost per call
liftandgain$Avg_call_cost <- liftandgain$call_cost/liftandgain$total


# Now, we can plot the gain Chart to see the gain obtained
# Tells us the number of responders captured (y-axis) as a function of the number of prospects targeted (x-axis)
plot(liftandgain$bucket,liftandgain$Gain,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Positive Response (in percentage)")


# Next, we plot the lift Chart
# Compares the response rate with and without using the model.
plot(liftandgain$bucket,liftandgain$Cumlift,col="blue",type="l",main="Gain Chart",xlab="Target Prospect (in percentage)",ylab = "Lift")


# We can conclude that our target is the 80.79% or 81% of the customers from the gain chart.
# The Average cost per call for the targeted 81% customers is 9.7745728
# The Avearage duration of xall for the targeted 81% customers is 271.95674
# The total cost for all the 81% targeted customers will come up to be approximately 2 lakhs which is better than the random model.