Skip to content

Instantly share code, notes, and snippets.

View janebunr's full-sized avatar

Jane Bunrerngsanoh janebunr

View GitHub Profile
# assigning weight on service percentage.
score <- c()
for (j in rep(1:141, each=1)){
for (i in rep(1:20)) {
weight <- (i/20)*SERVICE[j,i] #Higher weight on recent years and lower weight on older years
score <- rbind(score,weight)
}}
head(score)
# [,1]
# weight 0.02000
GBMfit.predictions <- predict(GBMfit,newdata=SUBHOLDOUT,type="prob")
GBMfit.predictions$CustomerID <- SUBHOLDOUT$CustomerID
GBMfit.predictions$Churn <- GBMfit.predictions$Yes
GBMfit.predictions <- GBMfit.predictions[-c(1:2)]
results <- data.frame(pred=ifelse(GBMfit.predictions$Yes > .5, "Yes", "No"),actual=SUBHOLDOUT$Churn)
confusionMatrix(results$pred, results$actual)
# Confusion Matrix and Statistics
#
# Reference
varImp(GBMfit)
# only 20 most important variables shown (out of 112)
#
# Overall
# CurrentEquipmentDays 100.000
# MonthsInService 69.802
# MonthlyMinutes 61.336
# PercChangeMinutes 54.557
# PercChangeRevenues 24.755
# TotalRecurringCharge 22.134
MASTERRESULTS <- rbind(RPARTfit$results[,2:7],
FORESTfit.new$results[,2:7],
GBMfit$results[,5:10])
MASTERRESULTS$method <- c(rep("rpart",nrow(RPARTfit$results)),
rep("randomforest",nrow(FORESTfit.new$results)),
rep("gbm",nrow(GBMfit$results)))
MASTERRESULTS$param <- c(RPARTfit$results$cp,
FORESTfit.new$results$mtry,
gbmGrid <- expand.grid(n.trees=c(500,1000,2000),
interaction.depth=2:4,
shrinkage=c(.005,.01),
n.minobsinnode=c(5,10))
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
fitControl <- trainControl(method="cv",number=5, verboseIter = TRUE,
summaryFunction = twoClassSummary, classProbs = TRUE, allowParallel = TRUE)
set.seed(474); GBMfit <- train(Churn~.,data=SUBTRAIN,method="gbm",metric="ROC",trControl=fitControl,tuneGrid=gbmGrid,verbose=FALSE)
stopCluster(cluster)
forestGrid <- expand.grid(mtry=c(1,2,3,7))
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)
fitControl <- trainControl(method="cv",number=5, verboseIter = TRUE,
summaryFunction = twoClassSummary, classProbs = TRUE, allowParallel = TRUE)
set.seed(seed); FORESTfit.new <- train(Churn~.,data=SUBTRAIN,method="rf",metric="ROC",trControl=fitControl,tuneGrid=forestGrid)
stopCluster(cluster)
registerDoSEQ()
FORESTfit.new$results[rownames(FORESTfit.new$bestTune),]
BEST <- list()
seed <- 111
set.seed(seed)
####################################
# Partition
####################################
rpartGrid <- expand.grid(cp=10^seq(from=-4,to=-1,length=30))
#train function
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
#######################################
# Handling rare levels#################
#######################################
service <- as.character(CELL$ServiceArea)
head(service) # [1] "SEAPOR503" "PITHOM412" "MILMIL414" "PITHOM412" "OKCTUL918" "OKCTUL918"
## Extracting area code
area.code <- substr(service,start=7,stop=9)
area.code <- combine_infrequent_levels(area.code,threshold=20)$values
area.code <- factor( area.code )
# Process data together
CELL <- rbind(TRAIN,HOLDOUT)
CELL$MissingMonthlyRev <- factor( ifelse(is.na(CELL$MonthlyRevenue),"Yes","No") )
CELL$MissingPercentMin <- factor( ifelse(is.na(CELL$PercChangeMinutes),"Yes","No") )
CELL$MissingMonthlyMin <- factor( ifelse(is.na(CELL$MonthlyMinutes),"Yes","No") )
CELL$MissingTotalRecur <- factor( ifelse(is.na(CELL$TotalRecurringCharge),"Yes","No") )
CELL$MissingDirecAssis <- factor( ifelse(is.na(CELL$DirectorAssistedCalls),"Yes","No") )
CELL$MissingOverageMin <- factor( ifelse(is.na(CELL$OverageMinutes),"Yes","No") )
CELL$MissingRoamingCalls <- factor( ifelse(is.na(CELL$RoamingCalls),"Yes","No") )
CELL$MissingPercChangeRev <- factor( ifelse(is.na(CELL$PercChangeRevenues),"Yes","No") )
# importing data
TRAIN <- read.csv("cell2celltrain.csv")
HOLDOUT <- read.csv("cell2cellholdout.csv")
SS <- read.csv("cell2cellsamplesubmission.csv")
###################################################
#Check extent of missingness if there's any pattern
###################################################
MISSING <- data.frame( Column=names(TRAIN),
MissingInTRAIN=as.numeric(unlist(lapply(TRAIN,function(x)sum(is.na(x))))),