Skip to content

Instantly share code, notes, and snippets.

@dobaduc
Last active October 6, 2015 16:40
Show Gist options
  • Save dobaduc/e4945f82b37856a40954 to your computer and use it in GitHub Desktop.
Save dobaduc/e4945f82b37856a40954 to your computer and use it in GitHub Desktop.
Logitboost
#Set Directory
setwd("/root") #Set working directory
#Load packages
library(caret)
library(jsonlite)
##Functions
#Create training and test sets
splitdf <- function(dataframe, seed=NULL) {
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
trainindex <- sample(index, trunc(length(index)*.7))
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
#Load CSV File
data.raw <- read.csv("/root/cleaned_data3.csv")
##Data Cleaning
#Check dataset for missing observations
apply(data.raw, 2, function(x) sum(is.na(x))) #Should have no errors since I manually cleaned the data
#Recode binary target variable into categorical variables
data.raw$Chance[data.raw$Chance == 1] <- "Yes"
data.raw$Chance[data.raw$Chance == "0"] <- "No"
##Convert binary features into factors
#Extract variable names
data.names <- names(data.raw)
#Select variables to convert into factors
data.cat <- data.names[c(5, 10:19)]
#Transform selected variables into factors
for (i in 1:length(data.cat)){data.raw[data.cat[i]] <- as.factor(data.raw[, data.cat[i]])}
#Remove duplicate rows
data.unique <- unique(data.raw)
#Remove unused "fit" variable
data.unique <- data.unique[, 1:18]
#Create training and test sets with randomization set for seed = 1985
data.split <- splitdf(data.unique, seed = 1985)
#Create separate train and test datasets
data.train <- data.split$trainset #used to tune the algorithm coefficients
data.test <- data.split$testset #used to test algorithm accuracy
#Set "Control Settings" to repeat 10-fold cross-validation for 10-iterations
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
#Train Logitboost classification algorithm with "train dataset"
data.glm <- train(Chance ~ ., data = data.train, method = "LogitBoost", trControl = fitControl)
#Run classification algorithm with "raw" response
#data.predict <- predict(data.glm, newdata = data.split$testset, type = "raw")
#Confirm "Accuracy" via Confusion Matrix
#confusionMatrix(data.test[, 18], data.predict)
### 85.7%
#Generate "Chance" predictions for "test dataset" with probability as output
data.predict <- predict(data.glm, newdata = data.split$testset, type = "prob")
#Print results for only "Yes" probabilities
#print(round(data.predict[, 2], 2))
#Extract latest datapoint from the bottom row
data.last <- data.split$testset[nrow(data.split$testset), ]
#Predict last observation
jsonString <- toJSON(round(predict(data.glm, newdata = data.last, type = "prob")[, 2], 2))
print(jsonString)
#Create package
#save(data.glm, file = "data_glm.rda")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment