Last active
October 6, 2015 16:40
-
-
Save dobaduc/e4945f82b37856a40954 to your computer and use it in GitHub Desktop.
Logitboost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Set Directory | |
setwd("/root") #Set working directory | |
#Load packages | |
library(caret) | |
library(jsonlite) | |
##Functions | |
#Create training and test sets | |
splitdf <- function(dataframe, seed=NULL) { | |
if (!is.null(seed)) set.seed(seed) | |
index <- 1:nrow(dataframe) | |
trainindex <- sample(index, trunc(length(index)*.7)) | |
trainset <- dataframe[trainindex, ] | |
testset <- dataframe[-trainindex, ] | |
list(trainset=trainset,testset=testset) | |
} | |
#Load CSV File | |
data.raw <- read.csv("/root/cleaned_data3.csv") | |
##Data Cleaning | |
#Check dataset for missing observations | |
apply(data.raw, 2, function(x) sum(is.na(x))) #Should have no errors since I manually cleaned the data | |
#Recode binary target variable into categorical variables | |
data.raw$Chance[data.raw$Chance == 1] <- "Yes" | |
data.raw$Chance[data.raw$Chance == "0"] <- "No" | |
##Convert binary features into factors | |
#Extract variable names | |
data.names <- names(data.raw) | |
#Select variables to convert into factors | |
data.cat <- data.names[c(5, 10:19)] | |
#Transform selected variables into factors | |
for (i in 1:length(data.cat)){data.raw[data.cat[i]] <- as.factor(data.raw[, data.cat[i]])} | |
#Remove duplicate rows | |
data.unique <- unique(data.raw) | |
#Remove unused "fit" variable | |
data.unique <- data.unique[, 1:18] | |
#Create training and test sets with randomization set for seed = 1985 | |
data.split <- splitdf(data.unique, seed = 1985) | |
#Create separate train and test datasets | |
data.train <- data.split$trainset #used to tune the algorithm coefficients | |
data.test <- data.split$testset #used to test algorithm accuracy | |
#Set "Control Settings" to repeat 10-fold cross-validation for 10-iterations | |
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10) | |
#Train Logitboost classification algorithm with "train dataset" | |
data.glm <- train(Chance ~ ., data = data.train, method = "LogitBoost", trControl = fitControl) | |
#Run classification algorithm with "raw" response | |
#data.predict <- predict(data.glm, newdata = data.split$testset, type = "raw") | |
#Confirm "Accuracy" via Confusion Matrix | |
#confusionMatrix(data.test[, 18], data.predict) | |
### 85.7% | |
#Generate "Chance" predictions for "test dataset" with probability as output | |
data.predict <- predict(data.glm, newdata = data.split$testset, type = "prob") | |
#Print results for only "Yes" probabilities | |
#print(round(data.predict[, 2], 2)) | |
#Extract latest datapoint from the bottom row | |
data.last <- data.split$testset[nrow(data.split$testset), ] | |
#Predict last observation | |
jsonString <- toJSON(round(predict(data.glm, newdata = data.last, type = "prob")[, 2], 2)) | |
print(jsonString) | |
#Create package | |
#save(data.glm, file = "data_glm.rda") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment