Created
February 4, 2022 18:18
-
-
Save ivopbernardo/a1e3a04676c6d91dcd7f46d6661bbe54 to your computer and use it in GitHub Desktop.
Random Forests vs. Decision Trees
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Don't forget to download the train.csv file | |
# to make this gist work. | |
# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv | |
# You also need to install ROCR and rpart libraries | |
# Reading the titanic train dataset | |
titanic <- read.csv('./train.csv') | |
# Obtaining the number of rows for training (70%) | |
size <- ceiling(0.7*nrow(titanic)) | |
# Use an indexer to perform train and test split | |
set.seed(999) | |
train_index <- sample( | |
seq_len(nrow(titanic)), size = size | |
) | |
train_df <- titanic[train_index, ] | |
test_df <- titanic[-train_index, ] | |
# Loading rpart | |
library(rpart) | |
# Training decision tree 1 | |
set.seed(9990) | |
oak_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 2, | |
minsplit=30)) | |
# Training Decision Tree 2 | |
set.seed(9991) | |
pine_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 3, | |
minsplit=3, | |
minbucket=4, | |
cp=0.01)) | |
# Training Decision Tree 3 | |
set.seed(9992) | |
elm_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass, | |
data = sample_n(train_df, 600), | |
method = 'class', | |
control = list(maxdepth = 2, | |
minsplit=2, | |
minbucket=4, | |
cp=0.01)) | |
# Building function for AUC | |
library(ROCR) | |
obtainauc <- function(model) { | |
predictions <- predict(model, test_df)[,2] | |
pred <- prediction(predictions, test_df$Survived) | |
perf <- performance(pred, measure = 'auc') | |
return (perf@y.values[[1]]) | |
} | |
# Building ensemble | |
ensemble <- ( | |
predict(oak_tree, test_df)[,2] | |
+ | |
predict(pine_tree, test_df)[,2] | |
+ | |
predict(elm_tree, test_df)[,2] | |
)/3 | |
# Ensemble Performance | |
prediction <- prediction(ensemble, test_df$Survived) | |
perf <- performance(prediction, measure = 'auc') | |
performance_ensemble <- perf@y.values[[1]] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment