Skip to content

Instantly share code, notes, and snippets.

@ivopbernardo
Created February 4, 2022 18:18
Show Gist options
  • Save ivopbernardo/a1e3a04676c6d91dcd7f46d6661bbe54 to your computer and use it in GitHub Desktop.
Save ivopbernardo/a1e3a04676c6d91dcd7f46d6661bbe54 to your computer and use it in GitHub Desktop.
Random Forests vs. Decision Trees
# Don't forget to download the train.csv file
# to make this gist work.
# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv
# You also need to install ROCR and rpart libraries
# Reading the titanic train dataset
titanic <- read.csv('./train.csv')
# Obtaining the number of rows for training (70%)
size <- ceiling(0.7*nrow(titanic))
# Use an indexer to perform train and test split
set.seed(999)
train_index <- sample(
seq_len(nrow(titanic)), size = size
)
train_df <- titanic[train_index, ]
test_df <- titanic[-train_index, ]
# Loading rpart
library(rpart)
# Training decision tree 1
set.seed(9990)
oak_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
data = sample_n(train_df, 600),
method = 'class',
control = list(maxdepth = 2,
minsplit=30))
# Training Decision Tree 2
set.seed(9991)
pine_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
data = sample_n(train_df, 600),
method = 'class',
control = list(maxdepth = 3,
minsplit=3,
minbucket=4,
cp=0.01))
# Training Decision Tree 3
set.seed(9992)
elm_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
data = sample_n(train_df, 600),
method = 'class',
control = list(maxdepth = 2,
minsplit=2,
minbucket=4,
cp=0.01))
# Building function for AUC
library(ROCR)
obtainauc <- function(model) {
predictions <- predict(model, test_df)[,2]
pred <- prediction(predictions, test_df$Survived)
perf <- performance(pred, measure = 'auc')
return (perf@y.values[[1]])
}
# Building ensemble
ensemble <- (
predict(oak_tree, test_df)[,2]
+
predict(pine_tree, test_df)[,2]
+
predict(elm_tree, test_df)[,2]
)/3
# Ensemble Performance
prediction <- prediction(ensemble, test_df$Survived)
perf <- performance(prediction, measure = 'auc')
performance_ensemble <- perf@y.values[[1]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment