ivopbernardo/rf_demo.R

## rf_demo.R
# Don't forget to download the train.csv file
# to make this gist work.

# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv

# You also need to install ROCR and rpart libraries

# Reading the titanic train dataset
titanic <- read.csv('./train.csv')

# Obtaining the number of rows for training (70%)
size <- ceiling(0.7*nrow(titanic))

# Use an indexer to perform train and test split
set.seed(999)

train_index <- sample(
  seq_len(nrow(titanic)), size = size
)
train_df <- titanic[train_index, ]
test_df <- titanic[-train_index, ]

# Loading rpart
library(rpart)

# Training decision tree 1
set.seed(9990)
oak_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
                  data = sample_n(train_df, 600),
                  method = 'class',
                  control = list(maxdepth = 2,
                                 minsplit=30))

# Training Decision Tree 2
set.seed(9991)
pine_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
                   data = sample_n(train_df, 600),
                   method = 'class',
                   control = list(maxdepth = 3,
                                  minsplit=3,
                                  minbucket=4,
                                  cp=0.01))

# Training Decision Tree 3
set.seed(9992)
elm_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
                  data = sample_n(train_df, 600),
                  method = 'class',
                  control = list(maxdepth = 2,
                                 minsplit=2,
                                 minbucket=4,
                                 cp=0.01))


# Building function for AUC
library(ROCR)
obtainauc <- function(model) {
  predictions <- predict(model, test_df)[,2]
  pred <- prediction(predictions, test_df$Survived)
  perf <- performance(pred, measure = 'auc')
  return (perf@y.values[[1]])
}

# Building ensemble
ensemble <- (
  predict(oak_tree, test_df)[,2]
  +
    predict(pine_tree, test_df)[,2]
  +
    predict(elm_tree, test_df)[,2]
)/3

# Ensemble Performance
prediction <- prediction(ensemble, test_df$Survived)
perf <- performance(prediction, measure = 'auc')
performance_ensemble <- perf@y.values[[1]]
	# Don't forget to download the train.csv file
	# to make this gist work.

	# Download it at: https://www.kaggle.com/c/titanic/data?select=train.csv

	# You also need to install ROCR and rpart libraries

	# Reading the titanic train dataset
	titanic <- read.csv('./train.csv')

	# Obtaining the number of rows for training (70%)
	size <- ceiling(0.7*nrow(titanic))

	# Use an indexer to perform train and test split
	set.seed(999)

	train_index <- sample(
	seq_len(nrow(titanic)), size = size
	)
	train_df <- titanic[train_index, ]
	test_df <- titanic[-train_index, ]

	# Loading rpart
	library(rpart)

	# Training decision tree 1
	set.seed(9990)
	oak_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
	data = sample_n(train_df, 600),
	method = 'class',
	control = list(maxdepth = 2,
	minsplit=30))

	# Training Decision Tree 2
	set.seed(9991)
	pine_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
	data = sample_n(train_df, 600),
	method = 'class',
	control = list(maxdepth = 3,
	minsplit=3,
	minbucket=4,
	cp=0.01))

	# Training Decision Tree 3
	set.seed(9992)
	elm_tree <- rpart(Survived ~ Fare + Age + Sex + Pclass,
	data = sample_n(train_df, 600),
	method = 'class',
	control = list(maxdepth = 2,
	minsplit=2,
	minbucket=4,
	cp=0.01))


	# Building function for AUC
	library(ROCR)
	obtainauc <- function(model) {
	predictions <- predict(model, test_df)[,2]
	pred <- prediction(predictions, test_df$Survived)
	perf <- performance(pred, measure = 'auc')
	return (perf@y.values[[1]])
	}

	# Building ensemble
	ensemble <- (
	predict(oak_tree, test_df)[,2]
	+
	predict(pine_tree, test_df)[,2]
	+
	predict(elm_tree, test_df)[,2]
	)/3

	# Ensemble Performance
	prediction <- prediction(ensemble, test_df$Survived)
	perf <- performance(prediction, measure = 'auc')
	performance_ensemble <- perf@y.values[[1]]