Skip to content

Instantly share code, notes, and snippets.

@rudeboybert
Last active March 23, 2020 00:00
Show Gist options
  • Save rudeboybert/6468b2e7b929b6241bfb8da7b4927a85 to your computer and use it in GitHub Desktop.
Save rudeboybert/6468b2e7b929b6241bfb8da7b4927a85 to your computer and use it in GitHub Desktop.
Using cross-validation to find optimal complexity parameter of a classification & regression tree
# Based on data from "House Prices: Advanced Regression Techniques" Kaggle Competition
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
# YouTube demo can be found here:
library(tidyverse)
library(rpart)
library(Metrics)
# Reload house prices data
train <- read_csv("https://rudeboybert.github.io/SDS293/static/train.csv")
test <- read_csv("https://rudeboybert.github.io/SDS293/static/test.csv")
# Set number of folds
k <- 5
# Randomly set k folds to training data
train <- train %>%
sample_frac(size = 1) %>%
mutate(fold = rep(1:k, length = n())) %>%
arrange(fold)
cp_values_grid <- seq(from = 0, to = 0.0015, len = 101)
error_estimates <- rep(0, times = length(cp_values_grid))
error_estimate_per_fold <- rep(0, k)
for(j in 1:length(cp_values_grid )){
current_cp_value <- cp_values_grid[j]
for(i in 1:k){
train_cv <- train %>%
filter(fold != i)
test_cv <- train %>%
filter(fold == i)
# Fit model:
trained_model <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt,
data = train_cv,
control = rpart.control(cp = current_cp_value))
# Get predictions
y_hat <- predict(trained_model, type="vector", newdata = test_cv)
# Get error
error_estimate_per_fold[i] <- rmsle(actual = test_cv$SalePrice, predicted = y_hat)
}
error_estimates[j] <- mean(error_estimate_per_fold)
}
blah <- tibble(
cp_value = cp_values_grid,
error_estimate = error_estimates
)
ggplot(blah, aes(x = cp_value, y = error_estimate)) +
geom_point() +
labs(x = "Complexity parameter", y = "Estimate of RMSLE")
# Bonus: Use optimal complexity parameter value to make submissions on Kaggle
# Since there are multiple cp values that yield the lowest estimated RMSLE, use
# the smallest value since it yields the least complex tree.
cp_star <- blah %>%
arrange(error_estimate, cp_value) %>%
slice(1) %>%
pull(cp_value)
# Fit/train model on all training data
trained_model_all <- rpart(SalePrice ~ GrLivArea + HalfBath + YearBuilt,
data = train,
control = rpart.control(cp = cp_star))
# Visualize this tree:
plot(trained_model_all, margin = 0.25)
text(trained_model_all, use.n = TRUE)
title("Classification & Regression Tree")
box()
# Predict on test set
test <- test %>%
mutate(SalePriceHat = predict(trained_model_all, type="vector", newdata = test))
# Write predictions to csv following exact format required by Kaggle here
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/submit
test %>%
select(Id, SalePrice = SalePriceHat) %>%
write_csv("submission.csv")
# This yields a RMSLE of 0.22065!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment