Skip to content

Instantly share code, notes, and snippets.

@juliasilge
Created October 1, 2021 00:38
Show Gist options
  • Save juliasilge/66df257973d917fcdfbc724982c6f571 to your computer and use it in GitHub Desktop.
Save juliasilge/66df257973d917fcdfbc724982c6f571 to your computer and use it in GitHub Desktop.
NYC Airbnb prices with xgboost and racing for R-Ladies Miami
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(finetune)
library(vip)
## data from here: https://www.kaggle.com/c/sliced-s01e05-WXx7h8/data
train_raw <- read_csv("train.csv")
#--- explore data -----------------------------------------------------------#
train_raw %>%
group_by(neighbourhood) %>%
summarise(n = n(),
price = median(price)) %>%
filter(n > 10) %>%
slice_max(price, n = 15) %>%
ggplot(aes(price, fct_reorder(neighbourhood, price))) +
geom_col() +
scale_x_continuous(labels = scales::dollar_format(), expand = c(0,0)) +
labs(y = NULL, x = "Median price per night",
title = "Airbnb prices in NYC by neighborhood",
subtitle = "Top 15 most expensive neighborhoods")
train_raw %>%
ggplot(aes(longitude, latitude, z = log(price))) +
stat_summary_hex(fun = median, alpha = 0.8, bins = 70) +
scale_fill_viridis_b() +
labs(fill = "Median price (log)")
train_raw %>%
ggplot(aes(price, fill = neighbourhood_group)) +
geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
scale_x_log10(labels = scales::dollar_format()) +
labs(fill = NULL, x = "price per night")
#--- build model ------------------------------------------------------------#
set.seed(123)
nyc_split <- train_raw %>%
mutate(price = log(price + 1)) %>%
initial_split(strata = price)
nyc_train <- training(nyc_split)
nyc_test <- testing(nyc_split)
set.seed(234)
nyc_folds <- vfold_cv(nyc_train, v = 5, strata = price)
nyc_rec <- recipe(price ~ latitude + longitude + neighbourhood + room_type +
minimum_nights + number_of_reviews + availability_365 + name,
data = nyc_train) %>%
step_other(neighbourhood, threshold = 0.02) %>%
step_tokenize(name) %>%
step_stopwords(name) %>%
step_tokenfilter(name, max_tokens = 30) %>%
step_tfidf(name) %>%
step_dummy(all_nominal_predictors())
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_wf <- workflow(nyc_rec, xgb_spec)
#--- tune & evaluate model -------------------------------------------------#
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = nyc_folds,
grid = 15,
control = control_race(verbose_elim = TRUE)
)
plot_race(xgb_rs)
show_best(xgb_rs)
xgb_last <-
xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "rmse")) %>%
last_fit(nyc_split)
collect_metrics(xgb_last)
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
extract_workflow(xgb_last) %>%
augment(nyc_test) %>%
mutate(.resid = price - .pred) %>%
ggplot(aes(longitude, latitude, color = .resid)) +
geom_point(alpha = 0.2) +
scale_color_viridis_c(limits = c(-1, 1)) +
labs(color = "Residuals")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment