Skip to content

Instantly share code, notes, and snippets.

@simonpcouch
Created June 22, 2023 15:58
Show Gist options
  • Save simonpcouch/79f48a5c06bd009cc4c8b63fafd7d7a4 to your computer and use it in GitHub Desktop.
Save simonpcouch/79f48a5c06bd009cc4c8b63fafd7d7a4 to your computer and use it in GitHub Desktop.
Chicago Suburbs Housing Data
library(tidymodels)
library(tidyverse)
library(stringr)
library(janitor)
library(doMC)
registerDoMC(cores = max(1, parallelly::availableCores() - 1))
# data cleaning --------
# we'd likely just do all this cleaning under the hood and supply
# the `chiburbs` result as the "initial" dataset
chiburbs <-
# use read.csv rather than read_csv as every other line is totally blank
# and read_csv doesn't know how to handle it
bind_rows(
read.csv("https://raw.githubusercontent.com/GeometricBison/HousePriceML/main/csv/naperville_2021-2022_2.csv") %>% mutate(city = "Naperville"),
read.csv("https://raw.githubusercontent.com/GeometricBison/HousePriceML/main/csv/bolingbrook_2021-2022_2.csv") %>% mutate(city = "Bolingbrook")
)%>%
clean_names() %>%
as_tibble() %>%
rename_with(~gsub("x_of", "n", .x, fixed = TRUE)) %>%
filter(!is.na(housingprice) & housingprice != "") %>%
mutate(
hoa_dues = gsub("$", "", hoa_dues, fixed = TRUE),
hoa_dues = gsub("/month", "", hoa_dues, fixed = TRUE),
hoa_dues = as.numeric(hoa_dues),
hoa_dues = if_else(is.na(hoa_dues), 0, hoa_dues),
housingprice = gsub("$", "", housingprice, fixed = TRUE),
housingprice = gsub(",", "", housingprice, fixed = TRUE),
housingprice = as.numeric(housingprice),
housingprice = log(housingprice),
sqft = gsub(",", "", sqft, fixed = TRUE),
sqft = as.numeric(sqft),
basement_sq_ft = gsub(",", "", basement_sq_ft, fixed = TRUE),
basement_sq_ft = as.numeric(basement_sq_ft),
tax_annual_amount = gsub("$", "", tax_annual_amount, fixed = TRUE),
tax_annual_amount = gsub(",", "", tax_annual_amount, fixed = TRUE),
tax_annual_amount = as.numeric(tax_annual_amount),
basement_sq_ft = if_else(is.na(basement_sq_ft), 0, basement_sq_ft),
n_baths_1_2 = if_else(is.na(n_baths_1_2), 0, n_baths_1_2),
n_cars = if_else(is.na(n_cars), 0, n_cars),
beds = if_else(is.na(beds), 0, beds),
zip = str_sub(address, -5, -1),
across(where(is.character), ~if_else(.x == "", NA_character_, .x)),
across(where(is.character), as.factor)
) %>%
rename(log_price = housingprice) %>%
filter(!is.na(log_price)) %>%
select(-list_price, -est_mo_payment, -basement, -address)
chiburbs
ggplot(chiburbs) +
aes(x = sqft, y = log_price, col = city) +
scale_x_sqrt() +
geom_point()
# data splitting ---------------
set.seed(1)
chiburbs_split <- initial_split(chiburbs)
chiburbs_train <- training(chiburbs_split)
chiburbs_test <- testing(chiburbs_split)
chiburbs_folds <- vfold_cv(chiburbs_train)
# baseline model ---------------
fit_resamples(
linear_reg(),
log_price ~ .,
chiburbs_folds,
metrics = metric_set(rsq)
) %>%
collect_metrics()
# more complex model ------------
recipe_basic <-
recipe(log_price ~ ., chiburbs_train) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_filter_missing(all_predictors(), threshold = .2) %>%
step_other(all_nominal_predictors()) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors())
spec_rf <-
rand_forest() %>%
set_mode("regression")
chiburbs_res <-
fit_resamples(
spec_rf,
recipe_basic,
chiburbs_folds,
metrics = metric_set(rsq)
)
collect_metrics(chiburbs_res)
chiburbs_fit <- fit(workflow(recipe_basic, spec_rf), chiburbs_train)
chiburbs_test <- augment(chiburbs_fit, new_data = chiburbs_test)
ggplot(chiburbs_test) +
aes(x = log_price, y = .pred) +
geom_point() +
coord_obs_pred()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment