Last active
September 12, 2018 20:11
-
-
Save clayford/3ddaca6b4d4f3d2daf8d9a49c8fbc5a2 to your computer and use it in GitHub Desktop.
R script to scrape housing information from University of Virginia off-Grounds Housing Service web site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# web scrape off-grounds housing | |
# for Jeff Boichuk | |
# 2018-09-12 | |
# https://offgroundshousing.student.virginia.edu/ | |
library(tidyverse) | |
library(rvest) | |
library(stringr) | |
library(pbapply) | |
# Plan -------------------------------------------------------------------- | |
# 0. Establish session using credentials | |
# 1. Get all URLs of browsing pages with houses (eg, page 1 of 11) | |
# 2. Visit each browsing URL and get the URL for each house | |
# 3. Visit each house URL and get the information | |
# 4. Create final data frame | |
# 0. Establish session using credentials ---------------------------------- | |
# Enter your username and password below where it says "your_username" and | |
# "your_password" | |
URL <- "https://offgroundshousing.student.virginia.edu/property/search" | |
# establish session and log in | |
pgsession <- html_session(URL) | |
pgform <- html_form(pgsession)[[1]] | |
filled_form <- set_values(pgform, | |
"username" = "your_username", | |
"password" = "your_password") | |
submit_form(pgsession, filled_form) | |
# go to housing page and read html | |
housing <- jump_to(pgsession, "https://offgroundshousing.student.virginia.edu/property/search") | |
page <- read_html(housing) | |
# 1. Get all URLs of browsing pages with houses --------------------------- | |
# get last page of results | |
lastpage <- html_nodes(x = page, css = ".last-page") %>% html_text() %>% | |
readr::parse_number() | |
# create URLs of all pages | |
p <- 1:lastpage | |
page_urls <- paste0('https://offgroundshousing.student.virginia.edu/property/search/?&page=', | |
p) | |
# 2. Visit each browsing URL and get the URL for each house --------------- | |
# function to get house URLs | |
getHouseURLs <- function(url){ | |
browse_page <- jump_to(pgsession, url) | |
bp <- read_html(browse_page) | |
# get house URLs on current page | |
house_urls <- html_nodes(x = bp, css = "div .name a") %>% html_attr(name = "href") | |
# create URLs | |
root_url <- "https://offgroundshousing.student.virginia.edu" | |
paste0(root_url, house_urls) | |
} | |
# apply function to page_urls | |
house_urls <- pblapply(page_urls, getHouseURLs) | |
# unlist into vector | |
house_urls <- unlist(house_urls) | |
# 3. Visit each house URL and get the information ------------------------- | |
# function to get house info | |
# Address | |
# Total monthly rent | |
# Number of beds | |
# Number of baths | |
# Square footage | |
# Max occupants | |
# Neighborhood | |
# Type | |
# Security deposit | |
# Move-in date | |
getHouseInfo <- function(url){ | |
print(url) | |
house <- jump_to(pgsession, url) | |
page <- read_html(house) | |
address <- page %>% html_nodes(".location") %>% html_text() | |
rent <- page %>% html_nodes(".numbers div:nth-of-type(1)") %>% html_text() %>% | |
str_squish() %>% str_extract("(?<=Rent:).+") | |
beds <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>% | |
str_c(collapse = "") %>% str_squish() %>% str_extract(".+(?= bed)") | |
baths <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>% | |
str_c(collapse = "") %>% str_squish() %>% str_extract("[0-9\\. to]+(?= bath)") %>% str_trim() | |
sqfeet <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>% | |
str_squish() %>% str_extract("[0-9]{3,4}") | |
maxocc <- page %>% html_nodes(".numbers") %>% html_text() %>% | |
str_squish() %>% str_extract("[0-9]+(?= occupants)") | |
neighborhood <- page %>% html_nodes(".numbers") %>% html_text() %>% | |
str_squish() %>% str_extract("(?<=Neighborhood: ).+") | |
type <- page %>% html_nodes(".other-info div:nth-of-type(1)") %>% html_text() %>% | |
str_squish() %>% str_extract("(?<=Type: ).+") | |
deposit <- page %>% html_nodes(".other-info div:nth-of-type(2)") %>% html_text() %>% | |
str_squish() %>% str_extract("(?<=Deposit: ).+") | |
moveindate <- page %>% html_nodes(".other-info div:nth-of-type(4)") %>% html_text() %>% | |
str_squish() %>% str_extract("(?<=Move-In: ).+") | |
lease <- page %>% html_nodes("#amenities-5 + ul") %>% html_text() %>% | |
str_squish() | |
if(purrr::is_empty(lease)) lease <- NA | |
company <- page %>% html_nodes(".property-contact strong a") %>% html_text() | |
if(purrr::is_empty(company)){ | |
company <- page %>% html_nodes(".property-contact p strong") %>% html_text() | |
if(purrr::is_empty(company)) company <- NA | |
} | |
d <- data.frame(address, rent, beds, baths, sqfeet, maxocc, | |
neighborhood, type, deposit, moveindate, | |
lease, company, url, | |
stringsAsFactors = FALSE) | |
d | |
} | |
# apply function to vector of house URLS | |
l.out <- pblapply(house_urls, getHouseInfo) | |
# 4. Create final data frame ---------------------------------------------- | |
# convert to data frame | |
housing_df <- bind_rows(l.out) | |
# write to csv | |
write.csv(housing_df, file = "/Users/jcf2d/Box Sync/__Consults/Jeff_Boichuk/housing.csv", | |
row.names = FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment