Skip to content

Instantly share code, notes, and snippets.

@clayford
Last active September 12, 2018 20:11
Show Gist options
  • Save clayford/3ddaca6b4d4f3d2daf8d9a49c8fbc5a2 to your computer and use it in GitHub Desktop.
Save clayford/3ddaca6b4d4f3d2daf8d9a49c8fbc5a2 to your computer and use it in GitHub Desktop.
R script to scrape housing information from University of Virginia off-Grounds Housing Service web site
# web scrape off-grounds housing
# for Jeff Boichuk
# 2018-09-12
# https://offgroundshousing.student.virginia.edu/
library(tidyverse)
library(rvest)
library(stringr)
library(pbapply)
# Plan --------------------------------------------------------------------
# 0. Establish session using credentials
# 1. Get all URLs of browsing pages with houses (eg, page 1 of 11)
# 2. Visit each browsing URL and get the URL for each house
# 3. Visit each house URL and get the information
# 4. Create final data frame
# 0. Establish session using credentials ----------------------------------
# Enter your username and password below where it says "your_username" and
# "your_password"
URL <- "https://offgroundshousing.student.virginia.edu/property/search"
# establish session and log in
pgsession <- html_session(URL)
pgform <- html_form(pgsession)[[1]]
filled_form <- set_values(pgform,
"username" = "your_username",
"password" = "your_password")
submit_form(pgsession, filled_form)
# go to housing page and read html
housing <- jump_to(pgsession, "https://offgroundshousing.student.virginia.edu/property/search")
page <- read_html(housing)
# 1. Get all URLs of browsing pages with houses ---------------------------
# get last page of results
lastpage <- html_nodes(x = page, css = ".last-page") %>% html_text() %>%
readr::parse_number()
# create URLs of all pages
p <- 1:lastpage
page_urls <- paste0('https://offgroundshousing.student.virginia.edu/property/search/?&page=',
p)
# 2. Visit each browsing URL and get the URL for each house ---------------
# function to get house URLs
getHouseURLs <- function(url){
browse_page <- jump_to(pgsession, url)
bp <- read_html(browse_page)
# get house URLs on current page
house_urls <- html_nodes(x = bp, css = "div .name a") %>% html_attr(name = "href")
# create URLs
root_url <- "https://offgroundshousing.student.virginia.edu"
paste0(root_url, house_urls)
}
# apply function to page_urls
house_urls <- pblapply(page_urls, getHouseURLs)
# unlist into vector
house_urls <- unlist(house_urls)
# 3. Visit each house URL and get the information -------------------------
# function to get house info
# Address
# Total monthly rent
# Number of beds
# Number of baths
# Square footage
# Max occupants
# Neighborhood
# Type
# Security deposit
# Move-in date
getHouseInfo <- function(url){
print(url)
house <- jump_to(pgsession, url)
page <- read_html(house)
address <- page %>% html_nodes(".location") %>% html_text()
rent <- page %>% html_nodes(".numbers div:nth-of-type(1)") %>% html_text() %>%
str_squish() %>% str_extract("(?<=Rent:).+")
beds <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
str_c(collapse = "") %>% str_squish() %>% str_extract(".+(?= bed)")
baths <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
str_c(collapse = "") %>% str_squish() %>% str_extract("[0-9\\. to]+(?= bath)") %>% str_trim()
sqfeet <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
str_squish() %>% str_extract("[0-9]{3,4}")
maxocc <- page %>% html_nodes(".numbers") %>% html_text() %>%
str_squish() %>% str_extract("[0-9]+(?= occupants)")
neighborhood <- page %>% html_nodes(".numbers") %>% html_text() %>%
str_squish() %>% str_extract("(?<=Neighborhood: ).+")
type <- page %>% html_nodes(".other-info div:nth-of-type(1)") %>% html_text() %>%
str_squish() %>% str_extract("(?<=Type: ).+")
deposit <- page %>% html_nodes(".other-info div:nth-of-type(2)") %>% html_text() %>%
str_squish() %>% str_extract("(?<=Deposit: ).+")
moveindate <- page %>% html_nodes(".other-info div:nth-of-type(4)") %>% html_text() %>%
str_squish() %>% str_extract("(?<=Move-In: ).+")
lease <- page %>% html_nodes("#amenities-5 + ul") %>% html_text() %>%
str_squish()
if(purrr::is_empty(lease)) lease <- NA
company <- page %>% html_nodes(".property-contact strong a") %>% html_text()
if(purrr::is_empty(company)){
company <- page %>% html_nodes(".property-contact p strong") %>% html_text()
if(purrr::is_empty(company)) company <- NA
}
d <- data.frame(address, rent, beds, baths, sqfeet, maxocc,
neighborhood, type, deposit, moveindate,
lease, company, url,
stringsAsFactors = FALSE)
d
}
# apply function to vector of house URLS
l.out <- pblapply(house_urls, getHouseInfo)
# 4. Create final data frame ----------------------------------------------
# convert to data frame
housing_df <- bind_rows(l.out)
# write to csv
write.csv(housing_df, file = "/Users/jcf2d/Box Sync/__Consults/Jeff_Boichuk/housing.csv",
row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment