clayford/webscrape_offgrounds_housing.R

## webscrape_offgrounds_housing.R
# web scrape off-grounds housing
# for Jeff Boichuk
# 2018-09-12

# https://offgroundshousing.student.virginia.edu/

library(tidyverse)
library(rvest)
library(stringr)
library(pbapply)

# Plan --------------------------------------------------------------------

# 0. Establish session using credentials
# 1. Get all URLs of browsing pages with houses (eg, page 1 of 11)
# 2. Visit each browsing URL and get the URL for each house
# 3. Visit each house URL and get the information
# 4. Create final data frame


# 0. Establish session using credentials ----------------------------------

# Enter your username and password below where it says "your_username" and
# "your_password"

URL <- "https://offgroundshousing.student.virginia.edu/property/search"
# establish session and log in
pgsession <- html_session(URL)
pgform <- html_form(pgsession)[[1]]
filled_form <- set_values(pgform,
                          "username" = "your_username",
                          "password" = "your_password")
submit_form(pgsession, filled_form)

# go to housing page and read html
housing <- jump_to(pgsession, "https://offgroundshousing.student.virginia.edu/property/search")
page <- read_html(housing)


# 1. Get all URLs of browsing pages with houses ---------------------------


# get last page of results
lastpage <- html_nodes(x = page, css = ".last-page") %>% html_text() %>%
  readr::parse_number()

# create URLs of all pages
p <- 1:lastpage
page_urls <- paste0('https://offgroundshousing.student.virginia.edu/property/search/?&page=',
                    p)


# 2. Visit each browsing URL and get the URL for each house ---------------


# function to get house URLs
getHouseURLs <- function(url){
  browse_page <- jump_to(pgsession, url)
  bp <- read_html(browse_page)


  # get house URLs on current page
  house_urls <- html_nodes(x = bp, css = "div .name a") %>% html_attr(name = "href")

  # create URLs
  root_url <- "https://offgroundshousing.student.virginia.edu"
  paste0(root_url, house_urls)
}

# apply function to page_urls
house_urls <- pblapply(page_urls, getHouseURLs)

# unlist into vector
house_urls <- unlist(house_urls)


# 3. Visit each house URL and get the information -------------------------


# function to get house info
# 	Address
# 	Total monthly rent
# 	Number of beds
# 	Number of baths
# 	Square footage
# 	Max occupants
# 	Neighborhood
# 	Type
# 	Security deposit
# 	Move-in date


getHouseInfo <- function(url){
  print(url)
  house <- jump_to(pgsession, url)
  page <- read_html(house)
  address <- page %>% html_nodes(".location") %>% html_text()
  rent <- page %>% html_nodes(".numbers div:nth-of-type(1)") %>% html_text() %>%
    str_squish() %>% str_extract("(?<=Rent:).+")
  beds <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
    str_c(collapse = "") %>% str_squish() %>% str_extract(".+(?= bed)")
  baths <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
    str_c(collapse = "") %>% str_squish() %>% str_extract("[0-9\\. to]+(?= bath)") %>% str_trim()
  sqfeet <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
    str_squish() %>% str_extract("[0-9]{3,4}")
  maxocc <- page %>% html_nodes(".numbers") %>% html_text() %>%
    str_squish() %>% str_extract("[0-9]+(?= occupants)")
  neighborhood <- page %>% html_nodes(".numbers") %>% html_text() %>%
    str_squish() %>% str_extract("(?<=Neighborhood: ).+")
  type <- page %>% html_nodes(".other-info div:nth-of-type(1)") %>% html_text() %>%
    str_squish() %>% str_extract("(?<=Type: ).+")
  deposit <- page %>% html_nodes(".other-info div:nth-of-type(2)") %>% html_text() %>%
    str_squish() %>% str_extract("(?<=Deposit: ).+")
  moveindate <- page %>% html_nodes(".other-info div:nth-of-type(4)") %>% html_text() %>%
    str_squish() %>% str_extract("(?<=Move-In: ).+")
  lease <- page %>% html_nodes("#amenities-5 + ul") %>% html_text() %>%
    str_squish()
  if(purrr::is_empty(lease)) lease <- NA
  company <- page %>% html_nodes(".property-contact strong a") %>% html_text()
  if(purrr::is_empty(company)){
    company <- page %>% html_nodes(".property-contact p strong") %>% html_text()
    if(purrr::is_empty(company)) company <- NA
  }
  d <- data.frame(address, rent, beds, baths, sqfeet, maxocc,
                  neighborhood, type, deposit, moveindate,
                  lease, company, url,
                  stringsAsFactors = FALSE)
  d
}


# apply function to vector of house URLS
l.out <- pblapply(house_urls, getHouseInfo)


# 4. Create final data frame ----------------------------------------------


# convert to data frame
housing_df <- bind_rows(l.out)

# write to csv
write.csv(housing_df, file = "/Users/jcf2d/Box Sync/__Consults/Jeff_Boichuk/housing.csv",
          row.names = FALSE)
	# web scrape off-grounds housing
	# for Jeff Boichuk
	# 2018-09-12

	# https://offgroundshousing.student.virginia.edu/

	library(tidyverse)
	library(rvest)
	library(stringr)
	library(pbapply)

	# Plan --------------------------------------------------------------------

	# 0. Establish session using credentials
	# 1. Get all URLs of browsing pages with houses (eg, page 1 of 11)
	# 2. Visit each browsing URL and get the URL for each house
	# 3. Visit each house URL and get the information
	# 4. Create final data frame


	# 0. Establish session using credentials ----------------------------------

	# Enter your username and password below where it says "your_username" and
	# "your_password"

	URL <- "https://offgroundshousing.student.virginia.edu/property/search"
	# establish session and log in
	pgsession <- html_session(URL)
	pgform <- html_form(pgsession)[[1]]
	filled_form <- set_values(pgform,
	"username" = "your_username",
	"password" = "your_password")
	submit_form(pgsession, filled_form)

	# go to housing page and read html
	housing <- jump_to(pgsession, "https://offgroundshousing.student.virginia.edu/property/search")
	page <- read_html(housing)



	# 1. Get all URLs of browsing pages with houses ---------------------------



	# get last page of results
	lastpage <- html_nodes(x = page, css = ".last-page") %>% html_text() %>%
	readr::parse_number()

	# create URLs of all pages
	p <- 1:lastpage
	page_urls <- paste0('https://offgroundshousing.student.virginia.edu/property/search/?&page=',
	p)



	# 2. Visit each browsing URL and get the URL for each house ---------------


	# function to get house URLs
	getHouseURLs <- function(url){
	browse_page <- jump_to(pgsession, url)
	bp <- read_html(browse_page)


	# get house URLs on current page
	house_urls <- html_nodes(x = bp, css = "div .name a") %>% html_attr(name = "href")

	# create URLs
	root_url <- "https://offgroundshousing.student.virginia.edu"
	paste0(root_url, house_urls)
	}

	# apply function to page_urls
	house_urls <- pblapply(page_urls, getHouseURLs)

	# unlist into vector
	house_urls <- unlist(house_urls)


	# 3. Visit each house URL and get the information -------------------------


	# function to get house info
	# Address
	# Total monthly rent
	# Number of beds
	# Number of baths
	# Square footage
	# Max occupants
	# Neighborhood
	# Type
	# Security deposit
	# Move-in date


	getHouseInfo <- function(url){
	print(url)
	house <- jump_to(pgsession, url)
	page <- read_html(house)
	address <- page %>% html_nodes(".location") %>% html_text()
	rent <- page %>% html_nodes(".numbers div:nth-of-type(1)") %>% html_text() %>%
	str_squish() %>% str_extract("(?<=Rent:).+")
	beds <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
	str_c(collapse = "") %>% str_squish() %>% str_extract(".+(?= bed)")
	baths <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
	str_c(collapse = "") %>% str_squish() %>% str_extract("[0-9\\. to]+(?= bath)") %>% str_trim()
	sqfeet <- page %>% html_nodes(".numbers div:nth-of-type(2)") %>% html_text() %>%
	str_squish() %>% str_extract("[0-9]{3,4}")
	maxocc <- page %>% html_nodes(".numbers") %>% html_text() %>%
	str_squish() %>% str_extract("[0-9]+(?= occupants)")
	neighborhood <- page %>% html_nodes(".numbers") %>% html_text() %>%
	str_squish() %>% str_extract("(?<=Neighborhood: ).+")
	type <- page %>% html_nodes(".other-info div:nth-of-type(1)") %>% html_text() %>%
	str_squish() %>% str_extract("(?<=Type: ).+")
	deposit <- page %>% html_nodes(".other-info div:nth-of-type(2)") %>% html_text() %>%
	str_squish() %>% str_extract("(?<=Deposit: ).+")
	moveindate <- page %>% html_nodes(".other-info div:nth-of-type(4)") %>% html_text() %>%
	str_squish() %>% str_extract("(?<=Move-In: ).+")
	lease <- page %>% html_nodes("#amenities-5 + ul") %>% html_text() %>%
	str_squish()
	if(purrr::is_empty(lease)) lease <- NA
	company <- page %>% html_nodes(".property-contact strong a") %>% html_text()
	if(purrr::is_empty(company)){
	company <- page %>% html_nodes(".property-contact p strong") %>% html_text()
	if(purrr::is_empty(company)) company <- NA
	}
	d <- data.frame(address, rent, beds, baths, sqfeet, maxocc,
	neighborhood, type, deposit, moveindate,
	lease, company, url,
	stringsAsFactors = FALSE)
	d
	}


	# apply function to vector of house URLS
	l.out <- pblapply(house_urls, getHouseInfo)


	# 4. Create final data frame ----------------------------------------------


	# convert to data frame
	housing_df <- bind_rows(l.out)

	# write to csv
	write.csv(housing_df, file = "/Users/jcf2d/Box Sync/__Consults/Jeff_Boichuk/housing.csv",
	row.names = FALSE)