neerajt/web_scraping_script.R

## web_scraping_script.R
#                                      Houston R Users Group - Web Scraping Script
# =========================================================================================================================================

# Loading Necessary Libraries
library(rvest)
library(tidyverse)
library(reshape2)
# Other useful libraries
#library(XML) # Hides xml function from rvest
#library(RSelenium) # Helpful in dealing with dynamic webpages

# I always use Chrome (might work in other browsers, but I'm not familiar with them)
# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf

# Disclaimer: Always review the website terms of service agreement


# ============================================= NBA Data from Sports Reference ========================================================

# NBA Player-Season Data
# https://www.basketball-reference.com/

url <- paste0(
  "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
  "&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
  "&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")

# Including XPath
nba <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="stats"]') %>%
  html_table() %>%
  .[[1]]

# But XPath is not actually needed here
nba <- url %>%
  read_html() %>%
  html_table() %>%
  .[[1]]

# Cleaning up the scraped table
names(nba) <- nba[1,]
nba <- nba[-1,]
nba <- nba[nba$Player != "Player",]

# Checking out the data
table(nba$Player) %>% data.frame
ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
  geom_line()


#============================================== Opiates - CDC Data Scrape Script =======================================================

# Data on Opiate prescriptions at the County-Year level
# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

# Scraping a single page
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
cdc16 <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
  html_table()
cdc16 <- cdc16[[1]]
head(cdc16)
names(cdc16) <- make.names(names(cdc16))
#rm(cdc16)

# Scrapping 2010 to 2016 to List and Making into a DF
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
dfList <- lapply(url, function(i) {
  webpage <- read_html(i)
  draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
  draft <- html_table(draft_table)[[1]]
})
str(dfList)
dfList[[1]] %>% head
cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
rm(dfList)

# Fixing Format and Names
names(cdc) <- make.names(names(cdc))
cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
cdc$variable <- gsub("X", "", cdc$variable)
names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
# Keeping TX Only
cdc <- cdc[cdc$State %in% "TX",]

# Checking out the data
str(cdc)
table(cdc$Year)
require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
  geom_line() +
  theme(legend.position="none")


# =============================================== Scraping State of the Union =========================================================

# Transcribed State of the Union Speeches
# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses

# Scraping State of the Union Speech
url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"

# XPath to the speech
speech <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
  html_text()
speech

# Another way to do it that gives a slightly different format
speechbypara <- url %>%
  read_html() %>%
  html_nodes("p") %>%
  html_text()
speechbypara

# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
# <p>: Paragraph elements
# <ul>: Unordered bulleted list
# <ol>: Ordered list
# <li>: Individual List item
# <div>: Division or section
# <table>: Table
	# Houston R Users Group - Web Scraping Script
	# =========================================================================================================================================

	# Loading Necessary Libraries
	library(rvest)
	library(tidyverse)
	library(reshape2)
	# Other useful libraries
	#library(XML) # Hides xml function from rvest
	#library(RSelenium) # Helpful in dealing with dynamic webpages

	# I always use Chrome (might work in other browsers, but I'm not familiar with them)
	# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf

	# Disclaimer: Always review the website terms of service agreement


	# ============================================= NBA Data from Sports Reference ========================================================

	# NBA Player-Season Data
	# https://www.basketball-reference.com/

	url <- paste0(
	"https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
	"&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
	"&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")

	# Including XPath
	nba <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="stats"]') %>%
	html_table() %>%
	.[[1]]

	# But XPath is not actually needed here
	nba <- url %>%
	read_html() %>%
	html_table() %>%
	.[[1]]

	# Cleaning up the scraped table
	names(nba) <- nba[1,]
	nba <- nba[-1,]
	nba <- nba[nba$Player != "Player",]

	# Checking out the data
	table(nba$Player) %>% data.frame
	ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
	geom_line()


	#============================================== Opiates - CDC Data Scrape Script =======================================================

	# Data on Opiate prescriptions at the County-Year level
	# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

	# Scraping a single page
	url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
	cdc16 <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
	html_table()
	cdc16 <- cdc16[[1]]
	head(cdc16)
	names(cdc16) <- make.names(names(cdc16))
	#rm(cdc16)

	# Scrapping 2010 to 2016 to List and Making into a DF
	url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
	dfList <- lapply(url, function(i) {
	webpage <- read_html(i)
	draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
	draft <- html_table(draft_table)[[1]]
	})
	str(dfList)
	dfList[[1]] %>% head
	cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
	rm(dfList)

	# Fixing Format and Names
	names(cdc) <- make.names(names(cdc))
	cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
	cdc$variable <- gsub("X", "", cdc$variable)
	names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
	cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
	cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
	# Keeping TX Only
	cdc <- cdc[cdc$State %in% "TX",]

	# Checking out the data
	str(cdc)
	table(cdc$Year)
	require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
	ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
	geom_line() +
	theme(legend.position="none")



	# =============================================== Scraping State of the Union =========================================================

	# Transcribed State of the Union Speeches
	# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses

	# Scraping State of the Union Speech
	url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"

	# XPath to the speech
	speech <- url %>%
	read_html() %>%
	html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
	html_text()
	speech

	# Another way to do it that gives a slightly different format
	speechbypara <- url %>%
	read_html() %>%
	html_nodes("p") %>%
	html_text()
	speechbypara

	# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
	# <p>: Paragraph elements
	# <ul>: Unordered bulleted list
	# <ol>: Ordered list
	# <li>: Individual List item
	# <div>: Division or section
	# <table>: Table