Skip to content

Instantly share code, notes, and snippets.

@HanjoStudy
Last active November 10, 2023 02:15
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save HanjoStudy/aeb331b7a277be9639f3cfb3bf875ba2 to your computer and use it in GitHub Desktop.
Save HanjoStudy/aeb331b7a277be9639f3cfb3bf875ba2 to your computer and use it in GitHub Desktop.
useR2018
##########################################################
#
# HANJO ODENDAAL
# hanjo.oden@gmail.com
# www.daeconomist.com
# @UbuntR314
# https://github.com/HanjoStudy
#
#
# ██████╗ ███████╗███████╗██╗ ███████╗███╗ ██╗██╗██╗ ██╗███╗ ███╗
# ██╔══██╗██╔════╝██╔════╝██║ ██╔════╝████╗ ██║██║██║ ██║████╗ ████║
# ██████╔╝███████╗█████╗ ██║ █████╗ ██╔██╗ ██║██║██║ ██║██╔████╔██║
# ██╔══██╗╚════██║██╔══╝ ██║ ██╔══╝ ██║╚██╗██║██║██║ ██║██║╚██╔╝██║
# ██║ ██║███████║███████╗███████╗███████╗██║ ╚████║██║╚██████╔╝██║ ╚═╝ ██║
# ╚═╝ ╚═╝╚══════╝╚══════╝╚══════╝╚══════╝╚═╝ ╚═══╝╚═╝ ╚═════╝ ╚═╝ ╚═╝
#
# Last update: July 2018
#
##########################################################
# By the end of the session I want you to be comfortable with
#
# * Connecting to RSelenium
# - Understand basic docker commands
# * Be able to construct a scraper that
# - navigates
# - scrolls
# - interacts with DOM
# - build a scraper framework snippet
# * Use screenshots
# -------------------------------------
# Why we use RSelenium
# -------------------------------------
# RSelenium allows you to carry out unit testing and regression testing on your webapps and webpages across a range of browser/OS combinations
# > Selenium makes our task easy as it can scrape complicated webpages with dynamic content
# > "Human-like" behaviour such as clicking and scrolling
# > FINALLY a stable server instance through docker!
# > They joy when you finally get it working!
# Getting the old boy started
# CRAN recently removed `RSelenium` from the repo, thus it is even more difficult to get the your `Selenium` instance up and running in `R`
# We will be using `devtools` to install the necessary dependencies from `github`
devtools::install_github("johndharrison/binman")
devtools::install_github("johndharrison/wdman")
devtools::install_github("ropensci/RSelenium")
# Once you have installed all the packages, remember to load `RSelenium` into your workspace
library(RSelenium)
library(rvest)
library(tidyverse)
# -------------------------------------
# Turning the iginition (docker style)
# -------------------------------------
# RSelenium is notorius for instability and compatibility issues. It is thus amazing that they now have a docker image for headless webdrivers. Running a docker container standardises the build across OS’s and removes many of the issues user may have relating to JAVA/browser version/selenium version
# > Offers improved stability
# > Greater ease in setting up the Selenium server
# > Quick up and down
# Get your environment setup
# sudo groupadd docker
# sudo usermod -aG docker $USER
# sudo docker pull selenium/standalone-chrome-debug
# Starting your Selenium Server i debug
# docker run --name chrome -v /dev/shm:/dev/shm -d -p 4445:4444 -p 5901:5900 selenium/standalone-chrome-debug:latest
# add swap if needed:
# sudo fallocate -l 3G /swapfile
# sudo chmod 600 /swapfile
# sudo mkswap /swapfile
# sudo swapon /swapfile
# sudo cp /etc/fstab /etc/fstab.bak
# sudo docker ps
# * `-name` name your container, otherwise docker will ;-)
# * `-v` mount volume
# * `-d` detached mode
# * `-p` port mapping (external:internal)
# * if on external server: `127.0.0.1:port:port`
# Attach your viewport (TightVNC & Vinagre)
# We can use Virtual Network Computing (VNC) viewers to view what is happening
# Finally - RSelenium is operational
# * Quick overview of the tools you will be using
# * Useful functions written in javascript that I find useful
# * Obsure and fun functions
# * Combine it all into a case study
# -------------------------------------
# Open and navigate
# -------------------------------------
library(RSelenium)
# This command sets up a list of the parameters we are going to send to selenium to kick off
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100",
port = 4445L,
browser = "chrome")
# Notice the strange notation? Thats because of Java object.method
remDr$open()
# Use method navigate to drive your browser around
remDr$navigate("http://www.google.com")
remDr$navigate("http://www.bing.com")
# Use methods back and forward to jump between pages
remDr$goBack()
remDr$goForward()
# -------------------------------------
# Using keys and Scrolling
# -------------------------------------
# We can send various keys to the Selenium
RSelenium:::selKeys %>% names()
# Note the notation of the command object$method(list = "command)
remDr$sendKeysToActiveElement(list(key = "page_down"))
remDr$sendKeysToActiveElement(list(key = "page_up"))
# We also send Javascript to the page - this becomes important if you want to know how far down you have scrolled...
remDr$executeScript("return window.scrollY", args = list(1))
remDr$executeScript("return document.body.scrollHeight", args = list(1))
remDr$executeScript("return window.innerHeight", args = list(1))
remDr$executeScript("return window.innerWidth", args = list(1))
remDr$sendKeysToActiveElement(list(key = "home"))
remDr$sendKeysToActiveElement(list(key = "end"))
# -------------------------------------
# Interacting with the DOM
# -------------------------------------
# The DOM stands for the Document Object Model. It is a cross-platform and language-independent convention for representing and interacting with objects in HTML, XHTML and XML documents. To get the whole DOM:
remDr$getPageSource() %>% .[[1]] %>% read_html()
# To interact with the DOM, we will use the `findElement` method:
# > Search by id, class, selector, xpath
remDr$navigate("http://www.google.com/")
# This is equivalent to html_nodes
webElem <- remDr$findElement(using = 'class', "gsfi")
webElem$highlightElement()
# Having identified the element we want to interact with, we have a couple of methods that we can apply to the object:
webElem$clickElement()
webElem$click(2)
# Cannot interact with objects not on screen
remDr$mouseMoveToLocation(webElement = webElem)
webElem$sendKeysToActiveElement(list(key = 'down_arrow', key = 'down_arrow', key = 'enter'))
webElem$sendKeysToActiveElement(list("Hallo World", key = 'enter'))
# -------------------------------------
# Nice to have functions
# -------------------------------------
remDr$maxWindowSize()
remDr$getTitle()
remDr$screenshot(display = TRUE)
b64out<- remDr$screenshot()
writeBin(RCurl::base64Decode(b64out, "raw"), 'screenshot.png')
# Scroll into view
remDr$executeScript("arguments[0].scrollIntoView(true);", args = list(webElem))
# Building a RSelenium pipe function
# RSelenium has 2 types of commands:
#
# * Those with side-effects (action)
# * Those that returns information we want to push into `rvest`
#
# For the 1st case, we would want to return the driver object as the state of it has changed
navi <- function(remDr, site = "www.google.com"){
remDr$navigate(site)
return(remDr)
}
remDr %>% navi(., "www.google.com")
# -------------------------------------
# Case Study: A Tour of the winelands!
# -------------------------------------
## Extending your wine knowledge
# Australia is famous for its wines! Lets find out a little bit more about the wine region
#
# > * Go to vivino.com
# > * Collect 2 pages worth of information
# > - Name of wine farm, name of wine, star rating, count of ratings
# Display all the wine
library(RSelenium)
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100",
port = 4445L,
browser = "chrome")
remDr$open()
remDr$navigate("https://www.vivino.com/")
# This piece isolates the button we need to click on to explore wines
webElem <- remDr$findElement("css", '.explore-widget__main__submit__button')
webElem$highlightElement()
webElem$clickElement()
scrollTo <- function(remDr, webElem){
remDr$executeScript("arguments[0].scrollIntoView(true);", args = list(webElem))
webElem$highlightElement()
}
# I use xpath here, just because I want to illustrates the handy function: starts with
# I am trying to isolate where I can fill in the name of the region I am looking to search
webElem <- remDr$findElements("xpath", '//input[starts-with(@class, "filterPills")]')
scrollTo(remDr, webElem[[2]])
webElem[[2]]$clickElement()
webElem[[2]]$sendKeysToActiveElement(list("Australia"))
webElem <- remDr$findElements("css", '.pill__inner--7gfKn')
# How I identify the correct webelem to click on
country_elem <- webElem %>%
sapply(., function(x) x$getElementText()) %>%
reduce(c) %>%
grepl("Australia", .) %>%
which
scrollTo(remDr, webElem[[country_elem]])
webElem[[country_elem]]$clickElement()
# Some pages need you to scroll to the bottom in order for more content to load. Vivino is one of them
remDr$executeScript("return window.scrollY", args = list(1))
remDr$executeScript("return document.body.scrollHeight", args = list(1))
remDr$sendKeysToActiveElement(list(key = "end"))
remDr$executeScript("return window.scrollY", args = list(1))
# Now we done with RSelenium, on to rvest!
pg <- remDr$getPageSource() %>% .[[1]] %>%
read_html()
collect_info <- function(pg){
farm <- pg %>% html_nodes(".vintageTitle__winery--2YoIr") %>%
html_text()
wine <- pg %>% html_nodes(".vintageTitle__wine--U7t9G") %>%
html_text()
rating <- pg %>% html_nodes("span.vivinoRating__rating--4Oti3") %>%
html_text() %>%
as.numeric
rating_count <- pg %>% html_nodes("span.vivinoRating__ratingCount--NmiVg") %>%
html_text() %>%
gsub("[^0-9]", "",.) %>%
as.numeric
data.frame(farm, wine, rating, rating_count)
}
collect_info(pg)
collect_info(pg)
# ------------------------------#
# ███████╗███╗ ██╗██████╗ #
# ██╔════╝████╗ ██║██╔══██╗ #
# █████╗ ██╔██╗ ██║██║ ██║ #
# ██╔══╝ ██║╚██╗██║██║ ██║ #
# ███████╗██║ ╚████║██████╔╝ #
# ╚══════╝╚═╝ ╚═══╝╚═════╝ #
# ------------------------------#
##########################################################
#
# HANJO ODENDAAL
# hanjo.oden@gmail.com
# www.daeconomist.com
# @UbuntR314
# https://github.com/HanjoStudy
#
#
# ██████╗ ██╗ ██╗███████╗███████╗████████╗
# ██╔══██╗██║ ██║██╔════╝██╔════╝╚══██╔══╝
# ██████╔╝██║ ██║█████╗ ███████╗ ██║
# ██╔══██╗╚██╗ ██╔╝██╔══╝ ╚════██║ ██║
# ██║ ██║ ╚████╔╝ ███████╗███████║ ██║
# ╚═╝ ╚═╝ ╚═══╝ ╚══════╝╚══════╝ ╚═╝
#
# Last update: July 2018
#
##########################################################
# -------------------------------------
# Crawl-delay
# -------------------------------------
# Remember as part of the commandments we must ensure our scraper behaves well and not try an access all files all at once. How do we ensure our scrapers behave well? The `crawl_delay` feature comes into play here:
rtxt <- robotstxt(domain = "www.fbi.gov")
rtxt$comments %>% tbl_df
rtxt$crawl_delay
# But what if the `robottxt` file we downloaded did not contain any information?
# Then sleep for around 5 - 10 seconds between calls
browseURL("https://i1.wp.com/rud.is/b/wp-content/uploads/2017/07/Cursor_and_RStudio.png?ssl=1")
# What do I mean by sleeping between calls?
# We need to tell `R` to not go absolutely beserek and try and get all the pages we want to investigate at once, we do this using the `Sys.sleep` command. I am going to build a nice function which can take of this for us
nytnyt <- function (periods = c(1,1.5)){
# draw from a uniform distribution a single number between params
tictoc <- runif(1, periods[1], periods[2])
# Use a nice verbose output to communicate your intent
cat(paste0(Sys.time()), "- Sleeping for ", round(tictoc, 2), "seconds\n")
# Implement the sleeper
Sys.sleep(tictoc)
}
# Always rememebr to test!
nytnyt()
# -------------------------------------
# Rvest
# -------------------------------------
# Installing rvest
# Easy har-*vest*-ing of static websites. Welcome to rvest
if(!require(rvest)) install.packages("rvest")
library(rvest)
# If you have used `XML` before, `rvest` is a dish of the same flavour
# Here I read in the result from the website ipify.org
read_html("https://api.ipify.org?format=json")
# Our first rvest function
# Running this command line for gives us an idea of some of the basic functions of `rvest`. We might always want to get an idea of our ip before we start scraping
get_ip <- function(){
# read in from website
read_html("https://api.ipify.org?format=json") %>%
# convert to text
html_text() %>%
# convert from json
jsonlite::fromJSON()
}
# Well done, you hacker you!
# -------------------------------------
# Rvest functions: html_table()
# -------------------------------------
getAnywhere("html_table")
methods("html_table")
rvest:::html_table.xml_document
#rvest:::html_table.xml_node
# Lucky for us, we don't need to know what is happening in the background! I am going to explore the rugby world Cup information from wikipedia
rugby <- read_html("https://en.wikipedia.org/wiki/Rugby_World_Cup")
# Use html_table to read in the information from the wikipedia site and be sure to fill ;-)
rugby_tables <- rugby %>% html_table(., fill = T)
# html_table will always return a list object - thus, use view to have a quick check which list you need
# In this case I need table 3, I also convert the names to lower case and replace all spaces with '_'
correct_names <- function(df){
df %>% purrr::set_names(., gsub(" ", "_", tolower(names(.))))
}
library(scales)
rugby_tables %>%
.[[3]] %>%
correct_names() %>%
mutate(total_attendance = as.numeric(gsub("[^0-9.-]+", "", total_attendance))) %>%
ggplot(., aes(year, total_attendance, fill = total_attendance)) +
geom_bar(stat = "Identity") +
labs(title = "World Cup Rugby Attendance",
subtitle = "1987 - 2015") +
scale_y_continuous(label = comma) +
theme_light()
# -------------------------------------
# Rvest functions: html_nodes()
# -------------------------------------
# Understanding the structure of the DOM and its tree like structure
browseURL("https://bit.ly/2JJcTdv")
# Ok, lets see how we can use the nodes to extract data
# Using the selector gadget, we can identify nodes within the DOM, that we would like to focus on
# using xpath
rugby %>%
html_nodes(., xpath = '//*[(@id = "toc")]') %>%
html_text %>%
cat
# using css
rugby %>%
html_nodes(., css = 'div#toc.toc') %>%
html_text %>%
cat
# more on xpath:
browseURL("https://bit.ly/2ycoNvd")
browseURL("https://bit.ly/2JBQz9Q")
# -------------------------------------
# Rvest functions: html_session()
# -------------------------------------
# Once you have basic static website scraping down, you need to start learning about sessions. What does this mean?
# cookies
# header requests
# status codes
# In essence you will be simulating browser activity. Do note, its different from a browser in that it cannot render javascript, but it can simulate moving through static webpages
# So what does a session object contain?
(rugby <- read_html("https://en.wikipedia.org/wiki/Rugby_World_Cup"))
(rugby <- html_session("https://en.wikipedia.org/wiki/Rugby_World_Cup"))
# There are also some useful linking functions if you know the matching character text
rugby <- rugby %>%
follow_link("Australia")
# There are also some useful linking functions if you know the matching character text
rugby %>%
back() %>%
jump_to("https://en.wikipedia.org/wiki/South_Africa_national_rugby_union_team")
# This becomes useful when you are interacting with websites; lets take a look at forms
# -------------------------------------
# Rvest functions: html_from()
# -------------------------------------
# So to interact with forms, we are going to use `html_session` and `html_form`
rugby <- html_session("https://en.wikipedia.org/wiki/Rugby_World_Cup")
(rugby_form <- rugby %>% html_form())
# You can see that the form is in a list object, so remember to extract the from object from the list
(rugby_form <- rugby %>% html_form() %>% .[[1]])
# Next, we can actually fill in the form using `set_values`
# This can either be done through a very manual process
(rugby_form$fields$search$value <- "cricket")
# Or by using the set_values function
(rugby_form <- set_values(rugby_form, search = "cricket"))
# lastly remember to submit the form!
cricket <- submit_form(rugby, rugby_form)
# -------------------------------------
# Concluding Rvest
# -------------------------------------
# Rvest is an amazing package for static website scraping and session control. For 90% of the websites out their, rvest will enable you to collect information in a well organised manner. For the other 10% you will need Selenium. Tomorrow we will see how to combine these 2 forces in the next sessions
# -------------------------------------
# Case Study
# -------------------------------------
# Putting it all into practice
# So go onto imdb and find your favourite film
# Collect all of 'People who liked this also liked...' movie links
# Collect and plot the gross USA amount of each of the movies using GGplot
# My favourite movie is Amelie
movies <- read_html("https://www.imdb.com/title/tt0211915/?ref_=fn_al_tt_1")
# My plan of action is to build 2 functions achieve the right results:
# get_recom: Should retrieve the links to all the related movies
# movie_gross: Get the gross income per recommended movie
# Plot using ggplot
get_recom <- function(movies) {
# First the names of the related movies
names <- movies %>%
html_nodes("div.rec_view") %>%
html_nodes("img") %>%
html_attr("title")
# Next the links to the movies
links <- movies %>%
html_nodes("div.rec_view") %>%
html_nodes("a") %>%
html_attr("href") %>%
.[!is.na(.)] %>%
paste0("https://www.imdb.com", .) # Don't forget to add the root url
# lastly combine in a neat df
data.frame(names, links)
}
movie_gross <- function(movies) {
movies %>%
# I have to use the big txt-block as the text is in the div, not the h4 block
html_nodes("div.txt-block") %>%
html_text() %>%
# I look for the single node that contained to Gross using grepl
.[grepl("Gross USA", .)] %>%
# Now the regex starts, i isolate the numbers that come after the dollar
gsub(".* \\$(.*), .*", "\\1", .) %>%
# Then I use regex to get the numbers
gsub("[^0-9]", "", .) %>%
as.numeric
}
movies <-
read_html("https://www.imdb.com/title/tt0211915/?ref_=fn_al_tt_1")
# Test my functions
recommendations <- get_recom(movies)
gross <- movie_gross(movies)
# I prefer using lists to store objects, expecially because you dont always know what is coming back
# Always run the loop through with i = 1
# You dont wanna find out you forgot to dynamically assign the index (ex recommendations[1, 'names']) and find the same result
all_movies <- list()
for (i in 1:nrow(recommendations)) {
cat("Now collecting gross income for:", recommendations[i, 'names'], "\n")
all_movies[[i]] <- read_html(recommendations[i, 'links']) %>%
movie_gross()
# Remember to be nice and sleep
nytnyt(c(1, 2))
}
# I will now bind the list using rbind
all_movies %>%
do.call(rbind, .) %>%
cbind(recommendations, gross = . / 1e6) %>%
ggplot(., aes(reorder(names, gross), gross, fill = gross)) +
geom_bar(stat = "Identity") +
coord_flip() +
theme_minimal()
# ------------------------------#
# ███████╗███╗ ██╗██████╗ #
# ██╔════╝████╗ ██║██╔══██╗ #
# █████╗ ██╔██╗ ██║██║ ██║ #
# ██╔══╝ ██║╚██╗██║██║ ██║ #
# ███████╗██║ ╚████║██████╔╝ #
# ╚══════╝╚═╝ ╚═══╝╚═════╝ #
# ------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment