Skip to content

Instantly share code, notes, and snippets.

@hrbrmstr

hrbrmstr/scrape.R

Last active Oct 19, 2018
Embed
What would you like to do?
get_page <- function(page_num = 1) {
# this is to be kind to the web site
# it does not have a robots.txt so this should be the default wait
# time between requests since the desires of the scraper are not
# greater than that of the site owner and you'd be abusing
# their resources if you did not put a delay in between requests
Sys.sleep(5)
# this makes the function requirements self-contained
suppressPackageStartupMessages({
require("httr", quietly = TRUE, warn.conflicts = FALSE)
require("rvest", quietly = TRUE, warn.conflicts = FALSE)
})
# this is just a way to reduce verbosity and doesn't need to be
# inside this function but by putting it here it makes the function
# self-contained
node_txt <- function(pg, selector) {
out <- html_nodes(pg, selector)
html_text(out, trim = TRUE)
}
# act like a web browser
httr::GET(
url = "https://www.proudlysa.co.za/members.php",
query = list(
page = page_num
)
) -> res
# stop & report network or web server errors if any
httr::stop_for_status(res)
pg <- httr::content(res, as = "parsed", encoding = "UTF-8")
data.frame(
CompanyName = node_txt(pg, '.view_data strong'),
CompanyReference = node_txt(pg, '.pricing-list li:nth-child(2)'),
ContactPerson = gsub("Contact: ", "", node_txt(pg, '.pricing-list li:nth-child(3)')),
EmailAddress = node_txt(pg, '.fa-user+ a'),
PhoneNumber = node_txt(pg, '.pricing-list li:nth-child(5)'),
Province = node_txt(pg, '.pricing-list li:nth-child(6)'),
stringsAsFactors = FALSE
)
# ideally, you'd add some other code to create a cache directory
# store the original page HTML files in that cache and use the
# cache in the event you do hit errors during scraping or failed
# to grab some data or lost the data and want to scrape again.
# that's not a requirement, but it's showing respect for the site
# and saving you time later.
}
# one primer call to get the total # of pages
doc <- read_html("https://www.proudlysa.co.za/members.php")
total_pages <- as.numeric(gsub("[^[[:digit:]]", "", tail(html_attr(html_nodes(doc, "a[href^='?page']"), "href"), 1)))
do.call(
rbind.data.frame,
lapply(1:4, get_page) # change this to 1:total_pages as I was not abt to wait
) -> xdf
str(xdf)
## 'data.frame': 48 obs. of 6 variables:
## $ CompanyName : chr "1000 Beautiful Bracelets" "1Bigdoor Entertainment Group" "2Cana Solutions" "3M South Africa (PTY) Ltd" ...
## $ CompanyReference: chr "C02879" "C02868" "C00395" "C02791" ...
## $ ContactPerson : chr "Valerie Pole" "Walton Patrick" "Veena Rugbar" "Dawn Isdale" ...
## $ EmailAddress : chr "valerie.pole@gmail.com" "info@1bigdoor.com" "veena.rugbar@2cana.co.za" "disdale@mmm.com" ...
## $ PhoneNumber : chr "715 283 083" "082-233 4436" "031-583 3200" "118 062 000" ...
## $ Province : chr "Gauteng" "Western Cape" "Kwazulu-Natal" "Gauteng" ...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment