get_page <- function(page_num = 1) { | |
# this is to be kind to the web site | |
# it does not have a robots.txt so this should be the default wait | |
# time between requests since the desires of the scraper are not | |
# greater than that of the site owner and you'd be abusing | |
# their resources if you did not put a delay in between requests | |
Sys.sleep(5) | |
# this makes the function requirements self-contained | |
suppressPackageStartupMessages({ | |
require("httr", quietly = TRUE, warn.conflicts = FALSE) | |
require("rvest", quietly = TRUE, warn.conflicts = FALSE) | |
}) | |
# this is just a way to reduce verbosity and doesn't need to be | |
# inside this function but by putting it here it makes the function | |
# self-contained | |
node_txt <- function(pg, selector) { | |
out <- html_nodes(pg, selector) | |
html_text(out, trim = TRUE) | |
} | |
# act like a web browser | |
httr::GET( | |
url = "https://www.proudlysa.co.za/members.php", | |
query = list( | |
page = page_num | |
) | |
) -> res | |
# stop & report network or web server errors if any | |
httr::stop_for_status(res) | |
pg <- httr::content(res, as = "parsed", encoding = "UTF-8") | |
data.frame( | |
CompanyName = node_txt(pg, '.view_data strong'), | |
CompanyReference = node_txt(pg, '.pricing-list li:nth-child(2)'), | |
ContactPerson = gsub("Contact: ", "", node_txt(pg, '.pricing-list li:nth-child(3)')), | |
EmailAddress = node_txt(pg, '.fa-user+ a'), | |
PhoneNumber = node_txt(pg, '.pricing-list li:nth-child(5)'), | |
Province = node_txt(pg, '.pricing-list li:nth-child(6)'), | |
stringsAsFactors = FALSE | |
) | |
# ideally, you'd add some other code to create a cache directory | |
# store the original page HTML files in that cache and use the | |
# cache in the event you do hit errors during scraping or failed | |
# to grab some data or lost the data and want to scrape again. | |
# that's not a requirement, but it's showing respect for the site | |
# and saving you time later. | |
} | |
# one primer call to get the total # of pages | |
doc <- read_html("https://www.proudlysa.co.za/members.php") | |
total_pages <- as.numeric(gsub("[^[[:digit:]]", "", tail(html_attr(html_nodes(doc, "a[href^='?page']"), "href"), 1))) | |
do.call( | |
rbind.data.frame, | |
lapply(1:4, get_page) # change this to 1:total_pages as I was not abt to wait | |
) -> xdf | |
str(xdf) | |
## 'data.frame': 48 obs. of 6 variables: | |
## $ CompanyName : chr "1000 Beautiful Bracelets" "1Bigdoor Entertainment Group" "2Cana Solutions" "3M South Africa (PTY) Ltd" ... | |
## $ CompanyReference: chr "C02879" "C02868" "C00395" "C02791" ... | |
## $ ContactPerson : chr "Valerie Pole" "Walton Patrick" "Veena Rugbar" "Dawn Isdale" ... | |
## $ EmailAddress : chr "valerie.pole@gmail.com" "info@1bigdoor.com" "veena.rugbar@2cana.co.za" "disdale@mmm.com" ... | |
## $ PhoneNumber : chr "715 283 083" "082-233 4436" "031-583 3200" "118 062 000" ... | |
## $ Province : chr "Gauteng" "Western Cape" "Kwazulu-Natal" "Gauteng" ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment