Last active
October 19, 2018 13:48
-
-
Save hrbrmstr/e5f1e7ff2a347c6876d118f55b521aa6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_page <- function(page_num = 1) { | |
# this is to be kind to the web site | |
# it does not have a robots.txt so this should be the default wait | |
# time between requests since the desires of the scraper are not | |
# greater than that of the site owner and you'd be abusing | |
# their resources if you did not put a delay in between requests | |
Sys.sleep(5) | |
# this makes the function requirements self-contained | |
suppressPackageStartupMessages({ | |
require("httr", quietly = TRUE, warn.conflicts = FALSE) | |
require("rvest", quietly = TRUE, warn.conflicts = FALSE) | |
}) | |
# this is just a way to reduce verbosity and doesn't need to be | |
# inside this function but by putting it here it makes the function | |
# self-contained | |
node_txt <- function(pg, selector) { | |
out <- html_nodes(pg, selector) | |
html_text(out, trim = TRUE) | |
} | |
# act like a web browser | |
httr::GET( | |
url = "https://www.proudlysa.co.za/members.php", | |
query = list( | |
page = page_num | |
) | |
) -> res | |
# stop & report network or web server errors if any | |
httr::stop_for_status(res) | |
pg <- httr::content(res, as = "parsed", encoding = "UTF-8") | |
data.frame( | |
CompanyName = node_txt(pg, '.view_data strong'), | |
CompanyReference = node_txt(pg, '.pricing-list li:nth-child(2)'), | |
ContactPerson = gsub("Contact: ", "", node_txt(pg, '.pricing-list li:nth-child(3)')), | |
EmailAddress = node_txt(pg, '.fa-user+ a'), | |
PhoneNumber = node_txt(pg, '.pricing-list li:nth-child(5)'), | |
Province = node_txt(pg, '.pricing-list li:nth-child(6)'), | |
stringsAsFactors = FALSE | |
) | |
# ideally, you'd add some other code to create a cache directory | |
# store the original page HTML files in that cache and use the | |
# cache in the event you do hit errors during scraping or failed | |
# to grab some data or lost the data and want to scrape again. | |
# that's not a requirement, but it's showing respect for the site | |
# and saving you time later. | |
} | |
# one primer call to get the total # of pages | |
doc <- read_html("https://www.proudlysa.co.za/members.php") | |
total_pages <- as.numeric(gsub("[^[[:digit:]]", "", tail(html_attr(html_nodes(doc, "a[href^='?page']"), "href"), 1))) | |
do.call( | |
rbind.data.frame, | |
lapply(1:4, get_page) # change this to 1:total_pages as I was not abt to wait | |
) -> xdf | |
str(xdf) | |
## 'data.frame': 48 obs. of 6 variables: | |
## $ CompanyName : chr "1000 Beautiful Bracelets" "1Bigdoor Entertainment Group" "2Cana Solutions" "3M South Africa (PTY) Ltd" ... | |
## $ CompanyReference: chr "C02879" "C02868" "C00395" "C02791" ... | |
## $ ContactPerson : chr "Valerie Pole" "Walton Patrick" "Veena Rugbar" "Dawn Isdale" ... | |
## $ EmailAddress : chr "valerie.pole@gmail.com" "info@1bigdoor.com" "veena.rugbar@2cana.co.za" "disdale@mmm.com" ... | |
## $ PhoneNumber : chr "715 283 083" "082-233 4436" "031-583 3200" "118 062 000" ... | |
## $ Province : chr "Gauteng" "Western Cape" "Kwazulu-Natal" "Gauteng" ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment