hrbrmstr/scrape.R

## scrape.R

get_page <- function(page_num = 1) {

  # this is to be kind to the web site
  # it does not have a robots.txt so this should be the default wait
  # time between requests since the desires of the scraper are not
  # greater than that of the site owner and you'd be abusing
  # their resources if you did not put a delay in between requests
  Sys.sleep(5)

  # this makes the function requirements self-contained
  suppressPackageStartupMessages({
    require("httr", quietly = TRUE, warn.conflicts = FALSE)
    require("rvest", quietly = TRUE, warn.conflicts = FALSE)
  })

  # this is just a way to reduce verbosity and doesn't need to be
  # inside this function but by putting it here it makes the function
  # self-contained
  node_txt <- function(pg, selector) {
    out <-  html_nodes(pg, selector)
    html_text(out, trim = TRUE)
  }

  # act like a web browser
  httr::GET(
    url = "https://www.proudlysa.co.za/members.php",
    query = list(
      page = page_num
    )
  ) -> res

  # stop & report network or web server errors if any
  httr::stop_for_status(res)

  pg <- httr::content(res, as = "parsed", encoding = "UTF-8")

  data.frame(
    CompanyName = node_txt(pg, '.view_data strong'),
    CompanyReference = node_txt(pg, '.pricing-list li:nth-child(2)'),
    ContactPerson = gsub("Contact: ", "", node_txt(pg, '.pricing-list li:nth-child(3)')),
    EmailAddress = node_txt(pg, '.fa-user+ a'),
    PhoneNumber = node_txt(pg, '.pricing-list li:nth-child(5)'),
    Province =  node_txt(pg, '.pricing-list li:nth-child(6)'),
    stringsAsFactors = FALSE
  )

  # ideally, you'd add some other code to create a cache directory
  # store the original page HTML files in that cache and use the
  # cache in the event you do hit errors during scraping or failed
  # to grab some data or lost the data and want to scrape again.
  # that's not a requirement, but it's showing respect for the site
  # and saving you time later.

}

# one primer call to get the total # of pages
doc <- read_html("https://www.proudlysa.co.za/members.php")
total_pages <- as.numeric(gsub("[^[[:digit:]]", "", tail(html_attr(html_nodes(doc, "a[href^='?page']"), "href"), 1)))

do.call(
  rbind.data.frame,
  lapply(1:4, get_page) # change this to 1:total_pages as I was not abt to wait
) -> xdf

str(xdf)
## 'data.frame': 48 obs. of  6 variables:
##  $ CompanyName     : chr  "1000 Beautiful Bracelets" "1Bigdoor Entertainment Group" "2Cana Solutions" "3M South Africa (PTY) Ltd" ...
##  $ CompanyReference: chr  "C02879" "C02868" "C00395" "C02791" ...
##  $ ContactPerson   : chr  "Valerie Pole" "Walton Patrick" "Veena Rugbar" "Dawn Isdale" ...
##  $ EmailAddress    : chr  "valerie.pole@gmail.com" "info@1bigdoor.com" "veena.rugbar@2cana.co.za" "disdale@mmm.com" ...
##  $ PhoneNumber     : chr  "715 283 083" "082-233 4436" "031-583 3200" "118 062 000" ...
##  $ Province        : chr  "Gauteng" "Western Cape" "Kwazulu-Natal" "Gauteng" ...

	get_page <- function(page_num = 1) {

	# this is to be kind to the web site
	# it does not have a robots.txt so this should be the default wait
	# time between requests since the desires of the scraper are not
	# greater than that of the site owner and you'd be abusing
	# their resources if you did not put a delay in between requests
	Sys.sleep(5)

	# this makes the function requirements self-contained
	suppressPackageStartupMessages({
	require("httr", quietly = TRUE, warn.conflicts = FALSE)
	require("rvest", quietly = TRUE, warn.conflicts = FALSE)
	})

	# this is just a way to reduce verbosity and doesn't need to be
	# inside this function but by putting it here it makes the function
	# self-contained
	node_txt <- function(pg, selector) {
	out <- html_nodes(pg, selector)
	html_text(out, trim = TRUE)
	}

	# act like a web browser
	httr::GET(
	url = "https://www.proudlysa.co.za/members.php",
	query = list(
	page = page_num
	)
	) -> res

	# stop & report network or web server errors if any
	httr::stop_for_status(res)

	pg <- httr::content(res, as = "parsed", encoding = "UTF-8")

	data.frame(
	CompanyName = node_txt(pg, '.view_data strong'),
	CompanyReference = node_txt(pg, '.pricing-list li:nth-child(2)'),
	ContactPerson = gsub("Contact: ", "", node_txt(pg, '.pricing-list li:nth-child(3)')),
	EmailAddress = node_txt(pg, '.fa-user+ a'),
	PhoneNumber = node_txt(pg, '.pricing-list li:nth-child(5)'),
	Province = node_txt(pg, '.pricing-list li:nth-child(6)'),
	stringsAsFactors = FALSE
	)

	# ideally, you'd add some other code to create a cache directory
	# store the original page HTML files in that cache and use the
	# cache in the event you do hit errors during scraping or failed
	# to grab some data or lost the data and want to scrape again.
	# that's not a requirement, but it's showing respect for the site
	# and saving you time later.

	}

	# one primer call to get the total # of pages
	doc <- read_html("https://www.proudlysa.co.za/members.php")
	total_pages <- as.numeric(gsub("[^[[:digit:]]", "", tail(html_attr(html_nodes(doc, "a[href^='?page']"), "href"), 1)))

	do.call(
	rbind.data.frame,
	lapply(1:4, get_page) # change this to 1:total_pages as I was not abt to wait
	) -> xdf

	str(xdf)
	## 'data.frame': 48 obs. of 6 variables:
	## $ CompanyName : chr "1000 Beautiful Bracelets" "1Bigdoor Entertainment Group" "2Cana Solutions" "3M South Africa (PTY) Ltd" ...
	## $ CompanyReference: chr "C02879" "C02868" "C00395" "C02791" ...
	## $ ContactPerson : chr "Valerie Pole" "Walton Patrick" "Veena Rugbar" "Dawn Isdale" ...
	## $ EmailAddress : chr "valerie.pole@gmail.com" "info@1bigdoor.com" "veena.rugbar@2cana.co.za" "disdale@mmm.com" ...
	## $ PhoneNumber : chr "715 283 083" "082-233 4436" "031-583 3200" "118 062 000" ...
	## $ Province : chr "Gauteng" "Western Cape" "Kwazulu-Natal" "Gauteng" ...