arvi1000/barug_scrape.R

## barug_scrape.R
# Batch download BARUG docs, so as to move from Meetup to Github

library(rvest)

# 1) build dataframe of download targets, with url and date ----

# fn to get a page of docs from a given offset
scrapeBarug <- function(offset) {
  # get url for a page containing table of BARUG docs
  # (web shows 5 pages @ offsets 0, 25, 50, 75, 100)
  url <- paste0('https://www.meetup.com/R-Users/files/?offset=', offset,
                '&sortBy=date&sortOrder=desc')

  this_page <- read_html(url)

  links <- this_page %>%
    html_nodes('.leading-bottom a') %>% # get 'titular anchors'
    html_attr('href') # and extract href value
  dates <- this_page %>%
    html_nodes('td:nth-child(5)') %>% # get cells for 5th column
    html_text() # extract text (i.e. date string)

  return(data.frame(links=links, dates=dates,
                    stringsAsFactors = F))
}

# apply to 5 pages; collapse
doc_targets <-
  lapply(seq(0, 100, 25), scrapeBarug) %>%
  data.table::rbindlist()

# standardize date format
doc_targets$std_date <- doc_targets$dates %>% lubridate::mdy() %>% as.character

# 2) download all files ----

# cool kids are using purrr::walk for this maybe, but for loop won't do wrong
for (i in 1:nrow(doc_targets)) {
  src_link <- doc_targets$links[i]
  targ_dir <- doc_targets$std_date[i]
  dir.create(targ_dir)
  download.file(src_link, paste0(targ_dir, '/', basename(src_link)))
}

# note, 1 file from Apr 15, 2015 is missing bc the link is 404
	# Batch download BARUG docs, so as to move from Meetup to Github

	library(rvest)

	# 1) build dataframe of download targets, with url and date ----

	# fn to get a page of docs from a given offset
	scrapeBarug <- function(offset) {
	# get url for a page containing table of BARUG docs
	# (web shows 5 pages @ offsets 0, 25, 50, 75, 100)
	url <- paste0('https://www.meetup.com/R-Users/files/?offset=', offset,
	'&sortBy=date&sortOrder=desc')

	this_page <- read_html(url)

	links <- this_page %>%
	html_nodes('.leading-bottom a') %>% # get 'titular anchors'
	html_attr('href') # and extract href value
	dates <- this_page %>%
	html_nodes('td:nth-child(5)') %>% # get cells for 5th column
	html_text() # extract text (i.e. date string)

	return(data.frame(links=links, dates=dates,
	stringsAsFactors = F))
	}

	# apply to 5 pages; collapse
	doc_targets <-
	lapply(seq(0, 100, 25), scrapeBarug) %>%
	data.table::rbindlist()

	# standardize date format
	doc_targets$std_date <- doc_targets$dates %>% lubridate::mdy() %>% as.character

	# 2) download all files ----

	# cool kids are using purrr::walk for this maybe, but for loop won't do wrong
	for (i in 1:nrow(doc_targets)) {
	src_link <- doc_targets$links[i]
	targ_dir <- doc_targets$std_date[i]
	dir.create(targ_dir)
	download.file(src_link, paste0(targ_dir, '/', basename(src_link)))
	}

	# note, 1 file from Apr 15, 2015 is missing bc the link is 404