yihui/scrape-rviews.R

## scrape-rviews.R
library(xml2)

page_post_links = function(
  page = 1, baseurl = 'https://www.rstudio.com/rviews',
  xpath = '//h2[@class="entry-title"]/a'
) {
  html = read_html(sprintf('%s/page/%d/', baseurl, page))
  xml_attr(xml_find_all(html, xpath), 'href')
}

post_links = function(...) {
  all_links = character()
  i = 1
  repeat {
    message('Reading page ', i)
    links = page_post_links(i, ...)
    if (length(links) == 0) break
    all_links = c(all_links, links)
    i = i + 1
  }
  all_links
}

page_md = function(link, dir = 'post') {
  html = read_html(link)
  get_content = function(xpath, extract = xml_text, ..., quote = TRUE) {
    res = trimws(extract(xml_find_first(html, xpath), ...))
    if (quote) sprintf('"%s"', res) else res
  }
  get_sub = function(xpath, reg) {
    x = grep(reg, xml_text(xml_find_all(html, xpath)), value = TRUE)
    if (length(x) > 0) gsub(reg, '\\1', x[1]) else character()
  }
  get_body = function() {
    content = xml_find_first(html, '//div[@class="post-content"]')
    content_children = xml_children(content)
    if (length(content_children) > 2) {
      if (xml_text(content_children[1]) == '') xml_remove(content_children[1])
      if (grepl('^by ', xml_text(content_children[2]))) xml_remove(content_children[2])
    }
    tmp = tempfile('tmp', '.', '.html')
    tmp_md = blogdown:::with_ext(tmp, 'md')
    on.exit(unlink(c(tmp, tmp_md)), add = TRUE)
    blogdown:::writeUTF8(
      gsub('^<div [^>]+>\\s*|\\s*</div>\\s*$', '', as.character(content)), tmp
    )
    system2('pandoc', c(tmp, '-o', tmp_md, '--wrap=none'))
    res = gsub(
      '\\s*<(div|span)[^>]*?>\\s*|\\s*</(div|span)>\\s*', '',
      blogdown:::readUTF8(tmp_md)
    )
    res = gsub('``` {.sourceCode .r}', '```r', res, fixed = TRUE)
    # remove consecutive empty lines
    gsub('\\s*\n\n+\\s*', '\n\n', paste(trimws(res), collapse = '\n'))
  }
  meta = unlist(list(
    title = get_content('//h2[@class="entry-title"]'),
    author = get_content('//a[@rel="author"]'),
    date = get_content('//span[@class="updated"]', quote = FALSE),
    slug = basename(link),
    categories = sprintf(
      '[%s]', get_sub('//script[text()]', '.*?"category":"([^"]+)".*')
    ),
    tags = NULL#,
    # description = gsub(
    #   '(^by .+?\r\n)|\n+|\r+', '',
    #   get_content('//meta[@property="og:description"]', xml_attr, 'content')
    # )
  ))
  if (!dir.exists(dir)) dir.create(dir)
  blogdown:::writeUTF8(
    c('---', sprintf('%s: %s', names(meta), meta), '---', '', get_body()),
    file.path(dir, sprintf('%s-%s.md', as.Date(meta['date']), meta['slug']))
  )
}

for (link in post_links()) {
  message('Converting ', link)
  page_md(link)
}
	library(xml2)

	page_post_links = function(
	page = 1, baseurl = 'https://www.rstudio.com/rviews',
	xpath = '//h2[@class="entry-title"]/a'
	) {
	html = read_html(sprintf('%s/page/%d/', baseurl, page))
	xml_attr(xml_find_all(html, xpath), 'href')
	}

	post_links = function(...) {
	all_links = character()
	i = 1
	repeat {
	message('Reading page ', i)
	links = page_post_links(i, ...)
	if (length(links) == 0) break
	all_links = c(all_links, links)
	i = i + 1
	}
	all_links
	}

	page_md = function(link, dir = 'post') {
	html = read_html(link)
	get_content = function(xpath, extract = xml_text, ..., quote = TRUE) {
	res = trimws(extract(xml_find_first(html, xpath), ...))
	if (quote) sprintf('"%s"', res) else res
	}
	get_sub = function(xpath, reg) {
	x = grep(reg, xml_text(xml_find_all(html, xpath)), value = TRUE)
	if (length(x) > 0) gsub(reg, '\\1', x[1]) else character()
	}
	get_body = function() {
	content = xml_find_first(html, '//div[@class="post-content"]')
	content_children = xml_children(content)
	if (length(content_children) > 2) {
	if (xml_text(content_children[1]) == '') xml_remove(content_children[1])
	if (grepl('^by ', xml_text(content_children[2]))) xml_remove(content_children[2])
	}
	tmp = tempfile('tmp', '.', '.html')
	tmp_md = blogdown:::with_ext(tmp, 'md')
	on.exit(unlink(c(tmp, tmp_md)), add = TRUE)
	blogdown:::writeUTF8(
	gsub('^<div [^>]+>\\s\|\\s</div>\\s*$', '', as.character(content)), tmp
	)
	system2('pandoc', c(tmp, '-o', tmp_md, '--wrap=none'))
	res = gsub(
	'\\s<(div\|span)[^>]?>\\s\|\\s</(div\|span)>\\s*', '',
	blogdown:::readUTF8(tmp_md)
	)
	res = gsub('``` {.sourceCode .r}', '```r', res, fixed = TRUE)
	# remove consecutive empty lines
	gsub('\\s\n\n+\\s', '\n\n', paste(trimws(res), collapse = '\n'))
	}
	meta = unlist(list(
	title = get_content('//h2[@class="entry-title"]'),
	author = get_content('//a[@rel="author"]'),
	date = get_content('//span[@class="updated"]', quote = FALSE),
	slug = basename(link),
	categories = sprintf(
	'[%s]', get_sub('//script[text()]', '.?"category":"([^"]+)".')
	),
	tags = NULL#,
	# description = gsub(
	# '(^by .+?\r\n)\|\n+\|\r+', '',
	# get_content('//meta[@property="og:description"]', xml_attr, 'content')
	# )
	))
	if (!dir.exists(dir)) dir.create(dir)
	blogdown:::writeUTF8(
	c('---', sprintf('%s: %s', names(meta), meta), '---', '', get_body()),
	file.path(dir, sprintf('%s-%s.md', as.Date(meta['date']), meta['slug']))
	)
	}

	for (link in post_links()) {
	message('Converting ', link)
	page_md(link)
	}