mbarnkob/2016-Nature-Journal-Scrape-Capti.R

## 2016-Nature-Journal-Scrape-Capti.R
# Script to convert Nature articles into Capti text files

# A script to scrape Nature Immunology "news and views" text files and convert them to a format,
# thats easier to listen to in Capti (https://www.captivoice.com/capti-site/). Does the following:

# 1. Add "Title " in front of title, "Author " in front of author, "Abstract " in front of abstract, "Text " in front of text
# 2. Removes references and website links
# 3. Remove figure texts.

# Version 1
# Mike Barnkob, 2016-02-01.
# License: CC BY-NC 4.0 - please share, remix, and credit.
# Info: http://mikebarnkob.dk/2016/an-r-script-for-scraping-news-from-nature-journals-to-capti-narrator

# References
# 1.  http://www.regular-expressions.info/examples.html
# 2. http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/

# SETUP

#Working directory
setwd("~/Dropbox/Projekter/2016 - Scrape Nature Journals for Capti")

#Load libraries
if (!require('XML')) install.packages('XML'); library('XML')
if (!require('rvest')) install.packages('rvest'); library('rvest')
if (!require('stringr')) install.packages('stringr'); library('stringr')

# SCRAPE

#Find all articles in "news and views" and "Research Highlights"
proxy_link <- "http://ezproxy-prd.bodleian.ox.ac.uk:2076"   #Use your universities proxy - remember to login first!
current_issue <- "/ni/journal/v17/n2/index.html"            #Link to nature Immunology - might work with other Nature Journals
html_link <- paste(proxy_link,current_issue,sep="")

news_and_views <- read_html(html_link)
news_and_views <- news_and_views %>%
  html_node("#nv") %>%
  html_nodes(".fulltext") %>%
  html_attr("href")
news_and_views <- paste(proxy_link, news_and_views, sep="")

research_highlights <- read_html(html_link)
research_highlights <- research_highlights %>%
  html_node("#rhighlts") %>%
  html_nodes(".atl") %>%
  html_nodes("a") %>%
  html_attr("href")
research_highlights <- paste(proxy_link, research_highlights, sep="")

perspective <- read_html(html_link)
perspective <- perspective %>%
  html_node("#pe , .fulltext") %>%
  html_attr("href")
perspective <- paste(proxy_link, perspective, sep="")

#List of all articles to scrape
article_list <- c(news_and_views, research_highlights, perspective)

#LOOP THROUGH ARTICLE LIST

article_complete <- c()
all_together_now <- c()

for(i in 1:length(article_list)) {
  #clear holders
  article <- c()
  title <- c()
  author <- c()
  abstract <- c()
  txt <- c()

  #scrape + add text for easier listening
  article <- read_html(article_list[i])   #download html page
  title <- article %>%                    #scrape title of article
    html_node(".article-heading") %>%
    html_text
  title <- paste("\n\nTitle -", title)
  author <- article %>%                   #scrape author of article
    html_node(".fn") %>%
    html_text
  author <- paste("\n\nBy -", author)
  abstract <- article %>%                 #scrape abstract, if available
    html_nodes(".standfirst") %>%
    html_text
  if (length(abstract)>0) {
    abstract <- paste("\n\nAbstract -", abstract)
  }

  txt <- article %>%                      #scrape main text
    html_nodes(".content") %>%
    .[2]
  txt <- gsub("<sup\\b[^<]*>[^<]*(?:<(?!/sup>)[^<]*)*</sup>", "", txt, perl=T)  #Remove all <sup> (ie references) from html - http://stackoverflow.com/questions/33970549/remove-all-specific-html-tags-using-gsub-r
  txt <- str_replace_all(txt, "\t", "")       #Remove all \t
  txt <- gsub('[\n]', '', txt)      #Remove all \n - http://stackoverflow.com/questions/9562535/gsub-reduce-all-repeating-characters-to-one-instance
  txt <- gsub("\\s*\\([^\\)]+\\)","",txt)     #Remove urls
  txt <- str_replace_all(txt, "</p><p>", "\n\n New paragraph \n")     #Replace </p><p> with \n\n New paragraph
  txt <- gsub("<.*?>", "", txt)             #Strip all html tags #http://stackoverflow.com/questions/17227294/removing-html-tags-from-a-string-in-r
  txt <- gsub("\\s*\\([^\\)]+\\)","",txt)    # Remove urls
  txt <- paste("\n\nText -", txt)

  article_complete <- paste(title, author, if (length(abstract)>0) { abstract }, txt)
  all_together_now <- paste(all_together_now, "\n\n Next article", article_complete)
}

#SAVE TEXT FILE
cat(all_together_now,file="Nature Immunology News.txt",sep="\n")
file.show("Nature Immunology News.txt")   #Shows the text file
	# Script to convert Nature articles into Capti text files

	# A script to scrape Nature Immunology "news and views" text files and convert them to a format,
	# thats easier to listen to in Capti (https://www.captivoice.com/capti-site/). Does the following:

	# 1. Add "Title " in front of title, "Author " in front of author, "Abstract " in front of abstract, "Text " in front of text
	# 2. Removes references and website links
	# 3. Remove figure texts.

	# Version 1
	# Mike Barnkob, 2016-02-01.
	# License: CC BY-NC 4.0 - please share, remix, and credit.
	# Info: http://mikebarnkob.dk/2016/an-r-script-for-scraping-news-from-nature-journals-to-capti-narrator

	# References
	# 1. http://www.regular-expressions.info/examples.html
	# 2. http://www.cheatography.com/davechild/cheat-sheets/regular-expressions/

	# SETUP

	#Working directory
	setwd("~/Dropbox/Projekter/2016 - Scrape Nature Journals for Capti")

	#Load libraries
	if (!require('XML')) install.packages('XML'); library('XML')
	if (!require('rvest')) install.packages('rvest'); library('rvest')
	if (!require('stringr')) install.packages('stringr'); library('stringr')

	# SCRAPE

	#Find all articles in "news and views" and "Research Highlights"
	proxy_link <- "http://ezproxy-prd.bodleian.ox.ac.uk:2076" #Use your universities proxy - remember to login first!
	current_issue <- "/ni/journal/v17/n2/index.html" #Link to nature Immunology - might work with other Nature Journals
	html_link <- paste(proxy_link,current_issue,sep="")

	news_and_views <- read_html(html_link)
	news_and_views <- news_and_views %>%
	html_node("#nv") %>%
	html_nodes(".fulltext") %>%
	html_attr("href")
	news_and_views <- paste(proxy_link, news_and_views, sep="")

	research_highlights <- read_html(html_link)
	research_highlights <- research_highlights %>%
	html_node("#rhighlts") %>%
	html_nodes(".atl") %>%
	html_nodes("a") %>%
	html_attr("href")
	research_highlights <- paste(proxy_link, research_highlights, sep="")

	perspective <- read_html(html_link)
	perspective <- perspective %>%
	html_node("#pe , .fulltext") %>%
	html_attr("href")
	perspective <- paste(proxy_link, perspective, sep="")

	#List of all articles to scrape
	article_list <- c(news_and_views, research_highlights, perspective)

	#LOOP THROUGH ARTICLE LIST

	article_complete <- c()
	all_together_now <- c()

	for(i in 1:length(article_list)) {
	#clear holders
	article <- c()
	title <- c()
	author <- c()
	abstract <- c()
	txt <- c()

	#scrape + add text for easier listening
	article <- read_html(article_list[i]) #download html page
	title <- article %>% #scrape title of article
	html_node(".article-heading") %>%
	html_text
	title <- paste("\n\nTitle -", title)
	author <- article %>% #scrape author of article
	html_node(".fn") %>%
	html_text
	author <- paste("\n\nBy -", author)
	abstract <- article %>% #scrape abstract, if available
	html_nodes(".standfirst") %>%
	html_text
	if (length(abstract)>0) {
	abstract <- paste("\n\nAbstract -", abstract)
	}

	txt <- article %>% #scrape main text
	html_nodes(".content") %>%
	.[2]
	txt <- gsub("<sup\\b[^<]>[^<](?:<(?!/sup>)[^<])</sup>", "", txt, perl=T) #Remove all <sup> (ie references) from html - http://stackoverflow.com/questions/33970549/remove-all-specific-html-tags-using-gsub-r
	txt <- str_replace_all(txt, "\t", "") #Remove all \t
	txt <- gsub('[\n]', '', txt) #Remove all \n - http://stackoverflow.com/questions/9562535/gsub-reduce-all-repeating-characters-to-one-instance
	txt <- gsub("\\s*\\([^\\)]+\\)","",txt) #Remove urls
	txt <- str_replace_all(txt, "</p><p>", "\n\n New paragraph \n") #Replace </p><p> with \n\n New paragraph
	txt <- gsub("<.*?>", "", txt) #Strip all html tags #http://stackoverflow.com/questions/17227294/removing-html-tags-from-a-string-in-r
	txt <- gsub("\\s*\\([^\\)]+\\)","",txt) # Remove urls
	txt <- paste("\n\nText -", txt)

	article_complete <- paste(title, author, if (length(abstract)>0) { abstract }, txt)
	all_together_now <- paste(all_together_now, "\n\n Next article", article_complete)
	}

	#SAVE TEXT FILE
	cat(all_together_now,file="Nature Immunology News.txt",sep="\n")
	file.show("Nature Immunology News.txt") #Shows the text file