felixgrunberger/paperblast_x_y_wordcloud.R

## paperblast_x_y_wordcloud.R
#-----------------------------------------------------------------------------------------------------------------#

# load function html to text


# Author: Tony Breyal
# Date: 2011-11-18
# Modified: 2011-11-18
# Description: Extracts all text from a webpage (aims to extract only the text you would see in a web browser)
# Packages Used: RCurl, XML
# Blog Reference: Not published

# Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
# For more information see: https://creativecommons.org/licenses/by-nc/3.0/
# All rights reserved.

htmlToText <- function(input, ...) {
  ###---PACKAGES ---###
  require(RCurl)
  require(XML)


  ###--- LOCAL FUNCTIONS ---###
  # Determine how to grab html for a single input element
  evaluate_input <- function(input) {
    # if input is a .html file
    if(file.exists(input)) {
      char.vec <- readLines(input, warn = FALSE)
      return(paste(char.vec, collapse = ""))
    }

    # if input is html text
    if(grepl("</html>", input, fixed = TRUE)) return(input)

    # if input is a URL, probably should use a regex here instead?
    if(!grepl(" ", input)) {
      # downolad SSL certificate in case of https problem
      if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm")
      return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm"))
    }

    # return NULL if none of the conditions above apply
    return(NULL)
  }

  # convert HTML to plain text
  convert_html_to_text <- function(html) {
    doc <- htmlParse(html, asText = TRUE)
    text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
    return(text)
  }

  # format text vector into one character string
  collapse_text <- function(txt) {
    return(paste(txt, collapse = " "))
  }

  ###--- MAIN ---###
  # STEP 1: Evaluate input
  html.list <- lapply(input, evaluate_input)

  # STEP 2: Extract text from HTML
  text.list <- lapply(html.list, convert_html_to_text)

  # STEP 3: Return text
  text.vector <- sapply(text.list, collapse_text)
  return(text.vector)
}

#-----------------------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------------------#
# x-y-plot part

###
##
#  load libraries
library(rvest)
library(dplyr)
library(stringr)
library(tm)
library(viridis)
library(RColorBrewer)
library(data.table)
library(ggplot2)
library(wordcloud2)


clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC',sub='byte')))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords,
                   c(stopwords("en"), "identity", "coverage", "ncbi", "protein", "pubmed", "article", "pmc",
                     "ptt", "refseq", "file", "text", "paperblast", "family", "atcc", "papers", "paper", "open", "locus", "gene",
                     "europepmc", "asncfamily","proteins", "access", "articles", "links", "uniprot"))
  return(corpus)
}


###
##
#  What protein are you interested in?
searched_protein <- "Cas9"


###
##
#  Get text from paperBLAST website
scraping_blast <- read_html(paste("http://papers.genomics.lbl.gov/cgi-bin/litSearch.cgi?query=",searched_protein,"%0D%0A&Search=Search",sep = "")
)


###
##
#  Homologs in which organisms?
organisms <- scraping_blast %>%
  html_nodes(xpath = '/html/body/p/i[1]') %>%
  html_text()
organisms <- head(organisms,-1)


###
##
#  identity and coverage of homologs?
scraping_blast_p_a <- scraping_blast %>%
  html_nodes("body") %>%
  html_nodes(xpath = '/html/body/p/a[2]') %>%
  html_text() %>%             #get the text, not the HTML tags
  gsub("^\\s+|\\s+$", "", .)  #strip the white space from the beginning and end of a string.

identity <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,1], "% ", simplify=T)[,1])
coverage <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,2], "% ", simplify=T)[,1])


###
##
#  make matrix for plotting with ggplot2
new_matrix <- NULL
new_matrix$identity <- identity
new_matrix$coverage <- coverage
new_matrix$organism <- organisms
new_table <- as.data.table(new_matrix)


###
##
#  plotting with ggplot2
ggplot(data = new_table, aes(x = identity, y = coverage, size = coverage)) +
  stat_density_2d(aes(fill = ..level.., alpha = ..level..), geom = "polygon", contour = T, n = 100, h = 10, show.legend = F,na.rm = T) +
  scale_fill_viridis_c() +
  scale_alpha(range = c(0.4,1)) +
  theme_minimal() +
  scale_x_continuous(limits = c(0,119)) +
  scale_y_continuous(limits = c(0,119)) +
  coord_fixed(ratio = 1) +
  theme(text = element_text(size = 14),
        axis.title.x = element_text(face = "bold", vjust = 0),
        axis.title.y = element_text(face = "bold", vjust = 0))


#-----------------------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------------------#
# wordcloud part

###
##
#  How can we access full-text pubmed articles --> get html address tag
scraping_blast_test <- scraping_blast %>%
  html_nodes("body") %>%
  html_nodes("ul") %>%
  html_nodes("a") %>%
  html_attr("href")

html <- scraping_blast_test[!is.na(scraping_blast_test)]
html <- html[grepl(html, pattern = "www.ncbi.nlm.nih.gov/pmc/articles")]


# convert HTML to text with predefined function (upper part) | might take a while
html2txt <- lapply(html, htmlToText)

# clean out non-ASCII characters
html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub=""))

# make a Corpus
text_corpus <- Corpus(VectorSource(html2txtclean))
text_clean <- clean_corpus(text_corpus)

skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
a <- tm_map(text_clean, FUN = tm_reduce, tmFuns = funcs)
a.dtm <- TermDocumentMatrix(a)
m = as.matrix(t(a.dtm))

# get word counts in decreasing order
word_freqs = sort(colSums(m), decreasing=TRUE)

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)


# plot with wordcloud 2 function and figPath format if you want to
wordcloud2(dm_1,fontFamily = "Avenir", figPath = "/Users/f/Documents/R/paperblast_cloud/model_white.png",
           color = viridis_pal()(12), size = 0.4, fontWeight = 600)
	#-----------------------------------------------------------------------------------------------------------------#

	# load function html to text


	# Author: Tony Breyal
	# Date: 2011-11-18
	# Modified: 2011-11-18
	# Description: Extracts all text from a webpage (aims to extract only the text you would see in a web browser)
	# Packages Used: RCurl, XML
	# Blog Reference: Not published

	# Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
	# For more information see: https://creativecommons.org/licenses/by-nc/3.0/
	# All rights reserved.

	htmlToText <- function(input, ...) {
	###---PACKAGES ---###
	require(RCurl)
	require(XML)


	###--- LOCAL FUNCTIONS ---###
	# Determine how to grab html for a single input element
	evaluate_input <- function(input) {
	# if input is a .html file
	if(file.exists(input)) {
	char.vec <- readLines(input, warn = FALSE)
	return(paste(char.vec, collapse = ""))
	}

	# if input is html text
	if(grepl("</html>", input, fixed = TRUE)) return(input)

	# if input is a URL, probably should use a regex here instead?
	if(!grepl(" ", input)) {
	# downolad SSL certificate in case of https problem
	if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm")
	return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm"))
	}

	# return NULL if none of the conditions above apply
	return(NULL)
	}

	# convert HTML to plain text
	convert_html_to_text <- function(html) {
	doc <- htmlParse(html, asText = TRUE)
	text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
	return(text)
	}

	# format text vector into one character string
	collapse_text <- function(txt) {
	return(paste(txt, collapse = " "))
	}

	###--- MAIN ---###
	# STEP 1: Evaluate input
	html.list <- lapply(input, evaluate_input)

	# STEP 2: Extract text from HTML
	text.list <- lapply(html.list, convert_html_to_text)

	# STEP 3: Return text
	text.vector <- sapply(text.list, collapse_text)
	return(text.vector)
	}

	#-----------------------------------------------------------------------------------------------------------------#
	#-----------------------------------------------------------------------------------------------------------------#
	# x-y-plot part

	###
	##
	# load libraries
	library(rvest)
	library(dplyr)
	library(stringr)
	library(tm)
	library(viridis)
	library(RColorBrewer)
	library(data.table)
	library(ggplot2)
	library(wordcloud2)


	clean_corpus <- function(corpus){
	corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC',sub='byte')))
	corpus <- tm_map(corpus, content_transformer(tolower))
	corpus <- tm_map(corpus, stripWhitespace)
	corpus <- tm_map(corpus, removePunctuation)
	corpus <- tm_map(corpus, removeNumbers)
	corpus <- tm_map(corpus, removeWords,
	c(stopwords("en"), "identity", "coverage", "ncbi", "protein", "pubmed", "article", "pmc",
	"ptt", "refseq", "file", "text", "paperblast", "family", "atcc", "papers", "paper", "open", "locus", "gene",
	"europepmc", "asncfamily","proteins", "access", "articles", "links", "uniprot"))
	return(corpus)
	}



	###
	##
	# What protein are you interested in?
	searched_protein <- "Cas9"


	###
	##
	# Get text from paperBLAST website
	scraping_blast <- read_html(paste("http://papers.genomics.lbl.gov/cgi-bin/litSearch.cgi?query=",searched_protein,"%0D%0A&Search=Search",sep = "")
	)


	###
	##
	# Homologs in which organisms?
	organisms <- scraping_blast %>%
	html_nodes(xpath = '/html/body/p/i[1]') %>%
	html_text()
	organisms <- head(organisms,-1)


	###
	##
	# identity and coverage of homologs?
	scraping_blast_p_a <- scraping_blast %>%
	html_nodes("body") %>%
	html_nodes(xpath = '/html/body/p/a[2]') %>%
	html_text() %>% #get the text, not the HTML tags
	gsub("^\\s+\|\\s+$", "", .) #strip the white space from the beginning and end of a string.

	identity <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,1], "% ", simplify=T)[,1])
	coverage <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,2], "% ", simplify=T)[,1])


	###
	##
	# make matrix for plotting with ggplot2
	new_matrix <- NULL
	new_matrix$identity <- identity
	new_matrix$coverage <- coverage
	new_matrix$organism <- organisms
	new_table <- as.data.table(new_matrix)


	###
	##
	# plotting with ggplot2
	ggplot(data = new_table, aes(x = identity, y = coverage, size = coverage)) +
	stat_density_2d(aes(fill = ..level.., alpha = ..level..), geom = "polygon", contour = T, n = 100, h = 10, show.legend = F,na.rm = T) +
	scale_fill_viridis_c() +
	scale_alpha(range = c(0.4,1)) +
	theme_minimal() +
	scale_x_continuous(limits = c(0,119)) +
	scale_y_continuous(limits = c(0,119)) +
	coord_fixed(ratio = 1) +
	theme(text = element_text(size = 14),
	axis.title.x = element_text(face = "bold", vjust = 0),
	axis.title.y = element_text(face = "bold", vjust = 0))


	#-----------------------------------------------------------------------------------------------------------------#
	#-----------------------------------------------------------------------------------------------------------------#
	# wordcloud part

	###
	##
	# How can we access full-text pubmed articles --> get html address tag
	scraping_blast_test <- scraping_blast %>%
	html_nodes("body") %>%
	html_nodes("ul") %>%
	html_nodes("a") %>%
	html_attr("href")

	html <- scraping_blast_test[!is.na(scraping_blast_test)]
	html <- html[grepl(html, pattern = "www.ncbi.nlm.nih.gov/pmc/articles")]


	# convert HTML to text with predefined function (upper part) \| might take a while
	html2txt <- lapply(html, htmlToText)

	# clean out non-ASCII characters
	html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub=""))

	# make a Corpus
	text_corpus <- Corpus(VectorSource(html2txtclean))
	text_clean <- clean_corpus(text_corpus)

	skipWords <- function(x) removeWords(x, stopwords("english"))
	funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
	a <- tm_map(text_clean, FUN = tm_reduce, tmFuns = funcs)
	a.dtm <- TermDocumentMatrix(a)
	m = as.matrix(t(a.dtm))

	# get word counts in decreasing order
	word_freqs = sort(colSums(m), decreasing=TRUE)

	# create a data frame with words and their frequencies
	dm = data.frame(word=names(word_freqs), freq=word_freqs)




	# plot with wordcloud 2 function and figPath format if you want to
	wordcloud2(dm_1,fontFamily = "Avenir", figPath = "/Users/f/Documents/R/paperblast_cloud/model_white.png",
	color = viridis_pal()(12), size = 0.4, fontWeight = 600)