Skip to content

Instantly share code, notes, and snippets.

@felixgrunberger
Created January 8, 2018 12:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save felixgrunberger/a95ee854894cca840a068cd4826664d7 to your computer and use it in GitHub Desktop.
Save felixgrunberger/a95ee854894cca840a068cd4826664d7 to your computer and use it in GitHub Desktop.
#-----------------------------------------------------------------------------------------------------------------#
# load function html to text
# Author: Tony Breyal
# Date: 2011-11-18
# Modified: 2011-11-18
# Description: Extracts all text from a webpage (aims to extract only the text you would see in a web browser)
# Packages Used: RCurl, XML
# Blog Reference: Not published
# Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
# For more information see: https://creativecommons.org/licenses/by-nc/3.0/
# All rights reserved.
htmlToText <- function(input, ...) {
###---PACKAGES ---###
require(RCurl)
require(XML)
###--- LOCAL FUNCTIONS ---###
# Determine how to grab html for a single input element
evaluate_input <- function(input) {
# if input is a .html file
if(file.exists(input)) {
char.vec <- readLines(input, warn = FALSE)
return(paste(char.vec, collapse = ""))
}
# if input is html text
if(grepl("</html>", input, fixed = TRUE)) return(input)
# if input is a URL, probably should use a regex here instead?
if(!grepl(" ", input)) {
# downolad SSL certificate in case of https problem
if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm")
return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm"))
}
# return NULL if none of the conditions above apply
return(NULL)
}
# convert HTML to plain text
convert_html_to_text <- function(html) {
doc <- htmlParse(html, asText = TRUE)
text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
return(text)
}
# format text vector into one character string
collapse_text <- function(txt) {
return(paste(txt, collapse = " "))
}
###--- MAIN ---###
# STEP 1: Evaluate input
html.list <- lapply(input, evaluate_input)
# STEP 2: Extract text from HTML
text.list <- lapply(html.list, convert_html_to_text)
# STEP 3: Return text
text.vector <- sapply(text.list, collapse_text)
return(text.vector)
}
#-----------------------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------------------#
# x-y-plot part
###
##
# load libraries
library(rvest)
library(dplyr)
library(stringr)
library(tm)
library(viridis)
library(RColorBrewer)
library(data.table)
library(ggplot2)
library(wordcloud2)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC',sub='byte')))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords,
c(stopwords("en"), "identity", "coverage", "ncbi", "protein", "pubmed", "article", "pmc",
"ptt", "refseq", "file", "text", "paperblast", "family", "atcc", "papers", "paper", "open", "locus", "gene",
"europepmc", "asncfamily","proteins", "access", "articles", "links", "uniprot"))
return(corpus)
}
###
##
# What protein are you interested in?
searched_protein <- "Cas9"
###
##
# Get text from paperBLAST website
scraping_blast <- read_html(paste("http://papers.genomics.lbl.gov/cgi-bin/litSearch.cgi?query=",searched_protein,"%0D%0A&Search=Search",sep = "")
)
###
##
# Homologs in which organisms?
organisms <- scraping_blast %>%
html_nodes(xpath = '/html/body/p/i[1]') %>%
html_text()
organisms <- head(organisms,-1)
###
##
# identity and coverage of homologs?
scraping_blast_p_a <- scraping_blast %>%
html_nodes("body") %>%
html_nodes(xpath = '/html/body/p/a[2]') %>%
html_text() %>% #get the text, not the HTML tags
gsub("^\\s+|\\s+$", "", .) #strip the white space from the beginning and end of a string.
identity <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,1], "% ", simplify=T)[,1])
coverage <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,2], "% ", simplify=T)[,1])
###
##
# make matrix for plotting with ggplot2
new_matrix <- NULL
new_matrix$identity <- identity
new_matrix$coverage <- coverage
new_matrix$organism <- organisms
new_table <- as.data.table(new_matrix)
###
##
# plotting with ggplot2
ggplot(data = new_table, aes(x = identity, y = coverage, size = coverage)) +
stat_density_2d(aes(fill = ..level.., alpha = ..level..), geom = "polygon", contour = T, n = 100, h = 10, show.legend = F,na.rm = T) +
scale_fill_viridis_c() +
scale_alpha(range = c(0.4,1)) +
theme_minimal() +
scale_x_continuous(limits = c(0,119)) +
scale_y_continuous(limits = c(0,119)) +
coord_fixed(ratio = 1) +
theme(text = element_text(size = 14),
axis.title.x = element_text(face = "bold", vjust = 0),
axis.title.y = element_text(face = "bold", vjust = 0))
#-----------------------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------------------#
# wordcloud part
###
##
# How can we access full-text pubmed articles --> get html address tag
scraping_blast_test <- scraping_blast %>%
html_nodes("body") %>%
html_nodes("ul") %>%
html_nodes("a") %>%
html_attr("href")
html <- scraping_blast_test[!is.na(scraping_blast_test)]
html <- html[grepl(html, pattern = "www.ncbi.nlm.nih.gov/pmc/articles")]
# convert HTML to text with predefined function (upper part) | might take a while
html2txt <- lapply(html, htmlToText)
# clean out non-ASCII characters
html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub=""))
# make a Corpus
text_corpus <- Corpus(VectorSource(html2txtclean))
text_clean <- clean_corpus(text_corpus)
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
a <- tm_map(text_clean, FUN = tm_reduce, tmFuns = funcs)
a.dtm <- TermDocumentMatrix(a)
m = as.matrix(t(a.dtm))
# get word counts in decreasing order
word_freqs = sort(colSums(m), decreasing=TRUE)
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
# plot with wordcloud 2 function and figPath format if you want to
wordcloud2(dm_1,fontFamily = "Avenir", figPath = "/Users/f/Documents/R/paperblast_cloud/model_white.png",
color = viridis_pal()(12), size = 0.4, fontWeight = 600)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment