Created
January 8, 2018 12:48
-
-
Save felixgrunberger/a95ee854894cca840a068cd4826664d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-----------------------------------------------------------------------------------------------------------------# | |
# load function html to text | |
# Author: Tony Breyal | |
# Date: 2011-11-18 | |
# Modified: 2011-11-18 | |
# Description: Extracts all text from a webpage (aims to extract only the text you would see in a web browser) | |
# Packages Used: RCurl, XML | |
# Blog Reference: Not published | |
# Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License | |
# For more information see: https://creativecommons.org/licenses/by-nc/3.0/ | |
# All rights reserved. | |
htmlToText <- function(input, ...) { | |
###---PACKAGES ---### | |
require(RCurl) | |
require(XML) | |
###--- LOCAL FUNCTIONS ---### | |
# Determine how to grab html for a single input element | |
evaluate_input <- function(input) { | |
# if input is a .html file | |
if(file.exists(input)) { | |
char.vec <- readLines(input, warn = FALSE) | |
return(paste(char.vec, collapse = "")) | |
} | |
# if input is html text | |
if(grepl("</html>", input, fixed = TRUE)) return(input) | |
# if input is a URL, probably should use a regex here instead? | |
if(!grepl(" ", input)) { | |
# downolad SSL certificate in case of https problem | |
if(!file.exists("cacert.perm")) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.perm") | |
return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm")) | |
} | |
# return NULL if none of the conditions above apply | |
return(NULL) | |
} | |
# convert HTML to plain text | |
convert_html_to_text <- function(html) { | |
doc <- htmlParse(html, asText = TRUE) | |
text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue) | |
return(text) | |
} | |
# format text vector into one character string | |
collapse_text <- function(txt) { | |
return(paste(txt, collapse = " ")) | |
} | |
###--- MAIN ---### | |
# STEP 1: Evaluate input | |
html.list <- lapply(input, evaluate_input) | |
# STEP 2: Extract text from HTML | |
text.list <- lapply(html.list, convert_html_to_text) | |
# STEP 3: Return text | |
text.vector <- sapply(text.list, collapse_text) | |
return(text.vector) | |
} | |
#-----------------------------------------------------------------------------------------------------------------# | |
#-----------------------------------------------------------------------------------------------------------------# | |
# x-y-plot part | |
### | |
## | |
# load libraries | |
library(rvest) | |
library(dplyr) | |
library(stringr) | |
library(tm) | |
library(viridis) | |
library(RColorBrewer) | |
library(data.table) | |
library(ggplot2) | |
library(wordcloud2) | |
clean_corpus <- function(corpus){ | |
corpus <- tm_map(corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC',sub='byte'))) | |
corpus <- tm_map(corpus, content_transformer(tolower)) | |
corpus <- tm_map(corpus, stripWhitespace) | |
corpus <- tm_map(corpus, removePunctuation) | |
corpus <- tm_map(corpus, removeNumbers) | |
corpus <- tm_map(corpus, removeWords, | |
c(stopwords("en"), "identity", "coverage", "ncbi", "protein", "pubmed", "article", "pmc", | |
"ptt", "refseq", "file", "text", "paperblast", "family", "atcc", "papers", "paper", "open", "locus", "gene", | |
"europepmc", "asncfamily","proteins", "access", "articles", "links", "uniprot")) | |
return(corpus) | |
} | |
### | |
## | |
# What protein are you interested in? | |
searched_protein <- "Cas9" | |
### | |
## | |
# Get text from paperBLAST website | |
scraping_blast <- read_html(paste("http://papers.genomics.lbl.gov/cgi-bin/litSearch.cgi?query=",searched_protein,"%0D%0A&Search=Search",sep = "") | |
) | |
### | |
## | |
# Homologs in which organisms? | |
organisms <- scraping_blast %>% | |
html_nodes(xpath = '/html/body/p/i[1]') %>% | |
html_text() | |
organisms <- head(organisms,-1) | |
### | |
## | |
# identity and coverage of homologs? | |
scraping_blast_p_a <- scraping_blast %>% | |
html_nodes("body") %>% | |
html_nodes(xpath = '/html/body/p/a[2]') %>% | |
html_text() %>% #get the text, not the HTML tags | |
gsub("^\\s+|\\s+$", "", .) #strip the white space from the beginning and end of a string. | |
identity <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,1], "% ", simplify=T)[,1]) | |
coverage <- as.numeric(str_split(str_split(scraping_blast_p_a[str_detect(scraping_blast_p_a,"identity")], ", ",simplify = T)[,2], "% ", simplify=T)[,1]) | |
### | |
## | |
# make matrix for plotting with ggplot2 | |
new_matrix <- NULL | |
new_matrix$identity <- identity | |
new_matrix$coverage <- coverage | |
new_matrix$organism <- organisms | |
new_table <- as.data.table(new_matrix) | |
### | |
## | |
# plotting with ggplot2 | |
ggplot(data = new_table, aes(x = identity, y = coverage, size = coverage)) + | |
stat_density_2d(aes(fill = ..level.., alpha = ..level..), geom = "polygon", contour = T, n = 100, h = 10, show.legend = F,na.rm = T) + | |
scale_fill_viridis_c() + | |
scale_alpha(range = c(0.4,1)) + | |
theme_minimal() + | |
scale_x_continuous(limits = c(0,119)) + | |
scale_y_continuous(limits = c(0,119)) + | |
coord_fixed(ratio = 1) + | |
theme(text = element_text(size = 14), | |
axis.title.x = element_text(face = "bold", vjust = 0), | |
axis.title.y = element_text(face = "bold", vjust = 0)) | |
#-----------------------------------------------------------------------------------------------------------------# | |
#-----------------------------------------------------------------------------------------------------------------# | |
# wordcloud part | |
### | |
## | |
# How can we access full-text pubmed articles --> get html address tag | |
scraping_blast_test <- scraping_blast %>% | |
html_nodes("body") %>% | |
html_nodes("ul") %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
html <- scraping_blast_test[!is.na(scraping_blast_test)] | |
html <- html[grepl(html, pattern = "www.ncbi.nlm.nih.gov/pmc/articles")] | |
# convert HTML to text with predefined function (upper part) | might take a while | |
html2txt <- lapply(html, htmlToText) | |
# clean out non-ASCII characters | |
html2txtclean <- sapply(html2txt, function(x) iconv(x, "latin1", "ASCII", sub="")) | |
# make a Corpus | |
text_corpus <- Corpus(VectorSource(html2txtclean)) | |
text_clean <- clean_corpus(text_corpus) | |
skipWords <- function(x) removeWords(x, stopwords("english")) | |
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords) | |
a <- tm_map(text_clean, FUN = tm_reduce, tmFuns = funcs) | |
a.dtm <- TermDocumentMatrix(a) | |
m = as.matrix(t(a.dtm)) | |
# get word counts in decreasing order | |
word_freqs = sort(colSums(m), decreasing=TRUE) | |
# create a data frame with words and their frequencies | |
dm = data.frame(word=names(word_freqs), freq=word_freqs) | |
# plot with wordcloud 2 function and figPath format if you want to | |
wordcloud2(dm_1,fontFamily = "Avenir", figPath = "/Users/f/Documents/R/paperblast_cloud/model_white.png", | |
color = viridis_pal()(12), size = 0.4, fontWeight = 600) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment