Skip to content

Instantly share code, notes, and snippets.

@harveyl888
Last active March 25, 2016 00:59
Show Gist options
  • Save harveyl888/4ed090a4a845477eb1ef to your computer and use it in GitHub Desktop.
Save harveyl888/4ed090a4a845477eb1ef to your computer and use it in GitHub Desktop.
Scraping HML
##
## Scraping HML
## use follow_link to read subsequent pages
## continue reading and scraping tables until 'Next' link does not respond
## table scraped by column so that max information can be teased out
##
library(rvest)
library(data.table)
maxPages <- 50 # safety net in case of runaway
page.count <- 1 # variable to keep count of pages read
keep_scraping <- TRUE
url <- 'http://www.hmdb.ca/hml/metabolites' # starting url
sess <- html_session(url)
l.hmdb <- list()
while (keep_scraping & page.count <= maxPages) {
cat(sprintf('Scraping page %02i', page.count))
t1 <- sess %>% html() %>% html_nodes(xpath='//*[@id="metabolites"]') # scrape the full table
t1.1 <- t1 %>% html_nodes(xpath = '//td[1]') # first column
l.id <- t1.1 %>% html_nodes('.btn-card') %>% html_text()
l.cas <- t1.1 %>% html_nodes('.cas') %>% html_text()
l.name <- t1 %>% html_nodes(xpath = '//td[2]') %>% html_text()
t1.4c <- t1 %>% html_nodes(xpath = '//td[4]') %>% xml_children() # fourth column
t1.5c <- t1 %>% html_nodes(xpath = '//td[5]') %>% xml_children() # fifth column
l.details <- list()
for (i in 1:length(t1.4c)) {
l.sub <- t1.4c[[i]] %>% html_text()
m.mw <- as.numeric(l.sub[length(l.sub) - 2])
m.acmass <- as.numeric(l.sub[length(l.sub)])
m.formula <- paste0(l.sub[1:(length(l.sub) - 5)], collapse = '')
m.fluids <- paste0(t1.5c[[i]]$ul %>% html_nodes('li') %>% html_text(), collapse = '; ')
l.details[[i]] <- list(avMass = m.mw, monoMass = m.acmass, formula = m.formula, fluids = m.fluids)
}
df.page <- data.frame(id = l.id, cas = l.cas, name = l.name)
df.page <- cbind(df.page, rbindlist(l.details))
l.hmdb[[length(l.hmdb) + 1]] <- df.page
cat(sprintf(' - %2i rows extacted\n', nrow(df.page)))
page.count <- page.count + 1
Sys.sleep(0.2) # be nice - wait a while
suppressMessages(
sess <- tryCatch(
{
sess %>% follow_link('Next') # try to follow link
},
error = function(e) {
return(NA) # no link to follow
}
)
)
if (length(sess) == 1) keep_scraping <- FALSE # test if link followed
}
df.hml <- rbindlist(l.hmdb) # consolidate table
write.csv(df.hml, 'hml_out.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment