Last active
March 25, 2016 00:59
-
-
Save harveyl888/4ed090a4a845477eb1ef to your computer and use it in GitHub Desktop.
Scraping HML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
## Scraping HML | |
## use follow_link to read subsequent pages | |
## continue reading and scraping tables until 'Next' link does not respond | |
## table scraped by column so that max information can be teased out | |
## | |
library(rvest) | |
library(data.table) | |
maxPages <- 50 # safety net in case of runaway | |
page.count <- 1 # variable to keep count of pages read | |
keep_scraping <- TRUE | |
url <- 'http://www.hmdb.ca/hml/metabolites' # starting url | |
sess <- html_session(url) | |
l.hmdb <- list() | |
while (keep_scraping & page.count <= maxPages) { | |
cat(sprintf('Scraping page %02i', page.count)) | |
t1 <- sess %>% html() %>% html_nodes(xpath='//*[@id="metabolites"]') # scrape the full table | |
t1.1 <- t1 %>% html_nodes(xpath = '//td[1]') # first column | |
l.id <- t1.1 %>% html_nodes('.btn-card') %>% html_text() | |
l.cas <- t1.1 %>% html_nodes('.cas') %>% html_text() | |
l.name <- t1 %>% html_nodes(xpath = '//td[2]') %>% html_text() | |
t1.4c <- t1 %>% html_nodes(xpath = '//td[4]') %>% xml_children() # fourth column | |
t1.5c <- t1 %>% html_nodes(xpath = '//td[5]') %>% xml_children() # fifth column | |
l.details <- list() | |
for (i in 1:length(t1.4c)) { | |
l.sub <- t1.4c[[i]] %>% html_text() | |
m.mw <- as.numeric(l.sub[length(l.sub) - 2]) | |
m.acmass <- as.numeric(l.sub[length(l.sub)]) | |
m.formula <- paste0(l.sub[1:(length(l.sub) - 5)], collapse = '') | |
m.fluids <- paste0(t1.5c[[i]]$ul %>% html_nodes('li') %>% html_text(), collapse = '; ') | |
l.details[[i]] <- list(avMass = m.mw, monoMass = m.acmass, formula = m.formula, fluids = m.fluids) | |
} | |
df.page <- data.frame(id = l.id, cas = l.cas, name = l.name) | |
df.page <- cbind(df.page, rbindlist(l.details)) | |
l.hmdb[[length(l.hmdb) + 1]] <- df.page | |
cat(sprintf(' - %2i rows extacted\n', nrow(df.page))) | |
page.count <- page.count + 1 | |
Sys.sleep(0.2) # be nice - wait a while | |
suppressMessages( | |
sess <- tryCatch( | |
{ | |
sess %>% follow_link('Next') # try to follow link | |
}, | |
error = function(e) { | |
return(NA) # no link to follow | |
} | |
) | |
) | |
if (length(sess) == 1) keep_scraping <- FALSE # test if link followed | |
} | |
df.hml <- rbindlist(l.hmdb) # consolidate table | |
write.csv(df.hml, 'hml_out.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment