Skip to content

Instantly share code, notes, and snippets.

@DomBennett
Last active April 9, 2018 08:34
Show Gist options
  • Save DomBennett/23438f09e378090610da6190f72a0d4e to your computer and use it in GitHub Desktop.
Save DomBennett/23438f09e378090610da6190f72a0d4e to your computer and use it in GitHub Desktop.
Download from GenBank with cache using R
# LIBS
library(rentrez)
# FUNCTIONS
connect_safely <- function(fnctn, args, wt_tms) {
res <- NULL
for (wt_tm in wt_tms) {
query <- try(R.utils::withTimeout(do.call(fnctn, args),
timeout = 3600),
silent = TRUE)
if (!inherits(query, 'try-error')) {
res <- query
break
} else {
# ctrl+c
if (grepl('Operation was aborted by an application callback',
query[[1]])) {
stop('Killed by user.')
}
cat("Retrying in [", wt_tm, "s]", sep = '')
Sys.sleep(wt_tm)
}
}
res
}
get_deja_vues <- function(cchdr) {
fls <- list.files(cchdr, '.RData')
sub('\\.RData', '', fls)
}
cache_result <- function(id, res, cchdr) {
saveRDS(res, file = file.path(cchdr, paste0(id, '.RData')))
}
# VARS
wd <- '~/Desktop/camila_download/'
outfl <- file.path(wd, 'output.txt')
cchdr <- file.path(wd, 'sequences')
wt_tms <- c(3, 9, 18, 36, 60, 300, 600, 3600)
srch_trm <- '140[SLEN] : 3000[SLEN])) AND (((ITS1[titl] OR ITS2[titl]) OR 5.8S[titl]) OR "internal transcribed spacer"[titl] OR "internal transcribed spacers"[titl] OR "ITS 1" [titl] OR "ITS 2"[titl])'
# SETUP
if (!file.exists(cchdr)) {
dir.create(cchdr)
}
# SEARCH GIS
# SEARCH
cat('Searching ...\n')
if (!file.exists(file.path(wd, 'gis.RData'))) {
args <- list(db = 'nucleotide',
term = srch_trm,
retmax = 0)
init_srch <- connect_safely(fnctn = entrez_search,
args = args, wt_tms = wt_tms)
cnt <- init_srch[['count']]
if (file.exists('part_gis.RData')) {
gis <- NULL
load('part_gis.RData')
strt <- which(is.na(gis))[1] - 1
} else {
gis <- rep(NA, cnt)
strt <- 0
}
btchsz <- 10000
for (i in seq(strt, cnt - 1, btchsz)) {
lower <- i + 1
upper <- ifelse(i + btchsz < cnt, i + btchsz, cnt)
cat("... [", lower, "-", upper, '/',
cnt,"]\n", sep = '')
args <- list(db = 'nucleotide', term = srch_trm,
retstart = lower, retmax = cnt)
srch_rs <- connect_safely(fnctn = entrez_search,
args = args, wt_tms = wt_tms)
gis[lower:upper] <- srch_rs[['ids']]
save(gis, file = 'part_gis.RData')
}
save(gis, file = 'gis.RData')
}
cat('Done.\n')
# DOWNLOAD
cat('Downloading ...\n')
load(file.path(wd, 'gis.RData'))
deja_vues <- get_deja_vues(cchdr = cchdr)
gis <- gis[!gis %in% deja_vues]
btchsz <- 100
for (i in seq(0, length(gis) - 1, btchsz)) {
lower <- i + 1
upper <- ifelse(i + btchsz < length(gis),
i + btchsz, length(gis))
crrnt_ids <- gis[lower:upper]
cat("... [", lower, "-", upper, '/',
length(gis),"]\n", sep = '')
args <- list(db = 'nucleotide',
id = crrnt_ids, rettype = 'gb',
retmode = 'text')
fetched <- connect_safely(fnctn = entrez_fetch,
args = args, wt_tms = wt_tms)
res <- strsplit(x = fetched, split = '\\n\\n')[[1]]
if (length(res) == length(crrnt_ids)) {
for (j in seq_along(crrnt_ids)) {
cache_result(id = crrnt_ids[[j]],
res = res[[j]],
cchdr = cchdr)
}
} else {
cat('Warning: skipped some IDs!')
}
}
# WRITE OUT
fls <- list.files(cchdr, '.RData')
if (file.exists(outfl)) {
file.remove(outfl)
}
for (fl in fls) {
rcrd <- readRDS(file.path(cchdr, fl))
rcrd <- paste0(rcrd, '\n\n')
write(x = rcrd, file = outfl, append = TRUE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment