DomBennett/genbank_download.R

## genbank_download.R
# LIBS
library(rentrez)

# FUNCTIONS
connect_safely <- function(fnctn, args, wt_tms) {
  res <- NULL
  for (wt_tm in wt_tms) {
    query <- try(R.utils::withTimeout(do.call(fnctn, args),
                                      timeout = 3600),
                 silent = TRUE)
    if (!inherits(query, 'try-error')) {
      res <- query
      break
    } else {
      # ctrl+c
      if (grepl('Operation was aborted by an application callback',
               query[[1]])) {
        stop('Killed by user.')
      }
      cat("Retrying in [", wt_tm, "s]", sep = '')
      Sys.sleep(wt_tm)
    }
  }
  res
}
get_deja_vues <- function(cchdr) {
  fls <- list.files(cchdr, '.RData')
  sub('\\.RData', '', fls)
}
cache_result <- function(id, res, cchdr) {
  saveRDS(res, file = file.path(cchdr, paste0(id, '.RData')))
}

# VARS
wd <- '~/Desktop/camila_download/'
outfl <- file.path(wd, 'output.txt')
cchdr <- file.path(wd, 'sequences')
wt_tms <- c(3, 9, 18, 36, 60, 300, 600, 3600)
srch_trm <- '140[SLEN] : 3000[SLEN])) AND (((ITS1[titl] OR ITS2[titl]) OR 5.8S[titl]) OR "internal transcribed spacer"[titl] OR "internal transcribed spacers"[titl] OR "ITS 1" [titl] OR "ITS 2"[titl])'

# SETUP
if (!file.exists(cchdr)) {
  dir.create(cchdr)
}

# SEARCH GIS
# SEARCH
cat('Searching ...\n')
if (!file.exists(file.path(wd, 'gis.RData'))) {
  args <- list(db = 'nucleotide',
               term = srch_trm,
               retmax = 0)
  init_srch <- connect_safely(fnctn = entrez_search,
                              args = args, wt_tms = wt_tms)
  cnt <- init_srch[['count']]
  if (file.exists('part_gis.RData')) {
    gis <- NULL
    load('part_gis.RData')
    strt <- which(is.na(gis))[1] - 1
  } else {
    gis <- rep(NA, cnt)
    strt <- 0
  }
  btchsz <- 10000
  for (i in seq(strt, cnt - 1, btchsz)) {
    lower <- i + 1
    upper <- ifelse(i + btchsz < cnt, i + btchsz, cnt)
    cat("... [", lower, "-", upper, '/',
        cnt,"]\n", sep = '')
    args <- list(db = 'nucleotide', term = srch_trm,
                 retstart = lower, retmax = cnt)
    srch_rs <- connect_safely(fnctn = entrez_search,
                              args = args, wt_tms = wt_tms)
    gis[lower:upper] <-  srch_rs[['ids']]
    save(gis, file = 'part_gis.RData')
  }
  save(gis, file = 'gis.RData')
}
cat('Done.\n')

# DOWNLOAD
cat('Downloading ...\n')
load(file.path(wd, 'gis.RData'))
deja_vues <- get_deja_vues(cchdr = cchdr)
gis <- gis[!gis %in% deja_vues]
btchsz <- 100
for (i in seq(0, length(gis) - 1, btchsz)) {
  lower <- i + 1
  upper <- ifelse(i + btchsz < length(gis),
                  i + btchsz, length(gis))
  crrnt_ids <- gis[lower:upper]
  cat("... [", lower, "-", upper, '/',
      length(gis),"]\n", sep = '')
  args <- list(db = 'nucleotide',
               id = crrnt_ids, rettype = 'gb',
               retmode = 'text')
  fetched <- connect_safely(fnctn = entrez_fetch,
                            args = args, wt_tms = wt_tms)
  res <- strsplit(x = fetched, split = '\\n\\n')[[1]]
  if (length(res) == length(crrnt_ids)) {
    for (j in seq_along(crrnt_ids)) {
      cache_result(id = crrnt_ids[[j]],
                   res = res[[j]],
                   cchdr = cchdr)
    }
  } else {
    cat('Warning: skipped some IDs!')
  }
}

# WRITE OUT
fls <- list.files(cchdr, '.RData')
if (file.exists(outfl)) {
  file.remove(outfl)
}
for (fl in fls) {
  rcrd <- readRDS(file.path(cchdr, fl))
  rcrd <- paste0(rcrd, '\n\n')
  write(x = rcrd, file = outfl, append = TRUE)
}
	# LIBS
	library(rentrez)

	# FUNCTIONS
	connect_safely <- function(fnctn, args, wt_tms) {
	res <- NULL
	for (wt_tm in wt_tms) {
	query <- try(R.utils::withTimeout(do.call(fnctn, args),
	timeout = 3600),
	silent = TRUE)
	if (!inherits(query, 'try-error')) {
	res <- query
	break
	} else {
	# ctrl+c
	if (grepl('Operation was aborted by an application callback',
	query[[1]])) {
	stop('Killed by user.')
	}
	cat("Retrying in [", wt_tm, "s]", sep = '')
	Sys.sleep(wt_tm)
	}
	}
	res
	}
	get_deja_vues <- function(cchdr) {
	fls <- list.files(cchdr, '.RData')
	sub('\\.RData', '', fls)
	}
	cache_result <- function(id, res, cchdr) {
	saveRDS(res, file = file.path(cchdr, paste0(id, '.RData')))
	}

	# VARS
	wd <- '~/Desktop/camila_download/'
	outfl <- file.path(wd, 'output.txt')
	cchdr <- file.path(wd, 'sequences')
	wt_tms <- c(3, 9, 18, 36, 60, 300, 600, 3600)
	srch_trm <- '140[SLEN] : 3000[SLEN])) AND (((ITS1[titl] OR ITS2[titl]) OR 5.8S[titl]) OR "internal transcribed spacer"[titl] OR "internal transcribed spacers"[titl] OR "ITS 1" [titl] OR "ITS 2"[titl])'

	# SETUP
	if (!file.exists(cchdr)) {
	dir.create(cchdr)
	}

	# SEARCH GIS
	# SEARCH
	cat('Searching ...\n')
	if (!file.exists(file.path(wd, 'gis.RData'))) {
	args <- list(db = 'nucleotide',
	term = srch_trm,
	retmax = 0)
	init_srch <- connect_safely(fnctn = entrez_search,
	args = args, wt_tms = wt_tms)
	cnt <- init_srch[['count']]
	if (file.exists('part_gis.RData')) {
	gis <- NULL
	load('part_gis.RData')
	strt <- which(is.na(gis))[1] - 1
	} else {
	gis <- rep(NA, cnt)
	strt <- 0
	}
	btchsz <- 10000
	for (i in seq(strt, cnt - 1, btchsz)) {
	lower <- i + 1
	upper <- ifelse(i + btchsz < cnt, i + btchsz, cnt)
	cat("... [", lower, "-", upper, '/',
	cnt,"]\n", sep = '')
	args <- list(db = 'nucleotide', term = srch_trm,
	retstart = lower, retmax = cnt)
	srch_rs <- connect_safely(fnctn = entrez_search,
	args = args, wt_tms = wt_tms)
	gis[lower:upper] <- srch_rs[['ids']]
	save(gis, file = 'part_gis.RData')
	}
	save(gis, file = 'gis.RData')
	}
	cat('Done.\n')

	# DOWNLOAD
	cat('Downloading ...\n')
	load(file.path(wd, 'gis.RData'))
	deja_vues <- get_deja_vues(cchdr = cchdr)
	gis <- gis[!gis %in% deja_vues]
	btchsz <- 100
	for (i in seq(0, length(gis) - 1, btchsz)) {
	lower <- i + 1
	upper <- ifelse(i + btchsz < length(gis),
	i + btchsz, length(gis))
	crrnt_ids <- gis[lower:upper]
	cat("... [", lower, "-", upper, '/',
	length(gis),"]\n", sep = '')
	args <- list(db = 'nucleotide',
	id = crrnt_ids, rettype = 'gb',
	retmode = 'text')
	fetched <- connect_safely(fnctn = entrez_fetch,
	args = args, wt_tms = wt_tms)
	res <- strsplit(x = fetched, split = '\\n\\n')[[1]]
	if (length(res) == length(crrnt_ids)) {
	for (j in seq_along(crrnt_ids)) {
	cache_result(id = crrnt_ids[[j]],
	res = res[[j]],
	cchdr = cchdr)
	}
	} else {
	cat('Warning: skipped some IDs!')
	}
	}

	# WRITE OUT
	fls <- list.files(cchdr, '.RData')
	if (file.exists(outfl)) {
	file.remove(outfl)
	}
	for (fl in fls) {
	rcrd <- readRDS(file.path(cchdr, fl))
	rcrd <- paste0(rcrd, '\n\n')
	write(x = rcrd, file = outfl, append = TRUE)
	}