Skip to content

Instantly share code, notes, and snippets.

@ramongallego
Last active January 17, 2020 21:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ramongallego/1bb1301081d79a9519b9a06f55b3c758 to your computer and use it in GitHub Desktop.
Save ramongallego/1bb1301081d79a9519b9a06f55b3c758 to your computer and use it in GitHub Desktop.
An R script to query genbank using org name and locus.
library(tidyverse)
library(insect)
library(rentrez)
set_entrez_key (YOUR_KEY_HERE)
library(here)
worlds.taxonomy <- read_rds(here("data", "all.taxonomy.rds"))
# make your own taxonomy db using the command
# worlds.taxonomy <- taxonomy(), then save it for later use
#############
#############
# The easiest way is to specify everything in the query, so you don't have to wrangle with the results too much
## Example 1: 16S from Chordata mitochondrial genomes
query <- "Chordata[ORGN]+AND+16S[TITL]"
Full.search <- searchGB(query,taxIDs = T, sequences = T, prompt=F )
Fwd_primer <- "GYAATCACTTGTCTTTTAAATGAAGACC"
Rev_primer <- "GGATTGCGCTGTTATCCCTA"
Trimmed_fragments <- virtualPCR(Full.search, up = Fwd_primer, down = Rev_primer,
trimprimers = TRUE,minamplen = 250, maxamplen = 400, cores = "autodetect")
## Now let's assume that you found new sequences that you want to add - maybe full mt genomes that
## might have been missed in the first query
new.query <- "Chordata[ORGN]+AND+13000:18000[SLEN]+mitochondrion[TITL]"
search_mits <- searchGB(new.query,taxIDs = T, sequences = T, prompt=F)
Trimmed_mtos <- virtualPCR(search_mits, up = Fwd_primer, down = Rev_primer,
trimprimers = TRUE,minamplen = 250, maxamplen = 400, cores = "autodetect")
## How to add both lists?
#### Case 1: Everything went fine (Nothing is repeated in both lists?)
All.seqs <- append(Trimmed_fragments,Trimmed_mtos)
### Case 2: we have to subset, pluck and so on.
### Remove sequences without taxID
Seq.names.16S <- tibble(seq.name = names(Trimmed_fragments),
Search = "16S" )
Seq.names.16S %>%
separate(seq.name, into = c("Accession", "taxID"), sep = "\\|", remove = F) %>%
left_join(worlds.taxonomy %>%
mutate(taxID = as.character(taxID))) %>%
filter(is.na(name)) %>%
pull (seq.name) -> to.throw # Keep the names of the sequences to remove
Seq.names.16S %>%
filter (! seq.name %in% to.throw ) %>%
pull (seq.name) -> to.keep
## Now select only the seqs we want
Trimmed_fragments [to.keep] -> seqs.to.keep
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment