Created
October 17, 2023 13:43
-
-
Save acvill/82453b9a67b0b50cb6b85155b74c247b to your computer and use it in GitHub Desktop.
Given a list of nucleotide IDs, fetch sequences and write to DNAStringSet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
entrez2dss <- | |
function(id_list) { | |
require(rentrez) # v1.2.3 | |
require(Biostrings) # v2.66.0 | |
require(stringr) # v1.5.0 | |
# fetch all sequences, trim empty elements | |
raw_ <- | |
entrez_fetch(db = "nucleotide", | |
rettype = "fasta", | |
id = id_list) |> | |
str_split_1(pattern = "\n\n") |> | |
head(-1) | |
# get sequence names | |
names_ <- | |
gsub(pattern = "\n.*", | |
replacement = "", | |
x = raw_) |> | |
gsub(pattern = "^>", | |
replacement = "") | |
# get sequences | |
seqs_ <- | |
sub(pattern = ".*?\n", | |
replacement = "", | |
x = raw_) |> | |
gsub(pattern = "\n", | |
replacement = "") | |
# create named DNAStringSet object | |
dss <- DNAStringSet(x = seqs_) | |
names(dss) <- names_ | |
dss | |
} | |
# test | |
seqlist <- c("NC_045512", "OR558592") | |
cov2 <- entrez2dss(seqlist) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://bioinformatics.stackexchange.com/a/21716/3967