Created
October 23, 2018 03:25
-
-
Save philippbayer/df50354d4917523bf0c6db35dddbe33f to your computer and use it in GitHub Desktop.
For a file of SRA run IDs (ERR457868 etc.), ask the Sequence Read Archive for the associated BioSample names (SAMEA2399445 etc.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rentrez) | |
library(assertthat) | |
library(readr) | |
search_ind <- function(term){ | |
# get the IDs for a run ID | |
# ERR457868 searched, returns 1011219 | |
results <- entrez_search(db="sra", term=term)$ids | |
assert_that(length(results) == 1) | |
results | |
} | |
search_links <- function(term) { | |
# get the Links in the SRA for a specific run's ID | |
# then pull out only the biosample from there | |
results <- entrez_link(dbfrom='sra', id=term, db='biosample')$links$sra_biosample | |
# there should be only one biosample per ID | |
assert_that(length(results) == 1) | |
results | |
} | |
search_summary <- function(term) { | |
# Get the summary for a BioSample, then return only the accession | |
summary <- entrez_summary(db='biosample', id=term)$accession | |
summary | |
} | |
names <- read.table('./Names_only.txt', head=F) | |
#ERR457868 ERR475358 ERR475359 ERR475360 ERR475361 ERR479604 .... | |
ids <- sapply(names$V1, search_ind, USE.NAMES=F) | |
#[1] "1011219" "1533032" "984971" "984969" "984970" "1533033".... | |
links <- sapply(ids, search_links, USE.NAMES=F) | |
# "3087115" "3769628" "3031276" "3031277" "3031278" "3769630" | |
accession_ids <- sapply(links, search_summary, USE.NAMES=F) | |
# [1] "SAMEA2399445" "SAMEA2445339" "SAMEA2729910" "SAMEA2445340" "SAMEA2445341" "SAMEA2467095" | |
names$V2 <- accession_ids | |
write_csv(names, 'IDs_to_Samples.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment