Skip to content

Instantly share code, notes, and snippets.

@aofarrel
Created May 10, 2024 19:27
Show Gist options
  • Save aofarrel/ac8d7bb0c24bbdd25d61aca73b94a62a to your computer and use it in GitHub Desktop.
Save aofarrel/ac8d7bb0c24bbdd25d61aca73b94a62a to your computer and use it in GitHub Desktop.
# Notes:
# * Works for SAMN, SAME, and SAMD BioSamples (should also work for SRS/ERS format)
# * Grabs date and location of sample isolation, host, sample source, and strain
# * edirect tools needs to be on $PATH, or you can use my Docker image for a pre-installed version: https://hub.docker.com/r/ashedpotatoes/sranwrp/tags
# * elink is known to randomly fail so this code doesn't use it -- however, without elink, you can't get the SRA reads (SRR/ERR/DRR) that make up a BioSample
import subprocess
biosamples = ["SAMEA6451356","SAMEA6451357","SAMEA6451358","SAMEA6451360","SAMEA6451361","SAMEA6451362","SAMEA6451363","SAMEA6451366","SAMEA6451367","SAMEA6451368","SAMEA6451369","SAMEA6451370","SAMEA6451371","SAMEA6451372","SAMEA6451373","SAMEA6451374","SAMEA6451375","SAMEA6451377"]
outs = []
for accession in biosamples:
esearch = subprocess.Popen(["esearch", "-db", "biosample", "-query", f"{accession}"], stdout=subprocess.PIPE)
esummary = subprocess.Popen(["efetch", "-format", "docsum"], stdin=esearch.stdout, stdout=subprocess.PIPE)
xtract = subprocess.check_output('''xtract -pattern BioSample -element -SRA "(NA)" -block Id -if Id@db -equals "SRA" -SRA Id -block Ids -first Id -element "&SRA" \
-DATE "(NA)" -block Attribute -if Attribute@harmonized_name -equals "collection_date" -DATE Attribute -block Attributes -element "&DATE" \
-LOC "(NA)" -block Attribute -if Attribute@harmonized_name -equals "geo_loc_name" -LOC Attribute -block Attributes -element "&LOC" \
-HOST "(NA)" -block Attribute -if Attribute@harmonized_name -equals "host" -HOST Attribute -block Attributes -element "&HOST" \
-SOURCE "(NA)" -block Attribute -if Attribute@harmonized_name -equals "isolation_source" -SOURCE Attribute -block Attributes -element "&SOURCE" \
-STRAIN "(NA)" -block Attribute -if Attribute@harmonized_name -equals "strain" -STRAIN Attribute -block Attributes -element "&STRAIN"''', stdin=esummary.stdout, text=True, shell=True)
outs.append(xtract)
print(xtract)
with open('metadata.tsv', 'w') as f:
for out in outs:
f.write("%s" % out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment