Skip to content

Instantly share code, notes, and snippets.

@pgsin
Last active September 18, 2019 15:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pgsin/e360ca75880ad5f8ad1afe319e734989 to your computer and use it in GitHub Desktop.
Save pgsin/e360ca75880ad5f8ad1afe319e734989 to your computer and use it in GitHub Desktop.
Get GenBank record of all strains of specific bacteria species
from Bio import Entrez
import os
import gzip
"""
To avoid problems with an access rate (ex. "HTTP Error 429: Too Many Requests")
Read how to get an api-key
https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
"""
Entrez.api_key = '<your api key here>'
Entrez.email = '<your email here>'
Entrez.tool = 'Demoscript'
def getStrainSequences(generic_name, specific_name, genome_dir):
"""
:param generic_name: the first part of species name
:param specific_name: the second part of species name
:return: return a dictionary of strains (assembly entry) for this species. AssemblyAccession -> [id0, id1, ...]
"""
assembly_accessions = {}
species_name = generic_name + " " + specific_name
count_info_entry = \
Entrez.esearch(
db="assembly",
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND '
'( all[filter] NOT anomalous[filter] ) )'.format(species_name),
rettype='count')
assembly_entry = \
Entrez.esearch(
db="assembly",
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND '
'( all[filter] NOT anomalous[filter] ) )'.format(species_name),
RetMax=Entrez.read(count_info_entry)['Count'],
rettype='uilist')
assembly_ids = Entrez.read(assembly_entry)['IdList']
# print("assembly_ids: " + str(assembly_ids))
for assembly_id in assembly_ids:
summary = \
Entrez.esummary(
db="assembly",
id=assembly_id)
assembly_accession = Entrez.read(summary)['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession']
assembly_accessions[assembly_accession] = []
# print("assembly_accession: " + str(assembly_accession))
assembly_accession_entry = \
Entrez.esearch(
db="nucleotide",
term="{}[Assembly]".format(assembly_accession),
rettype='uilist')
assembly_accession_entry_value = Entrez.read(assembly_accession_entry)
# print("assembly_accession_entry_value: " + str(assembly_accession_entry_value))
for identity in assembly_accession_entry_value['IdList']:
assembly_accessions[assembly_accession].append(identity)
gb_file = os.path.join(genome_dir, identity + ".gb.gz")
if not os.path.exists(gb_file):
gb_entry = \
Entrez.efetch(
db="nucleotide",
id=identity,
rettype="gbwithparts",
retmode="text")
with gzip.open(gb_file, 'wb') as fs:
fs.write(gb_entry.read().encode())
print("Downloading is done: " + assembly_accession)
return assembly_accessions
genome_dir = "streptococcus_genomes"
if not os.path.exists(genome_dir):
os.mkdir(genome_dir)
result = {}
with open('streprococcus.txt') as fs:
for line in fs:
l = line.rstrip().split()
result[line.rstrip()] = getStrainSequences(l[0], l[1], genome_dir)
with open(os.path.join(genome_dir, "result.txt"), "w") as fs:
fs.write(str(result))
Streptococcus acidominimus
Streptococcus agalactiae
Streptococcus alactolyticus
Streptococcus anginosus
Streptococcus australis
Streptococcus bovis
Streptococcus caballi
Streptococcus cameli
Streptococcus canis
Streptococcus caprae
Streptococcus castoreus
Streptococcus criceti
Streptococcus constellatus
Streptococcus cuniculi
Streptococcus danieliae
Streptococcus dentasini
Streptococcus dentiloxodontae
Streptococcus dentirousetti
Streptococcus devriesei
Streptococcus didelphis
Streptococcus downei
Streptococcus dysgalactiae
Streptococcus entericus
Streptococcus equi
Streptococcus equinus
Streptococcus ferus
Streptococcus gallinaceus
Streptococcus gallolyticus
Streptococcus gordonii
Streptococcus halichoeri
Streptococcus halotolerans
Streptococcus henryi
Streptococcus himalayensis
Streptococcus hongkongensis
Streptococcus hyointestinalis
Streptococcus hyovaginalis
Streptococcus ictaluri
Streptococcus infantarius
Streptococcus infantis
Streptococcus iniae
Streptococcus intermedius
Streptococcus lactarius
Streptococcus loxodontisalivarius
Streptococcus lutetiensis
Streptococcus macacae
Streptococcus marimammalium
Streptococcus marmotae
Streptococcus massiliensis
Streptococcus merionis
Streptococcus minor
Streptococcus mitis
Streptococcus moroccensis
Streptococcus mutans
Streptococcus oligofermentans
Streptococcus oralis
Streptococcus oricebi
Streptococcus oriloxodontae
Streptococcus orisasini
Streptococcus orisratti
Streptococcus orisuis
Streptococcus ovis
Streptococcus panodentis
Streptococcus pantholopis
Streptococcus parasanguinis
Streptococcus parasuis
Streptococcus parauberis
Streptococcus peroris
Streptococcus pharyngis
Streptococcus phocae
Streptococcus pluranimalium
Streptococcus plurextorum
Streptococcus pneumoniae
Streptococcus porci
Streptococcus porcinus
Streptococcus porcorum
Streptococcus pseudopneumoniae
Streptococcus pseudoporcinus
Streptococcus pyogenes
Streptococcus ratti
Streptococcus rifensis
Streptococcus rubneri
Streptococcus rupicaprae
Streptococcus salivarius
Streptococcus saliviloxodontae
Streptococcus sanguinis
Streptococcus sinensis
Streptococcus sobrinus
Streptococcus suis
Streptococcus tangierensis
Streptococcus thoraltensis
Streptococcus troglodytae
Streptococcus troglodytidis
Streptococcus tigurinus
Streptococcus thermophilus
Streptococcus uberis
Streptococcus urinalis
Streptococcus ursoris
Streptococcus vestibularis
Streptococcus zooepidemicus
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment