Get GenBank record of all strains of specific bacteria species
from Bio import Entrez | |
import os | |
import gzip | |
""" | |
To avoid problems with an access rate (ex. "HTTP Error 429: Too Many Requests") | |
Read how to get an api-key | |
https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/ | |
""" | |
Entrez.api_key = '<your api key here>' | |
Entrez.email = '<your email here>' | |
Entrez.tool = 'Demoscript' | |
def getStrainSequences(generic_name, specific_name, genome_dir): | |
""" | |
:param generic_name: the first part of species name | |
:param specific_name: the second part of species name | |
:return: return a dictionary of strains (assembly entry) for this species. AssemblyAccession -> [id0, id1, ...] | |
""" | |
assembly_accessions = {} | |
species_name = generic_name + " " + specific_name | |
count_info_entry = \ | |
Entrez.esearch( | |
db="assembly", | |
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND ' | |
'( all[filter] NOT anomalous[filter] ) )'.format(species_name), | |
rettype='count') | |
assembly_entry = \ | |
Entrez.esearch( | |
db="assembly", | |
term='{}[Organism] AND ( "latest refseq"[filter] AND "complete genome"[filter] AND ' | |
'( all[filter] NOT anomalous[filter] ) )'.format(species_name), | |
RetMax=Entrez.read(count_info_entry)['Count'], | |
rettype='uilist') | |
assembly_ids = Entrez.read(assembly_entry)['IdList'] | |
# print("assembly_ids: " + str(assembly_ids)) | |
for assembly_id in assembly_ids: | |
summary = \ | |
Entrez.esummary( | |
db="assembly", | |
id=assembly_id) | |
assembly_accession = Entrez.read(summary)['DocumentSummarySet']['DocumentSummary'][0]['AssemblyAccession'] | |
assembly_accessions[assembly_accession] = [] | |
# print("assembly_accession: " + str(assembly_accession)) | |
assembly_accession_entry = \ | |
Entrez.esearch( | |
db="nucleotide", | |
term="{}[Assembly]".format(assembly_accession), | |
rettype='uilist') | |
assembly_accession_entry_value = Entrez.read(assembly_accession_entry) | |
# print("assembly_accession_entry_value: " + str(assembly_accession_entry_value)) | |
for identity in assembly_accession_entry_value['IdList']: | |
assembly_accessions[assembly_accession].append(identity) | |
gb_file = os.path.join(genome_dir, identity + ".gb.gz") | |
if not os.path.exists(gb_file): | |
gb_entry = \ | |
Entrez.efetch( | |
db="nucleotide", | |
id=identity, | |
rettype="gbwithparts", | |
retmode="text") | |
with gzip.open(gb_file, 'wb') as fs: | |
fs.write(gb_entry.read().encode()) | |
print("Downloading is done: " + assembly_accession) | |
return assembly_accessions | |
genome_dir = "streptococcus_genomes" | |
if not os.path.exists(genome_dir): | |
os.mkdir(genome_dir) | |
result = {} | |
with open('streprococcus.txt') as fs: | |
for line in fs: | |
l = line.rstrip().split() | |
result[line.rstrip()] = getStrainSequences(l[0], l[1], genome_dir) | |
with open(os.path.join(genome_dir, "result.txt"), "w") as fs: | |
fs.write(str(result)) |
Streptococcus acidominimus | |
Streptococcus agalactiae | |
Streptococcus alactolyticus | |
Streptococcus anginosus | |
Streptococcus australis | |
Streptococcus bovis | |
Streptococcus caballi | |
Streptococcus cameli | |
Streptococcus canis | |
Streptococcus caprae | |
Streptococcus castoreus | |
Streptococcus criceti | |
Streptococcus constellatus | |
Streptococcus cuniculi | |
Streptococcus danieliae | |
Streptococcus dentasini | |
Streptococcus dentiloxodontae | |
Streptococcus dentirousetti | |
Streptococcus devriesei | |
Streptococcus didelphis | |
Streptococcus downei | |
Streptococcus dysgalactiae | |
Streptococcus entericus | |
Streptococcus equi | |
Streptococcus equinus | |
Streptococcus ferus | |
Streptococcus gallinaceus | |
Streptococcus gallolyticus | |
Streptococcus gordonii | |
Streptococcus halichoeri | |
Streptococcus halotolerans | |
Streptococcus henryi | |
Streptococcus himalayensis | |
Streptococcus hongkongensis | |
Streptococcus hyointestinalis | |
Streptococcus hyovaginalis | |
Streptococcus ictaluri | |
Streptococcus infantarius | |
Streptococcus infantis | |
Streptococcus iniae | |
Streptococcus intermedius | |
Streptococcus lactarius | |
Streptococcus loxodontisalivarius | |
Streptococcus lutetiensis | |
Streptococcus macacae | |
Streptococcus marimammalium | |
Streptococcus marmotae | |
Streptococcus massiliensis | |
Streptococcus merionis | |
Streptococcus minor | |
Streptococcus mitis | |
Streptococcus moroccensis | |
Streptococcus mutans | |
Streptococcus oligofermentans | |
Streptococcus oralis | |
Streptococcus oricebi | |
Streptococcus oriloxodontae | |
Streptococcus orisasini | |
Streptococcus orisratti | |
Streptococcus orisuis | |
Streptococcus ovis | |
Streptococcus panodentis | |
Streptococcus pantholopis | |
Streptococcus parasanguinis | |
Streptococcus parasuis | |
Streptococcus parauberis | |
Streptococcus peroris | |
Streptococcus pharyngis | |
Streptococcus phocae | |
Streptococcus pluranimalium | |
Streptococcus plurextorum | |
Streptococcus pneumoniae | |
Streptococcus porci | |
Streptococcus porcinus | |
Streptococcus porcorum | |
Streptococcus pseudopneumoniae | |
Streptococcus pseudoporcinus | |
Streptococcus pyogenes | |
Streptococcus ratti | |
Streptococcus rifensis | |
Streptococcus rubneri | |
Streptococcus rupicaprae | |
Streptococcus salivarius | |
Streptococcus saliviloxodontae | |
Streptococcus sanguinis | |
Streptococcus sinensis | |
Streptococcus sobrinus | |
Streptococcus suis | |
Streptococcus tangierensis | |
Streptococcus thoraltensis | |
Streptococcus troglodytae | |
Streptococcus troglodytidis | |
Streptococcus tigurinus | |
Streptococcus thermophilus | |
Streptococcus uberis | |
Streptococcus urinalis | |
Streptococcus ursoris | |
Streptococcus vestibularis | |
Streptococcus zooepidemicus |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment