Last active
March 22, 2019 14:09
-
-
Save ar0ch/8f1752df88a877caac17dcddf4f2cdea to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import re | |
import shutil | |
import xml.etree.ElementTree as ET | |
try: | |
from urllib.request import urlopen, urlretrieve | |
except ImportError: | |
from urllib import urlopen, urlretrieve | |
import argparse | |
############################################################# | |
# Function : get_links | |
# Input : speciesName and schemes dict | |
# Output : Dict containing links to alleles and profile | |
# Description: Gets the URLs from pubMLST for the required | |
# files (alleles, profile) | |
############################################################# | |
def get_links(xmlData, savePath, speciesName): | |
lociList = {} | |
profileURL = None | |
for species in xmlData: | |
if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ): | |
for mlst in species: | |
for database in mlst: | |
for child in database: | |
if child.tag == "profiles": | |
profileURL = child[1].text | |
if child.tag == "loci": | |
for locus in child: | |
lociList[locus.text.rstrip()] = locus[0].text | |
if profileURL is None: | |
profileError = "Parsing failed: could not find profiles file" | |
print(profileError) | |
print(f"This usually means the provided species, '{speciesName}', does not exist on PubMLST") | |
print(f"Use `{sys.argv[0]} list` to list available species") | |
print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/") | |
sys.exit(1) | |
elif lociList == {}: | |
lociError = "Parsing failed: could not find allele sequences" | |
print(lociError) | |
sys.exit(1) | |
else: | |
return profileURL, lociList | |
############################################################# | |
# Function : get_files | |
# Input : URLs from get_links | |
# Output : Downloads files and builds database | |
############################################################# | |
def get_files(filePrefix, loci, profileURL, speciesName): | |
with open(config, "w") as configFile: | |
configFile.write("[loci]\n") | |
for file in loci: | |
localFile = f"{filePrefix}_{file}.fasta" | |
try: | |
localFile, headers = urlretrieve(loci[file], localFile) | |
except: | |
print('\033[91m' + "There was an error downloading " + file + '\033[0m') | |
pass | |
configFile.write(f"{file}\t{localFile}\n") | |
localFile = filePrefix + "_profile.txt" | |
localFile, headers = urlretrieve(profileURL, localFile) | |
configFile.write("[profile]\n") | |
configFile.write("profile\t" + filePrefix + "_profile.txt\n") | |
configFile.close() | |
retVal = os.system(f"indexer -c {config} -p {filePrefix}") | |
if retVal != 0: | |
print('\033[91m' + "Failed to create database " + speciesName + '\033[0m') | |
else: | |
print('\033[92m' + "Database ready for " + speciesName + '\033[0m') | |
print(filePrefix) | |
dbURL = "http://pubmlst.org/data/dbases.xml" | |
databaseXML = urlopen(dbURL) | |
dbTree = ET.parse(databaseXML) | |
dbRoot = dbTree.getroot() | |
if len(sys.argv) < 1: | |
print(f"{sys.argv[0]} <species> <DB prefix>") | |
print() | |
print("For example:") | |
print(f"{sys.argv[0]} \"Staphylococcus aureus\" 31 ./database/") | |
print(f"Use '{sys.argv[0]} list' to list available schemes") | |
exit(0) | |
elif len(sys.argv) == 2: | |
species = sys.argv[1] | |
else: | |
species, dbPrefix = sys.argv[1:] | |
if species == "show" or species == "list": | |
for species in dbRoot: | |
print(species.text.rstrip()) | |
elif species == "all": | |
for species in dbRoot: | |
speciesName = species.text.rstrip() | |
print('\033[1m' + "Preparing: " + speciesName + '\033[0m') | |
if re.search('[/#. ()]', speciesName): | |
normSpeciesName = re.sub('[/# ]', "_", speciesName) | |
normSpeciesName = re.sub('[.()]', "", normSpeciesName) | |
print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m') | |
else: | |
normSpeciesName = speciesName | |
filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName | |
try: | |
os.makedirs(filePrefix) | |
except OSError: | |
pass | |
filePrefix = filePrefix + "/" + normSpeciesName | |
config = filePrefix + "_config.txt" | |
profileURL, loci = get_links(dbRoot, filePrefix, speciesName) | |
get_files(filePrefix, loci, profileURL, speciesName) | |
else: | |
print('\033[1m' + "Preparing: " + species + '\033[0m') | |
if re.search('[/#. ()]', species): | |
normSpeciesName = re.sub('[/# ]', "_", species) | |
normSpeciesName = re.sub('[.()]', "", normSpeciesName) | |
print('\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m') | |
else: | |
normSpeciesName = species | |
try: | |
os.makedirs(dbPrefix.rsplit("/", 1)[0]) | |
except OSError: | |
pass | |
if len(re.findall("/", dbPrefix)) == 0: | |
filePrefix = dbPrefix + "/" + normSpeciesName | |
elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0: | |
filePrefix = dbPrefix | |
elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0: | |
filePrefix = dbPrefix + normSpeciesName | |
elif len(re.findall("/", dbPrefix)) > 1: | |
if dbPrefix.endswith('/'): | |
filePrefix = dbPrefix + normSpeciesName | |
else: | |
filePrefix = dbPrefix | |
config = filePrefix + "_config.txt" | |
profileURL, loci = get_links(dbRoot, filePrefix, species) | |
get_files(filePrefix, loci, profileURL, species) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment