Skip to content

Instantly share code, notes, and snippets.

@ar0ch
Last active March 22, 2019 14:09
Show Gist options
  • Save ar0ch/8f1752df88a877caac17dcddf4f2cdea to your computer and use it in GitHub Desktop.
Save ar0ch/8f1752df88a877caac17dcddf4f2cdea to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import os
import re
import shutil
import xml.etree.ElementTree as ET
try:
from urllib.request import urlopen, urlretrieve
except ImportError:
from urllib import urlopen, urlretrieve
import argparse
#############################################################
# Function : get_links
# Input : speciesName and schemes dict
# Output : Dict containing links to alleles and profile
# Description: Gets the URLs from pubMLST for the required
# files (alleles, profile)
#############################################################
def get_links(xmlData, savePath, speciesName):
lociList = {}
profileURL = None
for species in xmlData:
if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ):
for mlst in species:
for database in mlst:
for child in database:
if child.tag == "profiles":
profileURL = child[1].text
if child.tag == "loci":
for locus in child:
lociList[locus.text.rstrip()] = locus[0].text
if profileURL is None:
profileError = "Parsing failed: could not find profiles file"
print(profileError)
print(f"This usually means the provided species, '{speciesName}', does not exist on PubMLST")
print(f"Use `{sys.argv[0]} list` to list available species")
print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/")
sys.exit(1)
elif lociList == {}:
lociError = "Parsing failed: could not find allele sequences"
print(lociError)
sys.exit(1)
else:
return profileURL, lociList
#############################################################
# Function : get_files
# Input : URLs from get_links
# Output : Downloads files and builds database
#############################################################
def get_files(filePrefix, loci, profileURL, speciesName):
with open(config, "w") as configFile:
configFile.write("[loci]\n")
for file in loci:
localFile = f"{filePrefix}_{file}.fasta"
try:
localFile, headers = urlretrieve(loci[file], localFile)
except:
print('\033[91m' + "There was an error downloading " + file + '\033[0m')
pass
configFile.write(f"{file}\t{localFile}\n")
localFile = filePrefix + "_profile.txt"
localFile, headers = urlretrieve(profileURL, localFile)
configFile.write("[profile]\n")
configFile.write("profile\t" + filePrefix + "_profile.txt\n")
configFile.close()
retVal = os.system(f"indexer -c {config} -p {filePrefix}")
if retVal != 0:
print('\033[91m' + "Failed to create database " + speciesName + '\033[0m')
else:
print('\033[92m' + "Database ready for " + speciesName + '\033[0m')
print(filePrefix)
dbURL = "http://pubmlst.org/data/dbases.xml"
databaseXML = urlopen(dbURL)
dbTree = ET.parse(databaseXML)
dbRoot = dbTree.getroot()
if len(sys.argv) < 1:
print(f"{sys.argv[0]} <species> <DB prefix>")
print()
print("For example:")
print(f"{sys.argv[0]} \"Staphylococcus aureus\" 31 ./database/")
print(f"Use '{sys.argv[0]} list' to list available schemes")
exit(0)
elif len(sys.argv) == 2:
species = sys.argv[1]
else:
species, dbPrefix = sys.argv[1:]
if species == "show" or species == "list":
for species in dbRoot:
print(species.text.rstrip())
elif species == "all":
for species in dbRoot:
speciesName = species.text.rstrip()
print('\033[1m' + "Preparing: " + speciesName + '\033[0m')
if re.search('[/#. ()]', speciesName):
normSpeciesName = re.sub('[/# ]', "_", speciesName)
normSpeciesName = re.sub('[.()]', "", normSpeciesName)
print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
else:
normSpeciesName = speciesName
filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName
try:
os.makedirs(filePrefix)
except OSError:
pass
filePrefix = filePrefix + "/" + normSpeciesName
config = filePrefix + "_config.txt"
profileURL, loci = get_links(dbRoot, filePrefix, speciesName)
get_files(filePrefix, loci, profileURL, speciesName)
else:
print('\033[1m' + "Preparing: " + species + '\033[0m')
if re.search('[/#. ()]', species):
normSpeciesName = re.sub('[/# ]', "_", species)
normSpeciesName = re.sub('[.()]', "", normSpeciesName)
print('\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
else:
normSpeciesName = species
try:
os.makedirs(dbPrefix.rsplit("/", 1)[0])
except OSError:
pass
if len(re.findall("/", dbPrefix)) == 0:
filePrefix = dbPrefix + "/" + normSpeciesName
elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0:
filePrefix = dbPrefix
elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0:
filePrefix = dbPrefix + normSpeciesName
elif len(re.findall("/", dbPrefix)) > 1:
if dbPrefix.endswith('/'):
filePrefix = dbPrefix + normSpeciesName
else:
filePrefix = dbPrefix
config = filePrefix + "_config.txt"
profileURL, loci = get_links(dbRoot, filePrefix, species)
get_files(filePrefix, loci, profileURL, species)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment