ar0ch/download_pubmlst_schemes.py

## download_pubmlst_schemes.py
#!/usr/bin/env python
import sys
import os
import re
import shutil
import xml.etree.ElementTree as ET
try:
    from urllib.request import urlopen, urlretrieve
except ImportError:
    from urllib import urlopen, urlretrieve
import argparse

#############################################################
# Function   : get_links
# Input      : speciesName and schemes dict
# Output     : Dict containing links to alleles and profile
# Description: Gets the URLs from pubMLST for the required
#              files (alleles, profile)
#############################################################
def get_links(xmlData, savePath, speciesName):
    lociList = {}
    profileURL = None
    for species in xmlData:
        if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ):
            for mlst in species:
                for database in mlst:
                    for child in database:
                        if child.tag == "profiles":
                            profileURL = child[1].text
                        if child.tag == "loci":
                            for locus in child:
                                lociList[locus.text.rstrip()] = locus[0].text
    if profileURL is None:
        profileError = "Parsing failed: could not find profiles file"
        print(profileError)
        print(f"This usually means the provided species, '{speciesName}', does not exist on PubMLST")
        print(f"Use `{sys.argv[0]} list` to list available species")
        print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/")
        sys.exit(1)
    elif lociList == {}:
        lociError = "Parsing failed: could not find allele sequences"
        print(lociError)
        sys.exit(1)
    else:
        return profileURL, lociList
#############################################################
# Function   : get_files
# Input      : URLs from get_links
# Output     : Downloads files and builds database
#############################################################
def get_files(filePrefix, loci, profileURL, speciesName):
    with open(config, "w") as configFile:
        configFile.write("[loci]\n")
        for file in loci:
            localFile = f"{filePrefix}_{file}.fasta"
            try:
                localFile, headers = urlretrieve(loci[file], localFile)
            except:
                print('\033[91m' + "There was an error downloading " + file + '\033[0m')
                pass
            configFile.write(f"{file}\t{localFile}\n")
        localFile = filePrefix + "_profile.txt"
        localFile, headers = urlretrieve(profileURL, localFile)
        configFile.write("[profile]\n")
        configFile.write("profile\t" + filePrefix + "_profile.txt\n")
        configFile.close()
        retVal = os.system(f"indexer -c {config} -p {filePrefix}")
        if retVal != 0:
            print('\033[91m' + "Failed to create database " + speciesName + '\033[0m')
        else:
            print('\033[92m' + "Database ready for " + speciesName + '\033[0m')
            print(filePrefix)


dbURL = "http://pubmlst.org/data/dbases.xml"
databaseXML = urlopen(dbURL)
dbTree = ET.parse(databaseXML)
dbRoot = dbTree.getroot()

if len(sys.argv) < 1:
    print(f"{sys.argv[0]} <species> <DB prefix>")
    print()
    print("For example:")
    print(f"{sys.argv[0]} \"Staphylococcus aureus\" 31 ./database/")
    print(f"Use '{sys.argv[0]} list' to list available schemes")
    exit(0)
elif len(sys.argv) == 2:
    species = sys.argv[1]
else:
    species, dbPrefix = sys.argv[1:]

if species == "show" or species == "list":
    for species in dbRoot:
        print(species.text.rstrip())
elif species == "all":
    for species in dbRoot:
        speciesName = species.text.rstrip()
        print('\033[1m' + "Preparing: " + speciesName + '\033[0m')
        if re.search('[/#. ()]', speciesName):
            normSpeciesName = re.sub('[/# ]', "_", speciesName)
            normSpeciesName = re.sub('[.()]', "", normSpeciesName)
            print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
        else:
            normSpeciesName = speciesName
        filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName
        try:
            os.makedirs(filePrefix)
        except OSError:
           pass
        filePrefix = filePrefix + "/" + normSpeciesName
        config = filePrefix + "_config.txt"
        profileURL, loci = get_links(dbRoot, filePrefix, speciesName)
        get_files(filePrefix, loci, profileURL, speciesName)
else:
    print('\033[1m' + "Preparing: " + species + '\033[0m')
    if re.search('[/#. ()]', species):
        normSpeciesName = re.sub('[/# ]', "_", species)
        normSpeciesName = re.sub('[.()]', "", normSpeciesName)
        print('\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
    else:
        normSpeciesName = species
    try:
        os.makedirs(dbPrefix.rsplit("/", 1)[0])
    except OSError:
        pass
    if len(re.findall("/", dbPrefix)) == 0:
        filePrefix = dbPrefix + "/" + normSpeciesName
    elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0:
        filePrefix = dbPrefix
    elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0:
        filePrefix = dbPrefix + normSpeciesName
    elif len(re.findall("/", dbPrefix)) > 1:
        if dbPrefix.endswith('/'):
            filePrefix = dbPrefix + normSpeciesName
        else:
            filePrefix = dbPrefix
    config = filePrefix + "_config.txt"
    profileURL, loci = get_links(dbRoot, filePrefix, species)
    get_files(filePrefix, loci, profileURL, species)
	#!/usr/bin/env python
	import sys
	import os
	import re
	import shutil
	import xml.etree.ElementTree as ET
	try:
	from urllib.request import urlopen, urlretrieve
	except ImportError:
	from urllib import urlopen, urlretrieve
	import argparse

	#############################################################
	# Function : get_links
	# Input : speciesName and schemes dict
	# Output : Dict containing links to alleles and profile
	# Description: Gets the URLs from pubMLST for the required
	# files (alleles, profile)
	#############################################################
	def get_links(xmlData, savePath, speciesName):
	lociList = {}
	profileURL = None
	for species in xmlData:
	if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ):
	for mlst in species:
	for database in mlst:
	for child in database:
	if child.tag == "profiles":
	profileURL = child[1].text
	if child.tag == "loci":
	for locus in child:
	lociList[locus.text.rstrip()] = locus[0].text
	if profileURL is None:
	profileError = "Parsing failed: could not find profiles file"
	print(profileError)
	print(f"This usually means the provided species, '{speciesName}', does not exist on PubMLST")
	print(f"Use `{sys.argv[0]} list` to list available species")
	print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/")
	sys.exit(1)
	elif lociList == {}:
	lociError = "Parsing failed: could not find allele sequences"
	print(lociError)
	sys.exit(1)
	else:
	return profileURL, lociList
	#############################################################
	# Function : get_files
	# Input : URLs from get_links
	# Output : Downloads files and builds database
	#############################################################
	def get_files(filePrefix, loci, profileURL, speciesName):
	with open(config, "w") as configFile:
	configFile.write("[loci]\n")
	for file in loci:
	localFile = f"{filePrefix}_{file}.fasta"
	try:
	localFile, headers = urlretrieve(loci[file], localFile)
	except:
	print('\033[91m' + "There was an error downloading " + file + '\033[0m')
	pass
	configFile.write(f"{file}\t{localFile}\n")
	localFile = filePrefix + "_profile.txt"
	localFile, headers = urlretrieve(profileURL, localFile)
	configFile.write("[profile]\n")
	configFile.write("profile\t" + filePrefix + "_profile.txt\n")
	configFile.close()
	retVal = os.system(f"indexer -c {config} -p {filePrefix}")
	if retVal != 0:
	print('\033[91m' + "Failed to create database " + speciesName + '\033[0m')
	else:
	print('\033[92m' + "Database ready for " + speciesName + '\033[0m')
	print(filePrefix)


	dbURL = "http://pubmlst.org/data/dbases.xml"
	databaseXML = urlopen(dbURL)
	dbTree = ET.parse(databaseXML)
	dbRoot = dbTree.getroot()

	if len(sys.argv) < 1:
	print(f"{sys.argv[0]} <species> <DB prefix>")
	print()
	print("For example:")
	print(f"{sys.argv[0]} \"Staphylococcus aureus\" 31 ./database/")
	print(f"Use '{sys.argv[0]} list' to list available schemes")
	exit(0)
	elif len(sys.argv) == 2:
	species = sys.argv[1]
	else:
	species, dbPrefix = sys.argv[1:]

	if species == "show" or species == "list":
	for species in dbRoot:
	print(species.text.rstrip())
	elif species == "all":
	for species in dbRoot:
	speciesName = species.text.rstrip()
	print('\033[1m' + "Preparing: " + speciesName + '\033[0m')
	if re.search('[/#. ()]', speciesName):
	normSpeciesName = re.sub('[/# ]', "_", speciesName)
	normSpeciesName = re.sub('[.()]', "", normSpeciesName)
	print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
	else:
	normSpeciesName = speciesName
	filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName
	try:
	os.makedirs(filePrefix)
	except OSError:
	pass
	filePrefix = filePrefix + "/" + normSpeciesName
	config = filePrefix + "_config.txt"
	profileURL, loci = get_links(dbRoot, filePrefix, speciesName)
	get_files(filePrefix, loci, profileURL, speciesName)
	else:
	print('\033[1m' + "Preparing: " + species + '\033[0m')
	if re.search('[/#. ()]', species):
	normSpeciesName = re.sub('[/# ]', "_", species)
	normSpeciesName = re.sub('[.()]', "", normSpeciesName)
	print('\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
	else:
	normSpeciesName = species
	try:
	os.makedirs(dbPrefix.rsplit("/", 1)[0])
	except OSError:
	pass
	if len(re.findall("/", dbPrefix)) == 0:
	filePrefix = dbPrefix + "/" + normSpeciesName
	elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0:
	filePrefix = dbPrefix
	elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0:
	filePrefix = dbPrefix + normSpeciesName
	elif len(re.findall("/", dbPrefix)) > 1:
	if dbPrefix.endswith('/'):
	filePrefix = dbPrefix + normSpeciesName
	else:
	filePrefix = dbPrefix
	config = filePrefix + "_config.txt"
	profileURL, loci = get_links(dbRoot, filePrefix, species)
	get_files(filePrefix, loci, profileURL, species)