nickloman/MLST_blast.py

## MLST_blast.py
# a little script to run BLASTN (BLAST+) and generate an MLST profile from a database defined in 'bugs'
# The alleles and profile files should be located in the directory BLAST_DIRECTORY/<bugname>/
# requires Biopython
# use like:
#  process('Acinetobacter_baumannii', open('contigs.fa'))

from Bio import SeqIO
from Bio.Blast import NCBIXML
from StringIO import StringIO
import sys
import subprocess
import re

BLAST_PATH = '/path/to/blast+/directory'
BLAST_DIRECTORY = '/path/to/allele_files/'

bugs = {
    'Acinetobacter_baumannii' : {'profile_file' : 'abaumannii.txt', 'list' : ['gltA.tfa', 'gyrB.tfa', 'gdhB.tfa', 'recA.tfa', 'cpn60.tfa', 'gpi.tfa', 'rpoD.tfa'], 'separator' : '_'},
    'Acinetobacter_baumannii_Pasteur' : {'profile_file' : 'profiles.txt', 'list' : ['cpn60.fa',  'fusA.fa',  'gltA.fa',  'pyrG.fa',  'recA.fa',  'rplB.fa',  'rpoB.fa'], 'separator' : '-'},
    'Streptococcus_pneumoniae' : {'profile_file' : 'spneumoniae.txt', 'list' : ['aroe.tfa', 'gdh_.tfa', 'gki_.tfa', 'recP.tfa', 'spi_.tfa', 'xpt_.tfa', 'ddl_.tfa'], 'separator' : '-'},
    'Pseudomonas_aeruginosa' : {'profile_file' : 'paeruginosa.txt', 'list' : ['acs.tfa', 'aro.tfa', 'gua.tfa', 'mut.tfa', 'nuo.tfa', 'pps.tfa', 'trp.tfa'], 'separator' : '_'},
    'Escherichia_coli' : {'profile_file' : 'publicSTs.txt', 'list' : ['ADK.fas', 'FUMC.fas', 'GYRB.fas', 'ICD.fas', 'MDH.fas', 'PURA.fas', 'RECA.fas'], 'separator' : re.compile('([a-zA-Z]+)(\d+)'), 'colnum' : 2},
}

class MLSTResult:
    def __init__(self, locus, allele, score):
        self.locus = locus
        self.allele = allele
        self.score = score

class MLSTProfile:
    def __init__(self, colnum = 1):
        self.profile = 'unknown'
        self.alleles = []
        self.colnum = colnum
        self.line = ''

    def compute_profile(self, fn):
        # skip header line
        itr = open(fn)
        itr.next()
        for ln in itr:
            cols = ln.split()

            if [str(x) for x in cols[self.colnum:self.colnum + len(self.alleles)]] \
               == \
               [str(a.allele) for a in self.alleles]:
                print "match"
                self.profile = cols[0]
                self.line = ln


def by_identity(a, b):
    return cmp(
        float(a.hsps[0].identities) / float(a.length),
        float(b.hsps[0].identities) / float(b.length)
    )

def process(bug, file_handle):
    concat = ">seq\n"
    for rec in SeqIO.parse(file_handle, "fasta"):
        concat += rec.seq.tostring()
        concat += 'NNNNNN'

    result = MLSTProfile(bugs[bug].get('colnum', 1))
    for db in bugs[bug]['list']:
        p = subprocess.Popen(
            [BLAST_PATH + '/' + 'blastn', '-db', BLAST_DIRECTORY + '/' + bug + '/' + db, '-outfmt', '5'],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
        )

        out, err = p.communicate(concat)

        for rec in NCBIXML.parse(StringIO(out)):
            rec.alignments.sort(by_identity, reverse=True)
            hit = rec.alignments[0]
            hsp = hit.hsps[0]

            identical = float(hsp.identities) / float(hit.length)

            tag = hit.title.split(" ")[1]

            print bugs[bug]['separator']
            try:
                name, allele = tag.split(bugs[bug]['separator'])
            except TypeError:
                m = re.match(bugs[bug]['separator'], tag)
                name = m.group(1)
                allele = m.group(2)


          #  re.split("[_-]", tag)
            print tag, name, allele

            if identical == 1.0:
                result.alleles.append(MLSTResult(name, allele, str(identical)))
            else:
                result.alleles.append(MLSTResult(name, 'no perfect match- best is ' + allele, str(identical)))

    if not result.alleles:
        raise ValueError('something went wrong!')

    result.compute_profile(BLAST_DIRECTORY + '/' + bug + '/' + bugs[bug]['profile_file'])

    return result
	# a little script to run BLASTN (BLAST+) and generate an MLST profile from a database defined in 'bugs'
	# The alleles and profile files should be located in the directory BLAST_DIRECTORY/<bugname>/
	# requires Biopython
	# use like:
	# process('Acinetobacter_baumannii', open('contigs.fa'))

	from Bio import SeqIO
	from Bio.Blast import NCBIXML
	from StringIO import StringIO
	import sys
	import subprocess
	import re

	BLAST_PATH = '/path/to/blast+/directory'
	BLAST_DIRECTORY = '/path/to/allele_files/'

	bugs = {
	'Acinetobacter_baumannii' : {'profile_file' : 'abaumannii.txt', 'list' : ['gltA.tfa', 'gyrB.tfa', 'gdhB.tfa', 'recA.tfa', 'cpn60.tfa', 'gpi.tfa', 'rpoD.tfa'], 'separator' : '_'},
	'Acinetobacter_baumannii_Pasteur' : {'profile_file' : 'profiles.txt', 'list' : ['cpn60.fa', 'fusA.fa', 'gltA.fa', 'pyrG.fa', 'recA.fa', 'rplB.fa', 'rpoB.fa'], 'separator' : '-'},
	'Streptococcus_pneumoniae' : {'profile_file' : 'spneumoniae.txt', 'list' : ['aroe.tfa', 'gdh_.tfa', 'gki_.tfa', 'recP.tfa', 'spi_.tfa', 'xpt_.tfa', 'ddl_.tfa'], 'separator' : '-'},
	'Pseudomonas_aeruginosa' : {'profile_file' : 'paeruginosa.txt', 'list' : ['acs.tfa', 'aro.tfa', 'gua.tfa', 'mut.tfa', 'nuo.tfa', 'pps.tfa', 'trp.tfa'], 'separator' : '_'},
	'Escherichia_coli' : {'profile_file' : 'publicSTs.txt', 'list' : ['ADK.fas', 'FUMC.fas', 'GYRB.fas', 'ICD.fas', 'MDH.fas', 'PURA.fas', 'RECA.fas'], 'separator' : re.compile('([a-zA-Z]+)(\d+)'), 'colnum' : 2},
	}

	class MLSTResult:
	def __init__(self, locus, allele, score):
	self.locus = locus
	self.allele = allele
	self.score = score

	class MLSTProfile:
	def __init__(self, colnum = 1):
	self.profile = 'unknown'
	self.alleles = []
	self.colnum = colnum
	self.line = ''

	def compute_profile(self, fn):
	# skip header line
	itr = open(fn)
	itr.next()
	for ln in itr:
	cols = ln.split()

	if [str(x) for x in cols[self.colnum:self.colnum + len(self.alleles)]] \
	== \
	[str(a.allele) for a in self.alleles]:
	print "match"
	self.profile = cols[0]
	self.line = ln


	def by_identity(a, b):
	return cmp(
	float(a.hsps[0].identities) / float(a.length),
	float(b.hsps[0].identities) / float(b.length)
	)

	def process(bug, file_handle):
	concat = ">seq\n"
	for rec in SeqIO.parse(file_handle, "fasta"):
	concat += rec.seq.tostring()
	concat += 'NNNNNN'

	result = MLSTProfile(bugs[bug].get('colnum', 1))
	for db in bugs[bug]['list']:
	p = subprocess.Popen(
	[BLAST_PATH + '/' + 'blastn', '-db', BLAST_DIRECTORY + '/' + bug + '/' + db, '-outfmt', '5'],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE,
	)

	out, err = p.communicate(concat)

	for rec in NCBIXML.parse(StringIO(out)):
	rec.alignments.sort(by_identity, reverse=True)
	hit = rec.alignments[0]
	hsp = hit.hsps[0]

	identical = float(hsp.identities) / float(hit.length)

	tag = hit.title.split(" ")[1]

	print bugs[bug]['separator']
	try:
	name, allele = tag.split(bugs[bug]['separator'])
	except TypeError:
	m = re.match(bugs[bug]['separator'], tag)
	name = m.group(1)
	allele = m.group(2)


	# re.split("[_-]", tag)
	print tag, name, allele

	if identical == 1.0:
	result.alleles.append(MLSTResult(name, allele, str(identical)))
	else:
	result.alleles.append(MLSTResult(name, 'no perfect match- best is ' + allele, str(identical)))

	if not result.alleles:
	raise ValueError('something went wrong!')

	result.compute_profile(BLAST_DIRECTORY + '/' + bug + '/' + bugs[bug]['profile_file'])

	return result