Skip to content

Instantly share code, notes, and snippets.

@nickloman
Created October 10, 2012 09:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nickloman/3864281 to your computer and use it in GitHub Desktop.
Save nickloman/3864281 to your computer and use it in GitHub Desktop.
MLST_blast.py
# a little script to run BLASTN (BLAST+) and generate an MLST profile from a database defined in 'bugs'
# The alleles and profile files should be located in the directory BLAST_DIRECTORY/<bugname>/
# requires Biopython
# use like:
# process('Acinetobacter_baumannii', open('contigs.fa'))
from Bio import SeqIO
from Bio.Blast import NCBIXML
from StringIO import StringIO
import sys
import subprocess
import re
BLAST_PATH = '/path/to/blast+/directory'
BLAST_DIRECTORY = '/path/to/allele_files/'
bugs = {
'Acinetobacter_baumannii' : {'profile_file' : 'abaumannii.txt', 'list' : ['gltA.tfa', 'gyrB.tfa', 'gdhB.tfa', 'recA.tfa', 'cpn60.tfa', 'gpi.tfa', 'rpoD.tfa'], 'separator' : '_'},
'Acinetobacter_baumannii_Pasteur' : {'profile_file' : 'profiles.txt', 'list' : ['cpn60.fa', 'fusA.fa', 'gltA.fa', 'pyrG.fa', 'recA.fa', 'rplB.fa', 'rpoB.fa'], 'separator' : '-'},
'Streptococcus_pneumoniae' : {'profile_file' : 'spneumoniae.txt', 'list' : ['aroe.tfa', 'gdh_.tfa', 'gki_.tfa', 'recP.tfa', 'spi_.tfa', 'xpt_.tfa', 'ddl_.tfa'], 'separator' : '-'},
'Pseudomonas_aeruginosa' : {'profile_file' : 'paeruginosa.txt', 'list' : ['acs.tfa', 'aro.tfa', 'gua.tfa', 'mut.tfa', 'nuo.tfa', 'pps.tfa', 'trp.tfa'], 'separator' : '_'},
'Escherichia_coli' : {'profile_file' : 'publicSTs.txt', 'list' : ['ADK.fas', 'FUMC.fas', 'GYRB.fas', 'ICD.fas', 'MDH.fas', 'PURA.fas', 'RECA.fas'], 'separator' : re.compile('([a-zA-Z]+)(\d+)'), 'colnum' : 2},
}
class MLSTResult:
def __init__(self, locus, allele, score):
self.locus = locus
self.allele = allele
self.score = score
class MLSTProfile:
def __init__(self, colnum = 1):
self.profile = 'unknown'
self.alleles = []
self.colnum = colnum
self.line = ''
def compute_profile(self, fn):
# skip header line
itr = open(fn)
itr.next()
for ln in itr:
cols = ln.split()
if [str(x) for x in cols[self.colnum:self.colnum + len(self.alleles)]] \
== \
[str(a.allele) for a in self.alleles]:
print "match"
self.profile = cols[0]
self.line = ln
def by_identity(a, b):
return cmp(
float(a.hsps[0].identities) / float(a.length),
float(b.hsps[0].identities) / float(b.length)
)
def process(bug, file_handle):
concat = ">seq\n"
for rec in SeqIO.parse(file_handle, "fasta"):
concat += rec.seq.tostring()
concat += 'NNNNNN'
result = MLSTProfile(bugs[bug].get('colnum', 1))
for db in bugs[bug]['list']:
p = subprocess.Popen(
[BLAST_PATH + '/' + 'blastn', '-db', BLAST_DIRECTORY + '/' + bug + '/' + db, '-outfmt', '5'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
out, err = p.communicate(concat)
for rec in NCBIXML.parse(StringIO(out)):
rec.alignments.sort(by_identity, reverse=True)
hit = rec.alignments[0]
hsp = hit.hsps[0]
identical = float(hsp.identities) / float(hit.length)
tag = hit.title.split(" ")[1]
print bugs[bug]['separator']
try:
name, allele = tag.split(bugs[bug]['separator'])
except TypeError:
m = re.match(bugs[bug]['separator'], tag)
name = m.group(1)
allele = m.group(2)
# re.split("[_-]", tag)
print tag, name, allele
if identical == 1.0:
result.alleles.append(MLSTResult(name, allele, str(identical)))
else:
result.alleles.append(MLSTResult(name, 'no perfect match- best is ' + allele, str(identical)))
if not result.alleles:
raise ValueError('something went wrong!')
result.compute_profile(BLAST_DIRECTORY + '/' + bug + '/' + bugs[bug]['profile_file'])
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment