Created
October 10, 2012 09:13
-
-
Save nickloman/3864281 to your computer and use it in GitHub Desktop.
MLST_blast.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a little script to run BLASTN (BLAST+) and generate an MLST profile from a database defined in 'bugs' | |
# The alleles and profile files should be located in the directory BLAST_DIRECTORY/<bugname>/ | |
# requires Biopython | |
# use like: | |
# process('Acinetobacter_baumannii', open('contigs.fa')) | |
from Bio import SeqIO | |
from Bio.Blast import NCBIXML | |
from StringIO import StringIO | |
import sys | |
import subprocess | |
import re | |
BLAST_PATH = '/path/to/blast+/directory' | |
BLAST_DIRECTORY = '/path/to/allele_files/' | |
bugs = { | |
'Acinetobacter_baumannii' : {'profile_file' : 'abaumannii.txt', 'list' : ['gltA.tfa', 'gyrB.tfa', 'gdhB.tfa', 'recA.tfa', 'cpn60.tfa', 'gpi.tfa', 'rpoD.tfa'], 'separator' : '_'}, | |
'Acinetobacter_baumannii_Pasteur' : {'profile_file' : 'profiles.txt', 'list' : ['cpn60.fa', 'fusA.fa', 'gltA.fa', 'pyrG.fa', 'recA.fa', 'rplB.fa', 'rpoB.fa'], 'separator' : '-'}, | |
'Streptococcus_pneumoniae' : {'profile_file' : 'spneumoniae.txt', 'list' : ['aroe.tfa', 'gdh_.tfa', 'gki_.tfa', 'recP.tfa', 'spi_.tfa', 'xpt_.tfa', 'ddl_.tfa'], 'separator' : '-'}, | |
'Pseudomonas_aeruginosa' : {'profile_file' : 'paeruginosa.txt', 'list' : ['acs.tfa', 'aro.tfa', 'gua.tfa', 'mut.tfa', 'nuo.tfa', 'pps.tfa', 'trp.tfa'], 'separator' : '_'}, | |
'Escherichia_coli' : {'profile_file' : 'publicSTs.txt', 'list' : ['ADK.fas', 'FUMC.fas', 'GYRB.fas', 'ICD.fas', 'MDH.fas', 'PURA.fas', 'RECA.fas'], 'separator' : re.compile('([a-zA-Z]+)(\d+)'), 'colnum' : 2}, | |
} | |
class MLSTResult: | |
def __init__(self, locus, allele, score): | |
self.locus = locus | |
self.allele = allele | |
self.score = score | |
class MLSTProfile: | |
def __init__(self, colnum = 1): | |
self.profile = 'unknown' | |
self.alleles = [] | |
self.colnum = colnum | |
self.line = '' | |
def compute_profile(self, fn): | |
# skip header line | |
itr = open(fn) | |
itr.next() | |
for ln in itr: | |
cols = ln.split() | |
if [str(x) for x in cols[self.colnum:self.colnum + len(self.alleles)]] \ | |
== \ | |
[str(a.allele) for a in self.alleles]: | |
print "match" | |
self.profile = cols[0] | |
self.line = ln | |
def by_identity(a, b): | |
return cmp( | |
float(a.hsps[0].identities) / float(a.length), | |
float(b.hsps[0].identities) / float(b.length) | |
) | |
def process(bug, file_handle): | |
concat = ">seq\n" | |
for rec in SeqIO.parse(file_handle, "fasta"): | |
concat += rec.seq.tostring() | |
concat += 'NNNNNN' | |
result = MLSTProfile(bugs[bug].get('colnum', 1)) | |
for db in bugs[bug]['list']: | |
p = subprocess.Popen( | |
[BLAST_PATH + '/' + 'blastn', '-db', BLAST_DIRECTORY + '/' + bug + '/' + db, '-outfmt', '5'], | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
) | |
out, err = p.communicate(concat) | |
for rec in NCBIXML.parse(StringIO(out)): | |
rec.alignments.sort(by_identity, reverse=True) | |
hit = rec.alignments[0] | |
hsp = hit.hsps[0] | |
identical = float(hsp.identities) / float(hit.length) | |
tag = hit.title.split(" ")[1] | |
print bugs[bug]['separator'] | |
try: | |
name, allele = tag.split(bugs[bug]['separator']) | |
except TypeError: | |
m = re.match(bugs[bug]['separator'], tag) | |
name = m.group(1) | |
allele = m.group(2) | |
# re.split("[_-]", tag) | |
print tag, name, allele | |
if identical == 1.0: | |
result.alleles.append(MLSTResult(name, allele, str(identical))) | |
else: | |
result.alleles.append(MLSTResult(name, 'no perfect match- best is ' + allele, str(identical))) | |
if not result.alleles: | |
raise ValueError('something went wrong!') | |
result.compute_profile(BLAST_DIRECTORY + '/' + bug + '/' + bugs[bug]['profile_file']) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment