Created
February 19, 2014 17:49
-
-
Save jwillis0720/9097426 to your computer and use it in GitHub Desktop.
superimposer_align.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env/python2.7 | |
import sys | |
from Bio.Align.Applications import ClustalwCommandline as cw | |
import warnings | |
from multiprocessing import Pool | |
import argparse | |
import os | |
from Bio import AlignIO | |
from Bio.PDB import PDBExceptions | |
from Bio.PDB import PDBParser | |
from Bio.PDB import Superimposer | |
import glob | |
from itertools import izip | |
clustalw_exe = r"/usr/bin/apps/clustalw/Linux2/i686/v1.83/clustalw" | |
warnings.simplefilter('ignore', PDBExceptions.PDBConstructionWarning) | |
if sys.version_info < (2, 7): | |
raise Exception("You must use python2.7 to run this") | |
# usage | |
usage = "%prog [options] -n native.pdb *pdbs" | |
parser = argparse.ArgumentParser(prog="rmsd_full.py", formatter_class=argparse.RawTextHelpFormatter, | |
description="\n\nA rosetta score vs rmsd file encompassing a full analysis.\n\n\ | |
-This recapitulates the most accurate representation of score vs rmsd by doing the following:\n\ | |
1. First does an alignment of sequences to find out amino acid correspondance\n\ | |
2. Iterates through atoms attempting to match them up. If they are not identical types, \n\ | |
\tthen they will be rejected in the calculation\n\ | |
3. An RMSD will be calculated all after all the atoms are aligned that were considered from steps 1 and 2.") | |
parser.add_argument("pdbs", metavar="*pdb", nargs=argparse.REMAINDER, help="The pdb files that you want superimposed") | |
parser.add_argument( | |
"--multi", "-m", dest="multi", help="run using multiple processors", action='store_true', default=False) | |
parser.add_argument("--native", "-n", dest="native", help="native structure") | |
parser.add_argument("--out", "-o", dest="table", help="the name of the prefix of the output file", default="score_vs_rmsd") | |
parser.add_argument("--debug", help="will output a file called debug_atoms.txt to tell you exatctly what was calculated in the RMSD") | |
args = parser.parse_args() | |
if len(args.pdbs) < 1: | |
parser.error("specify at least 1 protein to compare to native") | |
longer_names = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', | |
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G', | |
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K', | |
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S', | |
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'TYS': 'Y'} | |
# Functions | |
def calculate_all_superpositions(native, decoy, debug, pairwise): | |
"""calculates c alpha , backbone and all atom rmsds""" | |
native_atoms_ca = [] | |
decoy_atoms_ca = [] | |
native_atoms_bb = [] | |
decoy_atoms_bb = [] | |
native_atoms_all = [] | |
decoy_atoms_all = [] | |
# check to see if we are calculating rmsd on just residues | |
counter = 1 | |
what_to_skip_native = [] | |
what_to_skip_decoy = [] | |
native_residues = [i for i in native.get_residues()] | |
decoy_residues = [i for i in decoy.get_residues()] | |
# first figure out what to skip | |
for fasta1, fasta2 in zip(pairwise['native'], pairwise['target']): | |
if fasta1 == "-": | |
what_to_skip_native.append(counter) | |
elif fasta2 == "-": | |
what_to_skip_decoy.append(counter) | |
counter += 1 | |
for counter, residue in enumerate(native.get_residues(), start=1): | |
if counter in what_to_skip_decoy: | |
native_residues.remove(residue) | |
for counter, residue in enumerate(decoy.get_residues(), start=1): | |
if counter in what_to_skip_native: | |
decoy_residues.remove(residue) | |
zipped_file = izip(native_residues, decoy_residues) | |
for(native_residue, decoy_residue) in zipped_file: | |
res_native, res_decoy = str(native_residue.id[1]), str(decoy_residue.id[1]) | |
chain_native, chain_decoy = native_residue.get_parent().id, decoy_residue.get_parent().id | |
native_list = native_residue.get_list() | |
decoy_list = decoy_residue.get_list() | |
try: | |
native_atoms_ca.append(native_residue['CA']) | |
decoy_atoms_ca.append(decoy_residue['CA']) | |
except KeyError: | |
print "warning: residue", str(native_residue.get_id()[1]), "has no ca atom in either the native or the decoy structure. either this residue is a hetatm or part of your backbone is missing. the residue will not be included in the rmsd calculations." | |
try: | |
native_atoms_bb.append(native_residue['CA']) | |
native_atoms_bb.append(native_residue['C']) | |
native_atoms_bb.append(native_residue['O']) | |
native_atoms_bb.append(native_residue['N']) | |
decoy_atoms_bb.append(decoy_residue['CA']) | |
decoy_atoms_bb.append(decoy_residue['C']) | |
decoy_atoms_bb.append(decoy_residue['O']) | |
decoy_atoms_bb.append(decoy_residue['N']) | |
except KeyError: | |
print "warning: residue", str(native_residue.get_id()[1]), "has no backbone atoms" | |
zipped_atoms = izip(native_list, decoy_list) | |
for (native_atom, decoy_atom) in zipped_atoms: | |
if native_atom.id != decoy_atom.id: | |
if debug: | |
print "atom {0} from chain {1}, residue {2} and atom {3} from chain {4}, residue {5} don't match up, skipping".format( | |
native_atom.id, chain_native, res_native, decoy_atom.id, chain_decoy, res_decoy) | |
continue | |
native_atoms_all.append(native_atom) | |
decoy_atoms_all.append(decoy_atom) | |
superpose_ca = Superimposer() | |
superpose_bb = Superimposer() | |
superpose_all = Superimposer() | |
superpose_ca.set_atoms(native_atoms_ca, decoy_atoms_ca) | |
superpose_ca.apply(decoy.get_atoms()) | |
ca_rms = superpose_ca.rms | |
superpose_bb.set_atoms(native_atoms_bb, decoy_atoms_bb) | |
superpose_bb.apply(decoy.get_atoms()) | |
bb_rms = superpose_bb.rms | |
superpose_all.set_atoms(native_atoms_all, decoy_atoms_all) | |
superpose_all.apply(decoy.get_atoms()) | |
all_rms = superpose_all.rms | |
return {"ca_rmsd": ca_rms, "bb_rmsd": bb_rms, "all_rmsd": all_rms} | |
def load_pdb(path): | |
"""return a biopython structure object given a pdb file path""" | |
parser = PDBParser(PERMISSIVE=1) | |
pdb_file = open(path, 'rU') | |
structure = parser.get_structure(path[0:4], pdb_file) | |
pdb_file.close() | |
return structure | |
def get_fasta_from_pdb(pdb): | |
holder = [] | |
for i in pdb.get_residues(): | |
try: | |
holder.append((i, longer_names[i.get_resname()])) | |
except KeyError: | |
if args.debug: | |
print "Cant find short name for {0}, putting \"X\" instead".format(i) | |
holder.append((i, "X")) | |
return holder | |
def get_pairwise_alignment(native, target, target_name): | |
alignment_dict = {} | |
fasta1 = get_fasta_from_pdb(native) | |
fasta2 = get_fasta_from_pdb(target) | |
file = "clustal_tmp_in_{0}.fasta".format(target_name) | |
with open(file, 'w') as f: | |
f.write(">native\n") | |
for letter1 in fasta1: | |
f.write(letter1[1]) | |
f.write("\n>target\n") | |
for letter2 in fasta2: | |
f.write(letter2[1]) | |
cline = cw(clustalw_exe, infile=file) | |
assert os.path.isfile(clustalw_exe), "Clustal W binary is missing" | |
stdout, stderr = cline() | |
alignment = AlignIO.read(file.split('.fasta')[0] + '.aln', 'clustal') | |
alignment_dict['native'] = str(alignment[0, :].seq) | |
alignment_dict['target'] = str(alignment[1, :].seq) | |
return alignment_dict | |
def find_rmsd(model): | |
if args.debug: | |
print "processing %s" % (model) | |
native = args.native | |
rms_all = [] | |
pwa = get_pairwise_alignment(load_pdb(args.native), load_pdb(model), os.path.basename(model)) | |
# will return three rmsds aligned by everything in decoy | |
rms_all = calculate_all_superpositions( | |
load_pdb(native), load_pdb(model), [], args.debug, pwa) | |
print "Model:{0} All-Atom RMSD {1}, Backbone RMSD {2}, C-Alpha RMSD{3}".format( | |
rms_all['all_rmsd'], rms_all['bb_rmsd'], rms_all['ca_rmsd']) | |
def main(args): | |
if args.multi: | |
try: | |
p = Pool() | |
p.map(find_rmsd, args.pdbs) | |
except KeyboardInterrupt: | |
p.terminate() | |
quit() | |
else: | |
for i in args.pdbs: | |
find_rmsd(i) | |
# remove alignment files | |
temp_files = glob.glob('clustal_tmp*') | |
for i in temp_files: | |
os.remove(i) | |
if __name__ == "__main__": | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment