Skip to content

Instantly share code, notes, and snippets.

@jwillis0720
Created February 19, 2014 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwillis0720/9097426 to your computer and use it in GitHub Desktop.
Save jwillis0720/9097426 to your computer and use it in GitHub Desktop.
superimposer_align.py
#!/usr/bin/env/python2.7
import sys
from Bio.Align.Applications import ClustalwCommandline as cw
import warnings
from multiprocessing import Pool
import argparse
import os
from Bio import AlignIO
from Bio.PDB import PDBExceptions
from Bio.PDB import PDBParser
from Bio.PDB import Superimposer
import glob
from itertools import izip
clustalw_exe = r"/usr/bin/apps/clustalw/Linux2/i686/v1.83/clustalw"
warnings.simplefilter('ignore', PDBExceptions.PDBConstructionWarning)
if sys.version_info < (2, 7):
raise Exception("You must use python2.7 to run this")
# usage
usage = "%prog [options] -n native.pdb *pdbs"
parser = argparse.ArgumentParser(prog="rmsd_full.py", formatter_class=argparse.RawTextHelpFormatter,
description="\n\nA rosetta score vs rmsd file encompassing a full analysis.\n\n\
-This recapitulates the most accurate representation of score vs rmsd by doing the following:\n\
1. First does an alignment of sequences to find out amino acid correspondance\n\
2. Iterates through atoms attempting to match them up. If they are not identical types, \n\
\tthen they will be rejected in the calculation\n\
3. An RMSD will be calculated all after all the atoms are aligned that were considered from steps 1 and 2.")
parser.add_argument("pdbs", metavar="*pdb", nargs=argparse.REMAINDER, help="The pdb files that you want superimposed")
parser.add_argument(
"--multi", "-m", dest="multi", help="run using multiple processors", action='store_true', default=False)
parser.add_argument("--native", "-n", dest="native", help="native structure")
parser.add_argument("--out", "-o", dest="table", help="the name of the prefix of the output file", default="score_vs_rmsd")
parser.add_argument("--debug", help="will output a file called debug_atoms.txt to tell you exatctly what was calculated in the RMSD")
args = parser.parse_args()
if len(args.pdbs) < 1:
parser.error("specify at least 1 protein to compare to native")
longer_names = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'TYS': 'Y'}
# Functions
def calculate_all_superpositions(native, decoy, debug, pairwise):
"""calculates c alpha , backbone and all atom rmsds"""
native_atoms_ca = []
decoy_atoms_ca = []
native_atoms_bb = []
decoy_atoms_bb = []
native_atoms_all = []
decoy_atoms_all = []
# check to see if we are calculating rmsd on just residues
counter = 1
what_to_skip_native = []
what_to_skip_decoy = []
native_residues = [i for i in native.get_residues()]
decoy_residues = [i for i in decoy.get_residues()]
# first figure out what to skip
for fasta1, fasta2 in zip(pairwise['native'], pairwise['target']):
if fasta1 == "-":
what_to_skip_native.append(counter)
elif fasta2 == "-":
what_to_skip_decoy.append(counter)
counter += 1
for counter, residue in enumerate(native.get_residues(), start=1):
if counter in what_to_skip_decoy:
native_residues.remove(residue)
for counter, residue in enumerate(decoy.get_residues(), start=1):
if counter in what_to_skip_native:
decoy_residues.remove(residue)
zipped_file = izip(native_residues, decoy_residues)
for(native_residue, decoy_residue) in zipped_file:
res_native, res_decoy = str(native_residue.id[1]), str(decoy_residue.id[1])
chain_native, chain_decoy = native_residue.get_parent().id, decoy_residue.get_parent().id
native_list = native_residue.get_list()
decoy_list = decoy_residue.get_list()
try:
native_atoms_ca.append(native_residue['CA'])
decoy_atoms_ca.append(decoy_residue['CA'])
except KeyError:
print "warning: residue", str(native_residue.get_id()[1]), "has no ca atom in either the native or the decoy structure. either this residue is a hetatm or part of your backbone is missing. the residue will not be included in the rmsd calculations."
try:
native_atoms_bb.append(native_residue['CA'])
native_atoms_bb.append(native_residue['C'])
native_atoms_bb.append(native_residue['O'])
native_atoms_bb.append(native_residue['N'])
decoy_atoms_bb.append(decoy_residue['CA'])
decoy_atoms_bb.append(decoy_residue['C'])
decoy_atoms_bb.append(decoy_residue['O'])
decoy_atoms_bb.append(decoy_residue['N'])
except KeyError:
print "warning: residue", str(native_residue.get_id()[1]), "has no backbone atoms"
zipped_atoms = izip(native_list, decoy_list)
for (native_atom, decoy_atom) in zipped_atoms:
if native_atom.id != decoy_atom.id:
if debug:
print "atom {0} from chain {1}, residue {2} and atom {3} from chain {4}, residue {5} don't match up, skipping".format(
native_atom.id, chain_native, res_native, decoy_atom.id, chain_decoy, res_decoy)
continue
native_atoms_all.append(native_atom)
decoy_atoms_all.append(decoy_atom)
superpose_ca = Superimposer()
superpose_bb = Superimposer()
superpose_all = Superimposer()
superpose_ca.set_atoms(native_atoms_ca, decoy_atoms_ca)
superpose_ca.apply(decoy.get_atoms())
ca_rms = superpose_ca.rms
superpose_bb.set_atoms(native_atoms_bb, decoy_atoms_bb)
superpose_bb.apply(decoy.get_atoms())
bb_rms = superpose_bb.rms
superpose_all.set_atoms(native_atoms_all, decoy_atoms_all)
superpose_all.apply(decoy.get_atoms())
all_rms = superpose_all.rms
return {"ca_rmsd": ca_rms, "bb_rmsd": bb_rms, "all_rmsd": all_rms}
def load_pdb(path):
"""return a biopython structure object given a pdb file path"""
parser = PDBParser(PERMISSIVE=1)
pdb_file = open(path, 'rU')
structure = parser.get_structure(path[0:4], pdb_file)
pdb_file.close()
return structure
def get_fasta_from_pdb(pdb):
holder = []
for i in pdb.get_residues():
try:
holder.append((i, longer_names[i.get_resname()]))
except KeyError:
if args.debug:
print "Cant find short name for {0}, putting \"X\" instead".format(i)
holder.append((i, "X"))
return holder
def get_pairwise_alignment(native, target, target_name):
alignment_dict = {}
fasta1 = get_fasta_from_pdb(native)
fasta2 = get_fasta_from_pdb(target)
file = "clustal_tmp_in_{0}.fasta".format(target_name)
with open(file, 'w') as f:
f.write(">native\n")
for letter1 in fasta1:
f.write(letter1[1])
f.write("\n>target\n")
for letter2 in fasta2:
f.write(letter2[1])
cline = cw(clustalw_exe, infile=file)
assert os.path.isfile(clustalw_exe), "Clustal W binary is missing"
stdout, stderr = cline()
alignment = AlignIO.read(file.split('.fasta')[0] + '.aln', 'clustal')
alignment_dict['native'] = str(alignment[0, :].seq)
alignment_dict['target'] = str(alignment[1, :].seq)
return alignment_dict
def find_rmsd(model):
if args.debug:
print "processing %s" % (model)
native = args.native
rms_all = []
pwa = get_pairwise_alignment(load_pdb(args.native), load_pdb(model), os.path.basename(model))
# will return three rmsds aligned by everything in decoy
rms_all = calculate_all_superpositions(
load_pdb(native), load_pdb(model), [], args.debug, pwa)
print "Model:{0} All-Atom RMSD {1}, Backbone RMSD {2}, C-Alpha RMSD{3}".format(
rms_all['all_rmsd'], rms_all['bb_rmsd'], rms_all['ca_rmsd'])
def main(args):
if args.multi:
try:
p = Pool()
p.map(find_rmsd, args.pdbs)
except KeyboardInterrupt:
p.terminate()
quit()
else:
for i in args.pdbs:
find_rmsd(i)
# remove alignment files
temp_files = glob.glob('clustal_tmp*')
for i in temp_files:
os.remove(i)
if __name__ == "__main__":
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment