jwillis0720/superimposer_align.py

## superimposer_align.py
#!/usr/bin/env/python2.7
import sys
from Bio.Align.Applications import ClustalwCommandline as cw
import warnings
from multiprocessing import Pool
import argparse
import os
from Bio import AlignIO
from Bio.PDB import PDBExceptions
from Bio.PDB import PDBParser
from Bio.PDB import Superimposer
import glob
from itertools import izip


clustalw_exe = r"/usr/bin/apps/clustalw/Linux2/i686/v1.83/clustalw"
warnings.simplefilter('ignore', PDBExceptions.PDBConstructionWarning)

if sys.version_info < (2, 7):
    raise Exception("You must use python2.7 to run this")


# usage
usage = "%prog [options] -n native.pdb *pdbs"
parser = argparse.ArgumentParser(prog="rmsd_full.py", formatter_class=argparse.RawTextHelpFormatter,
                                 description="\n\nA rosetta score vs rmsd file encompassing a full analysis.\n\n\
                                 -This recapitulates the most accurate representation of score vs rmsd by doing the following:\n\
                                 1. First does an alignment of sequences to find out amino acid correspondance\n\
                                 2. Iterates through atoms attempting to match them up. If they are not identical types, \n\
                                 \tthen they will be rejected in the calculation\n\
                                 3. An RMSD will be calculated all after all the atoms are aligned that were considered from steps 1 and 2.")

parser.add_argument("pdbs", metavar="*pdb", nargs=argparse.REMAINDER, help="The pdb files that you want superimposed")
parser.add_argument(
    "--multi", "-m", dest="multi", help="run using multiple processors", action='store_true', default=False)
parser.add_argument("--native", "-n", dest="native", help="native structure")
parser.add_argument("--out", "-o", dest="table", help="the name of the prefix of the  output file", default="score_vs_rmsd")
parser.add_argument("--debug", help="will output a file called debug_atoms.txt to tell you exatctly what was calculated in the RMSD")

args = parser.parse_args()

if len(args.pdbs) < 1:
    parser.error("specify at least 1 protein to compare to native")

longer_names = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
                'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
                'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
                'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
                'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'TYS': 'Y'}


# Functions
def calculate_all_superpositions(native, decoy, debug, pairwise):
    """calculates c alpha , backbone and all atom rmsds"""
    native_atoms_ca = []
    decoy_atoms_ca = []
    native_atoms_bb = []
    decoy_atoms_bb = []
    native_atoms_all = []
    decoy_atoms_all = []

    # check to see if we are calculating rmsd on just residues
    counter = 1
    what_to_skip_native = []
    what_to_skip_decoy = []
    native_residues = [i for i in native.get_residues()]
    decoy_residues = [i for i in decoy.get_residues()]

    # first figure out what to skip
    for fasta1, fasta2 in zip(pairwise['native'], pairwise['target']):
            if fasta1 == "-":
                    what_to_skip_native.append(counter)
            elif fasta2 == "-":
                    what_to_skip_decoy.append(counter)
            counter += 1

    for counter, residue in enumerate(native.get_residues(), start=1):
        if counter in what_to_skip_decoy:
            native_residues.remove(residue)

    for counter, residue in enumerate(decoy.get_residues(), start=1):
        if counter in what_to_skip_native:
            decoy_residues.remove(residue)

    zipped_file = izip(native_residues, decoy_residues)

    for(native_residue, decoy_residue) in zipped_file:
        res_native, res_decoy = str(native_residue.id[1]), str(decoy_residue.id[1])
        chain_native, chain_decoy = native_residue.get_parent().id, decoy_residue.get_parent().id
        native_list = native_residue.get_list()
        decoy_list = decoy_residue.get_list()
        try:
            native_atoms_ca.append(native_residue['CA'])
            decoy_atoms_ca.append(decoy_residue['CA'])
        except KeyError:
            print "warning: residue", str(native_residue.get_id()[1]), "has no ca atom in either the native or the decoy structure.  either this residue is a hetatm or part of your backbone is missing. the residue will not be included in the rmsd calculations."
        try:
            native_atoms_bb.append(native_residue['CA'])
            native_atoms_bb.append(native_residue['C'])
            native_atoms_bb.append(native_residue['O'])
            native_atoms_bb.append(native_residue['N'])
            decoy_atoms_bb.append(decoy_residue['CA'])
            decoy_atoms_bb.append(decoy_residue['C'])
            decoy_atoms_bb.append(decoy_residue['O'])
            decoy_atoms_bb.append(decoy_residue['N'])
        except KeyError:
            print "warning: residue", str(native_residue.get_id()[1]), "has no backbone atoms"

        zipped_atoms = izip(native_list, decoy_list)
        for (native_atom, decoy_atom) in zipped_atoms:
            if native_atom.id != decoy_atom.id:
                if debug:
                    print "atom {0} from chain {1}, residue {2} and atom {3} from chain {4}, residue {5} don't match up, skipping".format(
                        native_atom.id, chain_native, res_native, decoy_atom.id, chain_decoy, res_decoy)
                continue
            native_atoms_all.append(native_atom)
            decoy_atoms_all.append(decoy_atom)

    superpose_ca = Superimposer()
    superpose_bb = Superimposer()
    superpose_all = Superimposer()

    superpose_ca.set_atoms(native_atoms_ca, decoy_atoms_ca)
    superpose_ca.apply(decoy.get_atoms())
    ca_rms = superpose_ca.rms
    superpose_bb.set_atoms(native_atoms_bb, decoy_atoms_bb)
    superpose_bb.apply(decoy.get_atoms())
    bb_rms = superpose_bb.rms
    superpose_all.set_atoms(native_atoms_all, decoy_atoms_all)
    superpose_all.apply(decoy.get_atoms())
    all_rms = superpose_all.rms

    return {"ca_rmsd": ca_rms, "bb_rmsd": bb_rms, "all_rmsd": all_rms}


def load_pdb(path):
    """return a biopython structure object given a pdb file path"""
    parser = PDBParser(PERMISSIVE=1)
    pdb_file = open(path, 'rU')
    structure = parser.get_structure(path[0:4], pdb_file)
    pdb_file.close()
    return structure


def get_fasta_from_pdb(pdb):
    holder = []
    for i in pdb.get_residues():
        try:
            holder.append((i, longer_names[i.get_resname()]))
        except KeyError:
            if args.debug:
                print "Cant find short name for {0}, putting \"X\" instead".format(i)
                holder.append((i, "X"))
    return holder


def get_pairwise_alignment(native, target, target_name):
    alignment_dict = {}
    fasta1 = get_fasta_from_pdb(native)
    fasta2 = get_fasta_from_pdb(target)
    file = "clustal_tmp_in_{0}.fasta".format(target_name)
    with open(file, 'w') as f:
        f.write(">native\n")
        for letter1 in fasta1:
            f.write(letter1[1])
        f.write("\n>target\n")
        for letter2 in fasta2:
            f.write(letter2[1])
    cline = cw(clustalw_exe, infile=file)
    assert os.path.isfile(clustalw_exe), "Clustal W binary is missing"
    stdout, stderr = cline()
    alignment = AlignIO.read(file.split('.fasta')[0] + '.aln', 'clustal')
    alignment_dict['native'] = str(alignment[0, :].seq)
    alignment_dict['target'] = str(alignment[1, :].seq)
    return alignment_dict


def find_rmsd(model):
    if args.debug:
        print "processing %s" % (model)
    native = args.native
    rms_all = []
    pwa = get_pairwise_alignment(load_pdb(args.native), load_pdb(model), os.path.basename(model))
    # will return three rmsds aligned by everything in decoy
    rms_all = calculate_all_superpositions(
        load_pdb(native), load_pdb(model), [], args.debug, pwa)
    print "Model:{0} All-Atom RMSD {1}, Backbone RMSD {2}, C-Alpha RMSD{3}".format(
        rms_all['all_rmsd'], rms_all['bb_rmsd'], rms_all['ca_rmsd'])

def main(args):
    if args.multi:
        try:
            p = Pool()
            p.map(find_rmsd, args.pdbs)
        except KeyboardInterrupt:
            p.terminate()
            quit()
    else:
        for i in args.pdbs:
            find_rmsd(i)
    # remove alignment files
    temp_files = glob.glob('clustal_tmp*')
    for i in temp_files:
        os.remove(i)

if __name__ == "__main__":
    main(args)
	#!/usr/bin/env/python2.7
	import sys
	from Bio.Align.Applications import ClustalwCommandline as cw
	import warnings
	from multiprocessing import Pool
	import argparse
	import os
	from Bio import AlignIO
	from Bio.PDB import PDBExceptions
	from Bio.PDB import PDBParser
	from Bio.PDB import Superimposer
	import glob
	from itertools import izip


	clustalw_exe = r"/usr/bin/apps/clustalw/Linux2/i686/v1.83/clustalw"
	warnings.simplefilter('ignore', PDBExceptions.PDBConstructionWarning)

	if sys.version_info < (2, 7):
	raise Exception("You must use python2.7 to run this")


	# usage
	usage = "%prog [options] -n native.pdb *pdbs"
	parser = argparse.ArgumentParser(prog="rmsd_full.py", formatter_class=argparse.RawTextHelpFormatter,
	description="\n\nA rosetta score vs rmsd file encompassing a full analysis.\n\n\
	-This recapitulates the most accurate representation of score vs rmsd by doing the following:\n\
	1. First does an alignment of sequences to find out amino acid correspondance\n\
	2. Iterates through atoms attempting to match them up. If they are not identical types, \n\
	\tthen they will be rejected in the calculation\n\
	3. An RMSD will be calculated all after all the atoms are aligned that were considered from steps 1 and 2.")

	parser.add_argument("pdbs", metavar="*pdb", nargs=argparse.REMAINDER, help="The pdb files that you want superimposed")
	parser.add_argument(
	"--multi", "-m", dest="multi", help="run using multiple processors", action='store_true', default=False)
	parser.add_argument("--native", "-n", dest="native", help="native structure")
	parser.add_argument("--out", "-o", dest="table", help="the name of the prefix of the output file", default="score_vs_rmsd")
	parser.add_argument("--debug", help="will output a file called debug_atoms.txt to tell you exatctly what was calculated in the RMSD")

	args = parser.parse_args()

	if len(args.pdbs) < 1:
	parser.error("specify at least 1 protein to compare to native")

	longer_names = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
	'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
	'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
	'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
	'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V', 'TYS': 'Y'}


	# Functions
	def calculate_all_superpositions(native, decoy, debug, pairwise):
	"""calculates c alpha , backbone and all atom rmsds"""
	native_atoms_ca = []
	decoy_atoms_ca = []
	native_atoms_bb = []
	decoy_atoms_bb = []
	native_atoms_all = []
	decoy_atoms_all = []

	# check to see if we are calculating rmsd on just residues
	counter = 1
	what_to_skip_native = []
	what_to_skip_decoy = []
	native_residues = [i for i in native.get_residues()]
	decoy_residues = [i for i in decoy.get_residues()]

	# first figure out what to skip
	for fasta1, fasta2 in zip(pairwise['native'], pairwise['target']):
	if fasta1 == "-":
	what_to_skip_native.append(counter)
	elif fasta2 == "-":
	what_to_skip_decoy.append(counter)
	counter += 1

	for counter, residue in enumerate(native.get_residues(), start=1):
	if counter in what_to_skip_decoy:
	native_residues.remove(residue)

	for counter, residue in enumerate(decoy.get_residues(), start=1):
	if counter in what_to_skip_native:
	decoy_residues.remove(residue)

	zipped_file = izip(native_residues, decoy_residues)

	for(native_residue, decoy_residue) in zipped_file:
	res_native, res_decoy = str(native_residue.id[1]), str(decoy_residue.id[1])
	chain_native, chain_decoy = native_residue.get_parent().id, decoy_residue.get_parent().id
	native_list = native_residue.get_list()
	decoy_list = decoy_residue.get_list()
	try:
	native_atoms_ca.append(native_residue['CA'])
	decoy_atoms_ca.append(decoy_residue['CA'])
	except KeyError:
	print "warning: residue", str(native_residue.get_id()[1]), "has no ca atom in either the native or the decoy structure. either this residue is a hetatm or part of your backbone is missing. the residue will not be included in the rmsd calculations."
	try:
	native_atoms_bb.append(native_residue['CA'])
	native_atoms_bb.append(native_residue['C'])
	native_atoms_bb.append(native_residue['O'])
	native_atoms_bb.append(native_residue['N'])
	decoy_atoms_bb.append(decoy_residue['CA'])
	decoy_atoms_bb.append(decoy_residue['C'])
	decoy_atoms_bb.append(decoy_residue['O'])
	decoy_atoms_bb.append(decoy_residue['N'])
	except KeyError:
	print "warning: residue", str(native_residue.get_id()[1]), "has no backbone atoms"

	zipped_atoms = izip(native_list, decoy_list)
	for (native_atom, decoy_atom) in zipped_atoms:
	if native_atom.id != decoy_atom.id:
	if debug:
	print "atom {0} from chain {1}, residue {2} and atom {3} from chain {4}, residue {5} don't match up, skipping".format(
	native_atom.id, chain_native, res_native, decoy_atom.id, chain_decoy, res_decoy)
	continue
	native_atoms_all.append(native_atom)
	decoy_atoms_all.append(decoy_atom)

	superpose_ca = Superimposer()
	superpose_bb = Superimposer()
	superpose_all = Superimposer()

	superpose_ca.set_atoms(native_atoms_ca, decoy_atoms_ca)
	superpose_ca.apply(decoy.get_atoms())
	ca_rms = superpose_ca.rms
	superpose_bb.set_atoms(native_atoms_bb, decoy_atoms_bb)
	superpose_bb.apply(decoy.get_atoms())
	bb_rms = superpose_bb.rms
	superpose_all.set_atoms(native_atoms_all, decoy_atoms_all)
	superpose_all.apply(decoy.get_atoms())
	all_rms = superpose_all.rms

	return {"ca_rmsd": ca_rms, "bb_rmsd": bb_rms, "all_rmsd": all_rms}


	def load_pdb(path):
	"""return a biopython structure object given a pdb file path"""
	parser = PDBParser(PERMISSIVE=1)
	pdb_file = open(path, 'rU')
	structure = parser.get_structure(path[0:4], pdb_file)
	pdb_file.close()
	return structure


	def get_fasta_from_pdb(pdb):
	holder = []
	for i in pdb.get_residues():
	try:
	holder.append((i, longer_names[i.get_resname()]))
	except KeyError:
	if args.debug:
	print "Cant find short name for {0}, putting \"X\" instead".format(i)
	holder.append((i, "X"))
	return holder


	def get_pairwise_alignment(native, target, target_name):
	alignment_dict = {}
	fasta1 = get_fasta_from_pdb(native)
	fasta2 = get_fasta_from_pdb(target)
	file = "clustal_tmp_in_{0}.fasta".format(target_name)
	with open(file, 'w') as f:
	f.write(">native\n")
	for letter1 in fasta1:
	f.write(letter1[1])
	f.write("\n>target\n")
	for letter2 in fasta2:
	f.write(letter2[1])
	cline = cw(clustalw_exe, infile=file)
	assert os.path.isfile(clustalw_exe), "Clustal W binary is missing"
	stdout, stderr = cline()
	alignment = AlignIO.read(file.split('.fasta')[0] + '.aln', 'clustal')
	alignment_dict['native'] = str(alignment[0, :].seq)
	alignment_dict['target'] = str(alignment[1, :].seq)
	return alignment_dict


	def find_rmsd(model):
	if args.debug:
	print "processing %s" % (model)
	native = args.native
	rms_all = []
	pwa = get_pairwise_alignment(load_pdb(args.native), load_pdb(model), os.path.basename(model))
	# will return three rmsds aligned by everything in decoy
	rms_all = calculate_all_superpositions(
	load_pdb(native), load_pdb(model), [], args.debug, pwa)
	print "Model:{0} All-Atom RMSD {1}, Backbone RMSD {2}, C-Alpha RMSD{3}".format(
	rms_all['all_rmsd'], rms_all['bb_rmsd'], rms_all['ca_rmsd'])

	def main(args):
	if args.multi:
	try:
	p = Pool()
	p.map(find_rmsd, args.pdbs)
	except KeyboardInterrupt:
	p.terminate()
	quit()
	else:
	for i in args.pdbs:
	find_rmsd(i)
	# remove alignment files
	temp_files = glob.glob('clustal_tmp*')
	for i in temp_files:
	os.remove(i)

	if __name__ == "__main__":
	main(args)