fabian-paul/mutate.py

## mutate.py
import numpy as np
import mdtraj
import os
import os.path
import argparse

def prepare_maps(mut_pdb='mutant.pdb', wt_pdb=None):
    if wt_pdb is None:
        wt_pdb = os.path.expanduser('~/system/crystal.pdb')

    wt = mdtraj.load(wt_pdb)
    mut = mdtraj.load(mut_pdb)

    wt = wt.atom_slice(wt.top.select('protein'))
    mut = mut.atom_slice(mut.top.select('protein'))

    dest = []
    source = []
    # go throught list of unmodified residues and map atoms
    for rw, rm in zip(wt.top.residues, mut.top.residues):
        if rw.name == rm.name and rw.index == rm.index:
            for aw,am in zip(rw.atoms, rm.atoms):
                assert aw.name == am.name
                i = aw.index
                source.append(i)
                j = am.index
                dest.append(j)

    # add mutated backbone to the map
    away = None
    for rw, rm in zip(wt.top.residues, mut.top.residues):
        if rw.name != rm.name and rw.index == rm.index:
            if away is None:
                away = rm.index
            else:
                assert rm.index == away # allow only one mutation at the same time
            for aw in rw.atoms:
                i = aw.index
                if aw.name in ['C','O','N','H','CA','HA','H1','H2','H3','OXT']:
                    j = next(am.index for am in rm.atoms if am.name == aw.name)
                    source.append(i)
                    dest.append(j)

    assert away is not None # should have least one mutation

    source = np.array(source)
    dest = np.array(dest)

    # find the indices of ALA-HB1 to ALA-HB3 in mutant topology
    r_mut_ala = next(rm for rm in mut.top.residues if rm.index == away)
    h_and_cb_indices = []
    for n in ['HB1', 'HB2', 'HB3', 'CB']:
        h_and_cb_indices.append(next(a.index for a in r_mut_ala.atoms if a.name == n))
    h_and_cb_indices = np.array(h_and_cb_indices)

    # find the indices of mutated-away backbone in wt topology
    r_wt_away = next(rw for rw in wt.top.residues if rw.index == away)
    bb_frame = []
    for n in ['C', 'CA', 'N', 'O', 'CB']:
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == n))
    # CG is kind of tough...
    for n in ['CG', 'CG2', 'OG']:
        if any(a.name == n for a in r_wt_away.atoms):
            bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == n))
    # find HB2 and HB3 positions
    if r_wt_away.name=='THR': # THR is an exception
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'OG1'))
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB'))
    if r_wt_away.name=='ILE' or r_wt_away.name=='VAL':
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'CG1'))
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB'))
    else:
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB2'))
        bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB3'))
    bb_frame = np.array(bb_frame)

    # if we are mutating a proline, we have to place the bb hydrogen too ...
    if wt.top.residue(away).name == 'PRO':
        proline = True
        PRO_H_index = next(am.index for am in mut.top.residue(away).atoms if am.name=='H')
        PRO_N_CD = np.array([
            next(aw.index for aw in wt.top.residue(away).atoms if aw.name=='N'),
            next(aw.index for aw in wt.top.residue(away).atoms if aw.name=='CD')
            ])
    else:
        proline = False
        PRO_H_index = None
        PRO_N_CD = None

    return source, dest, h_and_cb_indices, bb_frame, proline, PRO_H_index, PRO_N_CD, wt, mut

# positions of hydrogens in the coordiante system attached to the CA of alanine
# * z-axis is oriented along the CA-CB bond
# * x-axis is oriented along the direction from CA to GG orthogonalized w.r.t the z-axis (with Gautam method)
# * x-axis is oriented along along the bisectrix of the angle (N-CA-C) orthogonalized w.r.t the z-axis (with my method)
t = 1.91113553093 # the diamond angle 2*arctan(sqrt(2)) +/- Amber stupidity
c = 0.1526 # bond length CA-CB (Amber)
b = 0.109 # bond length CB-H (Amber)
sqrt_3 = np.sqrt(3.0)
H1 = np.array([b*np.sin(t),                          0.0, -b*np.cos(t)])
H2 = np.array([-0.5*b*np.sin(t),  0.5*sqrt_3*b*np.sin(t), -b*np.cos(t)])
H3 = np.array([-0.5*b*np.sin(t), -0.5*sqrt_3*b*np.sin(t), -b*np.cos(t)])
ceZ = np.array([0.0, 0.0, c])

def normalized(v):
    return v/np.linalg.norm(v)

def orthogonalized(v, w):
    return v - v.dot(w)/w.dot(w)* w

def orthonormalized(v, w):
    return normalized(orthogonalized(v, w))

def compute(frame, massova=False, constant_angles=False, fixCB=False):
    C, CA, N, O, CB, CG, HB2, HB3 = frame
    O = np.zeros((3,3))
    vZ = normalized(CB-CA)

    if constant_angles:
        assert not fixCB, 'not implemented'
        H1p = b*normalized(CG-CB) + CB
        H2p = b*normalized(HB2-CB) + CB
        H3p = b*normalized(HB3-CB) + CB
        CBp = CB
        return np.vstack((H1p, H2p, H3p, CBp))

    if massova:
        vX = orthonormalized(CG-CB, vZ)
    else:
        cc = orthonormalized(C-CA, vZ)
        n = orthonormalized(N-CA, vZ)
        vX = normalized(cc+n)

    vY = np.cross(vZ, vX)
    assert np.cross(vX, vY).dot(vZ) > 0
    # O = map from the coordiante system attached to ALA-CA to the laboratory frame
    O[:,0] = vX # remember: columns of a matrix are the images of the canonical unit vectors
    O[:,1] = vY
    O[:,2] = vZ

    if fixCB: # everything relative to CA
        H1p = O.dot(H1 + ceZ) + CA
        H2p = O.dot(H2 + ceZ) + CA
        H3p = O.dot(H3 + ceZ) + CA
        CBp = O.dot(ceZ) + CA
    else: # everything relative to CB
        H1p = O.dot(H1) + CB
        H2p = O.dot(H2) + CB
        H3p = O.dot(H3) + CB
        CBp = CB

    return np.vstack((H1p, H2p, H3p, CBp))

def place_PRO_hydrogen(frame):
    k = 0.101 # bond length N-H in Amber
    N,CD = frame
    v = normalized(CD-N)
    return N + k*v

if __name__=='__main__':
    parser = argparse.ArgumentParser(description='mutate trajectory')

    parser.add_argument('-i', '--trajectory', required=True, help='(input) trajectory', metavar='file')
    parser.add_argument('-m', '--mutant_topology', required=True, help='(input) topology file for mutant', metavar='file')
    parser.add_argument('-w', '--wild_type_topology', required=True, help='(input) topology file for wild type', metavar='file')
    parser.add_argument('-o', '--outdir', default='.', help='directory where mutant trajectories are written to, default=.', metavar='path')
    parser.add_argument('-g', '--massova', default=False, help='orient ALA methyl group like CG atom, default=false', action='store_true')
    parser.add_argument('-b', '--fixcb', default=False, help='reset C-alpha-C-beta bond length to rest length, default=false', action='store_true')
    parser.add_argument('-c', '--cangles', default=False, help='keep all angles and dihedrals, default=false', action='store_true')
    parser.add_argument('-s', '--stride', default=1, help='apply a stride of n steps to the mutant trajectory default=1', metavar='steps')

    # fixcb:   place all hydrogens and CB to rest positions
    # default: place all hydrogens to rest positions
    # massova: like default but keep the rotation of the dihedral, reset angles and bond lengths
    # cangles: keep all angles and dihedrals, only reset hydrogen bond lengths

    args = parser.parse_args()
    stride = int(args.stride)

    source, dest, h_and_cb_indices, bb_frame, proline, PRO_H_index, PRO_N_CD, wt, mut = prepare_maps(mut_pdb=args.mutant_topology, wt_pdb=args.wild_type_topology)

    # load trajectory
    wt_traj = mdtraj.load(args.trajectory, top=wt.top).atom_slice(wt.top.select('protein'))

    n_frames = len(wt_traj)
    n_frames_out = ((n_frames - 1) // stride) + 1
    mut_traj = mdtraj.Trajectory(np.zeros((n_frames_out, mut.top.n_atoms, 3))*np.nan, mut.top, time=wt_traj.time[::stride],
                             unitcell_lengths=wt_traj.unitcell_lengths[::stride], unitcell_angles=wt_traj.unitcell_angles[::stride])

    if proline:
        assert len(source)==len(dest)
        assert len(source)+len(h_and_cb_indices)+1==mut.n_atoms
    else:
        assert len(source)==len(dest)==mut.n_atoms-len(h_and_cb_indices)

    for t in range(n_frames_out):
        mut_traj.xyz[t, dest] =  wt_traj.xyz[t*stride, source]
        mut_traj.xyz[t, h_and_cb_indices] = compute(wt_traj.xyz[t*stride, bb_frame], constant_angles=args.cangles, massova=args.massova, fixCB=args.fixcb)
        if proline:
            mut_traj.xyz[t, PRO_H_index] = place_PRO_hydrogen(wt_traj.xyz[t*stride, PRO_N_CD])

    assert not np.any(np.isnan(mut_traj.xyz))

    #mut_traj.save_xtc(args.outdir + os.sep + os.path.basename(args.trajectory))
    #mut_traj.save_netcdf(args.outdir + os.sep + os.path.splitext(os.path.basename(args.trajectory))[0] + '.nc')
    mut_traj.save_dcd(args.outdir + os.sep + os.path.splitext(os.path.basename(args.trajectory))[0] + '.dcd')
	import numpy as np
	import mdtraj
	import os
	import os.path
	import argparse

	def prepare_maps(mut_pdb='mutant.pdb', wt_pdb=None):
	if wt_pdb is None:
	wt_pdb = os.path.expanduser('~/system/crystal.pdb')

	wt = mdtraj.load(wt_pdb)
	mut = mdtraj.load(mut_pdb)

	wt = wt.atom_slice(wt.top.select('protein'))
	mut = mut.atom_slice(mut.top.select('protein'))

	dest = []
	source = []
	# go throught list of unmodified residues and map atoms
	for rw, rm in zip(wt.top.residues, mut.top.residues):
	if rw.name == rm.name and rw.index == rm.index:
	for aw,am in zip(rw.atoms, rm.atoms):
	assert aw.name == am.name
	i = aw.index
	source.append(i)
	j = am.index
	dest.append(j)

	# add mutated backbone to the map
	away = None
	for rw, rm in zip(wt.top.residues, mut.top.residues):
	if rw.name != rm.name and rw.index == rm.index:
	if away is None:
	away = rm.index
	else:
	assert rm.index == away # allow only one mutation at the same time
	for aw in rw.atoms:
	i = aw.index
	if aw.name in ['C','O','N','H','CA','HA','H1','H2','H3','OXT']:
	j = next(am.index for am in rm.atoms if am.name == aw.name)
	source.append(i)
	dest.append(j)

	assert away is not None # should have least one mutation

	source = np.array(source)
	dest = np.array(dest)

	# find the indices of ALA-HB1 to ALA-HB3 in mutant topology
	r_mut_ala = next(rm for rm in mut.top.residues if rm.index == away)
	h_and_cb_indices = []
	for n in ['HB1', 'HB2', 'HB3', 'CB']:
	h_and_cb_indices.append(next(a.index for a in r_mut_ala.atoms if a.name == n))
	h_and_cb_indices = np.array(h_and_cb_indices)

	# find the indices of mutated-away backbone in wt topology
	r_wt_away = next(rw for rw in wt.top.residues if rw.index == away)
	bb_frame = []
	for n in ['C', 'CA', 'N', 'O', 'CB']:
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == n))
	# CG is kind of tough...
	for n in ['CG', 'CG2', 'OG']:
	if any(a.name == n for a in r_wt_away.atoms):
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == n))
	# find HB2 and HB3 positions
	if r_wt_away.name=='THR': # THR is an exception
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'OG1'))
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB'))
	if r_wt_away.name=='ILE' or r_wt_away.name=='VAL':
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'CG1'))
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB'))
	else:
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB2'))
	bb_frame.append(next(a.index for a in r_wt_away.atoms if a.name == 'HB3'))
	bb_frame = np.array(bb_frame)

	# if we are mutating a proline, we have to place the bb hydrogen too ...
	if wt.top.residue(away).name == 'PRO':
	proline = True
	PRO_H_index = next(am.index for am in mut.top.residue(away).atoms if am.name=='H')
	PRO_N_CD = np.array([
	next(aw.index for aw in wt.top.residue(away).atoms if aw.name=='N'),
	next(aw.index for aw in wt.top.residue(away).atoms if aw.name=='CD')
	])
	else:
	proline = False
	PRO_H_index = None
	PRO_N_CD = None

	return source, dest, h_and_cb_indices, bb_frame, proline, PRO_H_index, PRO_N_CD, wt, mut

	# positions of hydrogens in the coordiante system attached to the CA of alanine
	# * z-axis is oriented along the CA-CB bond
	# * x-axis is oriented along the direction from CA to GG orthogonalized w.r.t the z-axis (with Gautam method)
	# * x-axis is oriented along along the bisectrix of the angle (N-CA-C) orthogonalized w.r.t the z-axis (with my method)
	t = 1.91113553093 # the diamond angle 2*arctan(sqrt(2)) +/- Amber stupidity
	c = 0.1526 # bond length CA-CB (Amber)
	b = 0.109 # bond length CB-H (Amber)
	sqrt_3 = np.sqrt(3.0)
	H1 = np.array([bnp.sin(t), 0.0, -bnp.cos(t)])
	H2 = np.array([-0.5bnp.sin(t), 0.5sqrt_3bnp.sin(t), -bnp.cos(t)])
	H3 = np.array([-0.5bnp.sin(t), -0.5sqrt_3bnp.sin(t), -bnp.cos(t)])
	ceZ = np.array([0.0, 0.0, c])

	def normalized(v):
	return v/np.linalg.norm(v)

	def orthogonalized(v, w):
	return v - v.dot(w)/w.dot(w)* w

	def orthonormalized(v, w):
	return normalized(orthogonalized(v, w))

	def compute(frame, massova=False, constant_angles=False, fixCB=False):
	C, CA, N, O, CB, CG, HB2, HB3 = frame
	O = np.zeros((3,3))
	vZ = normalized(CB-CA)

	if constant_angles:
	assert not fixCB, 'not implemented'
	H1p = b*normalized(CG-CB) + CB
	H2p = b*normalized(HB2-CB) + CB
	H3p = b*normalized(HB3-CB) + CB
	CBp = CB
	return np.vstack((H1p, H2p, H3p, CBp))

	if massova:
	vX = orthonormalized(CG-CB, vZ)
	else:
	cc = orthonormalized(C-CA, vZ)
	n = orthonormalized(N-CA, vZ)
	vX = normalized(cc+n)

	vY = np.cross(vZ, vX)
	assert np.cross(vX, vY).dot(vZ) > 0
	# O = map from the coordiante system attached to ALA-CA to the laboratory frame
	O[:,0] = vX # remember: columns of a matrix are the images of the canonical unit vectors
	O[:,1] = vY
	O[:,2] = vZ

	if fixCB: # everything relative to CA
	H1p = O.dot(H1 + ceZ) + CA
	H2p = O.dot(H2 + ceZ) + CA
	H3p = O.dot(H3 + ceZ) + CA
	CBp = O.dot(ceZ) + CA
	else: # everything relative to CB
	H1p = O.dot(H1) + CB
	H2p = O.dot(H2) + CB
	H3p = O.dot(H3) + CB
	CBp = CB

	return np.vstack((H1p, H2p, H3p, CBp))

	def place_PRO_hydrogen(frame):
	k = 0.101 # bond length N-H in Amber
	N,CD = frame
	v = normalized(CD-N)
	return N + k*v

	if __name__=='__main__':
	parser = argparse.ArgumentParser(description='mutate trajectory')

	parser.add_argument('-i', '--trajectory', required=True, help='(input) trajectory', metavar='file')
	parser.add_argument('-m', '--mutant_topology', required=True, help='(input) topology file for mutant', metavar='file')
	parser.add_argument('-w', '--wild_type_topology', required=True, help='(input) topology file for wild type', metavar='file')
	parser.add_argument('-o', '--outdir', default='.', help='directory where mutant trajectories are written to, default=.', metavar='path')
	parser.add_argument('-g', '--massova', default=False, help='orient ALA methyl group like CG atom, default=false', action='store_true')
	parser.add_argument('-b', '--fixcb', default=False, help='reset C-alpha-C-beta bond length to rest length, default=false', action='store_true')
	parser.add_argument('-c', '--cangles', default=False, help='keep all angles and dihedrals, default=false', action='store_true')
	parser.add_argument('-s', '--stride', default=1, help='apply a stride of n steps to the mutant trajectory default=1', metavar='steps')

	# fixcb: place all hydrogens and CB to rest positions
	# default: place all hydrogens to rest positions
	# massova: like default but keep the rotation of the dihedral, reset angles and bond lengths
	# cangles: keep all angles and dihedrals, only reset hydrogen bond lengths

	args = parser.parse_args()
	stride = int(args.stride)

	source, dest, h_and_cb_indices, bb_frame, proline, PRO_H_index, PRO_N_CD, wt, mut = prepare_maps(mut_pdb=args.mutant_topology, wt_pdb=args.wild_type_topology)

	# load trajectory
	wt_traj = mdtraj.load(args.trajectory, top=wt.top).atom_slice(wt.top.select('protein'))

	n_frames = len(wt_traj)
	n_frames_out = ((n_frames - 1) // stride) + 1
	mut_traj = mdtraj.Trajectory(np.zeros((n_frames_out, mut.top.n_atoms, 3))*np.nan, mut.top, time=wt_traj.time[::stride],
	unitcell_lengths=wt_traj.unitcell_lengths[::stride], unitcell_angles=wt_traj.unitcell_angles[::stride])

	if proline:
	assert len(source)==len(dest)
	assert len(source)+len(h_and_cb_indices)+1==mut.n_atoms
	else:
	assert len(source)==len(dest)==mut.n_atoms-len(h_and_cb_indices)

	for t in range(n_frames_out):
	mut_traj.xyz[t, dest] = wt_traj.xyz[t*stride, source]
	mut_traj.xyz[t, h_and_cb_indices] = compute(wt_traj.xyz[t*stride, bb_frame], constant_angles=args.cangles, massova=args.massova, fixCB=args.fixcb)
	if proline:
	mut_traj.xyz[t, PRO_H_index] = place_PRO_hydrogen(wt_traj.xyz[t*stride, PRO_N_CD])

	assert not np.any(np.isnan(mut_traj.xyz))

	#mut_traj.save_xtc(args.outdir + os.sep + os.path.basename(args.trajectory))
	#mut_traj.save_netcdf(args.outdir + os.sep + os.path.splitext(os.path.basename(args.trajectory))[0] + '.nc')
	mut_traj.save_dcd(args.outdir + os.sep + os.path.splitext(os.path.basename(args.trajectory))[0] + '.dcd')