yamule/pdp.py

## pdp.py
#!/usr/bin/env python
# coding: utf-8

# In[1]:
"""
   Copyright 2022 yamule

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
"""

from Bio.PDB import *;
import numpy as np;
from Bio.PDB import PDBIO ;
from Bio.PDB.PDBIO import Select;
import copy;
pdb_parser = MMCIFParser()
## pdb_parser = PDBParser()

out_prefix = "xtestout";
structure = pdb_parser.get_structure('X', '7t8o.cif');
target_chain = set(["A","B"]);
class CA_Residue:
    def __init__(self,idx,residue,co):
        self.index = idx;
        self.residue = residue;
        self.coordinates = co;


# In[2]:


def perform_pdp(cas,candidate_range):
    #get_contacting_residues
    contact_with = [];
    contact_with_rev = [];
    for ii in range(len(cas)):
        pcon = [];
        for jj in range(ii,len(cas)): # biojava で 5 残基以上離れているのしか考えていないようだったので
            if cas[jj].index - cas[ii].index >= 5:
                diff = (cas[ii].coordinates-cas[jj].coordinates);
                dsum = (diff*diff).sum();
                if dsum <= 0.0:
                    dist = 0;
                else:
                    dist = np.sqrt(dsum);
                if dist < 8.0:
                    pcon.append(jj);
        contact_with.append(pcon);
        contact_with_rev.append([]);
    for gg in range(len(contact_with)):
        for dd in list(contact_with[gg]):
            contact_with_rev[dd].append(gg);
    cutpos_value = [];
    vsum = 0;
    xcou = 0;
    vave = None;
    endshift_0 = 12 if len(candidate_range) == 1 else 9;
    endshift = 9;# ignore this many boundary of AAs from terminals
    domains = [];
    dryrun = copy.deepcopy(candidate_range);
    while len(candidate_range) > 0:
        minvalue = 1000000000;
        minpos = None;
        zcan = candidate_range.pop();
        st = 0;
        en = len(zcan);
        tail_region = set(zcan);
        st_ = st+endshift;
        en_ = en-endshift;
        if zcan[st] == 0:
            st_ = st+endshift_0;
        if en_-1 - st_ < 1:
            domains.append(zcan);
            continue;
        if zcan[en-1] == len(cas)-1:
            en_ = en-endshift_0;

        #check_single_cut
        for ii in range(st,st_):
            tail_region.remove(zcan[ii]);
        for ii in range(st_,en_-1):
            tail_region.remove(zcan[ii]);
            contacting = set();
            for jj in range(st,ii+1):#ii の後ろで切る
                for cc in list(contact_with[zcan[jj]]):
                    if cc in tail_region:
                        contacting.add(cc);

            num_contacts = len(contacting);
            #print(num_contacts)
            bbase = ((len(zcan)-len(tail_region))**0.43)*((len(tail_region))**0.43);
            bval = num_contacts/bbase;
            #print(st,ii,bval)
            if minpos is None or minvalue > bval:
                minpos = [zcan[0:(ii+1)],zcan[(ii+1):en]];
                minvalue = bval;
            vsum += bval;
            xcou += 1;

        if True:
            #check_double_cut
            head = [];
            tail2 = list(zcan);
            for ii in range(st,st_):
                head.append(zcan[ii]);
                chk = tail2.pop(0);
                assert chk == zcan[ii];

            for ii in range(st_,en_):
                head.append(zcan[ii]);
                chk = tail2.pop(0);
                assert chk == zcan[ii];
                tail = list(tail2);
                middle = set();
                for jj in range(ii+1,en-1):
                    contacting = set();
                    middle.add(zcan[jj]);
                    chk = tail.pop(0);
                    assert chk == zcan[jj];
                    #if (not (tail[0] in contact_with[head[-1]])) or abs(cas[tail[0]][0]-cas[head[-1]][0]) < 35 :
                    if (not (tail[0] in contact_with[head[-1]])) or len(middle) < 34 :
                        continue;
                    for hh in list(head):
                        for cc in list(contact_with[hh]):
                            if cc in middle:
                                contacting.add(cc);
                    for tt in list(tail):
                        for cc in list(contact_with_rev[tt]):
                            if cc in middle:
                                contacting.add(cc);

                    num_contacts = len(contacting);
                    bbase = ((len(head)+len(tail))**0.43)*((len(middle))**0.43);
                    bval = num_contacts/bbase;
                    chh =  list(sorted(list(middle)));
                    if minpos is None or minvalue > bval:
                        minpos = [[*head,*tail],list(sorted(list(middle)))];
                        minvalue = bval;
                    vsum += bval;
                    xcou += 1;

        if xcou == 0:
            continue;
        if dryrun is None:
            if minvalue < vave/2:
                candidate_range.extend(minpos);
            else:
                domains.append(zcan);
        else:
            if len(candidate_range) == 0:
                vave = vsum/xcou;
                candidate_range = dryrun;
                dryrun = None;

    print(str(len(domains))+" domains were found.");
    # merge domains which have plenty of contacts
    while True:
        candidate_pair = None;
        max_ncc = 2.0;
        num_domains = len(domains);
        domain_set = [];
        for dd in list(domains):
            domain_set.append(set(dd));
        for ii in range(0,num_domains-1):
            for jj in range(ii+1,num_domains):
                contacting = set();
                for dd in list(domains[ii]):
                    for cc in list(contact_with[dd]):
                        if cc in domain_set[jj]:
                            contacting.add(cc);
                    for cc in list(contact_with_rev[dd]):
                        if cc in domain_set[jj]:
                            contacting.add(cc);
                ncc = len(contacting)/(((len(domains[ii]))**0.43)*((len(domains[jj]))**0.43));
                if ncc > max_ncc:
                    max_ncc = ncc;
                    candidate_pair = [ii,jj];
        if candidate_pair is None:
            break;
        else:
            print(candidate_pair);
            domains_new = [];
            domains_new.append([*domains[candidate_pair[0]],*domains[candidate_pair[1]]]);
            for dd in range(len(domains)):
                if dd != candidate_pair[0] and dd != candidate_pair[1]:
                    domains_new.append(domains[dd]);
            assert len(domains)-1 == len(domains_new);
            domains = domains_new;
    print("Merged into "+str(len(domains))+" domains.");
    # filter out small segments
    domains_final = [];
    for (ii,dd) in enumerate(list(domains)):
        print("Domain {} has {} AAs.".format(ii,len(dd)));
        if len(dd) < 30:
            print("Filter out Domain {}.".format(ii));
            continue;
        domains_final.append(dd);
    return domains_final;


# In[3]:


class ResidueSelector(Select):
    def __init__(self,rset):
        self.residue_set = rset;
    def accept_residue(self,residue):
        if residue in self.residue_set:
            return 1;
        return 0;


# In[4]:


cas = [];
ppos = 0;
prevpos = None;
prevppos = None;
candidate_range = [];
for model in structure.get_list():
    for chain in model.get_list():
        pcan = [];
        if not chain.get_id() in target_chain:
            continue;
        ppos += 100;
        for residue in chain.get_list():
            if prevpos is None:
                prevpos = residue.get_id()[1]-1;
                prevppos = ppos;
            current_sid = residue.get_id()[1];
            if current_sid - prevpos > 1 or current_sid - prevpos < 0:
                ppos = abs(current_sid - prevpos)+prevppos;
            else:
                ppos = prevppos+1;
            prevppos = ppos;
            prevpos = current_sid;
            for atom in residue.get_list():
                if atom.get_name() == 'CA':
                    pcan.append(len(cas));
                    cas.append(CA_Residue(ppos,residue,atom.get_coord()))
        #print(len(cas))
        candidate_range.append(pcan);
        #break;
domains_final = perform_pdp(cas,candidate_range);
for dd in enumerate(list(domains_final)):
    rset = set();
    for ddd in list(dd[1]):
        rset.add(cas[ddd].residue);
    pdbio = PDBIO();
    pdbio.set_structure(structure);
    pdbio.save(out_prefix+"."+str(dd[0])+".pdb",ResidueSelector(rset))


# In[ ]:


# In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:
	"""
	Copyright 2022 yamule

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	"""

	from Bio.PDB import *;
	import numpy as np;
	from Bio.PDB import PDBIO ;
	from Bio.PDB.PDBIO import Select;
	import copy;
	pdb_parser = MMCIFParser()
	## pdb_parser = PDBParser()

	out_prefix = "xtestout";
	structure = pdb_parser.get_structure('X', '7t8o.cif');
	target_chain = set(["A","B"]);
	class CA_Residue:
	def __init__(self,idx,residue,co):
	self.index = idx;
	self.residue = residue;
	self.coordinates = co;


	# In[2]:


	def perform_pdp(cas,candidate_range):
	#get_contacting_residues
	contact_with = [];
	contact_with_rev = [];
	for ii in range(len(cas)):
	pcon = [];
	for jj in range(ii,len(cas)): # biojava で 5 残基以上離れているのしか考えていないようだったので
	if cas[jj].index - cas[ii].index >= 5:
	diff = (cas[ii].coordinates-cas[jj].coordinates);
	dsum = (diff*diff).sum();
	if dsum <= 0.0:
	dist = 0;
	else:
	dist = np.sqrt(dsum);
	if dist < 8.0:
	pcon.append(jj);
	contact_with.append(pcon);
	contact_with_rev.append([]);
	for gg in range(len(contact_with)):
	for dd in list(contact_with[gg]):
	contact_with_rev[dd].append(gg);
	cutpos_value = [];
	vsum = 0;
	xcou = 0;
	vave = None;
	endshift_0 = 12 if len(candidate_range) == 1 else 9;
	endshift = 9;# ignore this many boundary of AAs from terminals
	domains = [];
	dryrun = copy.deepcopy(candidate_range);
	while len(candidate_range) > 0:
	minvalue = 1000000000;
	minpos = None;
	zcan = candidate_range.pop();
	st = 0;
	en = len(zcan);
	tail_region = set(zcan);
	st_ = st+endshift;
	en_ = en-endshift;
	if zcan[st] == 0:
	st_ = st+endshift_0;
	if en_-1 - st_ < 1:
	domains.append(zcan);
	continue;
	if zcan[en-1] == len(cas)-1:
	en_ = en-endshift_0;

	#check_single_cut
	for ii in range(st,st_):
	tail_region.remove(zcan[ii]);
	for ii in range(st_,en_-1):
	tail_region.remove(zcan[ii]);
	contacting = set();
	for jj in range(st,ii+1):#ii の後ろで切る
	for cc in list(contact_with[zcan[jj]]):
	if cc in tail_region:
	contacting.add(cc);

	num_contacts = len(contacting);
	#print(num_contacts)
	bbase = ((len(zcan)-len(tail_region))*0.43)((len(tail_region))**0.43);
	bval = num_contacts/bbase;
	#print(st,ii,bval)
	if minpos is None or minvalue > bval:
	minpos = [zcan[0:(ii+1)],zcan[(ii+1):en]];
	minvalue = bval;
	vsum += bval;
	xcou += 1;

	if True:
	#check_double_cut
	head = [];
	tail2 = list(zcan);
	for ii in range(st,st_):
	head.append(zcan[ii]);
	chk = tail2.pop(0);
	assert chk == zcan[ii];

	for ii in range(st_,en_):
	head.append(zcan[ii]);
	chk = tail2.pop(0);
	assert chk == zcan[ii];
	tail = list(tail2);
	middle = set();
	for jj in range(ii+1,en-1):
	contacting = set();
	middle.add(zcan[jj]);
	chk = tail.pop(0);
	assert chk == zcan[jj];
	#if (not (tail[0] in contact_with[head[-1]])) or abs(cas[tail[0]][0]-cas[head[-1]][0]) < 35 :
	if (not (tail[0] in contact_with[head[-1]])) or len(middle) < 34 :
	continue;
	for hh in list(head):
	for cc in list(contact_with[hh]):
	if cc in middle:
	contacting.add(cc);
	for tt in list(tail):
	for cc in list(contact_with_rev[tt]):
	if cc in middle:
	contacting.add(cc);

	num_contacts = len(contacting);
	bbase = ((len(head)+len(tail))*0.43)((len(middle))**0.43);
	bval = num_contacts/bbase;
	chh = list(sorted(list(middle)));
	if minpos is None or minvalue > bval:
	minpos = [[head,tail],list(sorted(list(middle)))];
	minvalue = bval;
	vsum += bval;
	xcou += 1;

	if xcou == 0:
	continue;
	if dryrun is None:
	if minvalue < vave/2:
	candidate_range.extend(minpos);
	else:
	domains.append(zcan);
	else:
	if len(candidate_range) == 0:
	vave = vsum/xcou;
	candidate_range = dryrun;
	dryrun = None;

	print(str(len(domains))+" domains were found.");
	# merge domains which have plenty of contacts
	while True:
	candidate_pair = None;
	max_ncc = 2.0;
	num_domains = len(domains);
	domain_set = [];
	for dd in list(domains):
	domain_set.append(set(dd));
	for ii in range(0,num_domains-1):
	for jj in range(ii+1,num_domains):
	contacting = set();
	for dd in list(domains[ii]):
	for cc in list(contact_with[dd]):
	if cc in domain_set[jj]:
	contacting.add(cc);
	for cc in list(contact_with_rev[dd]):
	if cc in domain_set[jj]:
	contacting.add(cc);
	ncc = len(contacting)/(((len(domains[ii]))*0.43)((len(domains[jj]))**0.43));
	if ncc > max_ncc:
	max_ncc = ncc;
	candidate_pair = [ii,jj];
	if candidate_pair is None:
	break;
	else:
	print(candidate_pair);
	domains_new = [];
	domains_new.append([domains[candidate_pair[0]],domains[candidate_pair[1]]]);
	for dd in range(len(domains)):
	if dd != candidate_pair[0] and dd != candidate_pair[1]:
	domains_new.append(domains[dd]);
	assert len(domains)-1 == len(domains_new);
	domains = domains_new;
	print("Merged into "+str(len(domains))+" domains.");
	# filter out small segments
	domains_final = [];
	for (ii,dd) in enumerate(list(domains)):
	print("Domain {} has {} AAs.".format(ii,len(dd)));
	if len(dd) < 30:
	print("Filter out Domain {}.".format(ii));
	continue;
	domains_final.append(dd);
	return domains_final;


	# In[3]:


	class ResidueSelector(Select):
	def __init__(self,rset):
	self.residue_set = rset;
	def accept_residue(self,residue):
	if residue in self.residue_set:
	return 1;
	return 0;


	# In[4]:


	cas = [];
	ppos = 0;
	prevpos = None;
	prevppos = None;
	candidate_range = [];
	for model in structure.get_list():
	for chain in model.get_list():
	pcan = [];
	if not chain.get_id() in target_chain:
	continue;
	ppos += 100;
	for residue in chain.get_list():
	if prevpos is None:
	prevpos = residue.get_id()[1]-1;
	prevppos = ppos;
	current_sid = residue.get_id()[1];
	if current_sid - prevpos > 1 or current_sid - prevpos < 0:
	ppos = abs(current_sid - prevpos)+prevppos;
	else:
	ppos = prevppos+1;
	prevppos = ppos;
	prevpos = current_sid;
	for atom in residue.get_list():
	if atom.get_name() == 'CA':
	pcan.append(len(cas));
	cas.append(CA_Residue(ppos,residue,atom.get_coord()))
	#print(len(cas))
	candidate_range.append(pcan);
	#break;
	domains_final = perform_pdp(cas,candidate_range);
	for dd in enumerate(list(domains_final)):
	rset = set();
	for ddd in list(dd[1]):
	rset.add(cas[ddd].residue);
	pdbio = PDBIO();
	pdbio.set_structure(structure);
	pdbio.save(out_prefix+"."+str(dd[0])+".pdb",ResidueSelector(rset))


	# In[ ]:





	# In[ ]: