Skip to content

Instantly share code, notes, and snippets.

@yamule
Last active January 1, 2022 14:52
Show Gist options
  • Save yamule/9cf87b06d7f2bc5e8823f977efb8ee78 to your computer and use it in GitHub Desktop.
Save yamule/9cf87b06d7f2bc5e8823f977efb8ee78 to your computer and use it in GitHub Desktop.
Protein Domain Parser
#!/usr/bin/env python
# coding: utf-8
# In[1]:
"""
Copyright 2022 yamule
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from Bio.PDB import *;
import numpy as np;
from Bio.PDB import PDBIO ;
from Bio.PDB.PDBIO import Select;
import copy;
pdb_parser = MMCIFParser()
## pdb_parser = PDBParser()
out_prefix = "xtestout";
structure = pdb_parser.get_structure('X', '7t8o.cif');
target_chain = set(["A","B"]);
class CA_Residue:
def __init__(self,idx,residue,co):
self.index = idx;
self.residue = residue;
self.coordinates = co;
# In[2]:
def perform_pdp(cas,candidate_range):
#get_contacting_residues
contact_with = [];
contact_with_rev = [];
for ii in range(len(cas)):
pcon = [];
for jj in range(ii,len(cas)): # biojava で 5 残基以上離れているのしか考えていないようだったので
if cas[jj].index - cas[ii].index >= 5:
diff = (cas[ii].coordinates-cas[jj].coordinates);
dsum = (diff*diff).sum();
if dsum <= 0.0:
dist = 0;
else:
dist = np.sqrt(dsum);
if dist < 8.0:
pcon.append(jj);
contact_with.append(pcon);
contact_with_rev.append([]);
for gg in range(len(contact_with)):
for dd in list(contact_with[gg]):
contact_with_rev[dd].append(gg);
cutpos_value = [];
vsum = 0;
xcou = 0;
vave = None;
endshift_0 = 12 if len(candidate_range) == 1 else 9;
endshift = 9;# ignore this many boundary of AAs from terminals
domains = [];
dryrun = copy.deepcopy(candidate_range);
while len(candidate_range) > 0:
minvalue = 1000000000;
minpos = None;
zcan = candidate_range.pop();
st = 0;
en = len(zcan);
tail_region = set(zcan);
st_ = st+endshift;
en_ = en-endshift;
if zcan[st] == 0:
st_ = st+endshift_0;
if en_-1 - st_ < 1:
domains.append(zcan);
continue;
if zcan[en-1] == len(cas)-1:
en_ = en-endshift_0;
#check_single_cut
for ii in range(st,st_):
tail_region.remove(zcan[ii]);
for ii in range(st_,en_-1):
tail_region.remove(zcan[ii]);
contacting = set();
for jj in range(st,ii+1):#ii の後ろで切る
for cc in list(contact_with[zcan[jj]]):
if cc in tail_region:
contacting.add(cc);
num_contacts = len(contacting);
#print(num_contacts)
bbase = ((len(zcan)-len(tail_region))**0.43)*((len(tail_region))**0.43);
bval = num_contacts/bbase;
#print(st,ii,bval)
if minpos is None or minvalue > bval:
minpos = [zcan[0:(ii+1)],zcan[(ii+1):en]];
minvalue = bval;
vsum += bval;
xcou += 1;
if True:
#check_double_cut
head = [];
tail2 = list(zcan);
for ii in range(st,st_):
head.append(zcan[ii]);
chk = tail2.pop(0);
assert chk == zcan[ii];
for ii in range(st_,en_):
head.append(zcan[ii]);
chk = tail2.pop(0);
assert chk == zcan[ii];
tail = list(tail2);
middle = set();
for jj in range(ii+1,en-1):
contacting = set();
middle.add(zcan[jj]);
chk = tail.pop(0);
assert chk == zcan[jj];
#if (not (tail[0] in contact_with[head[-1]])) or abs(cas[tail[0]][0]-cas[head[-1]][0]) < 35 :
if (not (tail[0] in contact_with[head[-1]])) or len(middle) < 34 :
continue;
for hh in list(head):
for cc in list(contact_with[hh]):
if cc in middle:
contacting.add(cc);
for tt in list(tail):
for cc in list(contact_with_rev[tt]):
if cc in middle:
contacting.add(cc);
num_contacts = len(contacting);
bbase = ((len(head)+len(tail))**0.43)*((len(middle))**0.43);
bval = num_contacts/bbase;
chh = list(sorted(list(middle)));
if minpos is None or minvalue > bval:
minpos = [[*head,*tail],list(sorted(list(middle)))];
minvalue = bval;
vsum += bval;
xcou += 1;
if xcou == 0:
continue;
if dryrun is None:
if minvalue < vave/2:
candidate_range.extend(minpos);
else:
domains.append(zcan);
else:
if len(candidate_range) == 0:
vave = vsum/xcou;
candidate_range = dryrun;
dryrun = None;
print(str(len(domains))+" domains were found.");
# merge domains which have plenty of contacts
while True:
candidate_pair = None;
max_ncc = 2.0;
num_domains = len(domains);
domain_set = [];
for dd in list(domains):
domain_set.append(set(dd));
for ii in range(0,num_domains-1):
for jj in range(ii+1,num_domains):
contacting = set();
for dd in list(domains[ii]):
for cc in list(contact_with[dd]):
if cc in domain_set[jj]:
contacting.add(cc);
for cc in list(contact_with_rev[dd]):
if cc in domain_set[jj]:
contacting.add(cc);
ncc = len(contacting)/(((len(domains[ii]))**0.43)*((len(domains[jj]))**0.43));
if ncc > max_ncc:
max_ncc = ncc;
candidate_pair = [ii,jj];
if candidate_pair is None:
break;
else:
print(candidate_pair);
domains_new = [];
domains_new.append([*domains[candidate_pair[0]],*domains[candidate_pair[1]]]);
for dd in range(len(domains)):
if dd != candidate_pair[0] and dd != candidate_pair[1]:
domains_new.append(domains[dd]);
assert len(domains)-1 == len(domains_new);
domains = domains_new;
print("Merged into "+str(len(domains))+" domains.");
# filter out small segments
domains_final = [];
for (ii,dd) in enumerate(list(domains)):
print("Domain {} has {} AAs.".format(ii,len(dd)));
if len(dd) < 30:
print("Filter out Domain {}.".format(ii));
continue;
domains_final.append(dd);
return domains_final;
# In[3]:
class ResidueSelector(Select):
def __init__(self,rset):
self.residue_set = rset;
def accept_residue(self,residue):
if residue in self.residue_set:
return 1;
return 0;
# In[4]:
cas = [];
ppos = 0;
prevpos = None;
prevppos = None;
candidate_range = [];
for model in structure.get_list():
for chain in model.get_list():
pcan = [];
if not chain.get_id() in target_chain:
continue;
ppos += 100;
for residue in chain.get_list():
if prevpos is None:
prevpos = residue.get_id()[1]-1;
prevppos = ppos;
current_sid = residue.get_id()[1];
if current_sid - prevpos > 1 or current_sid - prevpos < 0:
ppos = abs(current_sid - prevpos)+prevppos;
else:
ppos = prevppos+1;
prevppos = ppos;
prevpos = current_sid;
for atom in residue.get_list():
if atom.get_name() == 'CA':
pcan.append(len(cas));
cas.append(CA_Residue(ppos,residue,atom.get_coord()))
#print(len(cas))
candidate_range.append(pcan);
#break;
domains_final = perform_pdp(cas,candidate_range);
for dd in enumerate(list(domains_final)):
rset = set();
for ddd in list(dd[1]):
rset.add(cas[ddd].residue);
pdbio = PDBIO();
pdbio.set_structure(structure);
pdbio.save(out_prefix+"."+str(dd[0])+".pdb",ResidueSelector(rset))
# In[ ]:
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment