Last active
January 1, 2022 14:52
-
-
Save yamule/9cf87b06d7f2bc5e8823f977efb8ee78 to your computer and use it in GitHub Desktop.
Protein Domain Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[1]: | |
""" | |
Copyright 2022 yamule | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
""" | |
from Bio.PDB import *; | |
import numpy as np; | |
from Bio.PDB import PDBIO ; | |
from Bio.PDB.PDBIO import Select; | |
import copy; | |
pdb_parser = MMCIFParser() | |
## pdb_parser = PDBParser() | |
out_prefix = "xtestout"; | |
structure = pdb_parser.get_structure('X', '7t8o.cif'); | |
target_chain = set(["A","B"]); | |
class CA_Residue: | |
def __init__(self,idx,residue,co): | |
self.index = idx; | |
self.residue = residue; | |
self.coordinates = co; | |
# In[2]: | |
def perform_pdp(cas,candidate_range): | |
#get_contacting_residues | |
contact_with = []; | |
contact_with_rev = []; | |
for ii in range(len(cas)): | |
pcon = []; | |
for jj in range(ii,len(cas)): # biojava で 5 残基以上離れているのしか考えていないようだったので | |
if cas[jj].index - cas[ii].index >= 5: | |
diff = (cas[ii].coordinates-cas[jj].coordinates); | |
dsum = (diff*diff).sum(); | |
if dsum <= 0.0: | |
dist = 0; | |
else: | |
dist = np.sqrt(dsum); | |
if dist < 8.0: | |
pcon.append(jj); | |
contact_with.append(pcon); | |
contact_with_rev.append([]); | |
for gg in range(len(contact_with)): | |
for dd in list(contact_with[gg]): | |
contact_with_rev[dd].append(gg); | |
cutpos_value = []; | |
vsum = 0; | |
xcou = 0; | |
vave = None; | |
endshift_0 = 12 if len(candidate_range) == 1 else 9; | |
endshift = 9;# ignore this many boundary of AAs from terminals | |
domains = []; | |
dryrun = copy.deepcopy(candidate_range); | |
while len(candidate_range) > 0: | |
minvalue = 1000000000; | |
minpos = None; | |
zcan = candidate_range.pop(); | |
st = 0; | |
en = len(zcan); | |
tail_region = set(zcan); | |
st_ = st+endshift; | |
en_ = en-endshift; | |
if zcan[st] == 0: | |
st_ = st+endshift_0; | |
if en_-1 - st_ < 1: | |
domains.append(zcan); | |
continue; | |
if zcan[en-1] == len(cas)-1: | |
en_ = en-endshift_0; | |
#check_single_cut | |
for ii in range(st,st_): | |
tail_region.remove(zcan[ii]); | |
for ii in range(st_,en_-1): | |
tail_region.remove(zcan[ii]); | |
contacting = set(); | |
for jj in range(st,ii+1):#ii の後ろで切る | |
for cc in list(contact_with[zcan[jj]]): | |
if cc in tail_region: | |
contacting.add(cc); | |
num_contacts = len(contacting); | |
#print(num_contacts) | |
bbase = ((len(zcan)-len(tail_region))**0.43)*((len(tail_region))**0.43); | |
bval = num_contacts/bbase; | |
#print(st,ii,bval) | |
if minpos is None or minvalue > bval: | |
minpos = [zcan[0:(ii+1)],zcan[(ii+1):en]]; | |
minvalue = bval; | |
vsum += bval; | |
xcou += 1; | |
if True: | |
#check_double_cut | |
head = []; | |
tail2 = list(zcan); | |
for ii in range(st,st_): | |
head.append(zcan[ii]); | |
chk = tail2.pop(0); | |
assert chk == zcan[ii]; | |
for ii in range(st_,en_): | |
head.append(zcan[ii]); | |
chk = tail2.pop(0); | |
assert chk == zcan[ii]; | |
tail = list(tail2); | |
middle = set(); | |
for jj in range(ii+1,en-1): | |
contacting = set(); | |
middle.add(zcan[jj]); | |
chk = tail.pop(0); | |
assert chk == zcan[jj]; | |
#if (not (tail[0] in contact_with[head[-1]])) or abs(cas[tail[0]][0]-cas[head[-1]][0]) < 35 : | |
if (not (tail[0] in contact_with[head[-1]])) or len(middle) < 34 : | |
continue; | |
for hh in list(head): | |
for cc in list(contact_with[hh]): | |
if cc in middle: | |
contacting.add(cc); | |
for tt in list(tail): | |
for cc in list(contact_with_rev[tt]): | |
if cc in middle: | |
contacting.add(cc); | |
num_contacts = len(contacting); | |
bbase = ((len(head)+len(tail))**0.43)*((len(middle))**0.43); | |
bval = num_contacts/bbase; | |
chh = list(sorted(list(middle))); | |
if minpos is None or minvalue > bval: | |
minpos = [[*head,*tail],list(sorted(list(middle)))]; | |
minvalue = bval; | |
vsum += bval; | |
xcou += 1; | |
if xcou == 0: | |
continue; | |
if dryrun is None: | |
if minvalue < vave/2: | |
candidate_range.extend(minpos); | |
else: | |
domains.append(zcan); | |
else: | |
if len(candidate_range) == 0: | |
vave = vsum/xcou; | |
candidate_range = dryrun; | |
dryrun = None; | |
print(str(len(domains))+" domains were found."); | |
# merge domains which have plenty of contacts | |
while True: | |
candidate_pair = None; | |
max_ncc = 2.0; | |
num_domains = len(domains); | |
domain_set = []; | |
for dd in list(domains): | |
domain_set.append(set(dd)); | |
for ii in range(0,num_domains-1): | |
for jj in range(ii+1,num_domains): | |
contacting = set(); | |
for dd in list(domains[ii]): | |
for cc in list(contact_with[dd]): | |
if cc in domain_set[jj]: | |
contacting.add(cc); | |
for cc in list(contact_with_rev[dd]): | |
if cc in domain_set[jj]: | |
contacting.add(cc); | |
ncc = len(contacting)/(((len(domains[ii]))**0.43)*((len(domains[jj]))**0.43)); | |
if ncc > max_ncc: | |
max_ncc = ncc; | |
candidate_pair = [ii,jj]; | |
if candidate_pair is None: | |
break; | |
else: | |
print(candidate_pair); | |
domains_new = []; | |
domains_new.append([*domains[candidate_pair[0]],*domains[candidate_pair[1]]]); | |
for dd in range(len(domains)): | |
if dd != candidate_pair[0] and dd != candidate_pair[1]: | |
domains_new.append(domains[dd]); | |
assert len(domains)-1 == len(domains_new); | |
domains = domains_new; | |
print("Merged into "+str(len(domains))+" domains."); | |
# filter out small segments | |
domains_final = []; | |
for (ii,dd) in enumerate(list(domains)): | |
print("Domain {} has {} AAs.".format(ii,len(dd))); | |
if len(dd) < 30: | |
print("Filter out Domain {}.".format(ii)); | |
continue; | |
domains_final.append(dd); | |
return domains_final; | |
# In[3]: | |
class ResidueSelector(Select): | |
def __init__(self,rset): | |
self.residue_set = rset; | |
def accept_residue(self,residue): | |
if residue in self.residue_set: | |
return 1; | |
return 0; | |
# In[4]: | |
cas = []; | |
ppos = 0; | |
prevpos = None; | |
prevppos = None; | |
candidate_range = []; | |
for model in structure.get_list(): | |
for chain in model.get_list(): | |
pcan = []; | |
if not chain.get_id() in target_chain: | |
continue; | |
ppos += 100; | |
for residue in chain.get_list(): | |
if prevpos is None: | |
prevpos = residue.get_id()[1]-1; | |
prevppos = ppos; | |
current_sid = residue.get_id()[1]; | |
if current_sid - prevpos > 1 or current_sid - prevpos < 0: | |
ppos = abs(current_sid - prevpos)+prevppos; | |
else: | |
ppos = prevppos+1; | |
prevppos = ppos; | |
prevpos = current_sid; | |
for atom in residue.get_list(): | |
if atom.get_name() == 'CA': | |
pcan.append(len(cas)); | |
cas.append(CA_Residue(ppos,residue,atom.get_coord())) | |
#print(len(cas)) | |
candidate_range.append(pcan); | |
#break; | |
domains_final = perform_pdp(cas,candidate_range); | |
for dd in enumerate(list(domains_final)): | |
rset = set(); | |
for ddd in list(dd[1]): | |
rset.add(cas[ddd].residue); | |
pdbio = PDBIO(); | |
pdbio.set_structure(structure); | |
pdbio.save(out_prefix+"."+str(dd[0])+".pdb",ResidueSelector(rset)) | |
# In[ ]: | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment