Last active
May 24, 2019 09:06
-
-
Save maxentile/422f9d2caed60ef6ebe95eba4daf8617 to your computer and use it in GitHub Desktop.
find a small subset of drugbank that exercises as many smirnoff parameters as the whole set does
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# of molecules in DrugBank: 7133 | |
# of molecules selected by greedy set cover: 56 | |
exercises 289 of 322 parameter types in smirnoff99frosst (89.752%) | |
unused bond types | |
b22: [#6X3:1]:[#8X2+1:2] | |
b31: [#6X2:1]=[#16:2] | |
b45: [#16:1]-[#17:2] | |
b46: [#16:1]-[#35:2] | |
b47: [#16:1]-[#53:2] | |
b61: [#15:1]=[#7:2] | |
b64: [#15:1]~[#9:2] | |
b75: [#7:1]-[#9:2] | |
b76: [#7:1]-[#17:2] | |
b78: [#7:1]-[#53:2] | |
b81: [#15:1]-[#35:2] | |
b82: [#15:1]-[#53:2] | |
unused angle types | |
a34: [*:1]=[#16X2:2]=[*:3] | |
unused improper torsion types | |
unused proper torsion types | |
t8: [#35:1]-[#6X4:2]-[#6X4:3]-[#35:4] | |
t18: [#1:1]-[#6X4:2]-[#6X3:3]=[#8X1:4] | |
t19: [#1:1]-[#6X4:2]-[#6X3:3]=[#6X3:4] | |
t33: [#7X3:1]-[#6X4;r3:2]-[#6X3:3]~[#6X3:4] | |
t53: [*:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[*:4] | |
t54: [#1:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[*:4] | |
t55: [#1:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[#6X4;r3:4] | |
t80: [#16X4,#16X3+0:1]-[#7X2:2]=[#6X3:3]-[#16X2,#16X3+1:4] | |
t81: [#7X1:1]~[#7X2:2]-[#6X3:3]~[#6X3:4] | |
t82: [#7X1:1]~[#7X2:2]-[#6X4:3]-[#6X3:4] | |
t83: [#7X1:1]~[#7X2:2]-[#6X4:3]~[#1:4] | |
t93: [#1:1]-[#8X2:2]-[#6X4;r3:3]-[#1:4] | |
t94: [#1:1]-[#8X2:2]-[#6X4;r3:3]-[#6X4:4] | |
t102: [#1:1]-[#8X2H1:2]-@[#6X3:3]~[*:4] | |
t121: [*:1]-[#7X4,#7X3:2]-[#7X4,#7X3:3]-[*:4] | |
t132: [#6:1]-[#7X2:2]~[#7X2:3]~[#7X1:4] | |
t145: [*:1]~[#16X4,#16X3+0:2]=,:[#7X2:3]-,:[*:4] | |
t153: [*:1]~[#7X4:2]-[#15:3]~[*:4] | |
unused nonbonded types | |
n30: [#37+1:1] | |
n31: [#55+1:1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC1([C@H]([C@]1(C(=O)NCCC(=O)NCCSC(=O)/C=C/c2cc(c(c(c2)OC)O)OC)O)O[P@](=O)(O)O[P@@](=O)(O)OC[C@@H]3[C@H]([C@@H]([C@@H](O3)n4cnc5c4ncnc5N)O)OP(=O)(O)O)C | |
C[C@@H](c1nc(cs1)c2ccc(cc2)C#N)[C@](Cn3c[n+](cn3)C(C)OC(=O)N(C)c4c(cccn4)COC(=O)CNC)(c5cc(ccc5F)F)O | |
c1ccc(cc1)[C@H](C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)CSc4nnnn4CS(=O)(=O)O)C(=O)O)O | |
CC1(CCC(=C(C1)c2ccc(cc2)Cl)CN3CCN(CC3)c4ccc(c(c4)Oc5cc6cc[nH]c6nc5)C(=O)NS(=O)(=O)c7ccc(c(c7)[N+](=O)[O-])NCC8CCOCC8)C | |
CCO/N=C(/c1nc(sn1)NP(=O)(O)O)\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)Sc4nc(cs4)c5cc[n+](cc5)C)C(=O)[O-] | |
CC(C)(C)[C@H]/1C(=O)N2C[C@@H](C[C@H]2/C(=N/[C@@]3(C[C@H]3C=C)/C(=N/S(=O)(=O)C4CC4)/O)/O)Oc5c(nc6ccc(cc6n5)OC)CCCCC[C@@H]7C[C@H]7O/C(=N1)/O | |
CNC(=O)N(N(CCCl)S(=O)(=O)C)S(=O)(=O)C | |
C#CCN1CCN(CC1)c2c(cccc2Cl)NC(=O)c3ccc(o3)Br | |
C[C@]12CC[C@@H]3c4ccc(cc4C[C@H](C3[C@@H]1CC[C@@H]2O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F)O | |
c1cc(ccc1CC(=O)NC[P@H](=O)O)I | |
CCN(CC)C(=S)S | |
c1c(cc(cc1C(=O)NCCCO[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)CO)O)O)O)C(=O)NCCCO[C@@H]3[C@H]([C@@H]([C@H]([C@@H](O3)CO)O)O)O)CN=[N+]=[N-] | |
Cc1c2/cc/3\nc(/[o+]c\4/c(c(/c(/[n-]4)c/c(n\cc/c(c1CCC(=O)O)[n-]2)/C(=C)C)C)C=C)C(=C3C=C)C.[Fe+4] | |
CCN(CC)C(=O)[C@@]1(C[C@@H]1CN)c2ccccc2 | |
c1ccc(cc1)C[C@H]2C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CSSCCC(=O)N[C@H](C(=O)N2)Cc3ccc(cc3)O)C(=O)N4CCC[C@H]4C(=O)N[C@@H](CCCNC(=N)N)C(=O)NCC(=O)N)CC(=O)N)CCC(=O)N | |
CC(C)NC(=O)c1ccc(cc1)CNNC | |
c1ccc(cc1)/C=C/c2nn([n+](n2)c3ccc4c(c3)c(=O)[nH][nH]c4=O)c5nc6ccccc6s5 | |
CCCCC(=O)OC[C@H](CO[P@@](=S)(OCC[N+](C)(C)C)[S-])OC(=O)CCCC | |
CCCCC[C@@H](/C=C/[C@H]1[C@H]2C[C@@H]([C@@H]1C/C=C\CCCC(=O)O)OO2)OO | |
COc1ccnc(c1OC)CS(=O)c2[nH]c3cc(ccc3n2)OC(F)F | |
C1CN(CCN1C(=O)CCBr)C(=O)CCBr | |
CC1=C(C(C(=C(N1)C)C(=O)OC(C)C)c2cccc3c2non3)C(=O)OC | |
[Cl-].[K+] | |
CCc1nc(c(n1Cc2ccc3c(c2)c(c(o3)c4ccccc4NS(=O)(=O)C(F)(F)F)Br)C(=O)N)C5CC5 | |
CCCC[C@@H](C(CNC1CC1)(O)O)N | |
c1cc(sc1)CNS(=O)(=O)c2ccc(s2)S(=O)(=O)N | |
C/C(=C\C=C\C=C\C=C(/C)\C=C/1\C=C(C(=O)O1)/C=C/[C@@]23[C@@](O2)(C[C@@H](CC3(C)C)O)C)/C=C=C4[C@@](C[C@@H](CC4(C)C)OC(=O)C)(C)O | |
Cc1ccccc1c2c3ccc(cc3cnn2)c4cc(ccc4C)C(=O)NC5CC5 | |
[F-].[Na+] | |
Cc1cc2c(cc1C)N3C=N2[Co]456(N7=C8[C@H](C(C7=CC9=N4C(=C(C1=N5[C@@]([C@@H]2N6C(=C8C)[C@@]([C@H]2CC(=O)N)(CCC(=O)NC[C@H](OP(=O)(O[C@@H]2[C@H](O[C@H]3[C@@H]2O)CO)O)C)C)([C@@]([C@@H]1CCC(=O)N)(C)CC(=O)N)C)C)[C@@]([C@@H]9CCC(=O)N)(C)CC(=O)N)(C)C)CCC(=O)N)C#N | |
C1[C@H](C(=O)NO1)N | |
CC(C)[N@+]1([C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(CO)c3ccccc3)C.[Br-] | |
C[N+]1(CCc2cc(c3cc2[C@@H]1Cc4ccc(cc4)Oc5c6c(cc(c5OC)OC)CC[N+]([C@@H]6Cc7ccc(c(c7)O3)OC)(C)C)OC)C.[I-].[I-] | |
[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl | |
CNC1=Nc2ccc(cc2C(=N(=O)C1)c3ccccc3)Cl | |
Cc1c(nc[nH]1)CSCCN/C(=N/C)/NC#N | |
C(=O)(O)P(=O)(O)O | |
c1ccc(cc1)CSCC2=NS(=O)(=O)c3cc(c(cc3N2)Cl)S(=O)(=O)N | |
CC(C)OP(=O)(OC(C)C)F | |
CC(c1cc2ccccc2s1)N(C(=O)N)O | |
CC=CCC=CCCC(=O)[C@@H]1[C@@H](O1)C(=O)N | |
[Li+] | |
C([C@H](CSO)O)O | |
Cc1cc2c(cc1C3(CC3)c4ccc(cn4)C(=O)O)C(CCC2(C)C)(C)C | |
CCCCCCCCCCCCCCCCS(=O)(=O)F | |
B(c1cccc(c1)NC(=O)CI)(O)O | |
CC(=O)N(CCOP(=O)(O)O)Br | |
CCN/[S]=C/N=C/1\C=CC(=C2c3ccc(cc3Oc4c2ccc(c4)O)O)C(=C1)C(=O)O | |
c1ccc(cc1)C(=O)Nc2cc(n[nH]2)C3CC3 | |
c1ccc(cc1)C[C@@H](C(=O)C[N+]#N)NC(=O)OCc2ccccc2 | |
COc1ccc2ccc(cc2c1c3cnn(c3)S(=O)(=O)C)C(=N)N | |
C([C@@H](C(=O)O)N)[S]=O | |
c1cc(c(c(c1)Cl)Cl)c2cc(n3c(n2)c(cn3)SC#N)NCc4ccncc4 | |
CC(C)c1c(c2cc(ccc2[nH]c1=O)F)OC#CC3CC3 | |
CC[C@H](COc1ccccc1)O[P@](=O)(C)Cl | |
C(=[N-])=[N-].[Ca+2] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from tqdm import tqdm | |
from openeye import oechem | |
import xmltodict | |
from openforcefield.typing.engines.smirnoff import get_molecule_parameterIDs | |
path_to_smirnoff = 'smirnoff99Frosst.offxml' | |
path_to_drugbank = 'DrugBank_atyped.oeb' | |
# load drugbank | |
ifs = oechem.oemolistream(path_to_drugbank) | |
mols = [] | |
mol = oechem.OEGraphMol() | |
while oechem.OEReadMolecule(ifs, mol): | |
mols.append(mol) | |
mol = oechem.OEGraphMol() | |
# assign parameters | |
smirnoff_types_drugbank = [] | |
for i in tqdm(range(len(mols))): | |
parameterIDs = get_molecule_parameterIDs([mols[i]], path_to_smirnoff) | |
smirnoff_types_drugbank.append(set(list(parameterIDs[0].values())[0])) | |
# pick a small subset that exercises as many parameters as the whole set | |
def greedy_set_cover(list_of_subsets): | |
"""Pick the molecule that would add the most types""" | |
S = set() | |
included_indices = [] | |
remaining_indices = list(range(len(list_of_subsets))) | |
improvable = True | |
while (len(remaining_indices) > 0) and (improvable): | |
weights = [len(S.union(list_of_subsets[i])) for i in remaining_indices] | |
if max(weights) == len(S): | |
improvable = False | |
else: | |
i = remaining_indices[np.argmax(weights)] | |
S.update(list_of_subsets[i]) | |
included_indices.append(i) | |
remaining_indices.remove(i) | |
return included_indices | |
drugbank_covering_set = greedy_set_cover(smirnoff_types_drugbank) | |
S_drugbank = set() | |
for i in drugbank_covering_set: | |
S_drugbank.update(smirnoff_types_drugbank[i]) | |
# report | |
print('# of molecules in DrugBank: {}'.format(len(mols))) | |
print('# of molecules selected by greedy set cover: {}'.format(len(drugbank_covering_set))) | |
with open(path_to_smirnoff, 'r') as f: | |
smirnoff_file = f.read() | |
smirnoff = xmltodict.parse(smirnoff_file) | |
types_to_smirks_dict = {} | |
for d in smirnoff['SMIRNOFF']['HarmonicBondForce']['Bond']: | |
types_to_smirks_dict[d['@id']] = d['@smirks'] | |
for d in smirnoff['SMIRNOFF']['HarmonicAngleForce']['Angle']: | |
types_to_smirks_dict[d['@id']] = d['@smirks'] | |
for d in smirnoff['SMIRNOFF']['PeriodicTorsionForce']['Proper']: | |
types_to_smirks_dict[d['@id']] = d['@smirks'] | |
for d in smirnoff['SMIRNOFF']['PeriodicTorsionForce']['Improper']: | |
types_to_smirks_dict[d['@id']] = d['@smirks'] | |
for d in smirnoff['SMIRNOFF']['NonbondedForce']['Atom']: | |
types_to_smirks_dict[d['@id']] = d['@smirks'] | |
unexercised_types = [] | |
for key in types_to_smirks_dict.keys(): | |
if not (key in S_drugbank): | |
unexercised_types.append(key) | |
num_types = len(types_to_smirks_dict) | |
num_not_exercised = len(unexercised_types) | |
num_exercised = num_types - num_not_exercised | |
print('exercises {} of {} parameter types in smirnoff99frosst ({:.3f}%)'.format(num_exercised, num_types, 100 * num_exercised / num_types)) | |
print('\nunused bond types') | |
for key in unexercised_types: | |
if key[0] == 'b': | |
print('\t{}: {}'.format(key, types_to_smirks_dict[key])) | |
print('\nunused angle types') | |
for key in unexercised_types: | |
if key[0] == 'a': | |
print('\t{}: {}'.format(key, types_to_smirks_dict[key])) | |
print('\nunused improper torsion types') | |
for key in unexercised_types: | |
if key[0] == 'i': | |
print('\t{}: {}'.format(key, types_to_smirks_dict[key])) | |
print('\nunused proper torsion types') | |
for key in unexercised_types: | |
if key[0] == 't': | |
print('\t{}: {}'.format(key, types_to_smirks_dict[key])) | |
print('\nunused nonbonded types') | |
for key in unexercised_types: | |
if key[0] == 'n': | |
print('\t{}: {}'.format(key, types_to_smirks_dict[key])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I'm an undergraduate researcher in the Mobley Lab working on finding a set of molcules in eMolecules that can cover all parameters in the SMIRNOFF force field. I notice you used the
get_molecule_parameterIDs
function in your script, but I cannot seem to find it anywhere, even when I try to import it. Do you know how I can find it?