Skip to content

Instantly share code, notes, and snippets.

@maxentile
Last active May 24, 2019 09:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maxentile/422f9d2caed60ef6ebe95eba4daf8617 to your computer and use it in GitHub Desktop.
Save maxentile/422f9d2caed60ef6ebe95eba4daf8617 to your computer and use it in GitHub Desktop.
find a small subset of drugbank that exercises as many smirnoff parameters as the whole set does
# of molecules in DrugBank: 7133
# of molecules selected by greedy set cover: 56
exercises 289 of 322 parameter types in smirnoff99frosst (89.752%)
unused bond types
b22: [#6X3:1]:[#8X2+1:2]
b31: [#6X2:1]=[#16:2]
b45: [#16:1]-[#17:2]
b46: [#16:1]-[#35:2]
b47: [#16:1]-[#53:2]
b61: [#15:1]=[#7:2]
b64: [#15:1]~[#9:2]
b75: [#7:1]-[#9:2]
b76: [#7:1]-[#17:2]
b78: [#7:1]-[#53:2]
b81: [#15:1]-[#35:2]
b82: [#15:1]-[#53:2]
unused angle types
a34: [*:1]=[#16X2:2]=[*:3]
unused improper torsion types
unused proper torsion types
t8: [#35:1]-[#6X4:2]-[#6X4:3]-[#35:4]
t18: [#1:1]-[#6X4:2]-[#6X3:3]=[#8X1:4]
t19: [#1:1]-[#6X4:2]-[#6X3:3]=[#6X3:4]
t33: [#7X3:1]-[#6X4;r3:2]-[#6X3:3]~[#6X3:4]
t53: [*:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[*:4]
t54: [#1:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[*:4]
t55: [#1:1]-[#7X4,#7X3:2]-[#6X4;r3:3]-[#6X4;r3:4]
t80: [#16X4,#16X3+0:1]-[#7X2:2]=[#6X3:3]-[#16X2,#16X3+1:4]
t81: [#7X1:1]~[#7X2:2]-[#6X3:3]~[#6X3:4]
t82: [#7X1:1]~[#7X2:2]-[#6X4:3]-[#6X3:4]
t83: [#7X1:1]~[#7X2:2]-[#6X4:3]~[#1:4]
t93: [#1:1]-[#8X2:2]-[#6X4;r3:3]-[#1:4]
t94: [#1:1]-[#8X2:2]-[#6X4;r3:3]-[#6X4:4]
t102: [#1:1]-[#8X2H1:2]-@[#6X3:3]~[*:4]
t121: [*:1]-[#7X4,#7X3:2]-[#7X4,#7X3:3]-[*:4]
t132: [#6:1]-[#7X2:2]~[#7X2:3]~[#7X1:4]
t145: [*:1]~[#16X4,#16X3+0:2]=,:[#7X2:3]-,:[*:4]
t153: [*:1]~[#7X4:2]-[#15:3]~[*:4]
unused nonbonded types
n30: [#37+1:1]
n31: [#55+1:1]
CC1([C@H]([C@]1(C(=O)NCCC(=O)NCCSC(=O)/C=C/c2cc(c(c(c2)OC)O)OC)O)O[P@](=O)(O)O[P@@](=O)(O)OC[C@@H]3[C@H]([C@@H]([C@@H](O3)n4cnc5c4ncnc5N)O)OP(=O)(O)O)C
C[C@@H](c1nc(cs1)c2ccc(cc2)C#N)[C@](Cn3c[n+](cn3)C(C)OC(=O)N(C)c4c(cccn4)COC(=O)CNC)(c5cc(ccc5F)F)O
c1ccc(cc1)[C@H](C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)CSc4nnnn4CS(=O)(=O)O)C(=O)O)O
CC1(CCC(=C(C1)c2ccc(cc2)Cl)CN3CCN(CC3)c4ccc(c(c4)Oc5cc6cc[nH]c6nc5)C(=O)NS(=O)(=O)c7ccc(c(c7)[N+](=O)[O-])NCC8CCOCC8)C
CCO/N=C(/c1nc(sn1)NP(=O)(O)O)\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)Sc4nc(cs4)c5cc[n+](cc5)C)C(=O)[O-]
CC(C)(C)[C@H]/1C(=O)N2C[C@@H](C[C@H]2/C(=N/[C@@]3(C[C@H]3C=C)/C(=N/S(=O)(=O)C4CC4)/O)/O)Oc5c(nc6ccc(cc6n5)OC)CCCCC[C@@H]7C[C@H]7O/C(=N1)/O
CNC(=O)N(N(CCCl)S(=O)(=O)C)S(=O)(=O)C
C#CCN1CCN(CC1)c2c(cccc2Cl)NC(=O)c3ccc(o3)Br
C[C@]12CC[C@@H]3c4ccc(cc4C[C@H](C3[C@@H]1CC[C@@H]2O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F)O
c1cc(ccc1CC(=O)NC[P@H](=O)O)I
CCN(CC)C(=S)S
c1c(cc(cc1C(=O)NCCCO[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)CO)O)O)O)C(=O)NCCCO[C@@H]3[C@H]([C@@H]([C@H]([C@@H](O3)CO)O)O)O)CN=[N+]=[N-]
Cc1c2/cc/3\nc(/[o+]c\4/c(c(/c(/[n-]4)c/c(n\cc/c(c1CCC(=O)O)[n-]2)/C(=C)C)C)C=C)C(=C3C=C)C.[Fe+4]
CCN(CC)C(=O)[C@@]1(C[C@@H]1CN)c2ccccc2
c1ccc(cc1)C[C@H]2C(=O)N[C@H](C(=O)N[C@H](C(=O)N[C@@H](CSSCCC(=O)N[C@H](C(=O)N2)Cc3ccc(cc3)O)C(=O)N4CCC[C@H]4C(=O)N[C@@H](CCCNC(=N)N)C(=O)NCC(=O)N)CC(=O)N)CCC(=O)N
CC(C)NC(=O)c1ccc(cc1)CNNC
c1ccc(cc1)/C=C/c2nn([n+](n2)c3ccc4c(c3)c(=O)[nH][nH]c4=O)c5nc6ccccc6s5
CCCCC(=O)OC[C@H](CO[P@@](=S)(OCC[N+](C)(C)C)[S-])OC(=O)CCCC
CCCCC[C@@H](/C=C/[C@H]1[C@H]2C[C@@H]([C@@H]1C/C=C\CCCC(=O)O)OO2)OO
COc1ccnc(c1OC)CS(=O)c2[nH]c3cc(ccc3n2)OC(F)F
C1CN(CCN1C(=O)CCBr)C(=O)CCBr
CC1=C(C(C(=C(N1)C)C(=O)OC(C)C)c2cccc3c2non3)C(=O)OC
[Cl-].[K+]
CCc1nc(c(n1Cc2ccc3c(c2)c(c(o3)c4ccccc4NS(=O)(=O)C(F)(F)F)Br)C(=O)N)C5CC5
CCCC[C@@H](C(CNC1CC1)(O)O)N
c1cc(sc1)CNS(=O)(=O)c2ccc(s2)S(=O)(=O)N
C/C(=C\C=C\C=C\C=C(/C)\C=C/1\C=C(C(=O)O1)/C=C/[C@@]23[C@@](O2)(C[C@@H](CC3(C)C)O)C)/C=C=C4[C@@](C[C@@H](CC4(C)C)OC(=O)C)(C)O
Cc1ccccc1c2c3ccc(cc3cnn2)c4cc(ccc4C)C(=O)NC5CC5
[F-].[Na+]
Cc1cc2c(cc1C)N3C=N2[Co]456(N7=C8[C@H](C(C7=CC9=N4C(=C(C1=N5[C@@]([C@@H]2N6C(=C8C)[C@@]([C@H]2CC(=O)N)(CCC(=O)NC[C@H](OP(=O)(O[C@@H]2[C@H](O[C@H]3[C@@H]2O)CO)O)C)C)([C@@]([C@@H]1CCC(=O)N)(C)CC(=O)N)C)C)[C@@]([C@@H]9CCC(=O)N)(C)CC(=O)N)(C)C)CCC(=O)N)C#N
C1[C@H](C(=O)NO1)N
CC(C)[N@+]1([C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(CO)c3ccccc3)C.[Br-]
C[N+]1(CCc2cc(c3cc2[C@@H]1Cc4ccc(cc4)Oc5c6c(cc(c5OC)OC)CC[N+]([C@@H]6Cc7ccc(c(c7)O3)OC)(C)C)OC)C.[I-].[I-]
[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl
CNC1=Nc2ccc(cc2C(=N(=O)C1)c3ccccc3)Cl
Cc1c(nc[nH]1)CSCCN/C(=N/C)/NC#N
C(=O)(O)P(=O)(O)O
c1ccc(cc1)CSCC2=NS(=O)(=O)c3cc(c(cc3N2)Cl)S(=O)(=O)N
CC(C)OP(=O)(OC(C)C)F
CC(c1cc2ccccc2s1)N(C(=O)N)O
CC=CCC=CCCC(=O)[C@@H]1[C@@H](O1)C(=O)N
[Li+]
C([C@H](CSO)O)O
Cc1cc2c(cc1C3(CC3)c4ccc(cn4)C(=O)O)C(CCC2(C)C)(C)C
CCCCCCCCCCCCCCCCS(=O)(=O)F
B(c1cccc(c1)NC(=O)CI)(O)O
CC(=O)N(CCOP(=O)(O)O)Br
CCN/[S]=C/N=C/1\C=CC(=C2c3ccc(cc3Oc4c2ccc(c4)O)O)C(=C1)C(=O)O
c1ccc(cc1)C(=O)Nc2cc(n[nH]2)C3CC3
c1ccc(cc1)C[C@@H](C(=O)C[N+]#N)NC(=O)OCc2ccccc2
COc1ccc2ccc(cc2c1c3cnn(c3)S(=O)(=O)C)C(=N)N
C([C@@H](C(=O)O)N)[S]=O
c1cc(c(c(c1)Cl)Cl)c2cc(n3c(n2)c(cn3)SC#N)NCc4ccncc4
CC(C)c1c(c2cc(ccc2[nH]c1=O)F)OC#CC3CC3
CC[C@H](COc1ccccc1)O[P@](=O)(C)Cl
C(=[N-])=[N-].[Ca+2]
import numpy as np
from tqdm import tqdm
from openeye import oechem
import xmltodict
from openforcefield.typing.engines.smirnoff import get_molecule_parameterIDs
path_to_smirnoff = 'smirnoff99Frosst.offxml'
path_to_drugbank = 'DrugBank_atyped.oeb'
# load drugbank
ifs = oechem.oemolistream(path_to_drugbank)
mols = []
mol = oechem.OEGraphMol()
while oechem.OEReadMolecule(ifs, mol):
mols.append(mol)
mol = oechem.OEGraphMol()
# assign parameters
smirnoff_types_drugbank = []
for i in tqdm(range(len(mols))):
parameterIDs = get_molecule_parameterIDs([mols[i]], path_to_smirnoff)
smirnoff_types_drugbank.append(set(list(parameterIDs[0].values())[0]))
# pick a small subset that exercises as many parameters as the whole set
def greedy_set_cover(list_of_subsets):
"""Pick the molecule that would add the most types"""
S = set()
included_indices = []
remaining_indices = list(range(len(list_of_subsets)))
improvable = True
while (len(remaining_indices) > 0) and (improvable):
weights = [len(S.union(list_of_subsets[i])) for i in remaining_indices]
if max(weights) == len(S):
improvable = False
else:
i = remaining_indices[np.argmax(weights)]
S.update(list_of_subsets[i])
included_indices.append(i)
remaining_indices.remove(i)
return included_indices
drugbank_covering_set = greedy_set_cover(smirnoff_types_drugbank)
S_drugbank = set()
for i in drugbank_covering_set:
S_drugbank.update(smirnoff_types_drugbank[i])
# report
print('# of molecules in DrugBank: {}'.format(len(mols)))
print('# of molecules selected by greedy set cover: {}'.format(len(drugbank_covering_set)))
with open(path_to_smirnoff, 'r') as f:
smirnoff_file = f.read()
smirnoff = xmltodict.parse(smirnoff_file)
types_to_smirks_dict = {}
for d in smirnoff['SMIRNOFF']['HarmonicBondForce']['Bond']:
types_to_smirks_dict[d['@id']] = d['@smirks']
for d in smirnoff['SMIRNOFF']['HarmonicAngleForce']['Angle']:
types_to_smirks_dict[d['@id']] = d['@smirks']
for d in smirnoff['SMIRNOFF']['PeriodicTorsionForce']['Proper']:
types_to_smirks_dict[d['@id']] = d['@smirks']
for d in smirnoff['SMIRNOFF']['PeriodicTorsionForce']['Improper']:
types_to_smirks_dict[d['@id']] = d['@smirks']
for d in smirnoff['SMIRNOFF']['NonbondedForce']['Atom']:
types_to_smirks_dict[d['@id']] = d['@smirks']
unexercised_types = []
for key in types_to_smirks_dict.keys():
if not (key in S_drugbank):
unexercised_types.append(key)
num_types = len(types_to_smirks_dict)
num_not_exercised = len(unexercised_types)
num_exercised = num_types - num_not_exercised
print('exercises {} of {} parameter types in smirnoff99frosst ({:.3f}%)'.format(num_exercised, num_types, 100 * num_exercised / num_types))
print('\nunused bond types')
for key in unexercised_types:
if key[0] == 'b':
print('\t{}: {}'.format(key, types_to_smirks_dict[key]))
print('\nunused angle types')
for key in unexercised_types:
if key[0] == 'a':
print('\t{}: {}'.format(key, types_to_smirks_dict[key]))
print('\nunused improper torsion types')
for key in unexercised_types:
if key[0] == 'i':
print('\t{}: {}'.format(key, types_to_smirks_dict[key]))
print('\nunused proper torsion types')
for key in unexercised_types:
if key[0] == 't':
print('\t{}: {}'.format(key, types_to_smirks_dict[key]))
print('\nunused nonbonded types')
for key in unexercised_types:
if key[0] == 'n':
print('\t{}: {}'.format(key, types_to_smirks_dict[key]))
@btjanaka
Copy link

Hi, I'm an undergraduate researcher in the Mobley Lab working on finding a set of molcules in eMolecules that can cover all parameters in the SMIRNOFF force field. I notice you used the get_molecule_parameterIDs function in your script, but I cannot seem to find it anywhere, even when I try to import it. Do you know how I can find it?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment