Created
May 24, 2022 17:23
-
-
Save richardjgowers/c40186e1fe18ebafb22958e18827dd2f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "bd66ab90", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import itertools\n", | |
"import json\n", | |
"\n", | |
"from openmm import app\n", | |
"\n", | |
"from rdkit import Chem" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "6ebe7573", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def openmm_to_rdkit(pdbfile):\n", | |
" # convert openmm PDBFile to rdkit Molecule\n", | |
" # all bonds initially SINGLE, all charge initially neutral (to be later rectified)\n", | |
" rwmol = Chem.RWMol()\n", | |
" for i, atom in enumerate(pdbfile.topology.atoms()):\n", | |
" rwmol.AddAtom(Chem.Atom(atom.element.atomic_number))\n", | |
" rwmol.GetAtomWithIdx(i).SetProp('Name', atom.name)\n", | |
" # we're fully explicit\n", | |
" for atom in rwmol.GetAtoms():\n", | |
" atom.SetNoImplicit(True)\n", | |
" for bond in pdbfile.topology.bonds():\n", | |
" rwmol.AddBond(bond[0].index, bond[1].index, Chem.BondType.SINGLE)\n", | |
"\n", | |
" #conf = Chem.Conformer()\n", | |
" #for i, pos in enumerate(pdbfile.getPositions()):\n", | |
" # conf.SetAtomPosition(i, list(pos.value_in_unit(pos.unit)))\n", | |
" #rwmol.AddConformer(conf)\n", | |
" \n", | |
" return rwmol\n", | |
"\n", | |
"\n", | |
"def fuzzy_query(query):\n", | |
" \"\"\"return a copy of Query which is less specific:\n", | |
" - ignore aromaticity and hybridization of atoms (i.e. [#6] not C)\n", | |
" - ignore bond orders\n", | |
" - ignore charges\n", | |
" \"\"\"\n", | |
" generic = Chem.MolFromSmarts('**')\n", | |
" generic_bond = generic.GetBondWithIdx(0)\n", | |
" generic_mol = Chem.MolFromSmarts(''.join('[#{}]'.format(i+1) for i in range(112)))\n", | |
"\n", | |
" fuzzy = Chem.Mol(query)\n", | |
" for a in fuzzy.GetAtoms():\n", | |
" a.SetFormalCharge(0)\n", | |
" a.SetQuery(generic_mol.GetAtomWithIdx(a.GetAtomicNum() - 1)) # i.e. H looks up atom 0 in our generic mol\n", | |
" a.SetNoImplicit(True)\n", | |
" for b in fuzzy.GetBonds():\n", | |
" b.SetIsAromatic(False)\n", | |
" b.SetBondType(Chem.rdchem.BondType.SINGLE)\n", | |
" b.SetQuery(generic_bond)\n", | |
"\n", | |
" return fuzzy\n", | |
"\n", | |
"\n", | |
"def apply_pattern(rwmol, ref, match):\n", | |
" \"\"\"\n", | |
" rwmol -> thing to get modified\n", | |
" ref -> structure to copy from\n", | |
" match -> the indices mapping ref to match\n", | |
" \"\"\"\n", | |
" for atom_i, j in zip(ref.GetAtoms(), match):\n", | |
" atom_j = rwmol.GetAtomWithIdx(j)\n", | |
" # copy over chirality\n", | |
" if atom_i.GetChiralTag():\n", | |
" rwmol.GetAtomWithIdx(j).SetChiralTag(atom_i.GetChiralTag())\n", | |
" # copy over bond order & charge, only fancy atoms worth thinking about\n", | |
" if atom_i.GetAtomicNum() not in (7, 8, 16): # N, O, S\n", | |
" continue\n", | |
" atom_j.SetFormalCharge(atom_i.GetFormalCharge())\n", | |
"\n", | |
" # map double bonds onto our substructure\n", | |
" double_bonds = (b for b in ref.GetBonds()\n", | |
" if b.GetBondType() != Chem.rdchem.BondType.SINGLE)\n", | |
" for b in double_bonds:\n", | |
" x, y = match[b.GetBeginAtomIdx()], match[b.GetEndAtomIdx()]\n", | |
" b2 = rwmol.GetBondBetweenAtoms(x, y)\n", | |
" b2.SetBondType(b.GetBondType())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "bb44666d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dataloc = '/home/richard/code/openff-toolkit/openff/toolkit/data/proteins/aa_residues_substructures_explicit_bond_orders_with_caps.json'\n", | |
"\n", | |
"with open(dataloc, 'r') as f:\n", | |
" data = json.load(f)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "4aabe105", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pdbfile = app.PDBFile('./T4-protein.pdb')\n", | |
"rwmol = openmm_to_rdkit(pdbfile)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "3f8e5582", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"seen = set()\n", | |
"\n", | |
"for atom in rwmol.GetAtoms():\n", | |
" atom.SetProp('seen', 'False')\n", | |
" atom.SetProp('resname', 'None')\n", | |
"\n", | |
"for resname, patterns in data.items():\n", | |
" if resname in {'PEPTIDE_BOND', 'DISULFIDE'}:\n", | |
" continue\n", | |
" for pattern in patterns:\n", | |
" #pattern = clean_pattern(pattern)\n", | |
" q = fuzzy_query(Chem.MolFromSmarts(pattern))\n", | |
" for match in rwmol.GetSubstructMatches(q):\n", | |
" seen.update(match)\n", | |
" for m in match:\n", | |
" rwmol.GetAtomWithIdx(m).SetProp('seen', 'True')\n", | |
" rwmol.GetAtomWithIdx(m).SetProp('resname', resname)\n", | |
" #print(match)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "dd8f22a2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2465 2634\n", | |
"[1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1434, 1435, 1436, 1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2343, 2344, 2578, 2579, 2580, 2581, 2582, 2583, 2584, 2585, 2586, 2587, 2588, 2589, 2590, 2591, 2592, 2593, 2598, 2599]\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(seen), rwmol.GetNumAtoms())\n", | |
"print(sorted(set(range(rwmol.GetNumAtoms())) - seen))" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment