Skip to content

Instantly share code, notes, and snippets.

@john-bradshaw
Last active March 11, 2022 20:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save john-bradshaw/c2e8ae1bec2c50537d6cbb22e9c4f937 to your computer and use it in GitHub Desktop.
Save john-bradshaw/c2e8ae1bec2c50537d6cbb22e9c4f937 to your computer and use it in GitHub Desktop.
Code to split up reactants and reagents (currently untested, WIP)
"""
Module for identifying for RDKit functions working with reaction SMILES strings.
"""
import collections
import itertools
import functools
import typing
from rdkit import Chem
class InconsistentSMILES(RuntimeError):
pass
def get_atom_map_nums(mol_str) -> typing.Iterator[int]:
"""
:return: iterator of the atom mapping numbers of the atoms in the reaction string
"""
mol = Chem.MolFromSmiles(mol_str)
return (int(a.GetProp('molAtomMapNumber')) for a in mol.GetAtoms() if a.HasProp('molAtomMapNumber'))
def _not_unique_elements(itr_in) -> bool:
lst_ = list(itr_in)
return True if len(lst_) != len(set(lst_)) else False
def get_changed_bonds(rxn_smi, strict_mode=False) -> typing.List[typing.Tuple[int, int, float]]:
"""
Note unless `strict_mode` is `True`, this method does not check that the reagents have been seperated correctly
(i.e., reagent atoms are not in the products) or that atom map numbers have been repeated in the reactants or
products, which would break the rest of the implementation in a silent manner. If `strict_mode` is `True` and these
conditions are violated then an `InconsistentSMILES` is raised.
API should match the alternative:
https://github.com/connorcoley/rexgen_direct/blob/master/rexgen_direct/scripts/prep_data.py
:return: list of tuples of atom map numbers for each bond and new bond order
"""
# 1. Split into reactants and products.
reactants_smi, reagents_smi, products_smi = rxn_smi.split('>')
# 2. If necessary check for repeated atom map numbers in the SMILES and products.
if strict_mode:
if _not_unique_elements(get_atom_map_nums(reactants_smi)):
raise InconsistentSMILES("Repeated atom map numbers in reactants")
if _not_unique_elements(get_atom_map_nums(products_smi)):
raise InconsistentSMILES("Repeated atom maps numbers in products")
# 3. Get the bonds and their types in reactants and products
bonds_prev = {}
bonds_new = {}
for bond_dict, bonds in [(bonds_prev, Chem.MolFromSmiles(reactants_smi).GetBonds()),
(bonds_new, Chem.MolFromSmiles(products_smi).GetBonds())]:
for bond in bonds:
try:
bond_atmmap = frozenset((int(bond.GetBeginAtom().GetProp('molAtomMapNumber')),
int(bond.GetEndAtom().GetProp('molAtomMapNumber'))))
except KeyError:
continue
bond_dict[bond_atmmap] = float(bond.GetBondTypeAsDouble())
# 4. Go through the bonds before and after...
bond_changes: typing.List[typing.Tuple[int, int, float]] = []
product_atmmap_nums = set(get_atom_map_nums(products_smi))
if strict_mode and (len(set(get_atom_map_nums(reagents_smi)) & product_atmmap_nums) > 0):
raise InconsistentSMILES("Reagent atoms end up in products.")
for bnd in {*bonds_prev, *bonds_new}:
bnd_different_btwn_reacts_and_products = not (bonds_prev.get(bnd, None) == bonds_new.get(bnd, None))
bnd_missing_in_products = len(bnd & product_atmmap_nums) == 0
# ... and if a bond has (a) changed or (b) is half missing in the products then it must have changed!
if bnd_different_btwn_reacts_and_products and (not bnd_missing_in_products):
bond_changes.append((*sorted(list(bnd)), bonds_new.get(bnd, 0.0)))
# ^ note if no longer in products then new order is 0.
return bond_changes
def canonicalize(smiles, remove_atm_mapping=True, **otherargs) -> str:
"""
:param smiles: SMILES string to be canonicalized
:param remove_atm_mapping: whether to remove the atom map number (if applicable from molecule), note that this
affects canonicalization.
:param otherargs: arguments to pass to Chem.MolToSmiles
:return: canonicalized SMILES string
"""
mol = Chem.MolFromSmiles(smiles)
return canonicalize_from_molecule(mol, remove_atm_mapping, **otherargs)
def canonicalize_from_molecule(mol, remove_atm_mapping=True, **otherargs) -> str:
"""
:param mol: RDKit molecule to canonicalize.
:param remove_atm_mapping: whether to remove the atom map number (if applicable from molecule), note that this
affects canonicalization.
:param otherargs: arguments to pass to Chem.MolToSmiles
:return: canonicalized SMILES string
"""
mol_copy = Chem.RWMol(mol)
if remove_atm_mapping:
for atom in mol_copy.GetAtoms():
atom.ClearProp('molAtomMapNumber')
smiles = Chem.MolToSmiles(mol_copy, canonical=True, **otherargs)
return smiles
def split_reagents_out_from_reactants_and_products(rxn_smi, strict_mode=False) -> typing.Tuple[str, str, str]:
"""
Splits reaction into reactants, reagents, and products. Can deal with reagents in reactants part of SMILES string.
Note that this method expects relatively well done atom mapping.
Reagent defined as either:
1. in the middle part of reaction SMILES string, i.e. inbetween the `>` tokens.
2. in the reactants part of the SMILES string and all of these are true:
a. no atoms in the product(s).
b. not involved in the reaction center (atoms for which bonds change before and after) -- depending on the
center identification code this will be covered by a, but is also checked to allow for cases where
center can include information about changes in say a reactant that results in two undocumented minor
products.
c. reaction has been atom mapped (i.e., can we accurately check conditions a and b) -- currently judged by
a center being able to be identified.
3. in the reactants and products part of the SMILES string and both:
a. not involved in reaction center
b. unchanged before and after the reaction (comparing with canonicalized, atom-map removed strings)
:param rxn_smi: the reaction SMILES string
:param strict_mode: whether to run `get_changed_bonds` in strict mode when determining atom map numbers involved in
center.
:return: tuple of reactants, reagents, and products
"""
# 1. Split up reaction and get involved atom counts.
reactant_all_str, reagents_all_str, product_all_str = rxn_smi.split('>')
atoms_involved_in_reaction = set(
itertools.chain(*[(int(el[0]), int(el[1])) for el in get_changed_bonds(rxn_smi, strict_mode)]))
reactants_str = reactant_all_str.split('.')
products_str = product_all_str.split('.')
products_to_keep = collections.Counter(products_str)
product_atom_map_nums = functools.reduce(lambda x, y: x | y, (set(get_atom_map_nums(prod)) for prod in products_str))
reaction_been_atom_mapped = len(atoms_involved_in_reaction) > 0
# 2. Store map from canonical products to multiset of their SMILES in the products --> we will class
canon_products_to_orig_prods = collections.defaultdict(collections.Counter)
for smi in products_str:
canon_products_to_orig_prods[canonicalize(smi)].update([smi])
# 3. Go through the remaining reactants and check for conditions 2 or 3.
reactants = []
reagents = reagents_all_str.split('.')
for candidate_reactant in reactants_str:
atom_map_nums_in_candidate_reactant = set(get_atom_map_nums(candidate_reactant))
# compute some flags useful for checks 2 and 3
# 2a any atoms in products
not_in_product = len(list(product_atom_map_nums & atom_map_nums_in_candidate_reactant)) == 0
# 2b/3a any atoms in reaction center
not_in_center = len(list(set(atoms_involved_in_reaction & atom_map_nums_in_candidate_reactant))) == 0
# Check 2.
if (reaction_been_atom_mapped and not_in_product and not_in_center):
reagents.append(candidate_reactant)
continue
# Check 3.
canonical_reactant = canonicalize(candidate_reactant)
reactant_possibly_unchanged_in_products = canonical_reactant in canon_products_to_orig_prods
# ^ possibly as it could be different when we include atom maps -- we will check for this later.
if not_in_center and reactant_possibly_unchanged_in_products:
# We also need to match this reactant up with the appropriate product SMILES string and remove this from
# the product. To do this we shall go through the possible product SMILES strings.
possible_prod = None
for prod in canon_products_to_orig_prods[canonical_reactant]:
# if the atom mapped numbers intersect then this must be the product we are after and can break!
if len(set(get_atom_map_nums(prod)) & set(get_atom_map_nums(candidate_reactant))) > 0:
break
# if the product in the reaction SMILES has no atom map numbers it _could_ match but check other
# possibilities first to see if we get an atom map match.
if len(list(get_atom_map_nums(prod))) == 0:
possible_prod = prod
else:
prod = possible_prod # <-- if we are here then we did not get an exact atom map match
if prod is not None:
# ^ if it is still None then a false alarm and not the same molecule due to atom map numbers.
# (we're going to defer to atom map numbers and assume they're right!)
reagents.append(candidate_reactant)
products_to_keep.subtract([prod]) # remove it from the products too
# we also need to do some book keeping on our datastructure mapping canonical SMILES to product strings
# to indicate that we have removed one.
canon_products_to_orig_prods[canonical_reactant].subtract([prod])
canon_products_to_orig_prods[canonical_reactant] += collections.Counter()
# ^ remove zero and negative values
if len(canon_products_to_orig_prods[canonical_reactant]) == 0:
del canon_products_to_orig_prods[canonical_reactant]
continue
# if passed check 2 and 3 then it is a reactant!
reactants.append(candidate_reactant)
product_all_str = '.'.join(products_to_keep.elements())
return '.'.join(reactants), '.'.join(reagents), product_all_str
if __name__ == '__main__':
examples = [
(
"[Br:6][c:7]1[c:8]([SH:13])[cH:9][cH:10][cH:11][cH:12]1.[C:14](=[O:15])([O-:16])[O-:17].[CH2:20]([CH3:21])[O:22]"
"[CH:23]([CH2:24][Br:25])[O:26][CH2:27][CH3:28].[CH3:29][CH2:30][O:31][C:32](=[O:33])[CH3:34].[K+:18].[K+:19].[O:1]"
"=[CH:2][N:3]([CH3:4])[CH3:5].[OH2:35]>>[Br:6][c:7]1[c:8]([S:13][CH2:24][CH:23]([O:22][CH2:20][CH3:21])[O:26]"
"[CH2:27][CH3:28])[cH:9][cH:10][cH:11][cH:12]1"),
# Like above but demonstrating that unchanged molecules in products can also be classed as reagents:
(
"[Br:6][c:7]1[c:8]([SH:13])[cH:9][cH:10][cH:11][cH:12]1.[C:14](=[O:15])([O-:16])[O-:17].[CH2:20]([CH3:21])[O:22]"
"[CH:23]([CH2:24][Br:25])[O:26][CH2:27][CH3:28].[CH3:29][CH2:30][O:31][C:32](=[O:33])[CH3:34].[K+:18].[K+:19].[O:1]"
"=[CH:2][N:3]([CH3:4])[CH3:5].[OH2:35]>>[Br:6][c:7]1[c:8]([S:13][CH2:24][CH:23]([O:22][CH2:20][CH3:21])[O:26]"
"[CH2:27][CH3:28])[cH:9][cH:10][cH:11][cH:12]1.[K+:19].[K+:18].[CH3:29][CH2:30][O:31][C:32](=[O:33])[CH3:34]"),
(
"[C:34](=[O:35])([OH:36])[O-:37].[CH2:1]([CH3:2])[N:3]([c:4]1[c:5](-[c:12]2[n:13]([CH3:23])[c:14]3[cH:15][c:16]"
"([O:21][CH3:22])[cH:17][cH:18][c:19]3[cH:20]2)[cH:6][cH:7][c:8]([O:10][CH3:11])[cH:9]1)[CH2:24][CH3:25]"
".[Cl:26][N:27]1[C:28](=[O:29])[CH2:30][CH2:31][C:32]1=[O:33].[Na+:38]>[O:39]1[CH2:40][CH2:41][CH2:42][CH2:43]1"
">[CH2:1]([CH3:2])[N:3]([c:4]1[c:5](-[c:12]2[n:13]([CH3:23])[c:14]3[cH:15][c:16]([O:21][CH3:22])[cH:17][cH:18]"
"[c:19]3[c:20]2[Cl:26])[cH:6][cH:7][c:8]([O:10][CH3:11])[cH:9]1)[CH2:24][CH3:25]")
]
print("# Identifying centers")
for eg in examples:
print(eg)
print(get_changed_bonds(eg))
print()
print("\n\n# Splitting reagents out")
for eg in examples:
print(eg)
print('>'.join(split_reagents_out_from_reactants_and_products(eg)))
print()
print("Strict mode should fail with repeated atom maps...")
smiles = "[CH:1][N:2]([CH3:3])[CH3:1].[OH2:5]>>[CH:1][N:2]([CH3:3])[CH3:1][OH2:5]"
try:
get_changed_bonds(smiles, strict_mode=True)
except Exception as ex:
print("Yes, exception thrown:", ex)
else:
print("No?!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment