Skip to content

Instantly share code, notes, and snippets.

@marcosfelt
Created February 22, 2023 12:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcosfelt/e71c80db790bb425daeba2f339936fbc to your computer and use it in GitHub Desktop.
Save marcosfelt/e71c80db790bb425daeba2f339936fbc to your computer and use it in GitHub Desktop.
Group identification (from thermo)
from rdkit import Chem
"""The code below is directly copied from the thermo group_contribution code (MIT license)
https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/
I made changes to the SMARTS strings to match the ones used in FeOs
"""
J_BIGGS_JOBACK_SMARTS = [
["Methyl", "CH3", "[CX4H3]"],
["Secondary acyclic", "CH2", "[!R;CX4H2]"],
["Tertiary acyclic", ">CH", "[!R;CX4H]"],
["Quaternary acyclic", ">C<", "[!R;CX4H0]"],
["Primary alkene", "=CH2", "[CX3H2]"],
["Secondary alkene acyclic", "=CH", "[!R;CX3H1;!$([CX3H1](=O))]"],
["Tertiary alkene acyclic", "=C<", "[$([!R;CX3H0]);!$([!R;CX3H0]=[#8])]"],
["Alkyne", "C≡CH", "[$([CX2H1]#[CX2])]"],
["Aliphatic cyclohexane CH2", "CH2_hex", "[r6;CX4H2]"],
["Aliphatic cyclohexane", "CH_hex", "[r6;CX4H1]"],
["Aliphatic cyclopentane CH2", "CH2_pent", "[r5;CX4H2]"],
["Aliphatic cyclopentane CH", "CH_pent", "[r5;CX4H1]"],
["Alcohol", "OH", "[#6X3;!$([#6X3H0]=O)][OX2H]"],
["Aromatic carbon", "C_arom", "[cX3H0]"],
["Aromatic carbon-hydrogen", "CH_arom", "[cX3H1]"],
[
"Carbonyl acyclic",
">C=O",
"[$([CX3H0]-[!OX2])]=O",
],
["Aliphatic Ether", "OCH3", "[CX4H3][OX2H0]"],
["Aliphatic Ether", "OCH2", "[CX4H2][OX2H0]"],
["Aldehyde", "CH=O", "[CX3H1](=O)"],
["Ester", "COO", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
["Terminal Ester", "HCOO", "[#6X3H1;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
["Primary amino", "NH2", "[NX3H2]"],
]
"""Metadata for the Joback groups. The first element is the group name; the
second is the group symbol; and the third is the SMARTS matching string.
"""
J_BIGGS_JOBACK_SMARTS_id_dict = {
i + 1: j[2] for i, j in enumerate(J_BIGGS_JOBACK_SMARTS)
}
J_BIGGS_JOBACK_SMARTS_str_dict = {i[1]: i[2] for i in J_BIGGS_JOBACK_SMARTS}
def smarts_fragment(catalog, mol=None, smi=None, deduplicate=True):
r"""Fragments a molecule into a set of unique groups and counts as
specified by the `catalog`. The molecule can either be an rdkit
molecule object, or a smiles string which will be parsed by rdkit.
Returns a dictionary of groups and their counts according to the
indexes of the catalog provided.
Parameters
----------
catalog : dict
Dictionary indexed by keys pointing to smarts strings, [-]
mol : mol, optional
RDKit Mol object, [-]
smi : str, optional
Smiles string representing a chemical, [-]
Returns
-------
counts : dict
Dictionaty of integer counts of the found groups only, indexed by
the same keys used by the catalog [-]
success : bool
Whether or not molecule was fully and uniquely fragmented, [-]
status : str
A string holding an explanation of why the molecule failed to be
fragmented, if it fails; 'OK' if it suceeds.
Notes
-----
Raises an exception if rdkit is not installed, or `smi` or `rdkitmol` is
not defined.
Examples
--------
Acetone:
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CC(=O)C') # doctest:+SKIP
({1: 2, 24: 1}, True, 'OK')
Sodium sulfate, (Na2O4S):
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='[O-]S(=O)(=O)[O-].[Na+].[Na+]') # doctest:+SKIP
({29: 4}, False, 'Did not match all atoms present')
Propionic anhydride (C6H10O3):
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CCC(=O)OC(=O)CC') # doctest:+SKIP
({1: 2, 2: 2, 28: 2}, False, 'Matched some atoms repeatedly: [4]')
"""
if mol is None and smi is None:
raise Exception("Either an rdkit mol or a smiles string is required")
if smi is not None:
mol = Chem.MolFromSmiles(smi) # type: ignore
if mol is None:
status = "Failed to construct mol"
success = False
return {}, success, status
atom_count = len(mol.GetAtoms()) # type: ignore
status = "OK"
success = True
counts = {}
all_matches = {}
for key, smart in catalog.items():
if isinstance(smart, str):
patt = Chem.MolFromSmarts(smart) # type: ignore
else:
patt = smart
hits = list(mol.GetSubstructMatches(patt)) # type: ignore
if hits:
all_matches[key] = hits
counts[key] = len(hits)
# Duplicate group cleanup
matched_atoms = []
for i in all_matches.values():
for j in i:
matched_atoms.extend(j)
if deduplicate:
dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
iteration = 0
while dups and iteration < 100:
dup = dups[0]
dup_smart_matches = []
for group, group_match_list in all_matches.items():
for i, group_match_i in enumerate(group_match_list):
if dup in group_match_i:
dup_smart_matches.append(
(group, i, group_match_i, len(group_match_i))
)
sizes = [i[3] for i in dup_smart_matches]
max_size = max(sizes)
# print(sizes, 'sizes', 'dup', dup, 'working_data', dup_smart_matches)
if sizes.count(max_size) > 1:
iteration += 1
# print('BAD')
# Two same size groups, continue, can't do anything
continue
else:
# Remove matches that are not the largest
for group, idx, positions, size in dup_smart_matches:
if size != max_size:
# Not handling the case of multiple duplicate matches right, indexes changing!!!
del all_matches[group][idx]
continue
matched_atoms = []
for i in all_matches.values():
for j in i:
matched_atoms.extend(j)
dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
iteration += 1
matched_atoms = set()
for i in all_matches.values():
for j in i:
matched_atoms.update(j)
if len(matched_atoms) != atom_count:
status = "Did not match all atoms present"
success = False
# Check the atom aount again, this time looking for duplicate matches (only if have yet to fail)
if success:
matched_atoms = []
for i in all_matches.values():
for j in i:
matched_atoms.extend(j)
if len(matched_atoms) < atom_count:
status = "Matched %d of %d atoms only" % (len(matched_atoms), atom_count)
success = False
elif len(matched_atoms) > atom_count:
status = "Matched some atoms repeatedly: %s" % (
[i for i, c in Counter(matched_atoms).items() if c > 1]
)
success = False
return counts, success, status
@marcosfelt
Copy link
Author

marcosfelt commented Feb 22, 2023

pip install rdkit-pypi

or

conda install rdkit

How to use:

counts, success, status = smarts_fragment(
    catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi="CCO"
)

You might want to change back to the groups here: https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/joback.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment