Created
February 22, 2023 12:39
-
-
Save marcosfelt/e71c80db790bb425daeba2f339936fbc to your computer and use it in GitHub Desktop.
Group identification (from thermo)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rdkit import Chem | |
"""The code below is directly copied from the thermo group_contribution code (MIT license) | |
https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/ | |
I made changes to the SMARTS strings to match the ones used in FeOs | |
""" | |
J_BIGGS_JOBACK_SMARTS = [ | |
["Methyl", "CH3", "[CX4H3]"], | |
["Secondary acyclic", "CH2", "[!R;CX4H2]"], | |
["Tertiary acyclic", ">CH", "[!R;CX4H]"], | |
["Quaternary acyclic", ">C<", "[!R;CX4H0]"], | |
["Primary alkene", "=CH2", "[CX3H2]"], | |
["Secondary alkene acyclic", "=CH", "[!R;CX3H1;!$([CX3H1](=O))]"], | |
["Tertiary alkene acyclic", "=C<", "[$([!R;CX3H0]);!$([!R;CX3H0]=[#8])]"], | |
["Alkyne", "C≡CH", "[$([CX2H1]#[CX2])]"], | |
["Aliphatic cyclohexane CH2", "CH2_hex", "[r6;CX4H2]"], | |
["Aliphatic cyclohexane", "CH_hex", "[r6;CX4H1]"], | |
["Aliphatic cyclopentane CH2", "CH2_pent", "[r5;CX4H2]"], | |
["Aliphatic cyclopentane CH", "CH_pent", "[r5;CX4H1]"], | |
["Alcohol", "OH", "[#6X3;!$([#6X3H0]=O)][OX2H]"], | |
["Aromatic carbon", "C_arom", "[cX3H0]"], | |
["Aromatic carbon-hydrogen", "CH_arom", "[cX3H1]"], | |
[ | |
"Carbonyl acyclic", | |
">C=O", | |
"[$([CX3H0]-[!OX2])]=O", | |
], | |
["Aliphatic Ether", "OCH3", "[CX4H3][OX2H0]"], | |
["Aliphatic Ether", "OCH2", "[CX4H2][OX2H0]"], | |
["Aldehyde", "CH=O", "[CX3H1](=O)"], | |
["Ester", "COO", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"], | |
["Terminal Ester", "HCOO", "[#6X3H1;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"], | |
["Primary amino", "NH2", "[NX3H2]"], | |
] | |
"""Metadata for the Joback groups. The first element is the group name; the | |
second is the group symbol; and the third is the SMARTS matching string. | |
""" | |
J_BIGGS_JOBACK_SMARTS_id_dict = { | |
i + 1: j[2] for i, j in enumerate(J_BIGGS_JOBACK_SMARTS) | |
} | |
J_BIGGS_JOBACK_SMARTS_str_dict = {i[1]: i[2] for i in J_BIGGS_JOBACK_SMARTS} | |
def smarts_fragment(catalog, mol=None, smi=None, deduplicate=True): | |
r"""Fragments a molecule into a set of unique groups and counts as | |
specified by the `catalog`. The molecule can either be an rdkit | |
molecule object, or a smiles string which will be parsed by rdkit. | |
Returns a dictionary of groups and their counts according to the | |
indexes of the catalog provided. | |
Parameters | |
---------- | |
catalog : dict | |
Dictionary indexed by keys pointing to smarts strings, [-] | |
mol : mol, optional | |
RDKit Mol object, [-] | |
smi : str, optional | |
Smiles string representing a chemical, [-] | |
Returns | |
------- | |
counts : dict | |
Dictionaty of integer counts of the found groups only, indexed by | |
the same keys used by the catalog [-] | |
success : bool | |
Whether or not molecule was fully and uniquely fragmented, [-] | |
status : str | |
A string holding an explanation of why the molecule failed to be | |
fragmented, if it fails; 'OK' if it suceeds. | |
Notes | |
----- | |
Raises an exception if rdkit is not installed, or `smi` or `rdkitmol` is | |
not defined. | |
Examples | |
-------- | |
Acetone: | |
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CC(=O)C') # doctest:+SKIP | |
({1: 2, 24: 1}, True, 'OK') | |
Sodium sulfate, (Na2O4S): | |
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='[O-]S(=O)(=O)[O-].[Na+].[Na+]') # doctest:+SKIP | |
({29: 4}, False, 'Did not match all atoms present') | |
Propionic anhydride (C6H10O3): | |
>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CCC(=O)OC(=O)CC') # doctest:+SKIP | |
({1: 2, 2: 2, 28: 2}, False, 'Matched some atoms repeatedly: [4]') | |
""" | |
if mol is None and smi is None: | |
raise Exception("Either an rdkit mol or a smiles string is required") | |
if smi is not None: | |
mol = Chem.MolFromSmiles(smi) # type: ignore | |
if mol is None: | |
status = "Failed to construct mol" | |
success = False | |
return {}, success, status | |
atom_count = len(mol.GetAtoms()) # type: ignore | |
status = "OK" | |
success = True | |
counts = {} | |
all_matches = {} | |
for key, smart in catalog.items(): | |
if isinstance(smart, str): | |
patt = Chem.MolFromSmarts(smart) # type: ignore | |
else: | |
patt = smart | |
hits = list(mol.GetSubstructMatches(patt)) # type: ignore | |
if hits: | |
all_matches[key] = hits | |
counts[key] = len(hits) | |
# Duplicate group cleanup | |
matched_atoms = [] | |
for i in all_matches.values(): | |
for j in i: | |
matched_atoms.extend(j) | |
if deduplicate: | |
dups = [i for i, c in Counter(matched_atoms).items() if c > 1] | |
iteration = 0 | |
while dups and iteration < 100: | |
dup = dups[0] | |
dup_smart_matches = [] | |
for group, group_match_list in all_matches.items(): | |
for i, group_match_i in enumerate(group_match_list): | |
if dup in group_match_i: | |
dup_smart_matches.append( | |
(group, i, group_match_i, len(group_match_i)) | |
) | |
sizes = [i[3] for i in dup_smart_matches] | |
max_size = max(sizes) | |
# print(sizes, 'sizes', 'dup', dup, 'working_data', dup_smart_matches) | |
if sizes.count(max_size) > 1: | |
iteration += 1 | |
# print('BAD') | |
# Two same size groups, continue, can't do anything | |
continue | |
else: | |
# Remove matches that are not the largest | |
for group, idx, positions, size in dup_smart_matches: | |
if size != max_size: | |
# Not handling the case of multiple duplicate matches right, indexes changing!!! | |
del all_matches[group][idx] | |
continue | |
matched_atoms = [] | |
for i in all_matches.values(): | |
for j in i: | |
matched_atoms.extend(j) | |
dups = [i for i, c in Counter(matched_atoms).items() if c > 1] | |
iteration += 1 | |
matched_atoms = set() | |
for i in all_matches.values(): | |
for j in i: | |
matched_atoms.update(j) | |
if len(matched_atoms) != atom_count: | |
status = "Did not match all atoms present" | |
success = False | |
# Check the atom aount again, this time looking for duplicate matches (only if have yet to fail) | |
if success: | |
matched_atoms = [] | |
for i in all_matches.values(): | |
for j in i: | |
matched_atoms.extend(j) | |
if len(matched_atoms) < atom_count: | |
status = "Matched %d of %d atoms only" % (len(matched_atoms), atom_count) | |
success = False | |
elif len(matched_atoms) > atom_count: | |
status = "Matched some atoms repeatedly: %s" % ( | |
[i for i, c in Counter(matched_atoms).items() if c > 1] | |
) | |
success = False | |
return counts, success, status |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
or
How to use:
You might want to change back to the groups here: https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/joback.py