marcosfelt/groups.py

## groups.py
from rdkit import Chem

"""The code below is directly copied from the thermo group_contribution code (MIT license)
https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/

I made changes to the SMARTS strings to match the ones used in FeOs
"""
J_BIGGS_JOBACK_SMARTS = [
    ["Methyl", "CH3", "[CX4H3]"],
    ["Secondary acyclic", "CH2", "[!R;CX4H2]"],
    ["Tertiary acyclic", ">CH", "[!R;CX4H]"],
    ["Quaternary acyclic", ">C<", "[!R;CX4H0]"],
    ["Primary alkene", "=CH2", "[CX3H2]"],
    ["Secondary alkene acyclic", "=CH", "[!R;CX3H1;!$([CX3H1](=O))]"],
    ["Tertiary alkene acyclic", "=C<", "[$([!R;CX3H0]);!$([!R;CX3H0]=[#8])]"],
    ["Alkyne", "C≡CH", "[$([CX2H1]#[CX2])]"],
    ["Aliphatic cyclohexane CH2", "CH2_hex", "[r6;CX4H2]"],
    ["Aliphatic cyclohexane", "CH_hex", "[r6;CX4H1]"],
    ["Aliphatic cyclopentane CH2", "CH2_pent", "[r5;CX4H2]"],
    ["Aliphatic cyclopentane CH", "CH_pent", "[r5;CX4H1]"],
    ["Alcohol", "OH", "[#6X3;!$([#6X3H0]=O)][OX2H]"],
    ["Aromatic carbon", "C_arom", "[cX3H0]"],
    ["Aromatic carbon-hydrogen", "CH_arom", "[cX3H1]"],
    [
        "Carbonyl acyclic",
        ">C=O",
        "[$([CX3H0]-[!OX2])]=O",
    ],
    ["Aliphatic Ether", "OCH3", "[CX4H3][OX2H0]"],
    ["Aliphatic Ether", "OCH2", "[CX4H2][OX2H0]"],
    ["Aldehyde", "CH=O", "[CX3H1](=O)"],
    ["Ester", "COO", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
    ["Terminal Ester", "HCOO", "[#6X3H1;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
    ["Primary amino", "NH2", "[NX3H2]"],
]
"""Metadata for the Joback groups. The first element is the group name; the
second is the group symbol; and the third is the SMARTS matching string.
"""

J_BIGGS_JOBACK_SMARTS_id_dict = {
    i + 1: j[2] for i, j in enumerate(J_BIGGS_JOBACK_SMARTS)
}
J_BIGGS_JOBACK_SMARTS_str_dict = {i[1]: i[2] for i in J_BIGGS_JOBACK_SMARTS}


def smarts_fragment(catalog, mol=None, smi=None, deduplicate=True):
    r"""Fragments a molecule into a set of unique groups and counts as
    specified by the `catalog`. The molecule can either be an rdkit
    molecule object, or a smiles string which will be parsed by rdkit.
    Returns a dictionary of groups and their counts according to the
    indexes of the catalog provided.

    Parameters
    ----------
    catalog : dict
        Dictionary indexed by keys pointing to smarts strings, [-]
    mol : mol, optional
        RDKit Mol object, [-]
    smi : str, optional
        Smiles string representing a chemical, [-]

    Returns
    -------
    counts : dict
        Dictionaty of integer counts of the found groups only, indexed by
        the same keys used by the catalog [-]
    success : bool
        Whether or not molecule was fully and uniquely fragmented, [-]
    status : str
        A string holding an explanation of why the molecule failed to be
        fragmented, if it fails; 'OK' if it suceeds.

    Notes
    -----
    Raises an exception if rdkit is not installed, or `smi` or `rdkitmol` is
    not defined.
    Examples
    --------
    Acetone:
    >>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CC(=O)C') # doctest:+SKIP
    ({1: 2, 24: 1}, True, 'OK')
    Sodium sulfate, (Na2O4S):
    >>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='[O-]S(=O)(=O)[O-].[Na+].[Na+]') # doctest:+SKIP
    ({29: 4}, False, 'Did not match all atoms present')
    Propionic anhydride (C6H10O3):
    >>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CCC(=O)OC(=O)CC') # doctest:+SKIP
    ({1: 2, 2: 2, 28: 2}, False, 'Matched some atoms repeatedly: [4]')
    """

    if mol is None and smi is None:
        raise Exception("Either an rdkit mol or a smiles string is required")
    if smi is not None:
        mol = Chem.MolFromSmiles(smi)  # type: ignore
        if mol is None:
            status = "Failed to construct mol"
            success = False
            return {}, success, status

    atom_count = len(mol.GetAtoms())  # type: ignore
    status = "OK"
    success = True

    counts = {}
    all_matches = {}
    for key, smart in catalog.items():
        if isinstance(smart, str):
            patt = Chem.MolFromSmarts(smart)  # type: ignore
        else:
            patt = smart
        hits = list(mol.GetSubstructMatches(patt))  # type: ignore
        if hits:
            all_matches[key] = hits
            counts[key] = len(hits)

    # Duplicate group cleanup
    matched_atoms = []
    for i in all_matches.values():
        for j in i:
            matched_atoms.extend(j)

    if deduplicate:
        dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
        iteration = 0
        while dups and iteration < 100:
            dup = dups[0]

            dup_smart_matches = []
            for group, group_match_list in all_matches.items():
                for i, group_match_i in enumerate(group_match_list):
                    if dup in group_match_i:
                        dup_smart_matches.append(
                            (group, i, group_match_i, len(group_match_i))
                        )

            sizes = [i[3] for i in dup_smart_matches]
            max_size = max(sizes)
            #            print(sizes, 'sizes', 'dup', dup, 'working_data', dup_smart_matches)
            if sizes.count(max_size) > 1:
                iteration += 1
                #                print('BAD')
                # Two same size groups, continue, can't do anything
                continue
            else:
                # Remove matches that are not the largest
                for group, idx, positions, size in dup_smart_matches:
                    if size != max_size:
                        # Not handling the case of multiple duplicate matches right, indexes changing!!!
                        del all_matches[group][idx]
                        continue

            matched_atoms = []
            for i in all_matches.values():
                for j in i:
                    matched_atoms.extend(j)

            dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
            iteration += 1

    matched_atoms = set()
    for i in all_matches.values():
        for j in i:
            matched_atoms.update(j)
    if len(matched_atoms) != atom_count:
        status = "Did not match all atoms present"
        success = False

    # Check the atom aount again, this time looking for duplicate matches (only if have yet to fail)
    if success:
        matched_atoms = []
        for i in all_matches.values():
            for j in i:
                matched_atoms.extend(j)
        if len(matched_atoms) < atom_count:
            status = "Matched %d of %d atoms only" % (len(matched_atoms), atom_count)
            success = False
        elif len(matched_atoms) > atom_count:
            status = "Matched some atoms repeatedly: %s" % (
                [i for i, c in Counter(matched_atoms).items() if c > 1]
            )
            success = False

    return counts, success, status
	from rdkit import Chem

	"""The code below is directly copied from the thermo group_contribution code (MIT license)
	https://github.com/CalebBell/thermo/blob/516dee4ceda8e100918c7645e393a42fdfdc4bef/thermo/group_contribution/

	I made changes to the SMARTS strings to match the ones used in FeOs
	"""
	J_BIGGS_JOBACK_SMARTS = [
	["Methyl", "CH3", "[CX4H3]"],
	["Secondary acyclic", "CH2", "[!R;CX4H2]"],
	["Tertiary acyclic", ">CH", "[!R;CX4H]"],
	["Quaternary acyclic", ">C<", "[!R;CX4H0]"],
	["Primary alkene", "=CH2", "[CX3H2]"],
	["Secondary alkene acyclic", "=CH", "[!R;CX3H1;!$([CX3H1](=O))]"],
	["Tertiary alkene acyclic", "=C<", "[$([!R;CX3H0]);!$([!R;CX3H0]=[#8])]"],
	["Alkyne", "C≡CH", "[$([CX2H1]#[CX2])]"],
	["Aliphatic cyclohexane CH2", "CH2_hex", "[r6;CX4H2]"],
	["Aliphatic cyclohexane", "CH_hex", "[r6;CX4H1]"],
	["Aliphatic cyclopentane CH2", "CH2_pent", "[r5;CX4H2]"],
	["Aliphatic cyclopentane CH", "CH_pent", "[r5;CX4H1]"],
	["Alcohol", "OH", "[#6X3;!$([#6X3H0]=O)][OX2H]"],
	["Aromatic carbon", "C_arom", "[cX3H0]"],
	["Aromatic carbon-hydrogen", "CH_arom", "[cX3H1]"],
	[
	"Carbonyl acyclic",
	">C=O",
	"[$([CX3H0]-[!OX2])]=O",
	],
	["Aliphatic Ether", "OCH3", "[CX4H3][OX2H0]"],
	["Aliphatic Ether", "OCH2", "[CX4H2][OX2H0]"],
	["Aldehyde", "CH=O", "[CX3H1](=O)"],
	["Ester", "COO", "[#6X3H0;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
	["Terminal Ester", "HCOO", "[#6X3H1;!$([#6X3H0](~O)(~O)(~O))](=[#8X1])[#8X2H0]"],
	["Primary amino", "NH2", "[NX3H2]"],
	]
	"""Metadata for the Joback groups. The first element is the group name; the
	second is the group symbol; and the third is the SMARTS matching string.
	"""

	J_BIGGS_JOBACK_SMARTS_id_dict = {
	i + 1: j[2] for i, j in enumerate(J_BIGGS_JOBACK_SMARTS)
	}
	J_BIGGS_JOBACK_SMARTS_str_dict = {i[1]: i[2] for i in J_BIGGS_JOBACK_SMARTS}


	def smarts_fragment(catalog, mol=None, smi=None, deduplicate=True):
	r"""Fragments a molecule into a set of unique groups and counts as
	specified by the `catalog`. The molecule can either be an rdkit
	molecule object, or a smiles string which will be parsed by rdkit.
	Returns a dictionary of groups and their counts according to the
	indexes of the catalog provided.

	Parameters
	----------
	catalog : dict
	Dictionary indexed by keys pointing to smarts strings, [-]
	mol : mol, optional
	RDKit Mol object, [-]
	smi : str, optional
	Smiles string representing a chemical, [-]

	Returns
	-------
	counts : dict
	Dictionaty of integer counts of the found groups only, indexed by
	the same keys used by the catalog [-]
	success : bool
	Whether or not molecule was fully and uniquely fragmented, [-]
	status : str
	A string holding an explanation of why the molecule failed to be
	fragmented, if it fails; 'OK' if it suceeds.

	Notes
	-----
	Raises an exception if rdkit is not installed, or `smi` or `rdkitmol` is
	not defined.
	Examples
	--------
	Acetone:
	>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CC(=O)C') # doctest:+SKIP
	({1: 2, 24: 1}, True, 'OK')
	Sodium sulfate, (Na2O4S):
	>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='[O-]S(=O)(=O)[O-].[Na+].[Na+]') # doctest:+SKIP
	({29: 4}, False, 'Did not match all atoms present')
	Propionic anhydride (C6H10O3):
	>>> smarts_fragment(catalog=J_BIGGS_JOBACK_SMARTS_id_dict, smi='CCC(=O)OC(=O)CC') # doctest:+SKIP
	({1: 2, 2: 2, 28: 2}, False, 'Matched some atoms repeatedly: [4]')
	"""

	if mol is None and smi is None:
	raise Exception("Either an rdkit mol or a smiles string is required")
	if smi is not None:
	mol = Chem.MolFromSmiles(smi) # type: ignore
	if mol is None:
	status = "Failed to construct mol"
	success = False
	return {}, success, status

	atom_count = len(mol.GetAtoms()) # type: ignore
	status = "OK"
	success = True

	counts = {}
	all_matches = {}
	for key, smart in catalog.items():
	if isinstance(smart, str):
	patt = Chem.MolFromSmarts(smart) # type: ignore
	else:
	patt = smart
	hits = list(mol.GetSubstructMatches(patt)) # type: ignore
	if hits:
	all_matches[key] = hits
	counts[key] = len(hits)

	# Duplicate group cleanup
	matched_atoms = []
	for i in all_matches.values():
	for j in i:
	matched_atoms.extend(j)

	if deduplicate:
	dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
	iteration = 0
	while dups and iteration < 100:
	dup = dups[0]

	dup_smart_matches = []
	for group, group_match_list in all_matches.items():
	for i, group_match_i in enumerate(group_match_list):
	if dup in group_match_i:
	dup_smart_matches.append(
	(group, i, group_match_i, len(group_match_i))
	)

	sizes = [i[3] for i in dup_smart_matches]
	max_size = max(sizes)
	# print(sizes, 'sizes', 'dup', dup, 'working_data', dup_smart_matches)
	if sizes.count(max_size) > 1:
	iteration += 1
	# print('BAD')
	# Two same size groups, continue, can't do anything
	continue
	else:
	# Remove matches that are not the largest
	for group, idx, positions, size in dup_smart_matches:
	if size != max_size:
	# Not handling the case of multiple duplicate matches right, indexes changing!!!
	del all_matches[group][idx]
	continue

	matched_atoms = []
	for i in all_matches.values():
	for j in i:
	matched_atoms.extend(j)

	dups = [i for i, c in Counter(matched_atoms).items() if c > 1]
	iteration += 1

	matched_atoms = set()
	for i in all_matches.values():
	for j in i:
	matched_atoms.update(j)
	if len(matched_atoms) != atom_count:
	status = "Did not match all atoms present"
	success = False

	# Check the atom aount again, this time looking for duplicate matches (only if have yet to fail)
	if success:
	matched_atoms = []
	for i in all_matches.values():
	for j in i:
	matched_atoms.extend(j)
	if len(matched_atoms) < atom_count:
	status = "Matched %d of %d atoms only" % (len(matched_atoms), atom_count)
	success = False
	elif len(matched_atoms) > atom_count:
	status = "Matched some atoms repeatedly: %s" % (
	[i for i, c in Counter(matched_atoms).items() if c > 1]
	)
	success = False

	return counts, success, status