Skip to content

Instantly share code, notes, and snippets.

@PatWalters
Last active January 14, 2023 01:12
Show Gist options
  • Save PatWalters/f257c6b73cf008c0e94932ee96178717 to your computer and use it in GitHub Desktop.
Save PatWalters/f257c6b73cf008c0e94932ee96178717 to your computer and use it in GitHub Desktop.
Compare two SMILES or SD files to identify duplicate structures. Duplicates are identified by comparing InChI keys
#!/usr/bin/env python
import sys
import os
from rdkit import Chem
def molecule_supplier_from_name(input_file_name):
ext = os.path.splitext(input_file_name)[-1]
if ext == ".smi":
suppl = Chem.SmilesMolSupplier(input_file_name)
elif ext == ".sdf":
suppl = Chem.SDMolSupplier(input_file_name)
else:
print("%s is not a valid molecule extension" % ext)
sys.exit(0)
return suppl
def build_inchi_dict(infile_name):
suppl = molecule_supplier_from_name(infile_name)
if suppl is None:
print("Error reading %s, extension not recognized" % infile_name)
sys.exit(1)
inchi_dict = {}
for mol in suppl:
if mol:
name = mol.GetProp("_Name")
inchi_key = Chem.MolToInchiKey(mol)
inchi_dict[inchi_key] = name
return inchi_dict
def check_infile(infile_name, inchi_dict):
suppl = molecule_supplier_from_name(infile_name)
for mol in suppl:
if mol:
name = mol.GetProp("_Name")
smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
inchi = Chem.MolToInchiKey(mol)
match = inchi_dict.get(inchi) or "NO_MATCH"
print(smiles, name, match)
if __name__ == "__main__":
usage = """%s reference.[sdf|smi] query.[sdf|smi]
Program returns the matching molecule name from reference or NO_MATCH
for each molecule in query
""" % sys.argv[0]
if len(sys.argv) != 3:
print(usage, file=sys.stderr)
sys.exit(0)
ref_dict = build_inchi_dict(sys.argv[1])
check_infile(sys.argv[2], ref_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment