Skip to content

Instantly share code, notes, and snippets.

@egonw
Created May 1, 2024 19:06
Show Gist options
  • Save egonw/7b471e489c120e4485a86e10846253ae to your computer and use it in GitHub Desktop.
Save egonw/7b471e489c120e4485a86e10846253ae to your computer and use it in GitHub Desktop.
Groovy script using SPARQL to retrieve polymers with CXSMILES from Wikidata (CCZero), generates coordinates, and then writes an SD file.
// CC-BY 4.0 International. (c) 2024 Egon Willighagen
@Grab(group='io.github.egonw.bacting', module='managers-ui', version='0.5.2')
@Grab(group='io.github.egonw.bacting', module='managers-rdf', version='0.5.2')
@Grab(group='org.openscience.cdk', module='cdk-smiles', version='2.9')
@Grab(group='org.openscience.cdk', module='cdk-silent', version='2.9')
@Grab(group='org.openscience.cdk', module='cdk-ctab', version='2.9')
@Grab(group='org.openscience.cdk', module='cdk-sdg', version='2.9')
import org.openscience.cdk.smiles.SmilesParser;
import org.openscience.cdk.interfaces.*;
import org.openscience.cdk.silent.SilentChemObjectBuilder;
import org.openscience.cdk.io.*;
import org.openscience.cdk.layout.StructureDiagramGenerator;
import javax.vecmath.Vector2d
workspaceRoot = ".."
// ui = new net.bioclipse.managers.UIManager(workspaceRoot);
// cdk = new net.bioclipse.managers.CDKManager(workspaceRoot);
bioclipse = new net.bioclipse.managers.BioclipseManager(workspaceRoot);
rdf = new net.bioclipse.managers.RDFManager(workspaceRoot);
builder = SilentChemObjectBuilder.getInstance()
sp = new SmilesParser(builder)
mappingQuery = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT DISTINCT ?polymer ?polymerLabel ?cxsmiles WHERE {
?polymer wdt:P31/wdt:P279* wd:Q81163 ;
wdt:P10718 ?cxsmiles ;
rdfs:label ?polymerLabel . FILTER (LANG(?polymerLabel) = "en") .
}
"""
// sparqlEP = "https://qlever.cs.uni-freiburg.de/api/wikidata"
sparqlEP = "https://query.wikidata.org/sparql"
rawResults = bioclipse.sparqlRemote(sparqlEP, mappingQuery)
results = rdf.processSPARQLXML(rawResults, mappingQuery)
molList = builder.newInstance(IAtomContainerSet.class)
for (i=1;i<=results.rowCount;i++) {
wdItemIRI = results.get(i, "polymer")
println wdItemIRI
wdItem = wdItemIRI.replace("wd:", "").replace("http://www.wikidata.org/entity/", "")
label = results.get(i, "polymerLabel")
cxSMILES = results.get(i, "cxsmiles")
mol = sp.parseSmiles(cxSMILES)
sdg = new StructureDiagramGenerator();
sdg.setMolecule(mol);
sdg.generateCoordinates(new Vector2d(0, 1));
mol = sdg.getMolecule();
mol.setTitle(label)
mol.setProperty("PUBCHEM_SUBSTANCE_SYNONYM", label)
mol.setProperty("PUBCHEM_SUBSTANCE_COMMENT", cxSMILES)
mol.setProperty("PUBCHEM_EXT_DATASOURCE_REGID", wdItem)
mol.setProperty("PUBCHEM_EXT_SUBSTANCE_URL", "https://scholia.toolforge.org/" + wdItem)
molList.addAtomContainer(mol)
}
writer = new FileWriter(new File("wikidata_polymers.sdf"))
SDFWriter sdfWriter = new SDFWriter(writer);
sdfWriter.write(molList);
sdfWriter.close();
writer.close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment