Skip to content

Instantly share code, notes, and snippets.

@DavidJRobertson
Created November 30, 2020 00:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DavidJRobertson/7948a8d907ff27cf37c2a46e271d5192 to your computer and use it in GitHub Desktop.
Save DavidJRobertson/7948a8d907ff27cf37c2a46e271d5192 to your computer and use it in GitHub Desktop.
Pubchem LCSS data mangling
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" version="2.0">
<xsl:output method="xml" encoding="utf-8" indent="yes"/>
<xsl:template match="/">
<Chemicals>
<xsl:apply-templates/>
</Chemicals>
</xsl:template>
<xsl:template match="Chemical">
<Chemical>
<xsl:variable name="deduped-props">
<xsl:call-template name="dedup-properties">
<xsl:with-param name="chemprops" select="Properties/*"/>
</xsl:call-template>
</xsl:variable>
<Properties>
<xsl:sequence select="$deduped-props"/>
</Properties>
<xsl:variable name="used-refs"
select="sort(distinct-values(tokenize(string-join($deduped-props/*/@refs, ' '), ' ')))"/>
<xsl:if test="not(empty($used-refs))">
<References>
<xsl:for-each select="References/*[@number = $used-refs]">
<xsl:sort select="@number" data-type="number"/>
<xsl:copy-of select="."/>
</xsl:for-each>
</References>
</xsl:if>
</Chemical>
</xsl:template>
<xsl:template name="dedup-properties">
<xsl:param name="chemprops" required="yes"/>
<xsl:for-each-group select="$chemprops" group-by="name()">
<xsl:sort select="current-grouping-key()"/>
<xsl:variable name="prop" select="current-grouping-key()"/>
<xsl:for-each-group select="current-group()" group-by="normalize-space(text())">
<xsl:sort select="text()"/>
<xsl:element name="{$prop}">
<xsl:copy-of select="@*[name() != 'refs']"/>
<xsl:call-template name="merged-refs">
<xsl:with-param name="allrefs" select="current-group()/@refs"/>
</xsl:call-template>
<xsl:value-of select="current-grouping-key()"/>
</xsl:element>
</xsl:for-each-group>
</xsl:for-each-group>
</xsl:template>
<xsl:template name="merged-refs">
<xsl:param name="allrefs" required="yes"/>
<xsl:variable name="split" select="tokenize(string-join($allrefs, ' '), ' ')"/>
<xsl:variable name="sorted" select="sort(distinct-values(sort($split)))"/>
<xsl:variable name="joined" select="string-join($split, ' ')"/>
<xsl:if test="$joined != ''">
<xsl:attribute name="refs" select="$joined"/>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:pv="http://pubchem.ncbi.nlm.nih.gov/pug_view"
exclude-result-prefixes="xs pv" version="2.0" input-type-annotations="strip"
xpath-default-namespace="http://pubchem.ncbi.nlm.nih.gov/pug_view">
<xsl:param name="includeGhsHazardStatements" as="xs:boolean" select="true()"/>
<xsl:param name="includeGhsPrecautionaryStatements" as="xs:boolean" select="true()"/>
<xsl:param name="includeGhsDescriptions" as="xs:boolean" select="false()"/>
<xsl:param name="includeReferences" as="xs:boolean" select="true()"/>
<xsl:output method="xml" encoding="utf-8" indent="yes"/>
<xsl:template match="*" mode="#all">
<xsl:apply-templates mode="#current"/>
</xsl:template>
<xsl:template match="text()" mode="#all"/>
<xsl:template match="/">
<Chemicals>
<xsl:apply-templates/>
</Chemicals>
</xsl:template>
<xsl:template match="Record">
<Chemical>
<Properties>
<Name>
<xsl:value-of select="@title"/>
</Name>
<PubchemCID>
<xsl:text>CID-</xsl:text>
<xsl:value-of select="@number"/>
</PubchemCID>
<xsl:apply-templates select="*[name() != 'References']"/>
</Properties>
<xsl:apply-templates select="References"/>
</Chemical>
</xsl:template>
<!-- CHEMICAL PROPERTIES -->
<xsl:template match="Section[@heading = 'Molecular Formula']/InfoVal">
<Formula>
<xsl:call-template name="ref"/>
<xsl:value-of select="String"/>
</Formula>
</xsl:template>
<xsl:template match="Section[@heading = 'Molecular Weight']/InfoVal">
<MolarMass>
<xsl:call-template name="ref"/>
<xsl:value-of select="Number"/>
<xsl:if test="Unit">
<xsl:text> </xsl:text>
<xsl:value-of select="Unit"/>
</xsl:if>
</MolarMass>
</xsl:template>
<!-- GHS DATA -->
<xsl:template match="Section[@heading = 'GHS Classification']">
<xsl:apply-templates mode="ghs"/>
</xsl:template>
<xsl:template match="InfoVal[@name = 'Pictogram(s)']/GHSPictogram" mode="ghs">
<GHSPictogram>
<xsl:call-template name="ref">
<xsl:with-param name="subject" select=".."/>
</xsl:call-template>
<xsl:if test="$includeGhsDescriptions">
<xsl:attribute name="description" select="@type"/>
</xsl:if>
<xsl:value-of select="@code"/>
</GHSPictogram>
</xsl:template>
<xsl:template match="InfoVal[@name = 'Signal']" mode="ghs">
<GHSSignalWord>
<xsl:call-template name="ref"/>
<xsl:value-of select="String"/>
</GHSSignalWord>
</xsl:template>
<xsl:template match="InfoVal[@name = 'GHS Hazard Statements']" mode="ghs">
<xsl:if test="$includeGhsHazardStatements">
<xsl:variable name="infoval" select="."/>
<xsl:analyze-string select="normalize-space(String)"
regex="(H\d+)( \([\d\.]+%\))?: (.*)">
<xsl:matching-substring>
<GHSHazardStatement>
<xsl:call-template name="ref">
<xsl:with-param name="subject" select="$infoval"/>
</xsl:call-template>
<xsl:if test="$includeGhsDescriptions">
<xsl:attribute name="description" select="regex-group(3)"/>
</xsl:if>
<xsl:value-of select="regex-group(1)"/>
</GHSHazardStatement>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:if>
</xsl:template>
<xsl:template match="InfoVal[@name = 'Precautionary Statement Codes']" mode="ghs">
<xsl:if test="$includeGhsPrecautionaryStatements">
<xsl:variable name="infoval" select="."/>
<xsl:analyze-string select="String" regex="(P\d+)(\+P\d+)*">
<xsl:matching-substring>
<GHSPrecautionaryStatement>
<xsl:call-template name="ref">
<xsl:with-param name="subject" select="$infoval"/>
</xsl:call-template>
<xsl:value-of select="."/>
</GHSPrecautionaryStatement>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:if>
</xsl:template>
<!-- IDENTIFIERS -->
<xsl:template match="Section[@heading = 'Identifiers']">
<xsl:apply-templates mode="ident"/>
</xsl:template>
<xsl:template match="Section[@heading = 'CAS']/InfoVal" mode="ident">
<CASNumber>
<xsl:call-template name="ref"/>
<xsl:value-of select="String"/>
</CASNumber>
</xsl:template>
<xsl:template match="Section[@heading = 'InChI']/InfoVal" mode="ident">
<InChI>
<xsl:call-template name="ref"/>
<xsl:value-of select="String"/>
</InChI>
</xsl:template>
<xsl:template match="Section[@heading = 'InChI Key']/InfoVal" mode="ident">
<InChIKey>
<xsl:call-template name="ref"/>
<xsl:value-of select="String"/>
</InChIKey>
</xsl:template>
<!-- PHYSICAL PROPERTIES -->
<xsl:template match="Section[@heading = 'Physical Properties']">
<xsl:apply-templates mode="phys"/>
</xsl:template>
<xsl:template match="Section[@heading = 'Boiling Point']/InfoVal" mode="phys">
<BoilingPoint>
<xsl:call-template name="ref"/>
<xsl:value-of select="pv:fixDegrees(String)"/>
</BoilingPoint>
</xsl:template>
<xsl:template match="Section[@heading = 'Melting Point']/InfoVal" mode="phys">
<MeltingPoint>
<xsl:call-template name="ref"/>
<xsl:value-of select="pv:fixDegrees(String)"/>
</MeltingPoint>
</xsl:template>
<xsl:template match="Section[@heading = 'Flash Point']/InfoVal" mode="phys">
<FlashPoint>
<xsl:call-template name="ref"/>
<xsl:value-of select="pv:fixDegrees(String)"/>
</FlashPoint>
</xsl:template>
<xsl:template match="Section[@heading = 'Autoignition Temperature']/InfoVal" mode="phys">
<AutoignitionTemperature>
<xsl:call-template name="ref"/>
<xsl:value-of select="pv:fixDegrees(String)"/>
</AutoignitionTemperature>
</xsl:template>
<xsl:template match="Section[@heading = 'Density']/InfoVal" mode="phys">
<Density>
<xsl:call-template name="ref"/>
<xsl:value-of select="pv:fixDegrees(String)"/>
</Density>
</xsl:template>
<xsl:function name="pv:fixDegrees">
<xsl:param name="input" as="xs:string"/>
<xsl:value-of select="normalize-space(replace(replace($input, '°', '°'), '°', ' °'))"/>
</xsl:function>
<!-- REFERENCES -->
<xsl:template name="ref">
<xsl:param name="subject" select="."/>
<xsl:if test="$includeReferences">
<xsl:attribute name="refs" select="normalize-space(string-join((string-join($subject/Ref/@refno, ' '), (string-join($subject/@refno, ' '))), ' '))"/>
</xsl:if>
</xsl:template>
<xsl:template match="References">
<xsl:if test="$includeReferences">
<References>
<xsl:apply-templates mode="refs"/>
</References>
</xsl:if>
</xsl:template>
<xsl:template match="Reference" mode="refs">
<Reference>
<xsl:attribute name="number" select="ReferenceNumber"/>
<xsl:if test="ANID">
<xsl:attribute name="anid" select="ANID"/>
</xsl:if>
<xsl:if test="SourceID">
<xsl:attribute name="sourceid" select="SourceID"/>
</xsl:if>
<xsl:if test="URL">
<xsl:attribute name="url" select="URL"/>
</xsl:if>
<xsl:if test="IsToxnet">
<xsl:attribute name="istoxnet" select="IsToxnet"/>
</xsl:if>
<xsl:if test="Name">
<xsl:attribute name="name" select="Name"/>
</xsl:if>
<xsl:value-of select="SourceName"/>
</Reference>
</xsl:template>
</xsl:stylesheet>
<?xml version="1.0" encoding="UTF-8"?>
<p:declare-step xmlns:p="http://www.w3.org/ns/xproc" xmlns:c="http://www.w3.org/ns/xproc-step"
version="1.0">
<p:input port="source"/>
<p:output port="result"/>
<p:xslt name="simplify">
<p:input port="stylesheet">
<p:document href="lcss-simplify.xsl"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
<p:xslt name="chemicalize">
<p:input port="stylesheet">
<p:document href="lcss-chemicalize.xsl"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
<p:xslt name="chemical-dedup">
<p:input port="stylesheet">
<p:document href="chemical-property-dedup.xsl"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
</p:declare-step>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns="http://pubchem.ncbi.nlm.nih.gov/pug_view"
xmlns:pv="http://pubchem.ncbi.nlm.nih.gov/pug_view" exclude-result-prefixes="xs pv"
version="2.0" input-type-annotations="strip"
xpath-default-namespace="http://pubchem.ncbi.nlm.nih.gov/pug_view">
<xsl:output method="xml" encoding="utf-8" indent="yes"/>
<xsl:template match="*" mode="#all"/>
<xsl:template match="/">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="Record">
<Record>
<xsl:attribute name="type" select="RecordType"/>
<xsl:attribute name="number" select="RecordNumber"/>
<xsl:attribute name="title" select="RecordTitle"/>
<xsl:apply-templates select="Section"/>
<References>
<xsl:copy-of select="Reference"/>
</References>
</Record>
</xsl:template>
<xsl:template match="Section">
<Section>
<xsl:attribute name="heading" select="TOCHeading"/>
<xsl:if test="URL">
<xsl:attribute name="url" select="URL"/>
</xsl:if>
<xsl:apply-templates select="Section | Information"/>
</Section>
</xsl:template>
<xsl:template match="Information | Value">
<xsl:apply-templates select="*"/>
</xsl:template>
<xsl:template match="Information/Value/StringWithMarkup">
<InfoVal>
<xsl:call-template name="referenced">
<xsl:with-param name="subject" select="../.."/>
</xsl:call-template>
<xsl:variable name="string" select="normalize-space(String)"/>
<xsl:if test="$string != ''">
<String>
<xsl:value-of select="$string"/>
</String>
</xsl:if>
<xsl:apply-templates select="Markup"/>
</InfoVal>
</xsl:template>
<xsl:template match="Markup">
<xsl:copy-of select="."/>
</xsl:template>
<xsl:template match="Markup[Type[text() = 'PubChem Internal Link']]"/>
<xsl:template match="Markup[Type[text() = 'Color']]"/>
<xsl:template
match="Markup[Type[text() = 'Icon'] and Extra and URL[starts-with(text(), 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/GHS')]]">
<GHSPictogram>
<xsl:attribute name="type" select="Extra"/>
<xsl:attribute name="code"
select="substring-before(substring-after(URL, 'https://pubchem.ncbi.nlm.nih.gov/images/ghs/'), '.svg')"
/>
</GHSPictogram>
</xsl:template>
<xsl:template match="Information/Value[not(StringWithMarkup)]">
<InfoVal>
<xsl:call-template name="referenced">
<xsl:with-param name="subject" select=".."/>
</xsl:call-template>
<xsl:copy-of select="*"/>
</InfoVal>
</xsl:template>
<xsl:template name="referenced">
<xsl:param name="subject" required="yes"/>
<xsl:if test="$subject/ReferenceNumber">
<xsl:attribute name="refno" select="$subject/ReferenceNumber"/>
</xsl:if>
<xsl:if test="$subject/Reference">
<xsl:attribute name="ref" select="normalize-space($subject/Reference)"/>
</xsl:if>
<xsl:if test="$subject/Name">
<xsl:attribute name="name" select="$subject/Name"/>
</xsl:if>
</xsl:template>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment