Skip to content

Instantly share code, notes, and snippets.

@grtjn
Created June 27, 2012 08:32
Show Gist options
  • Save grtjn/3002466 to your computer and use it in GitHub Desktop.
Save grtjn/3002466 to your computer and use it in GitHub Desktop.
Convert a single line of a turtle .nx or .nq file to xml
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:local="local" xmlns:xdmp="http://marklogic.com/xdmp">
<xsl:variable name="quot">&quot;</xsl:variable>
<xsl:variable name="encoded-string-pattern">^"(([^\\"]+|\\[\\"nrt]|\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]|\\U[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])+)"(.*)</xsl:variable>
<xsl:template match="/">
<!-- http://dbpedia.org/Downloads37 -->
<!-- example input:
<http://dbpedia.org/resource/Alabama> <http://dbpedia.org/ontology/PopulatedPlace/areaTotal> "135765.0"^^<http://dbpedia.org/datatype/squareKilometre> <http://en.wikipedia.org/wiki/Alabama#absolute-line=33> .
-->
<xsl:variable name="tokens" select="local:tokenize(.)"/>
<xsl:variable name="s" select="$tokens[1]"/>
<xsl:variable name="p" select="$tokens[2]"/>
<xsl:variable name="o" select="$tokens[3]"/>
<xsl:variable name="c" select="$tokens[position() = (4, 5)][. != '.'][not(starts-with(., '@'))]"/>
<xsl:variable name="lang" select="if (starts-with($tokens[4], '@')) then substring-after($tokens[4], '@') else ()"/>
<t>
<s><xsl:value-of select="$s"/></s>
<p><xsl:value-of select="$p"/></p>
<o>
<xsl:if test="exists($lang)">
<xsl:attribute name="xml:lang" select="$lang"/>
</xsl:if>
<xsl:value-of select="$o"/>
</o>
<xsl:if test="exists($c)">
<c><xsl:value-of select="$c"/></c>
</xsl:if>
</t>
</xsl:template>
<xsl:function name="local:tokenize" as="xs:string*">
<xsl:param name="str" as="xs:string"/>
<xsl:choose>
<xsl:when test="$str = ('', '.')"/>
<xsl:when test="starts-with($str, '&lt;')">
<xsl:value-of select="substring-after(substring-before($str, '>'), '&lt;')"/>
<xsl:sequence select="local:tokenize(substring-after($str, '> '))"/>
</xsl:when>
<xsl:when test="starts-with($str, $quot)">
<xsl:variable name="encoded-string" select="replace($str, $encoded-string-pattern, '$1')"/>
<xsl:value-of select="local:decode-string($encoded-string)"/>
<xsl:variable name="remainder" select="replace($str, $encoded-string-pattern, '$3')"/>
<xsl:choose>
<xsl:when test="starts-with($remainder, '^^')">
<xsl:sequence select="local:tokenize(substring-after($remainder, '> '))"/>
</xsl:when>
<xsl:when test="starts-with($remainder, '@')">
<xsl:value-of select="substring-before($remainder, ' ')"/>
<xsl:sequence select="local:tokenize(substring-after($remainder, ' '))"/>
</xsl:when>
<xsl:when test="starts-with($remainder, ' ')">
<xsl:sequence select="local:tokenize(substring-after($remainder, ' '))"/>
</xsl:when>
<xsl:otherwise>##Should not be reached!##</xsl:otherwise>
</xsl:choose>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="substring-before($str, ' ')"/>
<xsl:sequence select="local:tokenize(substring-after($str, ' '))"/>
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:function name="local:decode-string" as="xs:string">
<xsl:param name="str" as="xs:string"/>
<xsl:variable name="_" select="replace($str, '\\\\', '\\')"/>
<xsl:variable name="_" select="replace($_, concat('\\', $quot), $quot)"/>
<xsl:variable name="_" select="replace($_, '(\\n|\\r)+', '&#10;')"/>
<xsl:variable name="_" select="replace($_, '\\t', '&#9;')"/>
<xsl:analyze-string select="$_" regex="\\u([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])|\\U([0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F])">
<xsl:matching-substring>
<xsl:value-of select="codepoints-to-string(xdmp:hex-to-integer(regex-group(1)))"/>
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:value-of select="."/>
</xsl:non-matching-substring>
</xsl:analyze-string>
</xsl:function>
</xsl:stylesheet>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment