Created
May 19, 2020 08:45
-
-
Save pastokes/d75d6050b9a5775bab38052603ca89c5 to your computer and use it in GitHub Desktop.
Script to extract content from Unicode 13.0 Character Code Charts - Scripts (https://www.unicode.org/charts/#scripts)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" | |
version="2.0"> | |
<!-- | |
Script to extract content from Unicode 13.0 Character Code Charts - Scripts (https://www.unicode.org/charts/#scripts) | |
The ultimate goal is to get a more or less machine-readable table of code points for all scripts in ISO15924. | |
This information must surely be online already, but I haven't yet managed to find it. | |
The XSLT is interesting for use of tail recursion with the so-called 'Kayessian' method for structuring flat sequences of elements | |
by their attributes. | |
Peter A. Stokes, École Pratique des Hautes Études – Université PSL, May 2020. | |
--> | |
<xsl:output method="html" encoding="utf-8" indent="yes" version="5"/> | |
<xsl:template match="/"> | |
<html> | |
<head> | |
<title>Unicode Points by Script</title> | |
</head> | |
<body> | |
<table> | |
<thead> | |
<tr> | |
<th>Script</th> | |
<th>Start</th> | |
<th>End</th> | |
</tr> | |
</thead> | |
<tbody> | |
<xsl:apply-templates select="//td"/> | |
</tbody> | |
</table> | |
</body> | |
</html> | |
</xsl:template> | |
<!-- td elements are used as the main division structure in the table (!), | |
then the 'rows' of the table are in fact a flat sequence of HTML p elements | |
so we use the Kayessian method, which means keeping track of our position. | |
I do this using tail recursion and a counter parameter, but I'm sure there | |
are other ways using position() or similar. | |
--> | |
<xsl:template match="td"> | |
<xsl:call-template name="p_mb"> | |
<xsl:with-param name="n" select="1"/> | |
<xsl:with-param name="root" select="."/> | |
</xsl:call-template> | |
</xsl:template> | |
<xsl:template name="p_mb"> | |
<xsl:param name="n"/> | |
<xsl:param name="root"/> | |
<xsl:variable name="currnode" select="$root//p[@class = 'mb'][$n]"/> | |
<xsl:variable name="script" select="$currnode/a/."/> | |
<xsl:apply-templates select="$currnode/a[@title]"/> | |
<xsl:for-each | |
select="$root//p[@class = 'mb'][$n]/following-sibling::p[(@class = 'pb' or @class = 'sb') and count(preceding-sibling::p[@class = 'mb']) = $n]"> | |
<xsl:apply-templates select="a[@title]" mode="subrow"/> | |
</xsl:for-each> | |
<xsl:if test="$root//p[@class = 'mb'][$n + 1]"> | |
<xsl:call-template name="p_mb"> | |
<xsl:with-param name="n" select="$n + 1"/> | |
<xsl:with-param name="root" select="$root"/> | |
</xsl:call-template> | |
</xsl:if> | |
</xsl:template> | |
<xsl:template match="a"> | |
<tr> | |
<td> | |
<xsl:value-of select="."/> | |
</td> | |
<xsl:apply-templates select="@title"/> | |
</tr> | |
</xsl:template> | |
<xsl:template match="a" mode="subrow"> | |
<tr> | |
<td> | |
<xsl:value-of select="concat('–– ', .)"/> | |
</td> | |
<xsl:choose> | |
<xsl:when test="@title"> | |
<xsl:apply-templates select="@title"/> | |
</xsl:when> | |
<!-- Shouldn't ever get here... --> | |
<xsl:otherwise> | |
<td>––</td> | |
<td>––</td> | |
</xsl:otherwise> | |
</xsl:choose> | |
</tr> | |
</xsl:template> | |
<!-- Doesn't really need to be in a named template, but structure is a bit clearer this way I think, since then | |
each p has its own template --> | |
<xsl:template name="sub_p"> | |
<xsl:param name="script"/> | |
<xsl:param name="root"/> | |
<!-- Careful: sometimes we have more than one a node in a p, and sometimes it's a cross-ref, not a range (so no title) --> | |
<xsl:apply-templates select="$root/a[@title]" mode="subrow"/> | |
</xsl:template> | |
<xsl:template match="a/@title"> | |
<!-- Careful: | |
- Sometimes the title uses a hypen and sometimes an en-dash | |
- Sometimes there is only a single code point | |
TODO: The following cases are not properly covered: | |
- Sometimes there are multiple ranges (ranges after first simply included in second column) | |
- Sometimes there is no @title but there is a start-point in the URL (currently simply omitted) | |
- Sometimes the link is a cross-reference, not a range (currently simply omitted) | |
--> | |
<xsl:choose> | |
<!-- Hyphen --> | |
<xsl:when test="contains(., '-')"> | |
<td> | |
<xsl:value-of select="substring-before(., '-')"/> | |
</td> | |
<td> | |
<xsl:value-of select="substring-after(., '-')"/> | |
</td> | |
</xsl:when> | |
<!-- En-dash --> | |
<xsl:when test="contains(., '–')"> | |
<td> | |
<xsl:value-of select="substring-before(., '–')"/> | |
</td> | |
<td> | |
<xsl:value-of select="substring-after(., '–')"/> | |
</td> | |
</xsl:when> | |
<xsl:otherwise> | |
<td> | |
<xsl:value-of select="."/> | |
</td> | |
<td> | |
–– | |
</td> | |
</xsl:otherwise> | |
</xsl:choose> | |
</xsl:template> | |
</xsl:stylesheet> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment