Skip to content

Instantly share code, notes, and snippets.

@zeux
Last active September 20, 2016 08:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zeux/0c12a521b1c10637a2179126f1688782 to your computer and use it in GitHub Desktop.
Save zeux/0c12a521b1c10637a2179126f1688782 to your computer and use it in GitHub Desktop.
HTML5 character reference parsing draft

This is a draft for a space-efficient HTML5 character reference parser.

The parser contains a table that maps from a character reference name to a UTF-8 encoded codepoint list. The table is encoded as a trie.

Each node contains a list of children in the 'keys' array, and a list of offsets in the 'offsets' array.

'keys' encode either the next character of the node (for a child node), or 0 for end-of-list. 'offsets' encode the offset into the 'keys'/'offsets' array where the node data is located.

For leaf nodes, the list stored in 'keys' is empty (keys[offset] == 0), and the offsets table contains the offset into codepoint array.

convert.cpp contains a sample implementation; it may not be entirely correct (it's possible it can read out-of-bounds) - this is just a proof of concept.

Note that the implementation in convert.cpp may need to write more bytes than there are available for nGt and nLt keys.

To build this, run the following sequence of commands:

wget https://www.w3.org/TR/html5/entities.json
python3 entities.py >table.h
c++ -Os convert.cpp
#include "table.h"
#include <stdio.h>
int translate(const char* s, const char** outs)
{
int16_t offset = root;
while (*s)
{
if (keys[offset] == 0)
break;
int16_t next = offset;
for (;;)
{
if (keys[next] == 0)
return -1;
if (keys[next] == *s)
break;
next++;
}
offset = offsets[next];
s++;
}
if (keys[offset] == 0)
{
*outs = s;
return offsets[offset];
}
return -1;
}
void translate(char* s)
{
char* w = s;
while (*s)
{
if (*s == '&')
{
const char* ns;
int off = translate(s, &ns);
if (off >= 0)
{
s = (char*)ns;
for (; codepoints[off]; off++)
*w++ = codepoints[off];
}
else
{
*w++ = *s++;
}
}
else
{
*w++ = *s++;
}
}
*w = 0;
}
int main(int argc, char** argv)
{
for (int i = 1; i < argc; ++i)
{
translate(argv[i]);
printf("%s\n", argv[i]);
}
}
#!/usr/bin/python3
import json
f = open('entities.json', 'r')
j = json.loads(f.read())
print("#include <stddef.h>")
print("#include <stdint.h>")
print("static const uint8_t codepoints[] = {")
table = {}
offset = 0
for e in j:
table[e] = offset
for c in j[e][u'codepoints']:
for b in chr(c).encode('utf-8'):
print("%d," % (b), end="")
offset += 1
print("0, // %s offset %d" % (e, table[e]))
offset += 1
print("}; // size %d" % offset)
class Node:
def __init__(self, prefix = ""):
self.children = {}
self.prefix = prefix
self.offset = 0
self.result = -1
trie = Node()
for e in j:
node = trie
for i,v in enumerate(e):
node = node.children.setdefault(v, Node(e[:i+1]))
node.result = table[e]
offset = 0
def fill_keys(node):
global offset
for k in node.children:
fill_keys(node.children[k])
node.offset = offset
for k in node.children:
print("%d, " % (ord(k)), end="")
print("0, // %s" % node.prefix)
offset += len(node.children) + 1
print("static const uint8_t keys[] = {")
fill_keys(trie)
print("}; // size %d" % offset)
def fill_offsets(node):
for k in node.children:
fill_offsets(node.children[k])
for k in node.children:
print("%d, " % (node.children[k].offset), end="")
print("%d, // %s" % (node.result, node.prefix))
print("static const int16_t offsets[] = {")
fill_offsets(trie)
print("}; // size %d" % offset)
print("static const int16_t root = %d;" % trie.offset)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment