zeux/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is a draft for a space-efficient HTML5 character reference parser.
The parser contains a table that maps from a character reference name to a UTF-8 encoded codepoint list.
The table is encoded as a trie.
Each node contains a list of children in the 'keys' array, and a list of offsets in the 'offsets' array.
'keys' encode either the next character of the node (for a child node), or 0 for end-of-list.
'offsets' encode the offset into the 'keys'/'offsets' array where the node data is located.
For leaf nodes, the list stored in 'keys' is empty (keys[offset] == 0), and the offsets table contains the offset into
codepoint array.
convert.cpp contains a sample implementation; it may not be entirely correct (it's possible it can read out-of-bounds) -
this is just a proof of concept.
Note that the implementation in convert.cpp may need to write more bytes than there are available for nGt and nLt keys.
To build this, run the following sequence of commands:
wget https://www.w3.org/TR/html5/entities.json
python3 entities.py >table.h
c++ -Os convert.cpp


## convert.cpp
#include "table.h"
#include <stdio.h>

int translate(const char* s, const char** outs)
{
	int16_t offset = root;

	while (*s)
	{
		if (keys[offset] == 0)
			break;

		int16_t next = offset;

		for (;;)
		{
			if (keys[next] == 0)
				return -1;

			if (keys[next] == *s)
				break;

			next++;
		}

		offset = offsets[next];
		s++;
	}

	if (keys[offset] == 0)
	{
		*outs = s;
		return offsets[offset];
	}

	return -1;
}

void translate(char* s)
{
	char* w = s;

	while (*s)
	{
		if (*s == '&')
		{
			const char* ns;
			int off = translate(s, &ns);

			if (off >= 0)
			{
				s = (char*)ns;

				for (; codepoints[off]; off++)
					*w++ = codepoints[off];
			}
			else
			{
				*w++ = *s++;
			}
		}
		else
		{
			*w++ = *s++;
		}
	}

	*w = 0;
}

int main(int argc, char** argv)
{
	for (int i = 1; i < argc; ++i)
	{
		translate(argv[i]);
		printf("%s\n", argv[i]);
	}
}

## entities.py
#!/usr/bin/python3

import json

f = open('entities.json', 'r')
j = json.loads(f.read())

print("#include <stddef.h>")
print("#include <stdint.h>")
print("static const uint8_t codepoints[] = {")

table = {}
offset = 0

for e in j:
	table[e] = offset

	for c in j[e][u'codepoints']:
		for b in chr(c).encode('utf-8'):
			print("%d," % (b), end="")
			offset += 1

	print("0, // %s offset %d" % (e, table[e]))
	offset += 1

print("}; // size %d" % offset)

class Node:
	def __init__(self, prefix = ""):
		self.children = {}
		self.prefix = prefix
		self.offset = 0
		self.result = -1

trie = Node()

for e in j:
	node = trie
	for i,v in enumerate(e):
		node = node.children.setdefault(v, Node(e[:i+1]))

	node.result = table[e]

offset = 0

def fill_keys(node):
	global offset
	for k in node.children:
		fill_keys(node.children[k])

	node.offset = offset

	for k in node.children:
		print("%d, " % (ord(k)), end="")

	print("0, // %s" % node.prefix)

	offset += len(node.children) + 1

print("static const uint8_t keys[] = {")

fill_keys(trie)

print("}; // size %d" % offset)

def fill_offsets(node):
	for k in node.children:
		fill_offsets(node.children[k])

	for k in node.children:
		print("%d, " % (node.children[k].offset), end="")

	print("%d, // %s" % (node.result, node.prefix))

print("static const int16_t offsets[] = {")

fill_offsets(trie)

print("}; // size %d" % offset)

print("static const int16_t root = %d;" % trie.offset)
	#include "table.h"
	#include <stdio.h>

	int translate(const char* s, const char** outs)
	{
	int16_t offset = root;

	while (*s)
	{
	if (keys[offset] == 0)
	break;

	int16_t next = offset;

	for (;;)
	{
	if (keys[next] == 0)
	return -1;

	if (keys[next] == *s)
	break;

	next++;
	}

	offset = offsets[next];
	s++;
	}

	if (keys[offset] == 0)
	{
	*outs = s;
	return offsets[offset];
	}

	return -1;
	}

	void translate(char* s)
	{
	char* w = s;

	while (*s)
	{
	if (*s == '&')
	{
	const char* ns;
	int off = translate(s, &ns);

	if (off >= 0)
	{
	s = (char*)ns;

	for (; codepoints[off]; off++)
	*w++ = codepoints[off];
	}
	else
	{
	w++ = s++;
	}
	}
	else
	{
	w++ = s++;
	}
	}

	*w = 0;
	}

	int main(int argc, char** argv)
	{
	for (int i = 1; i < argc; ++i)
	{
	translate(argv[i]);
	printf("%s\n", argv[i]);
	}
	}
	#!/usr/bin/python3

	import json

	f = open('entities.json', 'r')
	j = json.loads(f.read())

	print("#include <stddef.h>")
	print("#include <stdint.h>")
	print("static const uint8_t codepoints[] = {")

	table = {}
	offset = 0

	for e in j:
	table[e] = offset

	for c in j[e][u'codepoints']:
	for b in chr(c).encode('utf-8'):
	print("%d," % (b), end="")
	offset += 1

	print("0, // %s offset %d" % (e, table[e]))
	offset += 1

	print("}; // size %d" % offset)

	class Node:
	def __init__(self, prefix = ""):
	self.children = {}
	self.prefix = prefix
	self.offset = 0
	self.result = -1

	trie = Node()

	for e in j:
	node = trie
	for i,v in enumerate(e):
	node = node.children.setdefault(v, Node(e[:i+1]))

	node.result = table[e]

	offset = 0

	def fill_keys(node):
	global offset
	for k in node.children:
	fill_keys(node.children[k])

	node.offset = offset

	for k in node.children:
	print("%d, " % (ord(k)), end="")

	print("0, // %s" % node.prefix)

	offset += len(node.children) + 1

	print("static const uint8_t keys[] = {")

	fill_keys(trie)

	print("}; // size %d" % offset)

	def fill_offsets(node):
	for k in node.children:
	fill_offsets(node.children[k])

	for k in node.children:
	print("%d, " % (node.children[k].offset), end="")

	print("%d, // %s" % (node.result, node.prefix))

	print("static const int16_t offsets[] = {")

	fill_offsets(trie)

	print("}; // size %d" % offset)

	print("static const int16_t root = %d;" % trie.offset)