Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Given a truetype (*.ttf) or opentype (*.eot) font, make a regexp of the unicode characters it maps glyphs for

Requirements

You need the ttx binary from FontTools to dump the cmap table of the fonts into an xml font.ttx file:

> sudo apt-get install fonttools

HOWTO

> ttx -t cmap sorren.eot
Dumping "sorren.eot" to "sorren.ttx"...
Dumping 'cmap' table...
> ln -s sorren.ttx sorren.xml

Open sorren.xml in Chrome. Open devtools, paste the ttx_to_regexp.js code into the Console tab and hit return.

Next, type fontRange() to get the regexp covering all codepoints known to this font (platformID 0 is Unicode). If you're going to paste it elsewhere, you might as well type copy(fontRange()) and avoid any copy-paste errors:

> fontRange()
[\x00\x0d -~\xa0\xad\u2000-\u200a\u2010-\u2014\u202f\u205f\ue000]

Repeat above for all fonts you are interested in.

TODO

To become an international superhero, fork this gist, make a shell-runable node.js application font-to-regexp.js that just takes your font file(s?) on the command line, invokes ttx for you on it(them), loads the result with jsdom, runs fontRange on it and prints the regexp to stdout, instead of doing the above steps manually. Oh, and brag about it in the comments here, of course, so other people find it too!

function fontRange() {
function codepoint(node) { return Number(node.nodeValue); }
return regexpify($x('//cmap/*[@platformID="0"]/*/@code').map(codepoint));
}
function regexpify(codepoints) {
function character(code) {
switch (code) {
case 45: return '\\-';
case 92: return '\\\\';
case 93: return '\\]';
case 94: return '\\^';
case 173: return '\\xad'; // soft hyphen looks like dash; avoid confusion
default:
if ((code >= 0x20 && code <= 0x7e) ||
(code >= 0xa1 && code <= 0xff))
return String.fromCharCode(code);
var hex = code.toString(16);
if (code < 0x10) return '\\x0'+ hex;
if (code < 0x100) return '\\x' + hex;
if (code < 0x1000) return '\\u0'+ hex;
return '\\u' + hex;
}
}
function rangeify(range) {
var a = range[0], b = range[1], r;
switch (b) {
case a: r = character(a); break;
case a+1: r = character(a) +''+ character(b); break;
default: r = character(a) +'-'+ character(b); break;
}
// console.log('range:', range, a, b, r);
return r;
}
var sorted = codepoints.concat().sort(function asc(a,b) { return a - b; })
, ranges = []
, start = sorted.shift()
, prev = start
, next = sorted.shift()
;
while (true) {
while (next === prev + 1) {
prev = next;
next = sorted.shift();
}
ranges.push([start, prev]);
if (next === undefined) break;
start = prev = next;
next = sorted.shift();
}
return '['+ ranges.map(rangeify).join('') +']';
}
function $x(xpath, root) {
var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
var got = doc.evaluate( xpath, root||doc, null, 0, null ), result = [];
switch (got.resultType) {
case got.STRING_TYPE:
return got.stringValue;
case got.NUMBER_TYPE:
return got.numberValue;
case got.BOOLEAN_TYPE:
return got.booleanValue;
default:
while ((next = got.iterateNext()))
result.push( next );
return result;
}
}
@ysangkok
Copy link

ysangkok commented Oct 17, 2013

No need to parse ttx's output, just use it's API (it's in Python). Here's a script that takes a font and a codepoint number and writes the name of the character and whether the font contains it.

#!/usr/bin/env python
from itertools import chain
import sys

from fontTools.ttLib import TTFont
from fontTools.unicode import Unicode

ttf = TTFont(sys.argv[1], 0, verbose=0, allowVID=0,
                ignoreDecompileErrors=True,
                fontNumber=-1)

chars = chain.from_iterable([y + (Unicode[y[0]],) for y in x.cmap.items()] for x in ttf["cmap"].tables)
#print(list(chars))
char = int(sys.argv[2],0)
print(Unicode[char])
print(char in (x[0] for x in chars))

ttf.close()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment