johan/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Requirements

You need the ttx binary from FontTools to dump the cmap table of the fonts into an xml font.ttx file:
> sudo apt-get install fonttools

HOWTO

> ttx -t cmap sorren.eot
Dumping "sorren.eot" to "sorren.ttx"...
Dumping 'cmap' table...
> ln -s sorren.ttx sorren.xml

Open sorren.xml in Chrome. Open devtools, paste the ttx_to_regexp.js code into the Console tab and hit return.
Next, type fontRange() to get the regexp covering all codepoints known to this font (platformID 0 is Unicode). If you're going to paste it elsewhere, you might as well type copy(fontRange()) and avoid any copy-paste errors:
> fontRange()
[\x00\x0d -~\xa0\xad\u2000-\u200a\u2010-\u2014\u202f\u205f\ue000]

Repeat above for all fonts you are interested in.
TODO

To become an international superhero, fork this gist, make a shell-runable node.js application font-to-regexp.js that just takes your font file(s?) on the command line, invokes ttx for you on it(them), loads the result with jsdom, runs fontRange on it and prints the regexp to stdout, instead of  doing the above steps manually. Oh, and brag about it in the comments here, of course, so other people find it too!

  
## ttx_to_regexp.js
function fontRange() {
  function codepoint(node) { return Number(node.nodeValue); }
  return regexpify($x('//cmap/*[@platformID="0"]/*/@code').map(codepoint));
}

function regexpify(codepoints) {
  function character(code) {
    switch (code) {
      case 45: return '\\-';
      case 92: return '\\\\';
      case 93: return '\\]';
      case 94: return '\\^';
      case 173: return '\\xad'; // soft hyphen looks like dash; avoid confusion
      default:
        if ((code >= 0x20 && code <= 0x7e) ||
            (code >= 0xa1 && code <= 0xff))
          return String.fromCharCode(code);
        var hex = code.toString(16);
        if (code < 0x10)   return '\\x0'+ hex;
        if (code < 0x100)  return '\\x' + hex;
        if (code < 0x1000) return '\\u0'+ hex;
                           return '\\u' + hex;
    }
  }
  function rangeify(range) {
    var a = range[0], b = range[1], r;
    switch (b) {
      case a:   r = character(a); break;
      case a+1: r = character(a) +''+ character(b); break;
      default:  r = character(a) +'-'+ character(b); break;
    }
    // console.log('range:', range, a, b, r);
    return r;
  }
  var sorted = codepoints.concat().sort(function asc(a,b) { return a - b; })
    , ranges = []
    , start  = sorted.shift()
    , prev   = start
    , next   = sorted.shift()
    ;
  while (true) {
    while (next === prev + 1) {
      prev = next;
      next = sorted.shift();
    }
    ranges.push([start, prev]);
    if (next === undefined) break;
    start = prev = next;
    next = sorted.shift();
  }
  return '['+ ranges.map(rangeify).join('') +']';
}

function $x(xpath, root) {
  var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
  var got = doc.evaluate( xpath, root||doc, null, 0, null ), result = [];
  switch (got.resultType) {
    case got.STRING_TYPE:
      return got.stringValue;
    case got.NUMBER_TYPE:
      return got.numberValue;
    case got.BOOLEAN_TYPE:
      return got.booleanValue;
    default:
      while ((next = got.iterateNext()))
        result.push( next );
      return result;
  }
}
	function fontRange() {
	function codepoint(node) { return Number(node.nodeValue); }
	return regexpify($x('//cmap/[@platformID="0"]//@code').map(codepoint));
	}

	function regexpify(codepoints) {
	function character(code) {
	switch (code) {
	case 45: return '\\-';
	case 92: return '\\\\';
	case 93: return '\\]';
	case 94: return '\\^';
	case 173: return '\\xad'; // soft hyphen looks like dash; avoid confusion
	default:
	if ((code >= 0x20 && code <= 0x7e) \|\|
	(code >= 0xa1 && code <= 0xff))
	return String.fromCharCode(code);
	var hex = code.toString(16);
	if (code < 0x10) return '\\x0'+ hex;
	if (code < 0x100) return '\\x' + hex;
	if (code < 0x1000) return '\\u0'+ hex;
	return '\\u' + hex;
	}
	}
	function rangeify(range) {
	var a = range[0], b = range[1], r;
	switch (b) {
	case a: r = character(a); break;
	case a+1: r = character(a) +''+ character(b); break;
	default: r = character(a) +'-'+ character(b); break;
	}
	// console.log('range:', range, a, b, r);
	return r;
	}
	var sorted = codepoints.concat().sort(function asc(a,b) { return a - b; })
	, ranges = []
	, start = sorted.shift()
	, prev = start
	, next = sorted.shift()
	;
	while (true) {
	while (next === prev + 1) {
	prev = next;
	next = sorted.shift();
	}
	ranges.push([start, prev]);
	if (next === undefined) break;
	start = prev = next;
	next = sorted.shift();
	}
	return '['+ ranges.map(rangeify).join('') +']';
	}

	function $x(xpath, root) {
	var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
	var got = doc.evaluate( xpath, root\|\|doc, null, 0, null ), result = [];
	switch (got.resultType) {
	case got.STRING_TYPE:
	return got.stringValue;
	case got.NUMBER_TYPE:
	return got.numberValue;
	case got.BOOLEAN_TYPE:
	return got.booleanValue;
	default:
	while ((next = got.iterateNext()))
	result.push( next );
	return result;
	}
	}