Skip to content

Instantly share code, notes, and snippets.

@pekkaklarck
Last active May 23, 2016 01:05
Show Gist options
  • Save pekkaklarck/6712255 to your computer and use it in GitHub Desktop.
Save pekkaklarck/6712255 to your computer and use it in GitHub Desktop.
Javascript implementation of UTF-8 decoding algorithm by Björn Höhrmann explained at http://bjoern.hoehrmann.de/utf-8/decoder/dfa/. To support codepoints outside the Unicode BMP, this algorithm uses the custom String.fromCodePoint method explained at https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromChar…
var UTF8_ACCEPT = 0;
var UTF8D = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
];
function decode(utf8text) {
var state = UTF8_ACCEPT;
var codep = 0;
var codepoints = [];
for (var i=0; i < utf8text.length; i++) {
byte = utf8text.charCodeAt(i);
type = UTF8D[byte];
codep = (state != UTF8_ACCEPT) ?
(byte & 0x3f) | (codep << 6) : (0xff >> type) & (byte);
state = UTF8D[256 + state + type];
if (state == UTF8_ACCEPT)
codepoints.push(codep);
}
return String.fromCodePoint.apply(null, codepoints);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment