Created
September 26, 2013 09:58
-
-
Save migerh/6712140 to your computer and use it in GitHub Desktop.
Benchmark three different versions of utf8 decode(). See https://github.com/jsxgraph/jsxgraph/issues/50 for details. newerDecode() is a translation from C99 of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<title></title> | |
<script type="text/javascript" src="http://jsxgraph.uni-bayreuth.de/~michael/jsxgui/Examples/libs/benchmark.js"></script> | |
</head> | |
<body> | |
<div id="output"></div> | |
<script type="text/javascript"> | |
var counter = 0; | |
// constants | |
var UTF8_ACCEPT = 0, | |
UTF8_REJECT = 12, | |
UTF8D = [ | |
// The first part of the table maps bytes to character classes that | |
// to reduce the size of the transition table and create bitmasks. | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, | |
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, | |
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, | |
// The second part is a transition table that maps a combination | |
// of a state of the automaton and a character class to a state. | |
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, | |
12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12, | |
12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, | |
12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, | |
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 | |
]; | |
String.prototype.repeat = function(num) { | |
return new Array(isNaN(num)? 1 : ++num).join(this); | |
} | |
var output = document.getElementById('output'); | |
//var shortStr = '\xe8\x87\xaa\xe8\x8d\x8a\xe6\xb9\x96\xe4\xbb\xa5\xe5\x8c\x97'; | |
var shortStr = '\xf0\x9f\x99\x8a'; | |
var str = shortStr.repeat(100); | |
var suiteDecode = new Benchmark.Suite(); | |
var output = document.getElementById('output'); | |
output.innerHTML = 'Input length: ' + str.length + '<br />'; | |
var newerDecode = function (utftext) { | |
var i, byte, type, char, | |
codep = 0, | |
state = UTF8_ACCEPT, | |
string = [], | |
len = utftext.length; | |
for (i = 0; i < len; i++) { | |
byte = utftext.charCodeAt(i); | |
type = UTF8D[byte]; | |
if (state !== UTF8_ACCEPT) { | |
codep = (byte & 0x3f) | (codep << 6); | |
} else { | |
codep = (0xff >> type) & byte; | |
} | |
state = UTF8D[256 + state + type]; | |
if (state === UTF8_ACCEPT) { | |
if (codep > 0xffff) { | |
string.push(String.fromCharCode(0xD7C0 + (codep >> 10))); | |
string.push(String.fromCharCode(0xDC00 + (codep & 0x3FF))); | |
} else { | |
string.push(String.fromCharCode(codep)); | |
} | |
} | |
} | |
return string.join(''); | |
}; | |
var newDecode = function (utftext) { | |
return decodeURIComponent(escape(utftext)); | |
}; | |
var oldDecode = function (utftext) { | |
var string = [], | |
i = 0, | |
c = 0, | |
c2 = 0, | |
c3 = 0, | |
len = utftext.length; | |
while (i < len) { | |
c = utftext.charCodeAt(i); | |
if (c < 128) { | |
string.push(String.fromCharCode(c)); | |
i++; | |
} else if ((c > 191) && (c < 224)) { | |
c2 = utftext.charCodeAt(i + 1); | |
string.push(String.fromCharCode(((c & 31) << 6) | (c2 & 63))); | |
i += 2; | |
} else { | |
c2 = utftext.charCodeAt(i + 1); | |
c3 = utftext.charCodeAt(i + 2); | |
string.push(String.fromCharCode(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63))); | |
i += 3; | |
} | |
} | |
return string.join(''); | |
} | |
output.innerHTML += 'Run benchmark suites...<br />'; | |
suiteDecode.add('Decode#Old', function () { | |
oldDecode(str); | |
}) | |
.add('Decode#New', function () { | |
newDecode(str); | |
}) // add listeners | |
.add('Decode#Newer', function () { | |
newerDecode(str); | |
}) // add listeners | |
.on('cycle', function(event) { | |
output.innerHTML += (String(event.target)) + '<br />'; | |
}) | |
.on('complete', function() { | |
output.innerHTML += ('Fastest is ' + this.filter('fastest').pluck('name')) + '<br />'; | |
output.innerHTML += 'Old: ' + oldDecode(str) + ' / Length old: ' + oldDecode(str).length + '<br />'; | |
output.innerHTML += 'New: ' + newDecode(str) + ' / Length new: ' + newDecode(str).length + '<br />'; | |
output.innerHTML += 'Newer: ' + newerDecode(str) + ' / Length newer: ' + newerDecode(str).length; | |
}) | |
// run async | |
.run({ 'async': true }); | |
</script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment