Skip to content

Instantly share code, notes, and snippets.

@paulbaumgart
Created April 3, 2011 01:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save paulbaumgart/900101 to your computer and use it in GitHub Desktop.
Save paulbaumgart/900101 to your computer and use it in GitHub Desktop.
var SYSTEM = require('system'),
HTTP = require('http-client'),
OS = require('os');
var url = SYSTEM.args[1];
if (!url) {
print('usage:');
print('js ' + SYSTEM.args[0] + ' <fileformat.info URL>');
OS.exit(1);
}
var fileContents = HTTP.read(url).decodeToString('utf-8'),
charRegex = /\s*<td align="center"><a href="\/info\/unicode\/char\/.+?">U+(.+?)<\/a><\/td>\s*/g,
matches = null,
lastCharCode = null,
classStartCharCode = null,
classes = [];
var convertToUnicodeHex = function(charCode) {
if (charCode > 0xFFFF) {
charCode -= 0x10000;
var highSurrogate = ((charCode & 0xFFC00) >> 10) + 0xD800;
var lowSurrogate = (charCode & 0x3FF) + 0xDC00;
return '\\u' + highSurrogate.toString(16).toUpperCase() + '\\u' + lowSurrogate.toString(16).toUpperCase();
} else {
var str = charCode.toString(16).toUpperCase();
while (str.length < 4)
str = '0' + str;
return '\\u' + str;
}
};
var charClassFromCharCodes = function(startCharCode, endCharCode) {
if (startCharCode === endCharCode) {
if (startCharCode < 0xFFFF)
return convertToUnicodeHex(startCharCode);
else
return '"' + convertToUnicodeHex(startCharCode) + '"';
} else {
return convertToUnicodeHex(startCharCode) + '-' + convertToUnicodeHex(endCharCode);
}
};
while (matches = charRegex.exec(fileContents)) {
var charCode = parseInt(matches[1], 16);
if (charCode > 0xFFFF) { // funky utf-16 escape sequence required
if (classStartCharCode !== null) { // end the previous char class if it exists
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode) + ']';
classStartCharCode = null;
}
classes.push(charClassFromCharCodes(charCode, charCode));
}
else if (classStartCharCode === null) {
classStartCharCode = charCode;
classes.push('[');
}
else if (charCode !== lastCharCode + 1) {
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode);
classStartCharCode = charCode;
}
lastCharCode = charCode;
}
if (lastCharCode <= 0xFFFF && classStartCharCode !== null)
classes[classes.length - 1] += charClassFromCharCodes(classStartCharCode, lastCharCode) + ']';
print(classes.join(' / '));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment