Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tolmasky/901944 to your computer and use it in GitHub Desktop.
Save tolmasky/901944 to your computer and use it in GitHub Desktop.
var SYSTEM = require('system'),
HTTP = require('http-client');
if (SYSTEM.args.length < 2)
{
print("usage:");
print("js " + SYSTEM.args[0] + " <fileformat.info URL>+");
require("os").exit(1);
}
var unicodeCharacters = [],
surrogateMappings = [],
charRegex = /\s*<td align="center"><a href="\/info\/unicode\/char\/.+?">U+(.+?)<\/a><\/td>\s*/g;
SYSTEM.args.slice(1).forEach(function(aURL)
{
var fileContents = HTTP.read(aURL).decodeToString('utf-8');
while (match = charRegex.exec(fileContents))
{
var unicodeCharacter = parseInt(match[1], 16);
if (unicodeCharacter > 0xFFFF)
{
unicodeCharacter -= 0x10000;
var highSurrogate = ((unicodeCharacter & 0xFFC00) >> 10) + 0xD800;
if (!surrogateMappings[highSurrogate])
surrogateMappings[highSurrogate] = [];
surrogateMappings[highSurrogate].push((unicodeCharacter & 0x3FF) + 0xDC00);
}
else
unicodeCharacters.push(unicodeCharacter);
}
});
Array.prototype.toCharacterClass = function()
{
var startChar = null,
previousChar = null,
string = "";
this.forEach(function(aUnicodeCharacter)
{
if (startChar === null)
startChar = aUnicodeCharacter;
else if (aUnicodeCharacter !== previousChar + 1)
{
string += rangeString(startChar, previousChar);
startChar = aUnicodeCharacter;
}
previousChar = aUnicodeCharacter;
});
if (startChar !== null)
string += rangeString(startChar, previousChar);
return string;
function rangeString(fromChar, toChar)
{
var fromString = fromChar.toUnicodeHex();
if (toChar === fromChar)
return fromString;
return fromString + (toChar - fromChar > 1 ? "-" : "") + toChar.toUnicodeHex();
}
}
Number.prototype.toUnicodeHex = function()
{
var string = this.toString(16).toUpperCase();
while (string.length < 4)
string = '0' + string;
return '\\u' + string;
}
var classes = "[" + unicodeCharacters.toCharacterClass() + "]";
for (highSurrogate in surrogateMappings)
if (surrogateMappings.hasOwnProperty(highSurrogate))
classes += " / \"" + parseInt(highSurrogate, 10).toUnicodeHex() +
"\" [" + surrogateMappings[highSurrogate].toCharacterClass() + "]";
print(classes);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment