Skip to content

Instantly share code, notes, and snippets.

@avdg
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avdg/60ac8f3739d70bf68f9b to your computer and use it in GitHub Desktop.
Save avdg/60ac8f3739d70bf68f9b to your computer and use it in GitHub Desktop.
Experimental script to fetch unicode chars for ecmascript 5
var fs = require('fs');
var http = require('http');
var path = require('path');
var useLocal = false; // Just to stop this script from fetching unicodeData all the time ;-)
var es6mode = false; // If false, we use es5 syntax and ommit some characters for now
// Format:
// 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
//
// (0) Code value;
// (1) Character name;
// (2) General Category;
// (3) Canonical Combining Classes;
// (4) Bidirectional Category;
// (5) Character Decomposition Mapping
// (6) Decimal digit value;
// (7) Digit value;
// (8) Numeric value;
// (9) Mirrored;
// (10) Unicode 1.0 name;
// (11) 10646 comment field;
// (12) Uppercase Mapping;
// (13) Lowercase Mapping;
// (14) Titlecase Mapping;
// \n
var unicodeDataUrl = "http://www.unicode.org/Public/7.0.0/ucd/UnicodeData.txt";
// Format:
// 0000..001F ; Common # Cc [32] <control-0000>..<control-001F>
//
// Values are separated by spaces
// (0) A character or 2 characters (representing a character range),
// separated by '..' followed by ;
// (1) Script type followed by # (comment)
// (2) Unicode category
// (3) Number of characters between [square brackets]
// Only occers on line with character ranges
// (4) name of character for each given code point
// \n
//
// Note: there are empty lines and lines starting with # are comments
var unicodeScriptUrl = "http://www.unicode.org/Public/7.0.0/ucd/Scripts.txt";
// Note: ecmascript 5.1 requires unicode 3.0, but may use successors
// Note: ecmascript 6 requires unicode 5.1, but may use successors
var UNICODE = {
space_separator: ["Zs"], // See section ecmascript 5.1 - section 7.2
letter: ["L&", "Lm", "Lo", "Nl"],
space_combining_mark: ["Mn", "Mc"],
digit: ["Nd"],
connector_punctuation: ["Pc"]
};
var UNICODE_merge = {
"L&": ["Lu", "Ll", "Lt"],
};
var getFromUrl = function(url, done, err) {
"use strict";
http.get(url, function(res) {
var data = "";
res.on("data", function(chunk) {
data += chunk;
});
res.on("end", function(chunk) {
done(data);
});
}).on('error', err || function(e){
throw e;
});
};
var appendGroup = function(input, addition) {
"use strict";
if (typeof addition === "number") {
addition = [addition, addition];
}
if (typeof input[input.length - 1] === "number" &&
input[input.length - 1] === addition[0] - 1
) {
input[input.length - 1] = [addition[0] - 1, addition[1]];
} else if (Array.isArray(input[input.length - 1]) &&
input[input.length - 1][1] === addition[0] - 1
) {
input[input.length - 1][1] = addition[1];
} else {
if (Array.isArray(input[input.length - 1]) &&
input[input.length - 1][0] === input[input.length - 1][1] - 1
) {
input[input.length - 1] = input[input.length - 1][0];
input.push(input[input.length - 1] + 1);
}
input.push(addition[0] === addition[1] ? addition[0] : addition);
}
};
var regroup = function(a) {
"use strict";
var sort = function(a, b) {
if (Array.isArray(a)) {
a = a[0];
}
if (Array.isArray(b)) {
b = b[0];
}
return a - b;
};
a.sort(sort);
for (var i = 0, result = []; i < a.length; i++) {
appendGroup(result, a[i]);
}
return result;
};
var intToUnicode = function(input) {
"use strict";
var result = input.toString(16).toUpperCase();
while (result.length < 4) result = "0" + result;
if (result.length > 4) {
if (!es6mode) {
throw new Error("Unsupported syntax");
}
result = "{" + result + "}";
}
result = "\\\\u" + result;
return result;
};
var unicodeMerge = function(sets) {
for (var i in UNICODE_merge) {
if (sets[i] !== undefined) continue;
sets[i] = [];
for (var j = 0; j < 0; j++) {
sets[i] = sets[i].concat(sets[UNICODE_merge[i][j]]);
}
}
return sets;
};
// TODO apparantly there are some codes in (5) Character Decomposition Mapping
// TODO maybe there are more things like that
var categorizeFromUnicodeData = function(data) {
"use strict";
var categories = {};
// Only need (0) Code value and (2) General Category from each line
for (var i = 0, begin, char, cat; i < data.length; i++) {
begin = i;
while (data[i] != ";") i++;
char = parseInt(data.substr(begin, i - begin), 16);
i++;
while (data[i] != ";") i++;
i++;
begin = i;
while(data[i] != ";") i++;
cat = data.substr(begin, i - begin);
if (categories[cat]) {
appendGroup(categories[cat], char);
} else {
categories[cat] = [char];
}
while (data[i] != "\n") i++;
}
categories = unicodeMerge(categories);
return categories;
};
var categorizeFromUnicodeScript = function(data) {
"use strict";
var categories = {};
// Only need character range and general category after comment
for (var i = 0, begin, char, cat; i < data.length; i++) {
if (data[i] === "#") {
while (data[i] != "\n") i++;
}
if (data[i] === "\n") {
continue;
}
// Get space after code range
begin = i;
i = data.indexOf(" ", begin);
char = data.substr(begin, i - begin).split("..");
for (var j = 0; j < char.length; j++) {
char[j] = parseInt(char[j], 16);
}
char = char.length === 1 ? char[0] : char;
// Get category after comment
begin = data.indexOf("#", i);
i = data.indexOf(" ", begin + 2);
cat = data.substr(begin + 2, i - begin - 2);
if (categories[cat]) {
categories[cat].push(char);
} else {
categories[cat] = [char];
}
while (data[i] != "\n") i++;
}
for (i in categories) {
categories[i] = regroup(categories[i]);
}
categories = unicodeMerge(categories);
return categories;
};
var generate = function(result) {
"use strict";
var output = {}, code = "", i, j, k, count = 0;
var sort = function(a, b) {
if (Array.isArray(a)) {
a = a[0];
}
if (Array.isArray(b)) {
b = b[0];
}
return a - b;
};
// Create sets
for (i in UNICODE) {
output[i] = [];
for (j = 0; j < UNICODE[i].length; j++) {
if (!(UNICODE[i][j] in result)) {
console.log("Can not find category " + UNICODE[i][j] + " from " + i + " in categories " + Object.keys(result));
continue;
}
output[i] = output[i].concat(result[UNICODE[i][j]]);
}
output[i] = regroup(output[i]);
}
// Generate code
code += "var UNICODE = {\n";
for (i in output) {
code += " " + i + ': new RegExp("[';
for (j = 0; j < output[i].length; j++) {
if (typeof output[i][j] === "number") {
// Note that es 6 has support till \u{10FFFF}, es 5 can only reach \uFFFF
if (!es6mode && output[i][j] > 0xffff) {
count++;
continue;
}
code += intToUnicode(output[i][j]);
} else {
if (!es6mode && (output[i][j][0] > 0xffff || output[i][j][1] > 0xffff)) {
count++;
continue;
}
code += intToUnicode(output[i][j][0]);
code += intToUnicode(output[i][j][1]);
}
}
code += ']"),\n';
}
code += "};";
if (!es6mode) {
console.log("Had to skip " + count + " character groups because they exceeded 0xffff - no way to put them in a regex in es5");
}
return code;
};
var exec = function() {
"use strict";
var unicodeData;
var unicodeScript;
var onUnicodeData = function(data) {
fs.writeFileSync(path.join(appDir, "./UnicodeData.txt"), data);
unicodeData = data;
// var result = categorizeFromUnicodeData(data);
// fs.writeFileSync(path.join(appDir, "./UnicodeCategories.json"), JSON.stringify(result));
//
// var code = generate(result);
// fs.writeFileSync(path.join(appDir, "./unicode-ecmascript.js"), code);
};
var onUnicodeScript = function(data) {
fs.writeFileSync(path.join(appDir, "./Script.txt"), data);
unicodeScript = data;
var result = categorizeFromUnicodeScript(data);
fs.writeFileSync(path.join(appDir, "./UnicodeCategories.json"), JSON.stringify(result));
var code = generate(result);
fs.writeFileSync(path.join(appDir, "./unicode-ecmascript.js"), code);
};
var appDir = path.join(__dirname, "./unicode");
if (!fs.existsSync(appDir)) {
fs.mkdirSync(appDir);
}
if (useLocal) {
unicodeData = fs.readFileSync(path.join(appDir, "./UnicodeData.txt"), {encoding: 'utf8'});
onUnicodeData(unicodeData);
unicodeScript = fs.readFileSync(path.join(appDir, "./Script.txt"), {encoding: 'utf8'});
onUnicodeScript(unicodeScript);
} else {
getFromUrl(unicodeDataUrl, onUnicodeData);
getFromUrl(unicodeScriptUrl, onUnicodeScript);
}
};
exec();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment