Last active
August 29, 2015 14:13
-
-
Save avdg/60ac8f3739d70bf68f9b to your computer and use it in GitHub Desktop.
Experimental script to fetch unicode chars for ecmascript 5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var http = require('http'); | |
var path = require('path'); | |
var useLocal = false; // Just to stop this script from fetching unicodeData all the time ;-) | |
var es6mode = false; // If false, we use es5 syntax and ommit some characters for now | |
// Format: | |
// 0000;<control>;Cc;0;BN;;;;;N;NULL;;;; | |
// | |
// (0) Code value; | |
// (1) Character name; | |
// (2) General Category; | |
// (3) Canonical Combining Classes; | |
// (4) Bidirectional Category; | |
// (5) Character Decomposition Mapping | |
// (6) Decimal digit value; | |
// (7) Digit value; | |
// (8) Numeric value; | |
// (9) Mirrored; | |
// (10) Unicode 1.0 name; | |
// (11) 10646 comment field; | |
// (12) Uppercase Mapping; | |
// (13) Lowercase Mapping; | |
// (14) Titlecase Mapping; | |
// \n | |
var unicodeDataUrl = "http://www.unicode.org/Public/7.0.0/ucd/UnicodeData.txt"; | |
// Format: | |
// 0000..001F ; Common # Cc [32] <control-0000>..<control-001F> | |
// | |
// Values are separated by spaces | |
// (0) A character or 2 characters (representing a character range), | |
// separated by '..' followed by ; | |
// (1) Script type followed by # (comment) | |
// (2) Unicode category | |
// (3) Number of characters between [square brackets] | |
// Only occers on line with character ranges | |
// (4) name of character for each given code point | |
// \n | |
// | |
// Note: there are empty lines and lines starting with # are comments | |
var unicodeScriptUrl = "http://www.unicode.org/Public/7.0.0/ucd/Scripts.txt"; | |
// Note: ecmascript 5.1 requires unicode 3.0, but may use successors | |
// Note: ecmascript 6 requires unicode 5.1, but may use successors | |
var UNICODE = { | |
space_separator: ["Zs"], // See section ecmascript 5.1 - section 7.2 | |
letter: ["L&", "Lm", "Lo", "Nl"], | |
space_combining_mark: ["Mn", "Mc"], | |
digit: ["Nd"], | |
connector_punctuation: ["Pc"] | |
}; | |
var UNICODE_merge = { | |
"L&": ["Lu", "Ll", "Lt"], | |
}; | |
var getFromUrl = function(url, done, err) { | |
"use strict"; | |
http.get(url, function(res) { | |
var data = ""; | |
res.on("data", function(chunk) { | |
data += chunk; | |
}); | |
res.on("end", function(chunk) { | |
done(data); | |
}); | |
}).on('error', err || function(e){ | |
throw e; | |
}); | |
}; | |
var appendGroup = function(input, addition) { | |
"use strict"; | |
if (typeof addition === "number") { | |
addition = [addition, addition]; | |
} | |
if (typeof input[input.length - 1] === "number" && | |
input[input.length - 1] === addition[0] - 1 | |
) { | |
input[input.length - 1] = [addition[0] - 1, addition[1]]; | |
} else if (Array.isArray(input[input.length - 1]) && | |
input[input.length - 1][1] === addition[0] - 1 | |
) { | |
input[input.length - 1][1] = addition[1]; | |
} else { | |
if (Array.isArray(input[input.length - 1]) && | |
input[input.length - 1][0] === input[input.length - 1][1] - 1 | |
) { | |
input[input.length - 1] = input[input.length - 1][0]; | |
input.push(input[input.length - 1] + 1); | |
} | |
input.push(addition[0] === addition[1] ? addition[0] : addition); | |
} | |
}; | |
var regroup = function(a) { | |
"use strict"; | |
var sort = function(a, b) { | |
if (Array.isArray(a)) { | |
a = a[0]; | |
} | |
if (Array.isArray(b)) { | |
b = b[0]; | |
} | |
return a - b; | |
}; | |
a.sort(sort); | |
for (var i = 0, result = []; i < a.length; i++) { | |
appendGroup(result, a[i]); | |
} | |
return result; | |
}; | |
var intToUnicode = function(input) { | |
"use strict"; | |
var result = input.toString(16).toUpperCase(); | |
while (result.length < 4) result = "0" + result; | |
if (result.length > 4) { | |
if (!es6mode) { | |
throw new Error("Unsupported syntax"); | |
} | |
result = "{" + result + "}"; | |
} | |
result = "\\\\u" + result; | |
return result; | |
}; | |
var unicodeMerge = function(sets) { | |
for (var i in UNICODE_merge) { | |
if (sets[i] !== undefined) continue; | |
sets[i] = []; | |
for (var j = 0; j < 0; j++) { | |
sets[i] = sets[i].concat(sets[UNICODE_merge[i][j]]); | |
} | |
} | |
return sets; | |
}; | |
// TODO apparantly there are some codes in (5) Character Decomposition Mapping | |
// TODO maybe there are more things like that | |
var categorizeFromUnicodeData = function(data) { | |
"use strict"; | |
var categories = {}; | |
// Only need (0) Code value and (2) General Category from each line | |
for (var i = 0, begin, char, cat; i < data.length; i++) { | |
begin = i; | |
while (data[i] != ";") i++; | |
char = parseInt(data.substr(begin, i - begin), 16); | |
i++; | |
while (data[i] != ";") i++; | |
i++; | |
begin = i; | |
while(data[i] != ";") i++; | |
cat = data.substr(begin, i - begin); | |
if (categories[cat]) { | |
appendGroup(categories[cat], char); | |
} else { | |
categories[cat] = [char]; | |
} | |
while (data[i] != "\n") i++; | |
} | |
categories = unicodeMerge(categories); | |
return categories; | |
}; | |
var categorizeFromUnicodeScript = function(data) { | |
"use strict"; | |
var categories = {}; | |
// Only need character range and general category after comment | |
for (var i = 0, begin, char, cat; i < data.length; i++) { | |
if (data[i] === "#") { | |
while (data[i] != "\n") i++; | |
} | |
if (data[i] === "\n") { | |
continue; | |
} | |
// Get space after code range | |
begin = i; | |
i = data.indexOf(" ", begin); | |
char = data.substr(begin, i - begin).split(".."); | |
for (var j = 0; j < char.length; j++) { | |
char[j] = parseInt(char[j], 16); | |
} | |
char = char.length === 1 ? char[0] : char; | |
// Get category after comment | |
begin = data.indexOf("#", i); | |
i = data.indexOf(" ", begin + 2); | |
cat = data.substr(begin + 2, i - begin - 2); | |
if (categories[cat]) { | |
categories[cat].push(char); | |
} else { | |
categories[cat] = [char]; | |
} | |
while (data[i] != "\n") i++; | |
} | |
for (i in categories) { | |
categories[i] = regroup(categories[i]); | |
} | |
categories = unicodeMerge(categories); | |
return categories; | |
}; | |
var generate = function(result) { | |
"use strict"; | |
var output = {}, code = "", i, j, k, count = 0; | |
var sort = function(a, b) { | |
if (Array.isArray(a)) { | |
a = a[0]; | |
} | |
if (Array.isArray(b)) { | |
b = b[0]; | |
} | |
return a - b; | |
}; | |
// Create sets | |
for (i in UNICODE) { | |
output[i] = []; | |
for (j = 0; j < UNICODE[i].length; j++) { | |
if (!(UNICODE[i][j] in result)) { | |
console.log("Can not find category " + UNICODE[i][j] + " from " + i + " in categories " + Object.keys(result)); | |
continue; | |
} | |
output[i] = output[i].concat(result[UNICODE[i][j]]); | |
} | |
output[i] = regroup(output[i]); | |
} | |
// Generate code | |
code += "var UNICODE = {\n"; | |
for (i in output) { | |
code += " " + i + ': new RegExp("['; | |
for (j = 0; j < output[i].length; j++) { | |
if (typeof output[i][j] === "number") { | |
// Note that es 6 has support till \u{10FFFF}, es 5 can only reach \uFFFF | |
if (!es6mode && output[i][j] > 0xffff) { | |
count++; | |
continue; | |
} | |
code += intToUnicode(output[i][j]); | |
} else { | |
if (!es6mode && (output[i][j][0] > 0xffff || output[i][j][1] > 0xffff)) { | |
count++; | |
continue; | |
} | |
code += intToUnicode(output[i][j][0]); | |
code += intToUnicode(output[i][j][1]); | |
} | |
} | |
code += ']"),\n'; | |
} | |
code += "};"; | |
if (!es6mode) { | |
console.log("Had to skip " + count + " character groups because they exceeded 0xffff - no way to put them in a regex in es5"); | |
} | |
return code; | |
}; | |
var exec = function() { | |
"use strict"; | |
var unicodeData; | |
var unicodeScript; | |
var onUnicodeData = function(data) { | |
fs.writeFileSync(path.join(appDir, "./UnicodeData.txt"), data); | |
unicodeData = data; | |
// var result = categorizeFromUnicodeData(data); | |
// fs.writeFileSync(path.join(appDir, "./UnicodeCategories.json"), JSON.stringify(result)); | |
// | |
// var code = generate(result); | |
// fs.writeFileSync(path.join(appDir, "./unicode-ecmascript.js"), code); | |
}; | |
var onUnicodeScript = function(data) { | |
fs.writeFileSync(path.join(appDir, "./Script.txt"), data); | |
unicodeScript = data; | |
var result = categorizeFromUnicodeScript(data); | |
fs.writeFileSync(path.join(appDir, "./UnicodeCategories.json"), JSON.stringify(result)); | |
var code = generate(result); | |
fs.writeFileSync(path.join(appDir, "./unicode-ecmascript.js"), code); | |
}; | |
var appDir = path.join(__dirname, "./unicode"); | |
if (!fs.existsSync(appDir)) { | |
fs.mkdirSync(appDir); | |
} | |
if (useLocal) { | |
unicodeData = fs.readFileSync(path.join(appDir, "./UnicodeData.txt"), {encoding: 'utf8'}); | |
onUnicodeData(unicodeData); | |
unicodeScript = fs.readFileSync(path.join(appDir, "./Script.txt"), {encoding: 'utf8'}); | |
onUnicodeScript(unicodeScript); | |
} else { | |
getFromUrl(unicodeDataUrl, onUnicodeData); | |
getFromUrl(unicodeScriptUrl, onUnicodeScript); | |
} | |
}; | |
exec(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment