Skip to content

Instantly share code, notes, and snippets.

@mscalora
Last active January 13, 2022 13:09
Show Gist options
  • Save mscalora/71ef68b804dd523c867087fe2300dd76 to your computer and use it in GitHub Desktop.
Save mscalora/71ef68b804dd523c867087fe2300dd76 to your computer and use it in GitHub Desktop.
#!/usr/bin/env node
# credit to Lenny Domnitser for the logic of this tool, see: https://domnit.org/blog/2007/07/fix-encoding.html
#
# This tool will fix encoding errors in text files. In my case a sql backup file contained latin1 char
# sequences that appeared like: The “magic” of ’THings’
#
# Usage: fix_encodings.js <in-file> [<out-file>]
#
var win2byte = {
'\u20AC': '\x80', '\u201A': '\x82', '\u0192': '\x83', '\u201E': '\x84',
'\u2026': '\x85', '\u2020': '\x86', '\u2021': '\x87', '\u02C6': '\x88',
'\u2030': '\x89', '\u0160': '\x8A', '\u2039': '\x8B', '\u0152': '\x8C',
'\u017D': '\x8E', '\u2018': '\x91', '\u2019': '\x92', '\u201C': '\x93',
'\u201D': '\x94', '\u2022': '\x95', '\u2013': '\x96', '\u2014': '\x97',
'\u02DC': '\x98', '\u2122': '\x99', '\u0161': '\x9A', '\u203A': '\x9B',
'\u0153': '\x9C', '\u017E': '\x9E', '\u0178': '\x9F'
};
function getbyte(s) {
var b = win2byte[s];
return b || s;
}
function comp_in (list, pred) {
let a = [];
for (let e in list) {
a.push(pred(e))
}
return a;
}
//var codes = '(?:[\\x80-\\xBF]|' + [code for (code in win2byte)].join('|') + ')';
var codes = '(?:[\\x80-\\xBF]|' + comp_in(win2byte, (code) => code).join('|') + ')';
var pat = new RegExp('[\\xC2-\\xDF]' + codes +
'|[\\xE0-\\xEF]' + codes + '{2}' +
'|[\\xF0-\\xF4]' + codes + '{3}', 'g');
function sub(s) {
//s = s[0] + [getbyte(s[1 + parseInt(code)]) for (code in s.substring(1))].join('');
s = s[0] + comp_in(s.substring(1), (code) => getbyte(s[1 + parseInt(code)])).join('');
return decodeURIComponent(escape(s));
}
function fix(s) {
s = s.replace(pat, sub);
return s;
}
const fs = require('fs'),
Reset = "\x1b[0m",
Bright = "\x1b[1m",
Dim = "\x1b[2m",
Underscore = "\x1b[4m",
Reverse = "\x1b[7m",
FgBlack = "\x1b[30m",
FgRed = "\x1b[31m",
FgGreen = "\x1b[32m",
FgYellow = "\x1b[33m",
FgBlue = "\x1b[34m",
FgMagenta = "\x1b[35m",
FgCyan = "\x1b[36m",
FgWhite = "\x1b[37m";
let infile = process.argv[2],
outfile = process.argv[3],
content = '';
if (infile && fs.existsSync(infile)) {
content = fs.readFileSync(infile, 'utf8');
if (content && content.length) {
process.stderr.write(`${FgCyan}Read ${content.length} characters\n${Reset}`);
} else {
process.stderr.write(`${FgRed}Error: unable to read input file ${infile}\n${Reset}`);
process.exit(1);
}
} else if (infile) {
process.stderr.write(`${FgRed}Error: input file "${infile}" does not exist\n${Reset}`);
process.exit(1);
} else {
process.stderr.write(`${FgRed}Error: expecting input file name\n${Reset}`);
process.exit(1);
}
let fixed = fix(content);
if (outfile) {
fs.writeFileSync(outfile, fixed, 'utf8');
} else {
process.stdout.write(fixed);
}
process.stderr.write(`${FgGreen}Success: ${fixed.length} characters written`);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment