Skip to content

Instantly share code, notes, and snippets.

@ArtemAvramenko
Last active June 16, 2024 11:32
Show Gist options
  • Save ArtemAvramenko/6a1ccb2d555cbff1ab555af2a96fdef2 to your computer and use it in GitHub Desktop.
Save ArtemAvramenko/6a1ccb2d555cbff1ab555af2a96fdef2 to your computer and use it in GitHub Desktop.
JavaScript code to read unicode text from a file on the browser side
function readTextFromFile(file, maxMegabytes = 1) {
const reader = new FileReader();
return new Promise(resolve => {
// check file size
if (!file || !file.size) {
resolve({ error: `The file cannot be empty` });
return;
}
if (file.size > maxMegabytes * 0x100000) {
resolve({ error: `The file size should not exceed ${maxMegabytes} MB` });
return;
}
const filename = file.name;
reader.onerror = e => {
resolve({ error: 'Cannot read a file from a specified location' });
};
reader.onload = e => {
try {
const data = new Uint8Array(e.target.result);
// https://en.wikipedia.org/wiki/Byte_order_mark
let encoding = 'utf-8';
let checkMojibakes = false;
if (data[0] == 0xFE && data[1] == 0xFF) {
encoding = 'utf-16be';
} else if (data[0] == 0xFF && data[1] == 0xFE) {
encoding = 'utf-16';
} else {
checkMojibakes = data[0] != 0xEF || data[1] != 0xBB || data[2] != 0xBF;
}
// parse and remove null terminators
let text = new TextDecoder(encoding).decode(data);
text = text.replace(/\0+$/, '');
// check invalid text
if (text.indexOf('\0') >= 0 ||
checkMojibakes && text.indexOf('\uFFFD') >= 0) {
resolve({ error: 'The file must use UTF-8 or UTF-16 encoding' });
} else {
resolve({ text, filename });
}
} catch(e) {
resolve({ error: e.toString() });
}
};
try {
reader.readAsArrayBuffer(file);
} catch(e) {
resolve({ error: e.toString() });
}
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment