Last active
April 22, 2025 16:44
-
-
Save cafread/ab38c11d05a01d8669dea17f23735f33 to your computer and use it in GitHub Desktop.
Basic text cleansing to get rid of n dashes, weird whitespace and similar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const tests = [ | |
{"i": "", "o": ""}, | |
{"i": "abc ", "o": "abc"}, | |
{"i": " c", "o": "c"}, | |
{"i": "'x'", "o": "x"}, | |
{"i": "A–B", "o": "A-B"}, | |
{"i": "hmm;", "o": "hmm;"}, | |
{"i": "x x", "o": "x x"}, | |
{"i": "abc'` '", "o": "abc"}, | |
]; | |
const specialTrouble = { | |
";": ";", | |
"_": "_", | |
}; | |
const reg = new RegExp(Object.keys(specialTrouble).join("|"), "g"); | |
function deEvilize(str = "") { | |
// Miraculous! Simply the best! Up to the test when strings go wrong! | |
// Replace tabs, multiple spaces, zero width spaces, non-breaking spaces, new lines etc, n space, m space etc. with a single regular space | |
str = str.replace(/\s+/g, " "); | |
// Trim leading, trailing whitespace, commas, stops or quotes | |
str = str.replace(/[\s`'",.]+$/, "").replace(/^[\s`'",.]+/, ""); | |
// Replace all dashes with a normal hyphen https://www.fileformat.info/info/unicode/category/Pd/list.htm | |
str = str.replace(/[‐‑‒–—―-﹣-֊]+/, "-"); | |
// Special trouble characters e.g. Greek question mark | |
str = str.replace(reg, function(mat) {return specialTrouble[mat];}); | |
return str; | |
} | |
for (let t of tests) console.log(t.o, deEvilize(t.i), t.o===deEvilize(t.i)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment