Skip to content

Instantly share code, notes, and snippets.

@sfengyuan
Last active August 7, 2022 15:35
Show Gist options
  • Save sfengyuan/473e626dff10914ea274e4b6efa6e6f5 to your computer and use it in GitHub Desktop.
Save sfengyuan/473e626dff10914ea274e4b6efa6e6f5 to your computer and use it in GitHub Desktop.
clean bad Unicode
/*
date: 2022/8/7 23:34
*/
export default function (str) {
const garbageSpaces = [
'\u00A0', // No-Break Space
'\u3000', // ideographic space
'\u2000', // En quad space
'\u2001', // Em quad space
'\u2002', // En space
'\u2003', // Em space
'\u2004', // 3/em space
'\u2005', // 4/em space
'\u2006', // 6/em space
'\u2007', // figure space
'\u2008', // punctuation space
'\u2009', // thin space
'\u200A', // hair space
'\u200B', // zero width space
'\u202F', // narrow no break space
'\u205F', // math space
'\u303F', // ideographic half space
'\uFEFF', // zero width No Break Space
'\u{E0020}' // tag space
]
const ASCIIVariants = [
'\uFF01!',
'\uFF02"',
'\uFF03#',
'\uFF04$',
'\uFF05%',
'\uFF06&',
'\uFF07\'',
'\uFF08(',
'\uFF09)',
'\uFF0A*',
'\uFF0B+',
'\uFF0C,',
'\uFF0D-',
'\uFF0E.',
'\uFF0F/',
'\uFF100',
'\uFF111',
'\uFF122',
'\uFF133',
'\uFF144',
'\uFF155',
'\uFF166',
'\uFF177',
'\uFF188',
'\uFF199',
'\uFF1A:',
'\uFF1B;',
'\uFF1C<',
'\uFF1D=',
'\uFF1E>',
'\uFF1F?',
'\uFF20@',
'\uFF21A',
'\uFF22B',
'\uFF23C',
'\uFF24D',
'\uFF25E',
'\uFF26F',
'\uFF27G',
'\uFF28H',
'\uFF29I',
'\uFF2AJ',
'\uFF2BK',
'\uFF2CL',
'\uFF2DM',
'\uFF2EN',
'\uFF2FO',
'\uFF30P',
'\uFF31Q',
'\uFF32R',
'\uFF33S',
'\uFF34T',
'\uFF35U',
'\uFF36V',
'\uFF37W',
'\uFF38X',
'\uFF39Y',
'\uFF3AZ',
'\uFF3B[',
'\uFF3C\\',
'\uFF3D]',
'\uFF3E^',
'\uFF3F_',
'\uFF40`',
'\uFF41a',
'\uFF42b',
'\uFF43c',
'\uFF44d',
'\uFF45e',
'\uFF46f',
'\uFF47g',
'\uFF48h',
'\uFF49i',
'\uFF4Aj',
'\uFF4Bk',
'\uFF4Cl',
'\uFF4Dm',
'\uFF4En',
'\uFF4Fo',
'\uFF50p',
'\uFF51q',
'\uFF52r',
'\uFF53s',
'\uFF54t',
'\uFF55u',
'\uFF56v',
'\uFF57w',
'\uFF58x',
'\uFF59y',
'\uFF5Az',
'\uFF5B{',
'\uFF5C|',
'\uFF5D}',
'\uFF5E~',
]
garbageSpaces.forEach(s => { str = str.replaceAll(s, ' ') })
ASCIIVariants.forEach(c => { str = str.replaceAll(c[0], c[1]) })
return str
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment