Replace common non-typeable characters with typeable versions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Name: StringCleanse | |
Author: SweptThrone (sweptthr.one) | |
Description: Replaces non-typeable characters in a provided string with typeable versions. | |
Returns: The cleansed string, or the original string if no valid non-typeable characters were found. | |
Argument: The string to cleanse. | |
UPDATE January 17, 2022: Added some outliers in the 0x21## range. Manually. Also changed some decimals to hex. | |
*/ | |
function StringCleanse( str ) { | |
// dirtyRegex is a huge range of letters and numbers that are not typeable, u tag helps us grab them all | |
let dirtyRegex = /[\u{1D400}-\u{1D6A3}\u{1D7CE}-\u{1D7FF}\u{FF21}-\u{FF5A}\u{FF10}-\u{FF19}\u{24B6}-\u{24E9}\u{2460}-\u{2468}\u{24EA}\u{2102}\u{210A}-\u{210E}\u{2110}-\u{2113}\u{2115}\u{2118}-\u{211D}\u{2124}\u{2128}\u{212A}\u{212C}-\u{2131}\u{2133}\u{2134}\u{2139}\u{2145}-\u{2149}\u{1F130}-\u{1F149}\u{1F150}-\u{1F169}\u{1F170}-\u{1F189}]/gmu | |
let res = str.match( dirtyRegex ) | |
let finalStr = str | |
let outliers = { // I HATE UNICODE I HATE UNICODE I HATE UNICODE | |
0x2102: "C", | |
0x210A: "g", | |
0x210B: "H", | |
0x210C: "H", | |
0x210D: "H", | |
0x210E: "h", | |
0x2110: "I", | |
0x2111: "I", | |
0x2112: "I", | |
0x2113: "l", | |
0x2115: "N", | |
0x2118: "P", | |
0x2119: "P", | |
0x211A: "Q", | |
0x211B: "R", | |
0x211C: "R", | |
0x211D: "R", | |
0x2124: "Z", | |
0x2128: "Z", | |
0x212A: "K", | |
0x212C: "B", | |
0x212D: "C", | |
0x212E: "e", | |
0x212F: "e", | |
0x2130: "E", | |
0x2131: "F", | |
0x2133: "M", | |
0x2134: "o", | |
0x2139: "i", | |
0x2145: "D", | |
0x2146: "d", | |
0x2147: "e", | |
0x2148: "i", | |
0x2149: "j" | |
} | |
if ( res != null ) { | |
for ( i = 0; i < res.length; i++ ) { | |
let code = res[ i ].codePointAt( 0 ) | |
if ( code >= 0x1D400 && code <= 0x1D6A3 ) { // letters | |
// sub is the number we have to subtract from our "bad" letter code | |
// it will be different for each "set" of invalid letters so that it always ends up in the range of typeable letters | |
let sub = 0 | |
let iter = 0 | |
for ( comp = 0x1D419; comp <= 0x1D6D7; comp += 52 ) { // comp = 119833; comp <= 120483 | |
if ( code <= comp - 26 ) { // lowercase letters | |
sub = 0x1D3BF + ( 52 * iter ) - 58 | |
break | |
} else if ( code <= comp ) { // capital letters | |
sub = 0x1D3BF + ( 52 * iter ) | |
break | |
} | |
iter++ | |
} | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - sub ) ) | |
} else if ( code >= 0x1D7CE && code <= 0x1D7FF ) { // numbers | |
// sub is the number we have to subtract from our "bad" number code | |
// it will be different for each "set" of invalid numbers so that it always ends up in the range of typeable numbers | |
let sub = 0 | |
let iter = 0 | |
for ( comp = 0x1D7D7; comp <= 0x1D809; comp += 10 ) { // comp = 120791; comp <= 120841 | |
if ( code <= comp ) { // numbers | |
sub = 0x1D79E + ( 10 * iter ) | |
break | |
} | |
iter++ | |
} | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - sub ) ) | |
// these are more consistent because they're only in one range | |
// but that one range is always out of range of the others | |
} else if ( code >= 0xFF21 && code <= 0xFF5A ) { // fullwidth letters (i hate unicode) | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - 0xFEE0 ) ) | |
} else if ( code >= 0xFF10 && code <= 0xFF19 ) { // fullwidth numbers (I HATE UNICODE) | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - 0xFEE0 ) ) | |
} else if ( code >= 0x24B6 && code <= 0x24E9 ) { // circled letters, why do we need these? | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - ( code > 0x24CF ? 0x246F : 0x2475 ) ) ) | |
} else if ( ( code >= 0x2460 && code <= 0x2468 ) || code == 0x24EA ) { // circled numbers, 0 out of range for some reason | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - ( code == 0x24EA ? 0x24BA : 0x242F ) ) ) | |
} else if ( code >= 0x1F130 && code <= 0x1F149 ) { // letters in boxes | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - 0x1F0EF ) ) | |
} else if ( code >= 0x1F150 && code <= 0x1F169 ) { // letters in circles | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - 0x1F10F ) ) | |
} else if ( code >= 0x1F170 && code <= 0x1F189 ) { // letters in black boxes | |
finalStr = finalStr.replace( res[ i ], String.fromCodePoint( code - 0x1F12F ) ) | |
} else if ( outliers[ code ] ) { // miscellaneous outliers | |
finalStr = finalStr.replace( res[ i ], outliers[ code ] ) | |
} | |
} | |
return finalStr | |
} else { | |
return str | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment