Last active
November 15, 2023 20:58
-
-
Save JamoCA/6f35220d47caa7fdbf75eb884ff1cec7 to your computer and use it in GitHub Desktop.
Coldfusion UDF to convert Unicode UTF-8 punctuation and symbols to ASCII7 punctuation for natural language processing (NLP).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<cfscript> | |
/* 20200604 Map Symbols & Punctuation to ASCII | |
Convert the Unicode punctuation and symbols to ASCII punctuation and symbols is imperative in Natural language processing (NLP) for preserving the original documents. | |
Based on mapping from Lexical Systems Group: https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lvg/2013/docs/designDoc/UDF/unicode/NormOperations/mapSymbolToAscii.html | |
Blog: https://dev.to/gamesover/convert-symbols-punctuation-to-ascii-using-coldfusion-java-3l6a | |
TryCF: https://trycf.com/gist/6f35220d47caa7fdbf75eb884ff1cec7 */ | |
string function symbolsToASCII(required string inputString){ | |
var TempContent = javacast("string", arguments.inputString); | |
TempContent = TempContent.replaceAll("[\u00B4\u02B9\u02BC\u02C8\u0301\u2018\u2019\u201B\u2032\u2034\u2037]", chr(39)); /* apostrophe (') */ | |
TempContent = TempContent.replaceAll("[\u00AB\u00BB\u02BA\u030B\u030E\u201C\u201D\u201E\u201F\u2033\u2036\u3003\u301D\u301E]", chr(34)); /* quotation mark (") */ | |
TempContent = TempContent.replaceAll("[\u00AD\u2010\u2011\u2012\u2013\u2014\u2212\u2015]", chr(45)); /* hyphen (-) */ | |
TempContent = TempContent.replaceAll("[\u01C3\u2762]", chr(33)); /* exclamation mark (!) */ | |
TempContent = TempContent.replaceAll("[\u266F]", chr(35)); /* music sharp sign (#) */ | |
TempContent = TempContent.replaceAll("[\u066A\u2052]", chr(37)); /* percent sign (%) */ | |
TempContent = TempContent.replaceAll("[\u066D\u204E\u2217\u2731\u00D7]", chr(42)); /* asterisk (*) */ | |
TempContent = TempContent.replaceAll("[\u201A\uFE51\uFF64\u3001]", chr(44)); /* comma (,) */ | |
TempContent = TempContent.replaceAll("[\u00F7\u0338\u2044\u2215]", chr(47)); /* slash (/) */ | |
TempContent = TempContent.replaceAll("[\u0589\u05C3\u2236]", chr(58)); /* colon (:) */ | |
TempContent = TempContent.replaceAll("[\u203D]", chr(63)); /* question mark (?) */ | |
TempContent = TempContent.replaceAll("[\u27E6]", chr(91)); /* opening square bracket ([) */ | |
TempContent = TempContent.replaceAll("[\u20E5\u2216]", chr(92)); /* backslash (\) */ | |
TempContent = TempContent.replaceAll("[\u301B]", chr(93)); /* closing square bracket ([) */ | |
TempContent = TempContent.replaceAll("[\u02C4\u02C6\u0302\u2038\u2303]", chr(94)); /* caret (^) */ | |
TempContent = TempContent.replaceAll("[\u02CD\u0331\u0332\u2017]", chr(95)); /* underscore (_) */ | |
TempContent = TempContent.replaceAll("[\u02CB\u0300\u2035]", chr(96)); /* grave accent (`) */ | |
TempContent = TempContent.replaceAll("[\u2983]", chr(123)); /* opening curly bracket ({) */ | |
TempContent = TempContent.replaceAll("[\u01C0\u05C0\u2223\u2758]", chr(124)); /* vertical bar / pipe (|) */ | |
TempContent = TempContent.replaceAll("[\u2016]", "#chr(124)##chr(124)#"); /* double vertical bar / double pipe (||) */ | |
TempContent = TempContent.replaceAll("[\u02DC\u0303\u2053\u223C\u301C]", chr(126)); /* tilde (~) */ | |
TempContent = TempContent.replaceAll("[\u2039\u2329\u27E8\u3008]", chr(60)); /* less-than sign (<) */ | |
TempContent = TempContent.replaceAll("[\u2264\u2266]", "#chr(60)##chr(61)#"); /* less-than equal-to sign (<=) */ | |
TempContent = TempContent.replaceAll("[\u203A\u232A\u27E9\u3009]", chr(62)); /* greater-than sign (>) */ | |
TempContent = TempContent.replaceAll("[\u2265\u2267]", "#chr(62)##chr(61)#"); /* greater-than equal-to sign (>=) */ | |
TempContent = TempContent.replaceAll("[\u200B\u2060\uFEFF]", chr(32)); /* space ( ) */ | |
TempContent = TempContent.replaceAll("\u2153", "1/3"); | |
TempContent = TempContent.replaceAll("\u2154", "2/3"); | |
TempContent = TempContent.replaceAll("\u2155", "1/5"); | |
TempContent = TempContent.replaceAll("\u2156", "2/5"); | |
TempContent = TempContent.replaceAll("\u2157", "3/5"); | |
TempContent = TempContent.replaceAll("\u2158", "4/5"); | |
TempContent = TempContent.replaceAll("\u2159", "1/6"); | |
TempContent = TempContent.replaceAll("\u215A", "5/6"); | |
TempContent = TempContent.replaceAll("\u215B", "1/8"); | |
TempContent = TempContent.replaceAll("\u215C", "3/8"); | |
TempContent = TempContent.replaceAll("\u215D", "5/8"); | |
TempContent = TempContent.replaceAll("\u215E", "7/8"); | |
TempContent = TempContent.replaceAll("\u2026", "\.\.\."); | |
return TempContent; | |
} | |
</cfscript> | |
<cfset testString = '#CHR(8220)#I don#CHR(8217)#t know what you mean by #CHR(8216)#glory,#CHR(8217)# #CHR(8221)# Alice said.'> | |
<cfoutput> | |
<textarea style="width:95%; height:300px;"> | |
Original: #TestString# | |
symbolsToASCII: #symbolsToASCII(testString)# | |
</textarea> | |
</cfoutput> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment