Last active
March 27, 2024 19:39
-
-
Save JamoCA/68673cbac81d5924c80754c3fe4effcf to your computer and use it in GitHub Desktop.
ColdFusion/CFML function that Identifies and performs trim functions on white space-related characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
component displayname="whitespace" output="false" hint="Identifies and performs trim functions on white space-related characters" { | |
/* | |
author: James Moberg <james@ssmedia.com> | |
date: 2019-01-07 | |
Description: Removes all whitespace-related characters (ie, Zero-Width SPaces (ZWSPs)) from a string... not just characters below U+0020. | |
.NET Trim() really trims a string - also trimming non-breaking-spaces. This is not the case in Java. | |
http://www.henrikbrinch.dk/Blog/2013/02/28/java-net-string-gotchas-of-the-day/ | |
https://stackoverflow.com/a/4731164/693068 | |
https://stackoverflow.com/a/4307261/693068 | |
The String.trim() method talks about "whitespace", but defines this in a very precise but rather crude and idiosyncratic way - it simply regards anything up to and including U+0020 (the usual space character) as whitespace, and anything above that as non-whitespace. | |
https://closingbraces.net/2008/11/11/javastringtrim/ | |
charcodes | |
https://www.fileformat.info/info/unicode/char/0001/index.htm (substitute 0001 w/Spacedata.u) | |
Reason: | |
https://thehackernews.com/2019/01/phishing-zero-width-spaces.html | |
Used: | |
"NUL" = {n="Null char", d=0, h="0000", tags="join,unsafe"}, | |
GIST: https://gist.github.com/JamoCA/68673cbac81d5924c80754c3fe4effcf | |
Blog: https://dev.to/gamesover/filtering-zero-width-spaces-zwsps-using-coldfusion-122c | |
Tweet: https://twitter.com/gamesover/status/1704168951960519058 | |
*/ | |
/* 20191018 https://gist.github.com/JamoCA/42c3be286185aff0476d5888f0a819ff */ | |
variables.spaceData = [ | |
"SOH": ["n":"Start of Heading", "d":1, "h":"0001", "tags":"space,unsafe"] | |
,"STX": ["n":"Start of Text", "d":2, "h":"0002", "tags":"space,unsafe"] | |
,"ETX": ["n":"End of Text", "d":3, "h":"0003", "tags":"space,unsafe"] | |
,"EOT": ["n":"End of Transmission", "d":4, "h":"0004", "tags":"space,unsafe"] | |
,"ENQ": ["n":"Enquiry", "d":5, "h":"0005", "tags":"space,unsafe"] | |
,"ACK": ["n":"Acknowledgment", "d":6, "h":"0006", "tags":"space,unsafe"] | |
,"BEL": ["n":"Bell", "d":7, "h":"0007", "tags":"space,unsafe"] | |
,"BS": ["n":"Back Space", "d":8, "h":"0008", "tags":"space,unsafe"] | |
,"HT": ["n":"Horizontal Tab", "d":9, "h":"0009", "tags":"space,unsafe"] | |
,"LF": ["n":"Line Feed", "d":10, "h":"000A", "tags":"space,safe"] | |
,"VT": ["n":"Vertical Tab", "d":11, "h":"000B", "tags":"space,unsafe"] | |
,"FF": ["n":"Form Feed", "d":12, "h":"000C", "tags":"space,unsafe"] | |
,"CR": ["n":"Carriage Return", "d":13, "h":"000D", "tags":"space,safe"] | |
,"SO": ["n":"Shift Out / X-On", "d":14, "h":"000E", "tags":"space,unsafe"] | |
,"SI": ["n":"Shift In / X-Off", "d":15, "h":"000F", "tags":"space,unsafe"] | |
,"DLE": ["n":"Data Line Escape", "d":16, "h":"0010", "tags":"space,unsafe"] | |
,"DC1": ["n":"Device Control 1 (oft. XON)", "d":17, "h":"0011", "tags":"space,unsafe"] | |
,"DC2": ["n":"Device Control 2", "d":18, "h":"0012", "tags":"space,unsafe"] | |
,"DC3": ["n":"Device Control 3 (oft. XOFF)", "d":19, "h":"0013", "tags":"space,unsafe"] | |
,"DC4": ["n":"Device Control 4", "d":20, "h":"0014", "tags":"space,unsafe"] | |
,"NAK": ["n":"Negative Acknowledgement", "d":21, "h":"0015", "tags":"space,unsafe"] | |
,"SYN": ["n":"Synchronous Idle", "d":22, "h":"0016", "tags":"space,unsafe"] | |
,"ETB": ["n":"End of Transmit Block", "d":23, "h":"0017", "tags":"space,unsafe"] | |
,"CAN": ["n":"Cancel", "d":24, "h":"0018", "tags":"space,unsafe"] | |
,"EM": ["n":"End of Medium", "d":25, "h":"0019", "tags":"space,unsafe"] | |
,"SUB": ["n":"Substitute", "d":26, "h":"001A", "tags":"space,unsafe"] | |
,"ESC": ["n":"Escape", "d":27, "h":"001B", "tags":"space,unsafe"] | |
,"FS": ["n":"File Separator", "d":28, "h":"001C", "tags":"space,unsafe"] | |
,"GS": ["n":"Group Separator", "d":29, "h":"001D", "tags":"space,unsafe"] | |
,"RS": ["n":"Record Separator", "d":30, "h":"001E", "tags":"space,unsafe"] | |
,"US": ["n":"Unit Separator", "d":31, "h":"001F", "tags":"space,unsafe"] | |
,"SP": ["n":"Space", "d":32, "h":"0020", "tags":"space,safe"] | |
,"NEL": ["n":"next line", "d":133, "h":"0085", "tags":"space,unsafe"] | |
,"NBSP": ["n":"no-breaking space", "d":160, "h":"00A0", "tags":"space,unsafe"] | |
,"OGHAM": ["n":"OGHAM Space Mark", "d":5760, "h":"1680", "tags":"space,unsafe"] | |
,"MONGOLIAN": ["n":"Mongolian Vowel Separator", "d":6158, "h":"180E", "tags":"space,unsafe"] | |
,"ENQUAD": ["n":"EN Quad", "d":8192, "h":"2000", "tags":"space,unsafe"] | |
,"EMQUAD": ["n":"EM Quad", "d":8193, "h":"2001", "tags":"space,unsafe"] | |
,"ENSP": ["n":"EN Space", "d":8194, "h":"2002", "tags":"space,unsafe"] | |
,"EMSP": ["n":"EM Space", "d":8195, "h":"2003", "tags":"space,unsafe"] | |
,"THREE-PER:M SPACE" = ["n":"Thick Space", "d":8196, "h":"2004", "tags":"space,unsafe"] | |
,"FOUR-PER:M SPACE" = ["n":"Mid Space", "d":8197, "h":"2005", "tags":"space,unsafe"] | |
,"SIX-PER:M SPACE" = ["n":"Six-per-EM Space", "d":8198, "h":"2006", "tags":"space,unsafe"] | |
,"FGMSP": ["n":"Figure Space", "d":8199, "h":"2007", "tags":"space,unsafe"] | |
,"PUNSP": ["n":"Punctuation Space", "d":8200, "h":"2008", "tags":"space,unsafe"] | |
,"THINSPACE": ["n":"Thin Space", "d":8201, "h":"2009", "tags":"space,unsafe"] | |
,"HAIRSPACE": ["n":"Hair Space", "d":8202, "h":"200A", "tags":"space,unsafe"] | |
,"ZWSP": ["n":"zero-width space", "d":8203, "h":"200B", "tags":"space,unsafe"] | |
,"ZWNJ": ["n":"zero-width non-joiner", "d":8204, "h":"200C", "tags":"join,unsafe"] | |
,"ZWJ": ["n":"zero-width joiner", "d":8205, "h":"200D", "tags":"join,unsafe"] | |
,"LRM": ["n":"left-to-right mark", "d":8206, "h":"200E", "tags":"space,unsafe"] | |
,"RLM": ["n":"right-to-left mark", "d":8207, "h":"200F", "tags":"space,unsafe"] | |
,"WJ": ["n":"Word Joiner", "d":8288, "h":"2060", "tags":"join,unsafe"] | |
,"LINSEP": ["n":"Line Separator", "d":8232, "h":"2028", "tags":"space,unsafe"] | |
,"PARSEP": ["n":"Paragraph Separator", "d":8233, "h":"2029", "tags":"space,unsafe"] | |
,"NNBSP": ["n":"Narrow No-Break Space", "d":8239, "h":"202F", "tags":"space,unsafe"] | |
,"MMASP": ["n":"Medium Mathematical Space", "d":8287, "h":"205F", "tags":"space,unsafe"] | |
,"SMSP": ["n":"Symbol for Space", "d":9248, "h":"2420", "tags":"space,unsafe"] | |
,"BLANK": ["n":"Blank Symbol", "d":9250, "h":"2422", "tags":"space,unsafe"] | |
,"OPENBOX": ["n":"Open Box", "d":9251, "h":"2423", "tags":"space,unsafe"] | |
,"BB": ["n":"Braille blank pattern", "d":10240, "h":"2800", "tags":"space,unsafe"] | |
,"IDSP": ["n":"Ideographic Space", "d":12288, "h":"3000", "tags":"space,unsafe"] | |
,"BOM": ["n":"Zero Width No-Break Space (AKA Byte Order Mark)", "d":65279, "h":"FEFF", "tags":"join,unsafe"] | |
,"FWDZ": ["n":"Full-Width Digit Zero", "d":65296, "h":"FF10", "tags":"space,unsafe"] | |
]; | |
// tags = list of filters (all,safe,unsafe,space,join) | |
public void function dumpRegex(string tags="") output=true hint="performs a CFDump of regex rules" { | |
writedump(var="#getRegex(arguments.tags)#", label="SpacesRegex"); | |
} | |
// tags = list of filters (all,safe,unsafe,space,join) | |
public struct function getConfig(string tags="") output=false hint="Lists all rules used when a tag is specified" { | |
return [ | |
"spaceData": variables.spaceData | |
,"regex": getRegex(arguments.tags) | |
]; | |
} | |
public boolean function hasWhiteSpace(string inputString="") output=false hint="Checks if string contains any whitespace" { | |
if (!len(arguments.inputString)){ | |
return javacast("boolean", 0); | |
} | |
return javacast("boolean", refindnocase(getRegex('all'), arguments.inputString, 1, false)); | |
} | |
public any function hasUnsafeSpace(string inputString="") output=false hint="Checks if string contains unsafe-ish whitespace" { | |
if (!len(arguments.inputString)){ | |
return javacast("boolean", 0); | |
} | |
return javacast("boolean", refindnocase(getRegex('unsafe'), arguments.inputString, 1, false)); | |
} | |
public struct function identifyUnsafeSpace(string inputString="", string tags="all") output=false hint="Provides a array of shortcodes, names, decimal or hex values of identified whitespace and their regex positions" { | |
local.outputData = [:]; | |
local.tags = (len(trim(arguments.tags))) ? listtoarray(lcase(trim(arguments.tags))) : ["all"]; | |
local.returnAll = arrayfind(local.tags, "all"); | |
for ( local.space in variables.spaceData ) { | |
local.thisSpace = variables.spaceData[local.space]; | |
if (!local.returnAll){ | |
local.tagfilter = duplicate(local.tags); | |
local.spacetags = listtoarray(duplicate(local.thisSpace.tags)); | |
local.tagfilter.retainAll(local.spacetags); | |
} | |
if ( local.returnAll || arraylen(local.tagfilter) ) { | |
if ( find(chr(local.thisSpace.d), arguments.inputString) ) { | |
if ( !local.outputData.keyExists("#local.space#") ) { | |
local.outputData["#local.space#"] = [ | |
"d": javacast("int", local.thisSpace.d) | |
,"h": javacast("string", local.thisSpace.h) | |
,"name": javacast("string", local.thisSpace.n) | |
,"positions": [] | |
]; | |
local.searchData = reFindNoCaseAll(chr(local.thisSpace.d), arguments.inputString); | |
local.outputData["#local.space#"].positions = local.searchData.pos; | |
} | |
} | |
} | |
} | |
return local.outputData; | |
} | |
public string function leftTrim(string inputString="") output=false hint="Performs a left trim and strips all whitespace" { | |
return javacast("string", arguments.inputString).replaceAll("^(#getRegex('all')#)+", ""); | |
} | |
public string function rightTrim(string inputString="") output=false hint="Performs a right trim and strips all whitespace" { | |
return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", ""); | |
} | |
public string function fullTrim(string inputString="") output=false hint="Performs a left/right trim and strips all whitespace" { | |
return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "").replaceAll("^(#getRegex('all')#)+", ""); | |
} | |
public string function sanitize(string inputString="") output=false hint="Removes all unsafe whitespace" { | |
return javacast("string", arguments.inputString).replaceAll("(#getRegex('space,unsafe')#)", " ").replaceAll("(#getRegex('join,unsafe')#)", ""); | |
} | |
public string function sanitizeTrim(string inputString="") output=false hint="Performs a left/right trim and strips control characters" { | |
return fullTrim(sanitize(arguments.inputString)); | |
} | |
public string function compressText(string inputString="") output=false hint="Santizes, reduces multiple space characters to a single character" { | |
return sanitizeTrim(arguments.inputString).replaceAll(" +", " ").replaceAll("[\r\n]+", "#chr(13)##chr(10)#"); | |
} | |
/// Level = levels 1, 2 (default) or 3 | |
public string function compressHtml(string inputHtml="", string level="2") output=false hint="Replaces a huge amount of unnecessary whitespace from your HTML code" { | |
local.outputString = javacast("string", arguments.inputHtml); | |
arguments.level = (listfind("1,2,3", arguments.level)) ? arguments.level : 2; | |
if ( arguments.level eq 3 ) { | |
local.outputString = compressText(local.outputString); | |
local.outputString.replaceAll("> <", "><"); | |
local.outputString.replaceAll("<!--[^>]+>", ""); | |
} else if ( arguments.level eq 2 ) { | |
local.outputString = compressText(local.outputString); | |
local.outputString.replaceAll("(#getRegex('unsafe')#)", ""); | |
local.outputString.replaceAll("(" & chr(10) & "|" & chr(13) & ")+[[:space:]]{2,}", chr(13)); | |
} | |
return fullTrim(local.outputString); | |
} | |
public string function singleLine(string inputString="") output=false hint="Modifies content to output on a single line (for logging)" { | |
return fullTrim(replacelist(arguments.inputString, "#chr(9)#,#chr(10)#,#chr(12)#,#chr(13)#,#chr(160)#", " , , , , ")); | |
} | |
// tags = list of filters (all,safe|unsafe,space|join; unicode) | |
public string function getRegex(string tags="") output=false hint="Generates pipe-delimited REGEX list of whitespace/ZWSP characters. Ex. 'chr(32)|chr(160)'" { | |
local.cachekey = "udf_getRegex_#arguments.toString().hashCode()#"; | |
local.initialFilter = lcase(trim(arguments.tags)); | |
local.workingFilter = local.initialFilter; | |
local.useUnicode = listfind(local.initialFilter, "unicode"); | |
if (structkeyexists(request, local.cachekey)){ | |
return request[local.cachekey]; | |
} | |
local.filters = [:]; | |
if (!local.filters.keyExists("#local.initialFilter#")){ | |
local.regex = []; | |
if (listfind(local.workingFilter, "unicode")){ | |
local.workingFilter = listdeleteat(local.workingFilter, listfind(local.workingFilter, "unicode")); | |
} | |
local.all = !len(trim(local.workingFilter)) || listfind(lcase(local.workingFilter), "all"); | |
for (local.char in variables.spaceData){ | |
local.thisSpace = variables.spaceData[local.char]; | |
local.includeThis = local.all; | |
if (!local.includeThis){ | |
for (local.thisTag in listtoarray(lcase(local.workingFilter))){ | |
if (listfind(local.thisSpace.tags, local.thisTag)){ | |
local.includeThis = local.includeThis + 1; | |
} | |
} | |
if (local.includeThis neq listlen(local.workingFilter)){ | |
local.includeThis = 0; | |
} | |
} | |
if (local.includeThis){ | |
if (local.useUnicode){ | |
arrayappend(local.regex, "\\u#variables.spaceData[local.char].h#"); | |
} else { | |
arrayappend(local.regex, "#chr(variables.spaceData[local.char].d)#"); | |
arrayappend(local.regex, "&###variables.spaceData[local.char].d#;"); | |
arrayappend(local.regex, "&##xf#lcase(variables.spaceData[local.char].h)#;"); | |
} | |
} | |
} | |
local.filters["#local.initialFilter#"] = arraytolist(local.regex, "|"); | |
} | |
request[local.cachekey] = local.filters["#local.initialFilter#"]; | |
return local.filters["#local.initialFilter#"]; | |
} | |
/* 4/14/2009 http://www.cflib.org/udf/reFindNoCaseAll | |
@author Ben Forta (ben@forta.com) | |
@version 1, November 17, 2003 | |
@version 2, January 7, 2019 - James @ SunStar Media */ | |
private struct function reFindNoCaseAll(required string regex, required string text) output=false hint="Returns all the matches (case insensitive) of a regular expression within a string. This is simular to reGet(), but more closely matches the result set of reFind." { | |
local.results = [ | |
"pos": [] | |
,"len": [] | |
]; | |
local.pos = 1; | |
local.done = 0; | |
while (local.done neq 1) { | |
local.subex = refindnocase(arguments.regex, arguments.text, local.pos, true); | |
if (local.subex.len[1] eq 0){ | |
local.done = 1; | |
} else { | |
arrayappend(local.results.len, local.subex.len[1]); | |
arrayappend(local.results.pos, local.subex.pos[1]); | |
local.pos = local.subex.pos[1] + local.subex.len[1]; | |
} | |
} | |
if (arraylen(local.results.len) eq 0){ | |
arrayappend(local.results.len, 0); | |
arrayappend(local.results.pos, 0); | |
} | |
return local.results; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment