Skip to content

Instantly share code, notes, and snippets.

@JamoCA
Last active March 27, 2024 19:39
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save JamoCA/68673cbac81d5924c80754c3fe4effcf to your computer and use it in GitHub Desktop.
Save JamoCA/68673cbac81d5924c80754c3fe4effcf to your computer and use it in GitHub Desktop.
ColdFusion/CFML function that Identifies and performs trim functions on white space-related characters
component displayname="whitespace" output="false" hint="Identifies and performs trim functions on white space-related characters" {
/*
author: James Moberg <james@ssmedia.com>
date: 2019-01-07
Description: Removes all whitespace-related characters (ie, Zero-Width SPaces (ZWSPs)) from a string... not just characters below U+0020.
.NET Trim() really trims a string - also trimming non-breaking-spaces. This is not the case in Java.
http://www.henrikbrinch.dk/Blog/2013/02/28/java-net-string-gotchas-of-the-day/
https://stackoverflow.com/a/4731164/693068
https://stackoverflow.com/a/4307261/693068
The String.trim() method talks about "whitespace", but defines this in a very precise but rather crude and idiosyncratic way - it simply regards anything up to and including U+0020 (the usual space character) as whitespace, and anything above that as non-whitespace.
https://closingbraces.net/2008/11/11/javastringtrim/
charcodes
https://www.fileformat.info/info/unicode/char/0001/index.htm (substitute 0001 w/Spacedata.u)
Reason:
https://thehackernews.com/2019/01/phishing-zero-width-spaces.html
Used:
"NUL" = {n="Null char", d=0, h="0000", tags="join,unsafe"},
GIST: https://gist.github.com/JamoCA/68673cbac81d5924c80754c3fe4effcf
Blog: https://dev.to/gamesover/filtering-zero-width-spaces-zwsps-using-coldfusion-122c
Tweet: https://twitter.com/gamesover/status/1704168951960519058
*/
/* 20191018 https://gist.github.com/JamoCA/42c3be286185aff0476d5888f0a819ff */
variables.spaceData = [
"SOH": ["n":"Start of Heading", "d":1, "h":"0001", "tags":"space,unsafe"]
,"STX": ["n":"Start of Text", "d":2, "h":"0002", "tags":"space,unsafe"]
,"ETX": ["n":"End of Text", "d":3, "h":"0003", "tags":"space,unsafe"]
,"EOT": ["n":"End of Transmission", "d":4, "h":"0004", "tags":"space,unsafe"]
,"ENQ": ["n":"Enquiry", "d":5, "h":"0005", "tags":"space,unsafe"]
,"ACK": ["n":"Acknowledgment", "d":6, "h":"0006", "tags":"space,unsafe"]
,"BEL": ["n":"Bell", "d":7, "h":"0007", "tags":"space,unsafe"]
,"BS": ["n":"Back Space", "d":8, "h":"0008", "tags":"space,unsafe"]
,"HT": ["n":"Horizontal Tab", "d":9, "h":"0009", "tags":"space,unsafe"]
,"LF": ["n":"Line Feed", "d":10, "h":"000A", "tags":"space,safe"]
,"VT": ["n":"Vertical Tab", "d":11, "h":"000B", "tags":"space,unsafe"]
,"FF": ["n":"Form Feed", "d":12, "h":"000C", "tags":"space,unsafe"]
,"CR": ["n":"Carriage Return", "d":13, "h":"000D", "tags":"space,safe"]
,"SO": ["n":"Shift Out / X-On", "d":14, "h":"000E", "tags":"space,unsafe"]
,"SI": ["n":"Shift In / X-Off", "d":15, "h":"000F", "tags":"space,unsafe"]
,"DLE": ["n":"Data Line Escape", "d":16, "h":"0010", "tags":"space,unsafe"]
,"DC1": ["n":"Device Control 1 (oft. XON)", "d":17, "h":"0011", "tags":"space,unsafe"]
,"DC2": ["n":"Device Control 2", "d":18, "h":"0012", "tags":"space,unsafe"]
,"DC3": ["n":"Device Control 3 (oft. XOFF)", "d":19, "h":"0013", "tags":"space,unsafe"]
,"DC4": ["n":"Device Control 4", "d":20, "h":"0014", "tags":"space,unsafe"]
,"NAK": ["n":"Negative Acknowledgement", "d":21, "h":"0015", "tags":"space,unsafe"]
,"SYN": ["n":"Synchronous Idle", "d":22, "h":"0016", "tags":"space,unsafe"]
,"ETB": ["n":"End of Transmit Block", "d":23, "h":"0017", "tags":"space,unsafe"]
,"CAN": ["n":"Cancel", "d":24, "h":"0018", "tags":"space,unsafe"]
,"EM": ["n":"End of Medium", "d":25, "h":"0019", "tags":"space,unsafe"]
,"SUB": ["n":"Substitute", "d":26, "h":"001A", "tags":"space,unsafe"]
,"ESC": ["n":"Escape", "d":27, "h":"001B", "tags":"space,unsafe"]
,"FS": ["n":"File Separator", "d":28, "h":"001C", "tags":"space,unsafe"]
,"GS": ["n":"Group Separator", "d":29, "h":"001D", "tags":"space,unsafe"]
,"RS": ["n":"Record Separator", "d":30, "h":"001E", "tags":"space,unsafe"]
,"US": ["n":"Unit Separator", "d":31, "h":"001F", "tags":"space,unsafe"]
,"SP": ["n":"Space", "d":32, "h":"0020", "tags":"space,safe"]
,"NEL": ["n":"next line", "d":133, "h":"0085", "tags":"space,unsafe"]
,"NBSP": ["n":"no-breaking space", "d":160, "h":"00A0", "tags":"space,unsafe"]
,"OGHAM": ["n":"OGHAM Space Mark", "d":5760, "h":"1680", "tags":"space,unsafe"]
,"MONGOLIAN": ["n":"Mongolian Vowel Separator", "d":6158, "h":"180E", "tags":"space,unsafe"]
,"ENQUAD": ["n":"EN Quad", "d":8192, "h":"2000", "tags":"space,unsafe"]
,"EMQUAD": ["n":"EM Quad", "d":8193, "h":"2001", "tags":"space,unsafe"]
,"ENSP": ["n":"EN Space", "d":8194, "h":"2002", "tags":"space,unsafe"]
,"EMSP": ["n":"EM Space", "d":8195, "h":"2003", "tags":"space,unsafe"]
,"THREE-PER:M SPACE" = ["n":"Thick Space", "d":8196, "h":"2004", "tags":"space,unsafe"]
,"FOUR-PER:M SPACE" = ["n":"Mid Space", "d":8197, "h":"2005", "tags":"space,unsafe"]
,"SIX-PER:M SPACE" = ["n":"Six-per-EM Space", "d":8198, "h":"2006", "tags":"space,unsafe"]
,"FGMSP": ["n":"Figure Space", "d":8199, "h":"2007", "tags":"space,unsafe"]
,"PUNSP": ["n":"Punctuation Space", "d":8200, "h":"2008", "tags":"space,unsafe"]
,"THINSPACE": ["n":"Thin Space", "d":8201, "h":"2009", "tags":"space,unsafe"]
,"HAIRSPACE": ["n":"Hair Space", "d":8202, "h":"200A", "tags":"space,unsafe"]
,"ZWSP": ["n":"zero-width space", "d":8203, "h":"200B", "tags":"space,unsafe"]
,"ZWNJ": ["n":"zero-width non-joiner", "d":8204, "h":"200C", "tags":"join,unsafe"]
,"ZWJ": ["n":"zero-width joiner", "d":8205, "h":"200D", "tags":"join,unsafe"]
,"LRM": ["n":"left-to-right mark", "d":8206, "h":"200E", "tags":"space,unsafe"]
,"RLM": ["n":"right-to-left mark", "d":8207, "h":"200F", "tags":"space,unsafe"]
,"WJ": ["n":"Word Joiner", "d":8288, "h":"2060", "tags":"join,unsafe"]
,"LINSEP": ["n":"Line Separator", "d":8232, "h":"2028", "tags":"space,unsafe"]
,"PARSEP": ["n":"Paragraph Separator", "d":8233, "h":"2029", "tags":"space,unsafe"]
,"NNBSP": ["n":"Narrow No-Break Space", "d":8239, "h":"202F", "tags":"space,unsafe"]
,"MMASP": ["n":"Medium Mathematical Space", "d":8287, "h":"205F", "tags":"space,unsafe"]
,"SMSP": ["n":"Symbol for Space", "d":9248, "h":"2420", "tags":"space,unsafe"]
,"BLANK": ["n":"Blank Symbol", "d":9250, "h":"2422", "tags":"space,unsafe"]
,"OPENBOX": ["n":"Open Box", "d":9251, "h":"2423", "tags":"space,unsafe"]
,"BB": ["n":"Braille blank pattern", "d":10240, "h":"2800", "tags":"space,unsafe"]
,"IDSP": ["n":"Ideographic Space", "d":12288, "h":"3000", "tags":"space,unsafe"]
,"BOM": ["n":"Zero Width No-Break Space (AKA Byte Order Mark)", "d":65279, "h":"FEFF", "tags":"join,unsafe"]
,"FWDZ": ["n":"Full-Width Digit Zero", "d":65296, "h":"FF10", "tags":"space,unsafe"]
];
// tags = list of filters (all,safe,unsafe,space,join)
public void function dumpRegex(string tags="") output=true hint="performs a CFDump of regex rules" {
writedump(var="#getRegex(arguments.tags)#", label="SpacesRegex");
}
// tags = list of filters (all,safe,unsafe,space,join)
public struct function getConfig(string tags="") output=false hint="Lists all rules used when a tag is specified" {
return [
"spaceData": variables.spaceData
,"regex": getRegex(arguments.tags)
];
}
public boolean function hasWhiteSpace(string inputString="") output=false hint="Checks if string contains any whitespace" {
if (!len(arguments.inputString)){
return javacast("boolean", 0);
}
return javacast("boolean", refindnocase(getRegex('all'), arguments.inputString, 1, false));
}
public any function hasUnsafeSpace(string inputString="") output=false hint="Checks if string contains unsafe-ish whitespace" {
if (!len(arguments.inputString)){
return javacast("boolean", 0);
}
return javacast("boolean", refindnocase(getRegex('unsafe'), arguments.inputString, 1, false));
}
public struct function identifyUnsafeSpace(string inputString="", string tags="all") output=false hint="Provides a array of shortcodes, names, decimal or hex values of identified whitespace and their regex positions" {
local.outputData = [:];
local.tags = (len(trim(arguments.tags))) ? listtoarray(lcase(trim(arguments.tags))) : ["all"];
local.returnAll = arrayfind(local.tags, "all");
for ( local.space in variables.spaceData ) {
local.thisSpace = variables.spaceData[local.space];
if (!local.returnAll){
local.tagfilter = duplicate(local.tags);
local.spacetags = listtoarray(duplicate(local.thisSpace.tags));
local.tagfilter.retainAll(local.spacetags);
}
if ( local.returnAll || arraylen(local.tagfilter) ) {
if ( find(chr(local.thisSpace.d), arguments.inputString) ) {
if ( !local.outputData.keyExists("#local.space#") ) {
local.outputData["#local.space#"] = [
"d": javacast("int", local.thisSpace.d)
,"h": javacast("string", local.thisSpace.h)
,"name": javacast("string", local.thisSpace.n)
,"positions": []
];
local.searchData = reFindNoCaseAll(chr(local.thisSpace.d), arguments.inputString);
local.outputData["#local.space#"].positions = local.searchData.pos;
}
}
}
}
return local.outputData;
}
public string function leftTrim(string inputString="") output=false hint="Performs a left trim and strips all whitespace" {
return javacast("string", arguments.inputString).replaceAll("^(#getRegex('all')#)+", "");
}
public string function rightTrim(string inputString="") output=false hint="Performs a right trim and strips all whitespace" {
return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "");
}
public string function fullTrim(string inputString="") output=false hint="Performs a left/right trim and strips all whitespace" {
return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "").replaceAll("^(#getRegex('all')#)+", "");
}
public string function sanitize(string inputString="") output=false hint="Removes all unsafe whitespace" {
return javacast("string", arguments.inputString).replaceAll("(#getRegex('space,unsafe')#)", " ").replaceAll("(#getRegex('join,unsafe')#)", "");
}
public string function sanitizeTrim(string inputString="") output=false hint="Performs a left/right trim and strips control characters" {
return fullTrim(sanitize(arguments.inputString));
}
public string function compressText(string inputString="") output=false hint="Santizes, reduces multiple space characters to a single character" {
return sanitizeTrim(arguments.inputString).replaceAll(" +", " ").replaceAll("[\r\n]+", "#chr(13)##chr(10)#");
}
/// Level = levels 1, 2 (default) or 3
public string function compressHtml(string inputHtml="", string level="2") output=false hint="Replaces a huge amount of unnecessary whitespace from your HTML code" {
local.outputString = javacast("string", arguments.inputHtml);
arguments.level = (listfind("1,2,3", arguments.level)) ? arguments.level : 2;
if ( arguments.level eq 3 ) {
local.outputString = compressText(local.outputString);
local.outputString.replaceAll("> <", "><");
local.outputString.replaceAll("<!--[^>]+>", "");
} else if ( arguments.level eq 2 ) {
local.outputString = compressText(local.outputString);
local.outputString.replaceAll("(#getRegex('unsafe')#)", "");
local.outputString.replaceAll("(" & chr(10) & "|" & chr(13) & ")+[[:space:]]{2,}", chr(13));
}
return fullTrim(local.outputString);
}
public string function singleLine(string inputString="") output=false hint="Modifies content to output on a single line (for logging)" {
return fullTrim(replacelist(arguments.inputString, "#chr(9)#,#chr(10)#,#chr(12)#,#chr(13)#,#chr(160)#", " , , , , "));
}
// tags = list of filters (all,safe|unsafe,space|join; unicode)
public string function getRegex(string tags="") output=false hint="Generates pipe-delimited REGEX list of whitespace/ZWSP characters. Ex. 'chr(32)|chr(160)'" {
local.cachekey = "udf_getRegex_#arguments.toString().hashCode()#";
local.initialFilter = lcase(trim(arguments.tags));
local.workingFilter = local.initialFilter;
local.useUnicode = listfind(local.initialFilter, "unicode");
if (structkeyexists(request, local.cachekey)){
return request[local.cachekey];
}
local.filters = [:];
if (!local.filters.keyExists("#local.initialFilter#")){
local.regex = [];
if (listfind(local.workingFilter, "unicode")){
local.workingFilter = listdeleteat(local.workingFilter, listfind(local.workingFilter, "unicode"));
}
local.all = !len(trim(local.workingFilter)) || listfind(lcase(local.workingFilter), "all");
for (local.char in variables.spaceData){
local.thisSpace = variables.spaceData[local.char];
local.includeThis = local.all;
if (!local.includeThis){
for (local.thisTag in listtoarray(lcase(local.workingFilter))){
if (listfind(local.thisSpace.tags, local.thisTag)){
local.includeThis = local.includeThis + 1;
}
}
if (local.includeThis neq listlen(local.workingFilter)){
local.includeThis = 0;
}
}
if (local.includeThis){
if (local.useUnicode){
arrayappend(local.regex, "\\u#variables.spaceData[local.char].h#");
} else {
arrayappend(local.regex, "#chr(variables.spaceData[local.char].d)#");
arrayappend(local.regex, "&###variables.spaceData[local.char].d#;");
arrayappend(local.regex, "&##xf#lcase(variables.spaceData[local.char].h)#;");
}
}
}
local.filters["#local.initialFilter#"] = arraytolist(local.regex, "|");
}
request[local.cachekey] = local.filters["#local.initialFilter#"];
return local.filters["#local.initialFilter#"];
}
/* 4/14/2009 http://www.cflib.org/udf/reFindNoCaseAll
@author Ben Forta (ben@forta.com)
@version 1, November 17, 2003
@version 2, January 7, 2019 - James @ SunStar Media */
private struct function reFindNoCaseAll(required string regex, required string text) output=false hint="Returns all the matches (case insensitive) of a regular expression within a string. This is simular to reGet(), but more closely matches the result set of reFind." {
local.results = [
"pos": []
,"len": []
];
local.pos = 1;
local.done = 0;
while (local.done neq 1) {
local.subex = refindnocase(arguments.regex, arguments.text, local.pos, true);
if (local.subex.len[1] eq 0){
local.done = 1;
} else {
arrayappend(local.results.len, local.subex.len[1]);
arrayappend(local.results.pos, local.subex.pos[1]);
local.pos = local.subex.pos[1] + local.subex.len[1];
}
}
if (arraylen(local.results.len) eq 0){
arrayappend(local.results.len, 0);
arrayappend(local.results.pos, 0);
}
return local.results;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment