JamoCA/whitespace.cfc

## whitespace.cfc
component displayname="whitespace" output="false" hint="Identifies and performs trim functions on white space-related characters" {
	/*
	author:		James Moberg <james@ssmedia.com>
	date:		2019-01-07
	Description:	Removes all whitespace-related characters (ie, Zero-Width SPaces (ZWSPs)) from a string... not just characters below U+0020.
				.NET Trim() really trims a string - also trimming non-breaking-spaces.   This is not the case in Java.
				http://www.henrikbrinch.dk/Blog/2013/02/28/java-net-string-gotchas-of-the-day/
				https://stackoverflow.com/a/4731164/693068
				https://stackoverflow.com/a/4307261/693068


				The String.trim() method talks about "whitespace", but defines this in a very precise but rather crude and idiosyncratic way - it simply regards anything up to and including U+0020 (the usual space character) as whitespace, and anything above that as non-whitespace.
				https://closingbraces.net/2008/11/11/javastringtrim/
				charcodes
				https://www.fileformat.info/info/unicode/char/0001/index.htm  (substitute 0001 w/Spacedata.u)
	Reason:
				https://thehackernews.com/2019/01/phishing-zero-width-spaces.html

	Used:
				"NUL" = {n="Null char", d=0, h="0000", tags="join,unsafe"},
	GIST: https://gist.github.com/JamoCA/68673cbac81d5924c80754c3fe4effcf
	Blog: https://dev.to/gamesover/filtering-zero-width-spaces-zwsps-using-coldfusion-122c
	Tweet: https://twitter.com/gamesover/status/1704168951960519058

	*/

	/* 20191018 https://gist.github.com/JamoCA/42c3be286185aff0476d5888f0a819ff */
	variables.spaceData = [
		"SOH": ["n":"Start of Heading", "d":1, "h":"0001", "tags":"space,unsafe"]
		,"STX": ["n":"Start of Text", "d":2, "h":"0002", "tags":"space,unsafe"]
		,"ETX": ["n":"End of Text", "d":3, "h":"0003", "tags":"space,unsafe"]
		,"EOT": ["n":"End of Transmission", "d":4, "h":"0004", "tags":"space,unsafe"]
		,"ENQ": ["n":"Enquiry", "d":5, "h":"0005", "tags":"space,unsafe"]
		,"ACK": ["n":"Acknowledgment", "d":6, "h":"0006", "tags":"space,unsafe"]
		,"BEL": ["n":"Bell", "d":7, "h":"0007", "tags":"space,unsafe"]
		,"BS": ["n":"Back Space", "d":8, "h":"0008", "tags":"space,unsafe"]
		,"HT": ["n":"Horizontal Tab", "d":9, "h":"0009", "tags":"space,unsafe"]
		,"LF": ["n":"Line Feed", "d":10, "h":"000A", "tags":"space,safe"]
		,"VT": ["n":"Vertical Tab", "d":11, "h":"000B", "tags":"space,unsafe"]
		,"FF": ["n":"Form Feed", "d":12, "h":"000C", "tags":"space,unsafe"]
		,"CR": ["n":"Carriage Return", "d":13, "h":"000D", "tags":"space,safe"]
		,"SO": ["n":"Shift Out / X-On", "d":14, "h":"000E", "tags":"space,unsafe"]
		,"SI": ["n":"Shift In / X-Off", "d":15, "h":"000F", "tags":"space,unsafe"]
		,"DLE": ["n":"Data Line Escape", "d":16, "h":"0010", "tags":"space,unsafe"]
		,"DC1": ["n":"Device Control 1 (oft. XON)", "d":17, "h":"0011", "tags":"space,unsafe"]
		,"DC2": ["n":"Device Control 2", "d":18, "h":"0012", "tags":"space,unsafe"]
		,"DC3": ["n":"Device Control 3 (oft. XOFF)", "d":19, "h":"0013", "tags":"space,unsafe"]
		,"DC4": ["n":"Device Control 4", "d":20, "h":"0014", "tags":"space,unsafe"]
		,"NAK": ["n":"Negative Acknowledgement", "d":21, "h":"0015", "tags":"space,unsafe"]
		,"SYN": ["n":"Synchronous Idle", "d":22, "h":"0016", "tags":"space,unsafe"]
		,"ETB": ["n":"End of Transmit Block", "d":23, "h":"0017", "tags":"space,unsafe"]
		,"CAN": ["n":"Cancel", "d":24, "h":"0018", "tags":"space,unsafe"]
		,"EM": ["n":"End of Medium", "d":25, "h":"0019", "tags":"space,unsafe"]
		,"SUB": ["n":"Substitute", "d":26, "h":"001A", "tags":"space,unsafe"]
		,"ESC": ["n":"Escape", "d":27, "h":"001B", "tags":"space,unsafe"]
		,"FS": ["n":"File Separator", "d":28, "h":"001C", "tags":"space,unsafe"]
		,"GS": ["n":"Group Separator", "d":29, "h":"001D", "tags":"space,unsafe"]
		,"RS": ["n":"Record Separator", "d":30, "h":"001E", "tags":"space,unsafe"]
		,"US": ["n":"Unit Separator", "d":31, "h":"001F", "tags":"space,unsafe"]
		,"SP": ["n":"Space", "d":32, "h":"0020", "tags":"space,safe"]
		,"NEL": ["n":"next line", "d":133, "h":"0085", "tags":"space,unsafe"]
		,"NBSP": ["n":"no-breaking space", "d":160, "h":"00A0", "tags":"space,unsafe"]
		,"OGHAM": ["n":"OGHAM Space Mark", "d":5760, "h":"1680", "tags":"space,unsafe"]
		,"MONGOLIAN": ["n":"Mongolian Vowel Separator", "d":6158, "h":"180E", "tags":"space,unsafe"]
		,"ENQUAD": ["n":"EN Quad", "d":8192, "h":"2000", "tags":"space,unsafe"]
		,"EMQUAD": ["n":"EM Quad", "d":8193, "h":"2001", "tags":"space,unsafe"]
		,"ENSP": ["n":"EN Space", "d":8194, "h":"2002", "tags":"space,unsafe"]
		,"EMSP": ["n":"EM Space", "d":8195, "h":"2003", "tags":"space,unsafe"]
		,"THREE-PER:M SPACE" = ["n":"Thick Space", "d":8196, "h":"2004", "tags":"space,unsafe"]
		,"FOUR-PER:M SPACE" = ["n":"Mid Space", "d":8197, "h":"2005", "tags":"space,unsafe"]
		,"SIX-PER:M SPACE" = ["n":"Six-per-EM Space", "d":8198, "h":"2006", "tags":"space,unsafe"]
		,"FGMSP": ["n":"Figure Space", "d":8199, "h":"2007", "tags":"space,unsafe"]
		,"PUNSP": ["n":"Punctuation Space", "d":8200, "h":"2008", "tags":"space,unsafe"]
		,"THINSPACE": ["n":"Thin Space", "d":8201, "h":"2009", "tags":"space,unsafe"]
		,"HAIRSPACE": ["n":"Hair Space", "d":8202, "h":"200A", "tags":"space,unsafe"]
		,"ZWSP": ["n":"zero-width space", "d":8203, "h":"200B", "tags":"space,unsafe"]
		,"ZWNJ": ["n":"zero-width non-joiner", "d":8204, "h":"200C", "tags":"join,unsafe"]
		,"ZWJ": ["n":"zero-width joiner", "d":8205, "h":"200D", "tags":"join,unsafe"]
		,"LRM": ["n":"left-to-right mark", "d":8206, "h":"200E", "tags":"space,unsafe"]
		,"RLM": ["n":"right-to-left mark", "d":8207, "h":"200F", "tags":"space,unsafe"]
		,"WJ": ["n":"Word Joiner", "d":8288, "h":"2060", "tags":"join,unsafe"]
		,"LINSEP": ["n":"Line Separator", "d":8232, "h":"2028", "tags":"space,unsafe"]
		,"PARSEP": ["n":"Paragraph Separator", "d":8233, "h":"2029", "tags":"space,unsafe"]
		,"NNBSP": ["n":"Narrow No-Break Space", "d":8239, "h":"202F", "tags":"space,unsafe"]
		,"MMASP": ["n":"Medium Mathematical Space", "d":8287, "h":"205F", "tags":"space,unsafe"]
		,"SMSP": ["n":"Symbol for Space", "d":9248, "h":"2420", "tags":"space,unsafe"]
		,"BLANK": ["n":"Blank Symbol", "d":9250, "h":"2422", "tags":"space,unsafe"]
		,"OPENBOX": ["n":"Open Box", "d":9251, "h":"2423", "tags":"space,unsafe"]
		,"BB": ["n":"Braille blank pattern", "d":10240, "h":"2800", "tags":"space,unsafe"]
		,"IDSP": ["n":"Ideographic Space", "d":12288, "h":"3000", "tags":"space,unsafe"]
		,"BOM": ["n":"Zero Width No-Break Space (AKA Byte Order Mark)", "d":65279, "h":"FEFF", "tags":"join,unsafe"]
		,"FWDZ": ["n":"Full-Width Digit Zero", "d":65296, "h":"FF10", "tags":"space,unsafe"]
	];

	// tags = list of filters (all,safe,unsafe,space,join)
	public void function dumpRegex(string tags="") output=true hint="performs a CFDump of regex rules" {
		writedump(var="#getRegex(arguments.tags)#", label="SpacesRegex");
	}

	// tags = list of filters (all,safe,unsafe,space,join)
	public struct function getConfig(string tags="") output=false hint="Lists all rules used when a tag is specified" {
		return [
			"spaceData": variables.spaceData
			,"regex": getRegex(arguments.tags)
		];
	}

	public boolean function hasWhiteSpace(string inputString="") output=false hint="Checks if string contains any whitespace" {
		if (!len(arguments.inputString)){
			return javacast("boolean", 0);
		}
		return javacast("boolean", refindnocase(getRegex('all'), arguments.inputString, 1, false));
	}

	public any function hasUnsafeSpace(string inputString="") output=false hint="Checks if string contains unsafe-ish whitespace" {
		if (!len(arguments.inputString)){
			return javacast("boolean", 0);
		}
		return javacast("boolean", refindnocase(getRegex('unsafe'), arguments.inputString, 1, false));
	}

	public struct function identifyUnsafeSpace(string inputString="", string tags="all") output=false hint="Provides a array of shortcodes, names, decimal or hex values of identified whitespace and their regex positions" {
		local.outputData = [:];
		local.tags = (len(trim(arguments.tags))) ? listtoarray(lcase(trim(arguments.tags))) : ["all"];
		local.returnAll = arrayfind(local.tags, "all");

		for ( local.space in variables.spaceData ) {
			local.thisSpace = variables.spaceData[local.space];
			if (!local.returnAll){
				local.tagfilter = duplicate(local.tags);
				local.spacetags = listtoarray(duplicate(local.thisSpace.tags));
				local.tagfilter.retainAll(local.spacetags);
			}
			if ( local.returnAll || arraylen(local.tagfilter) ) {
				if ( find(chr(local.thisSpace.d), arguments.inputString) ) {
					if ( !local.outputData.keyExists("#local.space#") ) {
						local.outputData["#local.space#"] = [
							"d": javacast("int", local.thisSpace.d)
							,"h": javacast("string", local.thisSpace.h)
							,"name": javacast("string", local.thisSpace.n)
							,"positions": []
						];
						local.searchData = reFindNoCaseAll(chr(local.thisSpace.d), arguments.inputString);
						local.outputData["#local.space#"].positions = local.searchData.pos;
					}
				}
			}
		}
		return local.outputData;
	}

	public string function leftTrim(string inputString="") output=false hint="Performs a left trim and strips all whitespace" {
		return javacast("string", arguments.inputString).replaceAll("^(#getRegex('all')#)+", "");
	}

	public string function rightTrim(string inputString="") output=false hint="Performs a right trim and strips all whitespace" {
		return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "");
	}

	public string function fullTrim(string inputString="") output=false hint="Performs a left/right trim and strips all whitespace" {
		return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "").replaceAll("^(#getRegex('all')#)+", "");
	}

	public string function sanitize(string inputString="") output=false hint="Removes all unsafe whitespace" {
		return javacast("string", arguments.inputString).replaceAll("(#getRegex('space,unsafe')#)", " ").replaceAll("(#getRegex('join,unsafe')#)", "");
	}

	public string function sanitizeTrim(string inputString="") output=false hint="Performs a left/right trim and strips control characters" {
		return fullTrim(sanitize(arguments.inputString));
	}

	public string function compressText(string inputString="") output=false hint="Santizes, reduces multiple space characters to a single character" {
		return sanitizeTrim(arguments.inputString).replaceAll(" +", " ").replaceAll("[\r\n]+", "#chr(13)##chr(10)#");
	}

	/// Level = levels 1, 2 (default) or 3
	public string function compressHtml(string inputHtml="", string level="2") output=false hint="Replaces a huge amount of unnecessary whitespace from your HTML code" {
		local.outputString = javacast("string", arguments.inputHtml);
		arguments.level = (listfind("1,2,3", arguments.level)) ? arguments.level : 2;
		if ( arguments.level eq 3 ) {
			local.outputString = compressText(local.outputString);
			local.outputString.replaceAll("> <", "><");
			local.outputString.replaceAll("<!--[^>]+>", "");
		} else if ( arguments.level eq 2 ) {
			local.outputString = compressText(local.outputString);
			local.outputString.replaceAll("(#getRegex('unsafe')#)", "");
			local.outputString.replaceAll("(" & chr(10) & "|" & chr(13) & ")+[[:space:]]{2,}", chr(13));
		}
		return fullTrim(local.outputString);
	}

	public string function singleLine(string inputString="") output=false hint="Modifies content to output on a single line (for logging)" {
		return fullTrim(replacelist(arguments.inputString, "#chr(9)#,#chr(10)#,#chr(12)#,#chr(13)#,#chr(160)#", " , , , , "));
	}

	// tags = list of filters (all,safe|unsafe,space|join; unicode)
	public string function getRegex(string tags="") output=false hint="Generates pipe-delimited REGEX list of whitespace/ZWSP characters. Ex. 'chr(32)|chr(160)'" {
		local.cachekey = "udf_getRegex_#arguments.toString().hashCode()#";
		local.initialFilter = lcase(trim(arguments.tags));
		local.workingFilter = local.initialFilter;
		local.useUnicode =  listfind(local.initialFilter, "unicode");
		if (structkeyexists(request, local.cachekey)){
			return request[local.cachekey];
		}
		local.filters = [:];
		if (!local.filters.keyExists("#local.initialFilter#")){
			local.regex = [];
			if (listfind(local.workingFilter, "unicode")){
				local.workingFilter = listdeleteat(local.workingFilter, listfind(local.workingFilter, "unicode"));
			}
			local.all = !len(trim(local.workingFilter)) || listfind(lcase(local.workingFilter), "all");
			for (local.char in variables.spaceData){
				local.thisSpace = variables.spaceData[local.char];
				local.includeThis = local.all;
				if (!local.includeThis){
					for (local.thisTag in listtoarray(lcase(local.workingFilter))){
						if (listfind(local.thisSpace.tags, local.thisTag)){
							local.includeThis = local.includeThis + 1;
						}
					}
					if (local.includeThis neq listlen(local.workingFilter)){
						local.includeThis = 0;
					}
				}
				if (local.includeThis){
					if (local.useUnicode){
						arrayappend(local.regex, "\\u#variables.spaceData[local.char].h#");
					} else {
						arrayappend(local.regex, "#chr(variables.spaceData[local.char].d)#");
						arrayappend(local.regex, "&###variables.spaceData[local.char].d#;");
						arrayappend(local.regex, "&##xf#lcase(variables.spaceData[local.char].h)#;");
					}
				}
			}
			local.filters["#local.initialFilter#"] = arraytolist(local.regex, "|");
		}
		request[local.cachekey] = local.filters["#local.initialFilter#"];
		return local.filters["#local.initialFilter#"];
	}

	/*  4/14/2009   http://www.cflib.org/udf/reFindNoCaseAll
	@author Ben Forta (ben@forta.com)
	@version 1, November 17, 2003
	@version 2, January 7, 2019 - James @ SunStar Media */
	private struct function reFindNoCaseAll(required string regex, required string text) output=false hint="Returns all the matches (case insensitive) of a regular expression within a string. This is simular to reGet(), but more closely matches the result set of reFind." {
		local.results = [
			"pos": []
			,"len": []
		];
		local.pos = 1;
		local.done = 0;
		while (local.done neq 1) {
			local.subex = refindnocase(arguments.regex, arguments.text, local.pos, true);
			if (local.subex.len[1] eq 0){
				local.done = 1;
			} else {
				arrayappend(local.results.len, local.subex.len[1]);
				arrayappend(local.results.pos, local.subex.pos[1]);
				local.pos = local.subex.pos[1] + local.subex.len[1];
			}
		}
		if (arraylen(local.results.len) eq 0){
			arrayappend(local.results.len, 0);
			arrayappend(local.results.pos, 0);
		}
		return local.results;
	}

}
	component displayname="whitespace" output="false" hint="Identifies and performs trim functions on white space-related characters" {
	/*
	author: James Moberg <james@ssmedia.com>
	date: 2019-01-07
	Description: Removes all whitespace-related characters (ie, Zero-Width SPaces (ZWSPs)) from a string... not just characters below U+0020.
	.NET Trim() really trims a string - also trimming non-breaking-spaces. This is not the case in Java.
	http://www.henrikbrinch.dk/Blog/2013/02/28/java-net-string-gotchas-of-the-day/
	https://stackoverflow.com/a/4731164/693068
	https://stackoverflow.com/a/4307261/693068


	The String.trim() method talks about "whitespace", but defines this in a very precise but rather crude and idiosyncratic way - it simply regards anything up to and including U+0020 (the usual space character) as whitespace, and anything above that as non-whitespace.
	https://closingbraces.net/2008/11/11/javastringtrim/
	charcodes
	https://www.fileformat.info/info/unicode/char/0001/index.htm (substitute 0001 w/Spacedata.u)
	Reason:
	https://thehackernews.com/2019/01/phishing-zero-width-spaces.html

	Used:
	"NUL" = {n="Null char", d=0, h="0000", tags="join,unsafe"},
	GIST: https://gist.github.com/JamoCA/68673cbac81d5924c80754c3fe4effcf
	Blog: https://dev.to/gamesover/filtering-zero-width-spaces-zwsps-using-coldfusion-122c
	Tweet: https://twitter.com/gamesover/status/1704168951960519058

	*/

	/* 20191018 https://gist.github.com/JamoCA/42c3be286185aff0476d5888f0a819ff */
	variables.spaceData = [
	"SOH": ["n":"Start of Heading", "d":1, "h":"0001", "tags":"space,unsafe"]
	,"STX": ["n":"Start of Text", "d":2, "h":"0002", "tags":"space,unsafe"]
	,"ETX": ["n":"End of Text", "d":3, "h":"0003", "tags":"space,unsafe"]
	,"EOT": ["n":"End of Transmission", "d":4, "h":"0004", "tags":"space,unsafe"]
	,"ENQ": ["n":"Enquiry", "d":5, "h":"0005", "tags":"space,unsafe"]
	,"ACK": ["n":"Acknowledgment", "d":6, "h":"0006", "tags":"space,unsafe"]
	,"BEL": ["n":"Bell", "d":7, "h":"0007", "tags":"space,unsafe"]
	,"BS": ["n":"Back Space", "d":8, "h":"0008", "tags":"space,unsafe"]
	,"HT": ["n":"Horizontal Tab", "d":9, "h":"0009", "tags":"space,unsafe"]
	,"LF": ["n":"Line Feed", "d":10, "h":"000A", "tags":"space,safe"]
	,"VT": ["n":"Vertical Tab", "d":11, "h":"000B", "tags":"space,unsafe"]
	,"FF": ["n":"Form Feed", "d":12, "h":"000C", "tags":"space,unsafe"]
	,"CR": ["n":"Carriage Return", "d":13, "h":"000D", "tags":"space,safe"]
	,"SO": ["n":"Shift Out / X-On", "d":14, "h":"000E", "tags":"space,unsafe"]
	,"SI": ["n":"Shift In / X-Off", "d":15, "h":"000F", "tags":"space,unsafe"]
	,"DLE": ["n":"Data Line Escape", "d":16, "h":"0010", "tags":"space,unsafe"]
	,"DC1": ["n":"Device Control 1 (oft. XON)", "d":17, "h":"0011", "tags":"space,unsafe"]
	,"DC2": ["n":"Device Control 2", "d":18, "h":"0012", "tags":"space,unsafe"]
	,"DC3": ["n":"Device Control 3 (oft. XOFF)", "d":19, "h":"0013", "tags":"space,unsafe"]
	,"DC4": ["n":"Device Control 4", "d":20, "h":"0014", "tags":"space,unsafe"]
	,"NAK": ["n":"Negative Acknowledgement", "d":21, "h":"0015", "tags":"space,unsafe"]
	,"SYN": ["n":"Synchronous Idle", "d":22, "h":"0016", "tags":"space,unsafe"]
	,"ETB": ["n":"End of Transmit Block", "d":23, "h":"0017", "tags":"space,unsafe"]
	,"CAN": ["n":"Cancel", "d":24, "h":"0018", "tags":"space,unsafe"]
	,"EM": ["n":"End of Medium", "d":25, "h":"0019", "tags":"space,unsafe"]
	,"SUB": ["n":"Substitute", "d":26, "h":"001A", "tags":"space,unsafe"]
	,"ESC": ["n":"Escape", "d":27, "h":"001B", "tags":"space,unsafe"]
	,"FS": ["n":"File Separator", "d":28, "h":"001C", "tags":"space,unsafe"]
	,"GS": ["n":"Group Separator", "d":29, "h":"001D", "tags":"space,unsafe"]
	,"RS": ["n":"Record Separator", "d":30, "h":"001E", "tags":"space,unsafe"]
	,"US": ["n":"Unit Separator", "d":31, "h":"001F", "tags":"space,unsafe"]
	,"SP": ["n":"Space", "d":32, "h":"0020", "tags":"space,safe"]
	,"NEL": ["n":"next line", "d":133, "h":"0085", "tags":"space,unsafe"]
	,"NBSP": ["n":"no-breaking space", "d":160, "h":"00A0", "tags":"space,unsafe"]
	,"OGHAM": ["n":"OGHAM Space Mark", "d":5760, "h":"1680", "tags":"space,unsafe"]
	,"MONGOLIAN": ["n":"Mongolian Vowel Separator", "d":6158, "h":"180E", "tags":"space,unsafe"]
	,"ENQUAD": ["n":"EN Quad", "d":8192, "h":"2000", "tags":"space,unsafe"]
	,"EMQUAD": ["n":"EM Quad", "d":8193, "h":"2001", "tags":"space,unsafe"]
	,"ENSP": ["n":"EN Space", "d":8194, "h":"2002", "tags":"space,unsafe"]
	,"EMSP": ["n":"EM Space", "d":8195, "h":"2003", "tags":"space,unsafe"]
	,"THREE-PER:M SPACE" = ["n":"Thick Space", "d":8196, "h":"2004", "tags":"space,unsafe"]
	,"FOUR-PER:M SPACE" = ["n":"Mid Space", "d":8197, "h":"2005", "tags":"space,unsafe"]
	,"SIX-PER:M SPACE" = ["n":"Six-per-EM Space", "d":8198, "h":"2006", "tags":"space,unsafe"]
	,"FGMSP": ["n":"Figure Space", "d":8199, "h":"2007", "tags":"space,unsafe"]
	,"PUNSP": ["n":"Punctuation Space", "d":8200, "h":"2008", "tags":"space,unsafe"]
	,"THINSPACE": ["n":"Thin Space", "d":8201, "h":"2009", "tags":"space,unsafe"]
	,"HAIRSPACE": ["n":"Hair Space", "d":8202, "h":"200A", "tags":"space,unsafe"]
	,"ZWSP": ["n":"zero-width space", "d":8203, "h":"200B", "tags":"space,unsafe"]
	,"ZWNJ": ["n":"zero-width non-joiner", "d":8204, "h":"200C", "tags":"join,unsafe"]
	,"ZWJ": ["n":"zero-width joiner", "d":8205, "h":"200D", "tags":"join,unsafe"]
	,"LRM": ["n":"left-to-right mark", "d":8206, "h":"200E", "tags":"space,unsafe"]
	,"RLM": ["n":"right-to-left mark", "d":8207, "h":"200F", "tags":"space,unsafe"]
	,"WJ": ["n":"Word Joiner", "d":8288, "h":"2060", "tags":"join,unsafe"]
	,"LINSEP": ["n":"Line Separator", "d":8232, "h":"2028", "tags":"space,unsafe"]
	,"PARSEP": ["n":"Paragraph Separator", "d":8233, "h":"2029", "tags":"space,unsafe"]
	,"NNBSP": ["n":"Narrow No-Break Space", "d":8239, "h":"202F", "tags":"space,unsafe"]
	,"MMASP": ["n":"Medium Mathematical Space", "d":8287, "h":"205F", "tags":"space,unsafe"]
	,"SMSP": ["n":"Symbol for Space", "d":9248, "h":"2420", "tags":"space,unsafe"]
	,"BLANK": ["n":"Blank Symbol", "d":9250, "h":"2422", "tags":"space,unsafe"]
	,"OPENBOX": ["n":"Open Box", "d":9251, "h":"2423", "tags":"space,unsafe"]
	,"BB": ["n":"Braille blank pattern", "d":10240, "h":"2800", "tags":"space,unsafe"]
	,"IDSP": ["n":"Ideographic Space", "d":12288, "h":"3000", "tags":"space,unsafe"]
	,"BOM": ["n":"Zero Width No-Break Space (AKA Byte Order Mark)", "d":65279, "h":"FEFF", "tags":"join,unsafe"]
	,"FWDZ": ["n":"Full-Width Digit Zero", "d":65296, "h":"FF10", "tags":"space,unsafe"]
	];

	// tags = list of filters (all,safe,unsafe,space,join)
	public void function dumpRegex(string tags="") output=true hint="performs a CFDump of regex rules" {
	writedump(var="#getRegex(arguments.tags)#", label="SpacesRegex");
	}

	// tags = list of filters (all,safe,unsafe,space,join)
	public struct function getConfig(string tags="") output=false hint="Lists all rules used when a tag is specified" {
	return [
	"spaceData": variables.spaceData
	,"regex": getRegex(arguments.tags)
	];
	}

	public boolean function hasWhiteSpace(string inputString="") output=false hint="Checks if string contains any whitespace" {
	if (!len(arguments.inputString)){
	return javacast("boolean", 0);
	}
	return javacast("boolean", refindnocase(getRegex('all'), arguments.inputString, 1, false));
	}

	public any function hasUnsafeSpace(string inputString="") output=false hint="Checks if string contains unsafe-ish whitespace" {
	if (!len(arguments.inputString)){
	return javacast("boolean", 0);
	}
	return javacast("boolean", refindnocase(getRegex('unsafe'), arguments.inputString, 1, false));
	}

	public struct function identifyUnsafeSpace(string inputString="", string tags="all") output=false hint="Provides a array of shortcodes, names, decimal or hex values of identified whitespace and their regex positions" {
	local.outputData = [:];
	local.tags = (len(trim(arguments.tags))) ? listtoarray(lcase(trim(arguments.tags))) : ["all"];
	local.returnAll = arrayfind(local.tags, "all");

	for ( local.space in variables.spaceData ) {
	local.thisSpace = variables.spaceData[local.space];
	if (!local.returnAll){
	local.tagfilter = duplicate(local.tags);
	local.spacetags = listtoarray(duplicate(local.thisSpace.tags));
	local.tagfilter.retainAll(local.spacetags);
	}
	if ( local.returnAll \|\| arraylen(local.tagfilter) ) {
	if ( find(chr(local.thisSpace.d), arguments.inputString) ) {
	if ( !local.outputData.keyExists("#local.space#") ) {
	local.outputData["#local.space#"] = [
	"d": javacast("int", local.thisSpace.d)
	,"h": javacast("string", local.thisSpace.h)
	,"name": javacast("string", local.thisSpace.n)
	,"positions": []
	];
	local.searchData = reFindNoCaseAll(chr(local.thisSpace.d), arguments.inputString);
	local.outputData["#local.space#"].positions = local.searchData.pos;
	}
	}
	}
	}
	return local.outputData;
	}

	public string function leftTrim(string inputString="") output=false hint="Performs a left trim and strips all whitespace" {
	return javacast("string", arguments.inputString).replaceAll("^(#getRegex('all')#)+", "");
	}

	public string function rightTrim(string inputString="") output=false hint="Performs a right trim and strips all whitespace" {
	return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "");
	}

	public string function fullTrim(string inputString="") output=false hint="Performs a left/right trim and strips all whitespace" {
	return javacast("string", arguments.inputString).replaceAll("(#getRegex('all')#)+$", "").replaceAll("^(#getRegex('all')#)+", "");
	}

	public string function sanitize(string inputString="") output=false hint="Removes all unsafe whitespace" {
	return javacast("string", arguments.inputString).replaceAll("(#getRegex('space,unsafe')#)", " ").replaceAll("(#getRegex('join,unsafe')#)", "");
	}

	public string function sanitizeTrim(string inputString="") output=false hint="Performs a left/right trim and strips control characters" {
	return fullTrim(sanitize(arguments.inputString));
	}

	public string function compressText(string inputString="") output=false hint="Santizes, reduces multiple space characters to a single character" {
	return sanitizeTrim(arguments.inputString).replaceAll(" +", " ").replaceAll("[\r\n]+", "#chr(13)##chr(10)#");
	}

	/// Level = levels 1, 2 (default) or 3
	public string function compressHtml(string inputHtml="", string level="2") output=false hint="Replaces a huge amount of unnecessary whitespace from your HTML code" {
	local.outputString = javacast("string", arguments.inputHtml);
	arguments.level = (listfind("1,2,3", arguments.level)) ? arguments.level : 2;
	if ( arguments.level eq 3 ) {
	local.outputString = compressText(local.outputString);
	local.outputString.replaceAll("> <", "><");
	local.outputString.replaceAll("<!--[^>]+>", "");
	} else if ( arguments.level eq 2 ) {
	local.outputString = compressText(local.outputString);
	local.outputString.replaceAll("(#getRegex('unsafe')#)", "");
	local.outputString.replaceAll("(" & chr(10) & "\|" & chr(13) & ")+[[:space:]]{2,}", chr(13));
	}
	return fullTrim(local.outputString);
	}

	public string function singleLine(string inputString="") output=false hint="Modifies content to output on a single line (for logging)" {
	return fullTrim(replacelist(arguments.inputString, "#chr(9)#,#chr(10)#,#chr(12)#,#chr(13)#,#chr(160)#", " , , , , "));
	}

	// tags = list of filters (all,safe\|unsafe,space\|join; unicode)
	public string function getRegex(string tags="") output=false hint="Generates pipe-delimited REGEX list of whitespace/ZWSP characters. Ex. 'chr(32)\|chr(160)'" {
	local.cachekey = "udf_getRegex_#arguments.toString().hashCode()#";
	local.initialFilter = lcase(trim(arguments.tags));
	local.workingFilter = local.initialFilter;
	local.useUnicode = listfind(local.initialFilter, "unicode");
	if (structkeyexists(request, local.cachekey)){
	return request[local.cachekey];
	}
	local.filters = [:];
	if (!local.filters.keyExists("#local.initialFilter#")){
	local.regex = [];
	if (listfind(local.workingFilter, "unicode")){
	local.workingFilter = listdeleteat(local.workingFilter, listfind(local.workingFilter, "unicode"));
	}
	local.all = !len(trim(local.workingFilter)) \|\| listfind(lcase(local.workingFilter), "all");
	for (local.char in variables.spaceData){
	local.thisSpace = variables.spaceData[local.char];
	local.includeThis = local.all;
	if (!local.includeThis){
	for (local.thisTag in listtoarray(lcase(local.workingFilter))){
	if (listfind(local.thisSpace.tags, local.thisTag)){
	local.includeThis = local.includeThis + 1;
	}
	}
	if (local.includeThis neq listlen(local.workingFilter)){
	local.includeThis = 0;
	}
	}
	if (local.includeThis){
	if (local.useUnicode){
	arrayappend(local.regex, "\\u#variables.spaceData[local.char].h#");
	} else {
	arrayappend(local.regex, "#chr(variables.spaceData[local.char].d)#");
	arrayappend(local.regex, "&###variables.spaceData[local.char].d#;");
	arrayappend(local.regex, "&##xf#lcase(variables.spaceData[local.char].h)#;");
	}
	}
	}
	local.filters["#local.initialFilter#"] = arraytolist(local.regex, "\|");
	}
	request[local.cachekey] = local.filters["#local.initialFilter#"];
	return local.filters["#local.initialFilter#"];
	}

	/* 4/14/2009 http://www.cflib.org/udf/reFindNoCaseAll
	@author Ben Forta (ben@forta.com)
	@version 1, November 17, 2003
	@version 2, January 7, 2019 - James @ SunStar Media */
	private struct function reFindNoCaseAll(required string regex, required string text) output=false hint="Returns all the matches (case insensitive) of a regular expression within a string. This is simular to reGet(), but more closely matches the result set of reFind." {
	local.results = [
	"pos": []
	,"len": []
	];
	local.pos = 1;
	local.done = 0;
	while (local.done neq 1) {
	local.subex = refindnocase(arguments.regex, arguments.text, local.pos, true);
	if (local.subex.len[1] eq 0){
	local.done = 1;
	} else {
	arrayappend(local.results.len, local.subex.len[1]);
	arrayappend(local.results.pos, local.subex.pos[1]);
	local.pos = local.subex.pos[1] + local.subex.len[1];
	}
	}
	if (arraylen(local.results.len) eq 0){
	arrayappend(local.results.len, 0);
	arrayappend(local.results.pos, 0);
	}
	return local.results;
	}

	}