BrianLeishman/unhtml.c

## unhtml.c
/*
 * Copyright (c) 2018, brian
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * File:   unhtml.c
 * Author: brian
 *
 * Created on January 31, 2018, 1:02 PM
 */

/*
 * USAGE INSTRUCTIONS:
 *
 * make sure libmysqlclient-dev is installed:
 * apt-get install libmysqlclient-dev
 *
 * Replace "/usr/lib/mysql/plugin" with your MySQL plugins directory (can be found by running "select @@plugin_dir;")
 * gcc -I/usr/include/mysql -o unhtml.so -shared unhtml.c -fPIC && cp unhtml.so /usr/lib/mysql/plugin/unhtml.so
 *
 * Then, on the server:
 * create function`unhtml`returns string soname'unhtml.so';
 *
 * And use/test like:
 * select `unhtml`('<b>I&apos;ll say, it works!</b>');
 *
 * This function is a mixture of the PHP 7 striptags function, an htmlentity decode function
 * found at https://bitbucket.org/cggaertner/cstuff/src/master/entities.c (The PHP one looked far too complicated to port),
 * and the PHP trim function with the default trim options, since there's a lot of potential whitespace
 *
 *
 * Yeet!
 *
 */

#ifdef STANDARD
/* STANDARD is defined, don't use any mysql functions */
#include <string.h>
#ifdef __WIN__
typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */
typedef __int64 longlong;
#else
typedef unsigned long long ulonglong;
typedef long long longlong;
#endif /*__WIN__*/
#else
#include <my_global.h>
#include <my_sys.h>
#if defined(MYSQL_SERVER)
#include <m_string.h>
#else
/* when compiled as standalone */
#include <string.h>
#endif
#endif
#include <stdint.h>
#include <mysql.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include <ctype.h>

#define UNICODE_MAX 0x10FFFFul

#ifdef HAVE_DLOPEN

//https://bitbucket.org/cggaertner/cstuff/src/master/entities.c

static const char *const NAMED_ENTITIES[][2] = {
	{ "AElig;", "Æ"},
	{ "Aacute;", "Á"},
	{ "Acirc;", "Â"},
	{ "Agrave;", "À"},
	{ "Alpha;", "Α"},
	{ "Aring;", "Å"},
	{ "Atilde;", "Ã"},
	{ "Auml;", "Ä"},
	{ "Beta;", "Β"},
	{ "Ccedil;", "Ç"},
	{ "Chi;", "Χ"},
	{ "Dagger;", "‡"},
	{ "Delta;", "Δ"},
	{ "ETH;", "Ð"},
	{ "Eacute;", "É"},
	{ "Ecirc;", "Ê"},
	{ "Egrave;", "È"},
	{ "Epsilon;", "Ε"},
	{ "Eta;", "Η"},
	{ "Euml;", "Ë"},
	{ "Gamma;", "Γ"},
	{ "Iacute;", "Í"},
	{ "Icirc;", "Î"},
	{ "Igrave;", "Ì"},
	{ "Iota;", "Ι"},
	{ "Iuml;", "Ï"},
	{ "Kappa;", "Κ"},
	{ "Lambda;", "Λ"},
	{ "Mu;", "Μ"},
	{ "Ntilde;", "Ñ"},
	{ "Nu;", "Ν"},
	{ "OElig;", "Œ"},
	{ "Oacute;", "Ó"},
	{ "Ocirc;", "Ô"},
	{ "Ograve;", "Ò"},
	{ "Omega;", "Ω"},
	{ "Omicron;", "Ο"},
	{ "Oslash;", "Ø"},
	{ "Otilde;", "Õ"},
	{ "Ouml;", "Ö"},
	{ "Phi;", "Φ"},
	{ "Pi;", "Π"},
	{ "Prime;", "″"},
	{ "Psi;", "Ψ"},
	{ "Rho;", "Ρ"},
	{ "Scaron;", "Š"},
	{ "Sigma;", "Σ"},
	{ "THORN;", "Þ"},
	{ "Tau;", "Τ"},
	{ "Theta;", "Θ"},
	{ "Uacute;", "Ú"},
	{ "Ucirc;", "Û"},
	{ "Ugrave;", "Ù"},
	{ "Upsilon;", "Υ"},
	{ "Uuml;", "Ü"},
	{ "Xi;", "Ξ"},
	{ "Yacute;", "Ý"},
	{ "Yuml;", "Ÿ"},
	{ "Zeta;", "Ζ"},
	{ "aacute;", "á"},
	{ "acirc;", "â"},
	{ "acute;", "´"},
	{ "aelig;", "æ"},
	{ "agrave;", "à"},
	{ "alefsym;", "ℵ"},
	{ "alpha;", "α"},
	{ "amp;", "&"},
	{ "and;", "∧"},
	{ "ang;", "∠"},
	{ "apos;", "'"},
	{ "aring;", "å"},
	{ "asymp;", "≈"},
	{ "atilde;", "ã"},
	{ "auml;", "ä"},
	{ "bdquo;", "„"},
	{ "beta;", "β"},
	{ "brvbar;", "¦"},
	{ "bull;", "•"},
	{ "cap;", "∩"},
	{ "ccedil;", "ç"},
	{ "cedil;", "¸"},
	{ "cent;", "¢"},
	{ "chi;", "χ"},
	{ "circ;", "ˆ"},
	{ "clubs;", "♣"},
	{ "cong;", "≅"},
	{ "copy;", "©"},
	{ "crarr;", "↵"},
	{ "cup;", "∪"},
	{ "curren;", "¤"},
	{ "dArr;", "⇓"},
	{ "dagger;", "†"},
	{ "darr;", "↓"},
	{ "deg;", "°"},
	{ "delta;", "δ"},
	{ "diams;", "♦"},
	{ "divide;", "÷"},
	{ "eacute;", "é"},
	{ "ecirc;", "ê"},
	{ "egrave;", "è"},
	{ "empty;", "∅"},
	{ "emsp;", "\xE2\x80\x83"},
	{ "ensp;", "\xE2\x80\x82"},
	{ "epsilon;", "ε"},
	{ "equiv;", "≡"},
	{ "eta;", "η"},
	{ "eth;", "ð"},
	{ "euml;", "ë"},
	{ "euro;", "€"},
	{ "exist;", "∃"},
	{ "fnof;", "ƒ"},
	{ "forall;", "∀"},
	{ "frac12;", "½"},
	{ "frac14;", "¼"},
	{ "frac34;", "¾"},
	{ "frasl;", "⁄"},
	{ "gamma;", "γ"},
	{ "ge;", "≥"},
	{ "gt;", ">"},
	{ "hArr;", "⇔"},
	{ "harr;", "↔"},
	{ "hearts;", "♥"},
	{ "hellip;", "…"},
	{ "iacute;", "í"},
	{ "icirc;", "î"},
	{ "iexcl;", "¡"},
	{ "igrave;", "ì"},
	{ "image;", "ℑ"},
	{ "infin;", "∞"},
	{ "int;", "∫"},
	{ "iota;", "ι"},
	{ "iquest;", "¿"},
	{ "isin;", "∈"},
	{ "iuml;", "ï"},
	{ "kappa;", "κ"},
	{ "lArr;", "⇐"},
	{ "lambda;", "λ"},
	{ "lang;", "〈"},
	{ "laquo;", "«"},
	{ "larr;", "←"},
	{ "lceil;", "⌈"},
	{ "ldquo;", "“"},
	{ "le;", "≤"},
	{ "lfloor;", "⌊"},
	{ "lowast;", "∗"},
	{ "loz;", "◊"},
	{ "lrm;", "\xE2\x80\x8E"},
	{ "lsaquo;", "‹"},
	{ "lsquo;", "‘"},
	{ "lt;", "<"},
	{ "macr;", "¯"},
	{ "mdash;", "—"},
	{ "micro;", "µ"},
	{ "middot;", "·"},
	{ "minus;", "−"},
	{ "mu;", "μ"},
	{ "nabla;", "∇"},
	{ "nbsp;", "\xC2\xA0"},
	{ "ndash;", "–"},
	{ "ne;", "≠"},
	{ "ni;", "∋"},
	{ "not;", "¬"},
	{ "notin;", "∉"},
	{ "nsub;", "⊄"},
	{ "ntilde;", "ñ"},
	{ "nu;", "ν"},
	{ "oacute;", "ó"},
	{ "ocirc;", "ô"},
	{ "oelig;", "œ"},
	{ "ograve;", "ò"},
	{ "oline;", "‾"},
	{ "omega;", "ω"},
	{ "omicron;", "ο"},
	{ "oplus;", "⊕"},
	{ "or;", "∨"},
	{ "ordf;", "ª"},
	{ "ordm;", "º"},
	{ "oslash;", "ø"},
	{ "otilde;", "õ"},
	{ "otimes;", "⊗"},
	{ "ouml;", "ö"},
	{ "para;", "¶"},
	{ "part;", "∂"},
	{ "permil;", "‰"},
	{ "perp;", "⊥"},
	{ "phi;", "φ"},
	{ "pi;", "π"},
	{ "piv;", "ϖ"},
	{ "plusmn;", "±"},
	{ "pound;", "£"},
	{ "prime;", "′"},
	{ "prod;", "∏"},
	{ "prop;", "∝"},
	{ "psi;", "ψ"},
	{ "quot;", "\""},
	{ "rArr;", "⇒"},
	{ "radic;", "√"},
	{ "rang;", "〉"},
	{ "raquo;", "»"},
	{ "rarr;", "→"},
	{ "rceil;", "⌉"},
	{ "rdquo;", "”"},
	{ "real;", "ℜ"},
	{ "reg;", "®"},
	{ "rfloor;", "⌋"},
	{ "rho;", "ρ"},
	{ "rlm;", "\xE2\x80\x8F"},
	{ "rsaquo;", "›"},
	{ "rsquo;", "’"},
	{ "sbquo;", "‚"},
	{ "scaron;", "š"},
	{ "sdot;", "⋅"},
	{ "sect;", "§"},
	{ "shy;", "\xC2\xAD"},
	{ "sigma;", "σ"},
	{ "sigmaf;", "ς"},
	{ "sim;", "∼"},
	{ "spades;", "♠"},
	{ "sub;", "⊂"},
	{ "sube;", "⊆"},
	{ "sum;", "∑"},
	{ "sup1;", "¹"},
	{ "sup2;", "²"},
	{ "sup3;", "³"},
	{ "sup;", "⊃"},
	{ "supe;", "⊇"},
	{ "szlig;", "ß"},
	{ "tau;", "τ"},
	{ "there4;", "∴"},
	{ "theta;", "θ"},
	{ "thetasym;", "ϑ"},
	{ "thinsp;", "\xE2\x80\x89"},
	{ "thorn;", "þ"},
	{ "tilde;", "˜"},
	{ "times;", "×"},
	{ "trade;", "™"},
	{ "uArr;", "⇑"},
	{ "uacute;", "ú"},
	{ "uarr;", "↑"},
	{ "ucirc;", "û"},
	{ "ugrave;", "ù"},
	{ "uml;", "¨"},
	{ "upsih;", "ϒ"},
	{ "upsilon;", "υ"},
	{ "uuml;", "ü"},
	{ "weierp;", "℘"},
	{ "xi;", "ξ"},
	{ "yacute;", "ý"},
	{ "yen;", "¥"},
	{ "yuml;", "ÿ"},
	{ "zeta;", "ζ"},
	{ "zwj;", "\xE2\x80\x8D"},
	{ "zwnj;", "\xE2\x80\x8C"}
};

static int cmp(const void *key, const void *value) {
	return strncmp((const char *) key, *(const char *const *) value,
			strlen(*(const char *const *) value));
}

static const char *get_named_entity(const char *name) {
	const char *const *entity = (const char *const *) bsearch(name,
			NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,
			sizeof *NAMED_ENTITIES, cmp);

	return entity ? entity[1] : NULL;
}

static size_t putc_utf8(unsigned long cp, char *buffer) {
	unsigned char *bytes = (unsigned char *) buffer;

	if (cp <= 0x007Ful) {
		bytes[0] = (unsigned char) cp;
		return 1;
	}

	if (cp <= 0x07FFul) {
		bytes[1] = (unsigned char) ((2 << 6) | (cp & 0x3F));
		bytes[0] = (unsigned char) ((6 << 5) | (cp >> 6));
		return 2;
	}

	if (cp <= 0xFFFFul) {
		bytes[2] = (unsigned char) ((2 << 6) | (cp & 0x3F));
		bytes[1] = (unsigned char) ((2 << 6) | ((cp >> 6) & 0x3F));
		bytes[0] = (unsigned char) ((14 << 4) | (cp >> 12));
		return 3;
	}

	if (cp <= 0x10FFFFul) {
		bytes[3] = (unsigned char) ((2 << 6) | (cp & 0x3F));
		bytes[2] = (unsigned char) ((2 << 6) | ((cp >> 6) & 0x3F));
		bytes[1] = (unsigned char) ((2 << 6) | ((cp >> 12) & 0x3F));
		bytes[0] = (unsigned char) ((30 << 3) | (cp >> 18));
		return 4;
	}

	return 0;
}

static bool parse_entity(
		const char *current, char **to, const char **from) {
	const char *end = strchr(current, ';');
	if (!end) return 0;

	if (current[1] == '#') {
		char *tail = NULL;
		int errno_save = errno;
		bool hex = current[2] == 'x' || current[2] == 'X';

		errno = 0;
		unsigned long cp = strtoul(
				current + (hex ? 3 : 2), &tail, hex ? 16 : 10);

		bool fail = errno || tail != end || cp > UNICODE_MAX;
		errno = errno_save;
		if (fail) return 0;

		*to += putc_utf8(cp, *to);
		*from = end + 1;

		return 1;
	} else {
		const char *entity = get_named_entity(&current[1]);
		if (!entity) return 0;

		size_t len = strlen(entity);
		memcpy(*to, entity, len);

		*to += len;
		*from = end + 1;

		return 1;
	}
}

size_t decode_html_entities_utf8(char *dest, const char *src) {
	if (!src) src = dest;

	char *to = dest;
	const char *from = src;
	const char *current;

	for (current; (current = strchr(from, '&'));) {
		memmove(to, from, (size_t) (current - from));
		to += current - from;

		if (parse_entity(current, &to, &from))
			continue;

		from = current;
		*to++ = *from++;
	}

	size_t remaining = strlen(from);

	memmove(to, from, remaining);
	to += remaining;
	*to = 0;

	return (size_t) (to - dest);
}

char *unhtml(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error);
my_bool unhtml_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
void unhtml_deinit(UDF_INIT *initid);

my_bool unhtml_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
	if (args->arg_count != 1) {
		strcpy(message, "unhtml() requires one argument");
		return 1;
	}

	args->arg_type[0] = STRING_RESULT;

	initid->maybe_null = 1; //can return null

	return 0;
}

char *unhtml(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error) {
	if (args->args[0] == NULL) {
		*is_null = 1;
		return 0;
	}

	char *tbuf, *buf, *p, *tp, *rp, c, lc;
	int br, depth = 0, in_q = 0;
	uint8_t state = 0;
	size_t pos, i = 0;
	char *allow_free = NULL;
	const char *allow_actual;
	char is_xml = 0;
	char *rbuf = args->args[0];

	buf = malloc(sizeof (char) * (args->lengths[0] + 1));
	memcpy(buf, args->args[0], args->lengths[0]);
	buf[args->lengths[0]] = '\0';

	c = *buf;
	lc = '\0';
	p = buf;
	rp = rbuf;
	br = 0;
	tbuf = tp = NULL;

	while (i < args->lengths[0]) {
		switch (c) {
			case '\0':
				break;
			case '<':
				if (in_q) {
					break;
				}
				if (isspace(*(p + 1))) {
					goto reg_char;
				}
				if (state == 0) {
					lc = '<';
					state = 1;
				} else if (state == 1) {
					depth++;
				}
				break;

			case '(':
				if (state == 2) {
					if (lc != '"' && lc != '\'') {
						lc = '(';
						br++;
					}
				} else if (state == 0) {
					*(rp++) = c;
				}
				break;

			case ')':
				if (state == 2) {
					if (lc != '"' && lc != '\'') {
						lc = ')';
						br--;
					}
				} else if (state == 0) {
					*(rp++) = c;
				}
				break;

			case '>':
				if (depth) {
					depth--;
					break;
				}

				if (in_q) {
					break;
				}

				switch (state) {
					case 1: /* HTML/XML */
						lc = '>';
						if (is_xml && *(p - 1) == '-') {
							break;
						}
						in_q = state = is_xml = 0;
						break;

					case 2: /* PHP */
						if (!br && lc != '\"' && *(p - 1) == '?') {
							in_q = state = 0;
							tp = tbuf;
						}
						break;

					case 3:
						in_q = state = 0;
						tp = tbuf;
						break;

					case 4: /* JavaScript/CSS/etc... */
						if (p >= buf + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
							in_q = state = 0;
							tp = tbuf;
						}
						break;

					default:
						*(rp++) = c;
						break;
				}
				break;

			case '"':
			case '\'':
				if (state == 4) {
					/* Inside <!-- comment --> */
					break;
				} else if (state == 2 && *(p - 1) != '\\') {
					if (lc == c) {
						lc = '\0';
					} else if (lc != '\\') {
						lc = c;
					}
				} else if (state == 0) {
					*(rp++) = c;
				}
				if (state && p != buf && (state == 1 || *(p - 1) != '\\') && (!in_q || *p == in_q)) {
					if (in_q) {
						in_q = 0;
					} else {
						in_q = *p;
					}
				}
				break;

			case '!':
				/* JavaScript & Other HTML scripting languages */
				if (state == 1 && *(p - 1) == '<') {
					state = 3;
					lc = c;
				} else {
					if (state == 0) {
						*(rp++) = c;
					}
				}
				break;

			case '-':
				if (state == 3 && p >= buf + 2 && *(p - 1) == '-' && *(p - 2) == '!') {
					state = 4;
				} else {
					goto reg_char;
				}
				break;

			case '?':

				if (state == 1 && *(p - 1) == '<') {
					br = 0;
					state = 2;
					break;
				}

			case 'E':
			case 'e':
				/* !DOCTYPE exception */
				if (state == 3 && p > buf + 6
						&& tolower(*(p - 1)) == 'p'
						&& tolower(*(p - 2)) == 'y'
						&& tolower(*(p - 3)) == 't'
						&& tolower(*(p - 4)) == 'c'
						&& tolower(*(p - 5)) == 'o'
						&& tolower(*(p - 6)) == 'd') {
					state = 1;
					break;
				}
				/* fall-through */

			case 'l':
			case 'L':

				/* swm: If we encounter '<?xml' then we shouldn't be in
				 * state == 2 (PHP). Switch back to HTML.
				 */

				if (state == 2 && p > buf + 4 && strncasecmp(p - 4, "<?xm", 4) == 0) {
					state = 1;
					is_xml = 1;
					break;
				}

				/* fall-through */
			default:
				reg_char :
				if (state == 0) {
					*(rp++) = c;
				}
				break;
		}
		c = *(++p);
		i++;
	}
	if (rp < rbuf + args->lengths[0]) {
		*rp = '\0';
	}
	free(buf);

	size_t rbuf_length = decode_html_entities_utf8(rbuf, NULL);

	const char *start = rbuf;
	const char *end = start + strlen(rbuf);

	while (start != end) {
		unsigned char c = (unsigned char) *start;

		if (c <= ' ' &&
				(c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\0')) {
			start++;
		} else {
			break;
		}
	}
	while (start != end) {
		unsigned char c = (unsigned char) *(end - 1);

		if (c <= ' ' &&
				(c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\0')) {
			end--;
		} else {
			break;
		}
	}

	if (rbuf_length == end - start) {
		*length = rbuf_length;
		return rbuf;
	} else if (end - start == 0) {
		*length = 0;
		return "\0";
	} else {
		buf = malloc(sizeof (char) * (end - start + 1));
		memcpy(buf, start, end - start);
		buf[end - start] = '\0';

		*length = end - start;
		//free(rbuf);
		return buf;
	}

}

#endif /* HAVE_DLOPEN */
	/*
	* Copyright (c) 2018, brian
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* * Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* File: unhtml.c
	* Author: brian
	*
	* Created on January 31, 2018, 1:02 PM
	*/

	/*
	* USAGE INSTRUCTIONS:
	*
	* make sure libmysqlclient-dev is installed:
	* apt-get install libmysqlclient-dev
	*
	* Replace "/usr/lib/mysql/plugin" with your MySQL plugins directory (can be found by running "select @@plugin_dir;")
	* gcc -I/usr/include/mysql -o unhtml.so -shared unhtml.c -fPIC && cp unhtml.so /usr/lib/mysql/plugin/unhtml.so
	*
	* Then, on the server:
	* create function`unhtml`returns string soname'unhtml.so';
	*
	* And use/test like:
	* select `unhtml`('<b>I'll say, it works!</b>');
	*
	* This function is a mixture of the PHP 7 striptags function, an htmlentity decode function
	* found at https://bitbucket.org/cggaertner/cstuff/src/master/entities.c (The PHP one looked far too complicated to port),
	* and the PHP trim function with the default trim options, since there's a lot of potential whitespace
	*
	*
	* Yeet!
	*
	*/

	#ifdef STANDARD
	/* STANDARD is defined, don't use any mysql functions */
	#include <string.h>
	#ifdef __WIN__
	typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */
	typedef __int64 longlong;
	#else
	typedef unsigned long long ulonglong;
	typedef long long longlong;
	#endif /__WIN__/
	#else
	#include <my_global.h>
	#include <my_sys.h>
	#if defined(MYSQL_SERVER)
	#include <m_string.h>
	#else
	/* when compiled as standalone */
	#include <string.h>
	#endif
	#endif
	#include <stdint.h>
	#include <mysql.h>
	#include <stdlib.h>
	#include <stdbool.h>
	#include <errno.h>
	#include <ctype.h>

	#define UNICODE_MAX 0x10FFFFul

	#ifdef HAVE_DLOPEN

	//https://bitbucket.org/cggaertner/cstuff/src/master/entities.c

	static const char *const NAMED_ENTITIES[][2] = {
	{ "AElig;", "Æ"},
	{ "Aacute;", "Á"},
	{ "Acirc;", "Â"},
	{ "Agrave;", "À"},
	{ "Alpha;", "Α"},
	{ "Aring;", "Å"},
	{ "Atilde;", "Ã"},
	{ "Auml;", "Ä"},
	{ "Beta;", "Β"},
	{ "Ccedil;", "Ç"},
	{ "Chi;", "Χ"},
	{ "Dagger;", "‡"},
	{ "Delta;", "Δ"},
	{ "ETH;", "Ð"},
	{ "Eacute;", "É"},
	{ "Ecirc;", "Ê"},
	{ "Egrave;", "È"},
	{ "Epsilon;", "Ε"},
	{ "Eta;", "Η"},
	{ "Euml;", "Ë"},
	{ "Gamma;", "Γ"},
	{ "Iacute;", "Í"},
	{ "Icirc;", "Î"},
	{ "Igrave;", "Ì"},
	{ "Iota;", "Ι"},
	{ "Iuml;", "Ï"},
	{ "Kappa;", "Κ"},
	{ "Lambda;", "Λ"},
	{ "Mu;", "Μ"},
	{ "Ntilde;", "Ñ"},
	{ "Nu;", "Ν"},
	{ "OElig;", "Œ"},
	{ "Oacute;", "Ó"},
	{ "Ocirc;", "Ô"},
	{ "Ograve;", "Ò"},
	{ "Omega;", "Ω"},
	{ "Omicron;", "Ο"},
	{ "Oslash;", "Ø"},
	{ "Otilde;", "Õ"},
	{ "Ouml;", "Ö"},
	{ "Phi;", "Φ"},
	{ "Pi;", "Π"},
	{ "Prime;", "″"},
	{ "Psi;", "Ψ"},
	{ "Rho;", "Ρ"},
	{ "Scaron;", "Š"},
	{ "Sigma;", "Σ"},
	{ "THORN;", "Þ"},
	{ "Tau;", "Τ"},
	{ "Theta;", "Θ"},
	{ "Uacute;", "Ú"},
	{ "Ucirc;", "Û"},
	{ "Ugrave;", "Ù"},
	{ "Upsilon;", "Υ"},
	{ "Uuml;", "Ü"},
	{ "Xi;", "Ξ"},
	{ "Yacute;", "Ý"},
	{ "Yuml;", "Ÿ"},
	{ "Zeta;", "Ζ"},
	{ "aacute;", "á"},
	{ "acirc;", "â"},
	{ "acute;", "´"},
	{ "aelig;", "æ"},
	{ "agrave;", "à"},
	{ "alefsym;", "ℵ"},
	{ "alpha;", "α"},
	{ "amp;", "&"},
	{ "and;", "∧"},
	{ "ang;", "∠"},
	{ "apos;", "'"},
	{ "aring;", "å"},
	{ "asymp;", "≈"},
	{ "atilde;", "ã"},
	{ "auml;", "ä"},
	{ "bdquo;", "„"},
	{ "beta;", "β"},
	{ "brvbar;", "¦"},
	{ "bull;", "•"},
	{ "cap;", "∩"},
	{ "ccedil;", "ç"},
	{ "cedil;", "¸"},
	{ "cent;", "¢"},
	{ "chi;", "χ"},
	{ "circ;", "ˆ"},
	{ "clubs;", "♣"},
	{ "cong;", "≅"},
	{ "copy;", "©"},
	{ "crarr;", "↵"},
	{ "cup;", "∪"},
	{ "curren;", "¤"},
	{ "dArr;", "⇓"},
	{ "dagger;", "†"},
	{ "darr;", "↓"},
	{ "deg;", "°"},
	{ "delta;", "δ"},
	{ "diams;", "♦"},
	{ "divide;", "÷"},
	{ "eacute;", "é"},
	{ "ecirc;", "ê"},
	{ "egrave;", "è"},
	{ "empty;", "∅"},
	{ "emsp;", "\xE2\x80\x83"},
	{ "ensp;", "\xE2\x80\x82"},
	{ "epsilon;", "ε"},
	{ "equiv;", "≡"},
	{ "eta;", "η"},
	{ "eth;", "ð"},
	{ "euml;", "ë"},
	{ "euro;", "€"},
	{ "exist;", "∃"},
	{ "fnof;", "ƒ"},
	{ "forall;", "∀"},
	{ "frac12;", "½"},
	{ "frac14;", "¼"},
	{ "frac34;", "¾"},
	{ "frasl;", "⁄"},
	{ "gamma;", "γ"},
	{ "ge;", "≥"},
	{ "gt;", ">"},
	{ "hArr;", "⇔"},
	{ "harr;", "↔"},
	{ "hearts;", "♥"},
	{ "hellip;", "…"},
	{ "iacute;", "í"},
	{ "icirc;", "î"},
	{ "iexcl;", "¡"},
	{ "igrave;", "ì"},
	{ "image;", "ℑ"},
	{ "infin;", "∞"},
	{ "int;", "∫"},
	{ "iota;", "ι"},
	{ "iquest;", "¿"},
	{ "isin;", "∈"},
	{ "iuml;", "ï"},
	{ "kappa;", "κ"},
	{ "lArr;", "⇐"},
	{ "lambda;", "λ"},
	{ "lang;", "〈"},
	{ "laquo;", "«"},
	{ "larr;", "←"},
	{ "lceil;", "⌈"},
	{ "ldquo;", "“"},
	{ "le;", "≤"},
	{ "lfloor;", "⌊"},
	{ "lowast;", "∗"},
	{ "loz;", "◊"},
	{ "lrm;", "\xE2\x80\x8E"},
	{ "lsaquo;", "‹"},
	{ "lsquo;", "‘"},
	{ "lt;", "<"},
	{ "macr;", "¯"},
	{ "mdash;", "—"},
	{ "micro;", "µ"},
	{ "middot;", "·"},
	{ "minus;", "−"},
	{ "mu;", "μ"},
	{ "nabla;", "∇"},
	{ "nbsp;", "\xC2\xA0"},
	{ "ndash;", "–"},
	{ "ne;", "≠"},
	{ "ni;", "∋"},
	{ "not;", "¬"},
	{ "notin;", "∉"},
	{ "nsub;", "⊄"},
	{ "ntilde;", "ñ"},
	{ "nu;", "ν"},
	{ "oacute;", "ó"},
	{ "ocirc;", "ô"},
	{ "oelig;", "œ"},
	{ "ograve;", "ò"},
	{ "oline;", "‾"},
	{ "omega;", "ω"},
	{ "omicron;", "ο"},
	{ "oplus;", "⊕"},
	{ "or;", "∨"},
	{ "ordf;", "ª"},
	{ "ordm;", "º"},
	{ "oslash;", "ø"},
	{ "otilde;", "õ"},
	{ "otimes;", "⊗"},
	{ "ouml;", "ö"},
	{ "para;", "¶"},
	{ "part;", "∂"},
	{ "permil;", "‰"},
	{ "perp;", "⊥"},
	{ "phi;", "φ"},
	{ "pi;", "π"},
	{ "piv;", "ϖ"},
	{ "plusmn;", "±"},
	{ "pound;", "£"},
	{ "prime;", "′"},
	{ "prod;", "∏"},
	{ "prop;", "∝"},
	{ "psi;", "ψ"},
	{ "quot;", "\""},
	{ "rArr;", "⇒"},
	{ "radic;", "√"},
	{ "rang;", "〉"},
	{ "raquo;", "»"},
	{ "rarr;", "→"},
	{ "rceil;", "⌉"},
	{ "rdquo;", "”"},
	{ "real;", "ℜ"},
	{ "reg;", "®"},
	{ "rfloor;", "⌋"},
	{ "rho;", "ρ"},
	{ "rlm;", "\xE2\x80\x8F"},
	{ "rsaquo;", "›"},
	{ "rsquo;", "’"},
	{ "sbquo;", "‚"},
	{ "scaron;", "š"},
	{ "sdot;", "⋅"},
	{ "sect;", "§"},
	{ "shy;", "\xC2\xAD"},
	{ "sigma;", "σ"},
	{ "sigmaf;", "ς"},
	{ "sim;", "∼"},
	{ "spades;", "♠"},
	{ "sub;", "⊂"},
	{ "sube;", "⊆"},
	{ "sum;", "∑"},
	{ "sup1;", "¹"},
	{ "sup2;", "²"},
	{ "sup3;", "³"},
	{ "sup;", "⊃"},
	{ "supe;", "⊇"},
	{ "szlig;", "ß"},
	{ "tau;", "τ"},
	{ "there4;", "∴"},
	{ "theta;", "θ"},
	{ "thetasym;", "ϑ"},
	{ "thinsp;", "\xE2\x80\x89"},
	{ "thorn;", "þ"},
	{ "tilde;", "˜"},
	{ "times;", "×"},
	{ "trade;", "™"},
	{ "uArr;", "⇑"},
	{ "uacute;", "ú"},
	{ "uarr;", "↑"},
	{ "ucirc;", "û"},
	{ "ugrave;", "ù"},
	{ "uml;", "¨"},
	{ "upsih;", "ϒ"},
	{ "upsilon;", "υ"},
	{ "uuml;", "ü"},
	{ "weierp;", "℘"},
	{ "xi;", "ξ"},
	{ "yacute;", "ý"},
	{ "yen;", "¥"},
	{ "yuml;", "ÿ"},
	{ "zeta;", "ζ"},
	{ "zwj;", "\xE2\x80\x8D"},
	{ "zwnj;", "\xE2\x80\x8C"}
	};

	static int cmp(const void key, const void value) {
	return strncmp((const char ) key, (const char const ) value,
	strlen((const char const *) value));
	}

	static const char get_named_entity(const char name) {
	const char const entity = (const char const ) bsearch(name,
	NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,
	sizeof *NAMED_ENTITIES, cmp);

	return entity ? entity[1] : NULL;
	}

	static size_t putc_utf8(unsigned long cp, char *buffer) {
	unsigned char bytes = (unsigned char ) buffer;

	if (cp <= 0x007Ful) {
	bytes[0] = (unsigned char) cp;
	return 1;
	}

	if (cp <= 0x07FFul) {
	bytes[1] = (unsigned char) ((2 << 6) \| (cp & 0x3F));
	bytes[0] = (unsigned char) ((6 << 5) \| (cp >> 6));
	return 2;
	}

	if (cp <= 0xFFFFul) {
	bytes[2] = (unsigned char) ((2 << 6) \| (cp & 0x3F));
	bytes[1] = (unsigned char) ((2 << 6) \| ((cp >> 6) & 0x3F));
	bytes[0] = (unsigned char) ((14 << 4) \| (cp >> 12));
	return 3;
	}

	if (cp <= 0x10FFFFul) {
	bytes[3] = (unsigned char) ((2 << 6) \| (cp & 0x3F));
	bytes[2] = (unsigned char) ((2 << 6) \| ((cp >> 6) & 0x3F));
	bytes[1] = (unsigned char) ((2 << 6) \| ((cp >> 12) & 0x3F));
	bytes[0] = (unsigned char) ((30 << 3) \| (cp >> 18));
	return 4;
	}

	return 0;
	}

	static bool parse_entity(
	const char current, char to, const char *from) {
	const char *end = strchr(current, ';');
	if (!end) return 0;

	if (current[1] == '#') {
	char *tail = NULL;
	int errno_save = errno;
	bool hex = current[2] == 'x' \|\| current[2] == 'X';

	errno = 0;
	unsigned long cp = strtoul(
	current + (hex ? 3 : 2), &tail, hex ? 16 : 10);

	bool fail = errno \|\| tail != end \|\| cp > UNICODE_MAX;
	errno = errno_save;
	if (fail) return 0;

	to += putc_utf8(cp, to);
	*from = end + 1;

	return 1;
	} else {
	const char *entity = get_named_entity(&current[1]);
	if (!entity) return 0;

	size_t len = strlen(entity);
	memcpy(*to, entity, len);

	*to += len;
	*from = end + 1;

	return 1;
	}
	}

	size_t decode_html_entities_utf8(char dest, const char src) {
	if (!src) src = dest;

	char *to = dest;
	const char *from = src;
	const char *current;

	for (current; (current = strchr(from, '&'));) {
	memmove(to, from, (size_t) (current - from));
	to += current - from;

	if (parse_entity(current, &to, &from))
	continue;

	from = current;
	to++ = from++;
	}

	size_t remaining = strlen(from);

	memmove(to, from, remaining);
	to += remaining;
	*to = 0;

	return (size_t) (to - dest);
	}

	char unhtml(UDF_INIT initid, UDF_ARGS args, char result, unsigned long length, char is_null, char *error);
	my_bool unhtml_init(UDF_INIT initid, UDF_ARGS args, char *message);
	void unhtml_deinit(UDF_INIT *initid);

	my_bool unhtml_init(UDF_INIT initid, UDF_ARGS args, char *message) {
	if (args->arg_count != 1) {
	strcpy(message, "unhtml() requires one argument");
	return 1;
	}

	args->arg_type[0] = STRING_RESULT;

	initid->maybe_null = 1; //can return null

	return 0;
	}

	char unhtml(UDF_INIT initid, UDF_ARGS args, char result, unsigned long length, char is_null, char *error) {
	if (args->args[0] == NULL) {
	*is_null = 1;
	return 0;
	}

	char tbuf, buf, p, tp, *rp, c, lc;
	int br, depth = 0, in_q = 0;
	uint8_t state = 0;
	size_t pos, i = 0;
	char *allow_free = NULL;
	const char *allow_actual;
	char is_xml = 0;
	char *rbuf = args->args[0];

	buf = malloc(sizeof (char) * (args->lengths[0] + 1));
	memcpy(buf, args->args[0], args->lengths[0]);
	buf[args->lengths[0]] = '\0';

	c = *buf;
	lc = '\0';
	p = buf;
	rp = rbuf;
	br = 0;
	tbuf = tp = NULL;

	while (i < args->lengths[0]) {
	switch (c) {
	case '\0':
	break;
	case '<':
	if (in_q) {
	break;
	}
	if (isspace(*(p + 1))) {
	goto reg_char;
	}
	if (state == 0) {
	lc = '<';
	state = 1;
	} else if (state == 1) {
	depth++;
	}
	break;

	case '(':
	if (state == 2) {
	if (lc != '"' && lc != '\'') {
	lc = '(';
	br++;
	}
	} else if (state == 0) {
	*(rp++) = c;
	}
	break;

	case ')':
	if (state == 2) {
	if (lc != '"' && lc != '\'') {
	lc = ')';
	br--;
	}
	} else if (state == 0) {
	*(rp++) = c;
	}
	break;

	case '>':
	if (depth) {
	depth--;
	break;
	}

	if (in_q) {
	break;
	}

	switch (state) {
	case 1: /* HTML/XML */
	lc = '>';
	if (is_xml && *(p - 1) == '-') {
	break;
	}
	in_q = state = is_xml = 0;
	break;

	case 2: /* PHP */
	if (!br && lc != '\"' && *(p - 1) == '?') {
	in_q = state = 0;
	tp = tbuf;
	}
	break;

	case 3:
	in_q = state = 0;
	tp = tbuf;
	break;

	case 4: /* JavaScript/CSS/etc... */
	if (p >= buf + 2 && (p - 1) == '-' && (p - 2) == '-') {
	in_q = state = 0;
	tp = tbuf;
	}
	break;

	default:
	*(rp++) = c;
	break;
	}
	break;

	case '"':
	case '\'':
	if (state == 4) {
	/* Inside <!-- comment --> */
	break;
	} else if (state == 2 && *(p - 1) != '\\') {
	if (lc == c) {
	lc = '\0';
	} else if (lc != '\\') {
	lc = c;
	}
	} else if (state == 0) {
	*(rp++) = c;
	}
	if (state && p != buf && (state == 1 \|\| (p - 1) != '\\') && (!in_q \|\| p == in_q)) {
	if (in_q) {
	in_q = 0;
	} else {
	in_q = *p;
	}
	}
	break;

	case '!':
	/* JavaScript & Other HTML scripting languages */
	if (state == 1 && *(p - 1) == '<') {
	state = 3;
	lc = c;
	} else {
	if (state == 0) {
	*(rp++) = c;
	}
	}
	break;

	case '-':
	if (state == 3 && p >= buf + 2 && (p - 1) == '-' && (p - 2) == '!') {
	state = 4;
	} else {
	goto reg_char;
	}
	break;

	case '?':

	if (state == 1 && *(p - 1) == '<') {
	br = 0;
	state = 2;
	break;
	}

	case 'E':
	case 'e':
	/* !DOCTYPE exception */
	if (state == 3 && p > buf + 6
	&& tolower(*(p - 1)) == 'p'
	&& tolower(*(p - 2)) == 'y'
	&& tolower(*(p - 3)) == 't'
	&& tolower(*(p - 4)) == 'c'
	&& tolower(*(p - 5)) == 'o'
	&& tolower(*(p - 6)) == 'd') {
	state = 1;
	break;
	}
	/* fall-through */

	case 'l':
	case 'L':

	/* swm: If we encounter '<?xml' then we shouldn't be in
	* state == 2 (PHP). Switch back to HTML.
	*/

	if (state == 2 && p > buf + 4 && strncasecmp(p - 4, "<?xm", 4) == 0) {
	state = 1;
	is_xml = 1;
	break;
	}

	/* fall-through */
	default:
	reg_char :
	if (state == 0) {
	*(rp++) = c;
	}
	break;
	}
	c = *(++p);
	i++;
	}
	if (rp < rbuf + args->lengths[0]) {
	*rp = '\0';
	}
	free(buf);

	size_t rbuf_length = decode_html_entities_utf8(rbuf, NULL);

	const char *start = rbuf;
	const char *end = start + strlen(rbuf);

	while (start != end) {
	unsigned char c = (unsigned char) *start;

	if (c <= ' ' &&
	(c == ' ' \|\| c == '\n' \|\| c == '\r' \|\| c == '\t' \|\| c == '\v' \|\| c == '\0')) {
	start++;
	} else {
	break;
	}
	}
	while (start != end) {
	unsigned char c = (unsigned char) *(end - 1);

	if (c <= ' ' &&
	(c == ' ' \|\| c == '\n' \|\| c == '\r' \|\| c == '\t' \|\| c == '\v' \|\| c == '\0')) {
	end--;
	} else {
	break;
	}
	}

	if (rbuf_length == end - start) {
	*length = rbuf_length;
	return rbuf;
	} else if (end - start == 0) {
	*length = 0;
	return "\0";
	} else {
	buf = malloc(sizeof (char) * (end - start + 1));
	memcpy(buf, start, end - start);
	buf[end - start] = '\0';

	*length = end - start;
	//free(rbuf);
	return buf;
	}

	}

	#endif /* HAVE_DLOPEN */