Skip to content

Instantly share code, notes, and snippets.

@BrianLeishman
Created February 4, 2018 16:37
Show Gist options
  • Save BrianLeishman/da6edb36ea2989b2ded533087e9d1511 to your computer and use it in GitHub Desktop.
Save BrianLeishman/da6edb36ea2989b2ded533087e9d1511 to your computer and use it in GitHub Desktop.
MySQL `unhtml` UDF for converting HTML to plain text, removing tags and converting HTML entities
/*
* Copyright (c) 2018, brian
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* File: unhtml.c
* Author: brian
*
* Created on January 31, 2018, 1:02 PM
*/
/*
* USAGE INSTRUCTIONS:
*
* make sure libmysqlclient-dev is installed:
* apt-get install libmysqlclient-dev
*
* Replace "/usr/lib/mysql/plugin" with your MySQL plugins directory (can be found by running "select @@plugin_dir;")
* gcc -I/usr/include/mysql -o unhtml.so -shared unhtml.c -fPIC && cp unhtml.so /usr/lib/mysql/plugin/unhtml.so
*
* Then, on the server:
* create function`unhtml`returns string soname'unhtml.so';
*
* And use/test like:
* select `unhtml`('<b>I&apos;ll say, it works!</b>');
*
* This function is a mixture of the PHP 7 striptags function, an htmlentity decode function
* found at https://bitbucket.org/cggaertner/cstuff/src/master/entities.c (The PHP one looked far too complicated to port),
* and the PHP trim function with the default trim options, since there's a lot of potential whitespace
*
*
* Yeet!
*
*/
#ifdef STANDARD
/* STANDARD is defined, don't use any mysql functions */
#include <string.h>
#ifdef __WIN__
typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */
typedef __int64 longlong;
#else
typedef unsigned long long ulonglong;
typedef long long longlong;
#endif /*__WIN__*/
#else
#include <my_global.h>
#include <my_sys.h>
#if defined(MYSQL_SERVER)
#include <m_string.h>
#else
/* when compiled as standalone */
#include <string.h>
#endif
#endif
#include <stdint.h>
#include <mysql.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include <ctype.h>
#define UNICODE_MAX 0x10FFFFul
#ifdef HAVE_DLOPEN
//https://bitbucket.org/cggaertner/cstuff/src/master/entities.c
static const char *const NAMED_ENTITIES[][2] = {
{ "AElig;", "Æ"},
{ "Aacute;", "Á"},
{ "Acirc;", "Â"},
{ "Agrave;", "À"},
{ "Alpha;", "Α"},
{ "Aring;", "Å"},
{ "Atilde;", "Ã"},
{ "Auml;", "Ä"},
{ "Beta;", "Β"},
{ "Ccedil;", "Ç"},
{ "Chi;", "Χ"},
{ "Dagger;", "‡"},
{ "Delta;", "Δ"},
{ "ETH;", "Ð"},
{ "Eacute;", "É"},
{ "Ecirc;", "Ê"},
{ "Egrave;", "È"},
{ "Epsilon;", "Ε"},
{ "Eta;", "Η"},
{ "Euml;", "Ë"},
{ "Gamma;", "Γ"},
{ "Iacute;", "Í"},
{ "Icirc;", "Î"},
{ "Igrave;", "Ì"},
{ "Iota;", "Ι"},
{ "Iuml;", "Ï"},
{ "Kappa;", "Κ"},
{ "Lambda;", "Λ"},
{ "Mu;", "Μ"},
{ "Ntilde;", "Ñ"},
{ "Nu;", "Ν"},
{ "OElig;", "Œ"},
{ "Oacute;", "Ó"},
{ "Ocirc;", "Ô"},
{ "Ograve;", "Ò"},
{ "Omega;", "Ω"},
{ "Omicron;", "Ο"},
{ "Oslash;", "Ø"},
{ "Otilde;", "Õ"},
{ "Ouml;", "Ö"},
{ "Phi;", "Φ"},
{ "Pi;", "Π"},
{ "Prime;", "″"},
{ "Psi;", "Ψ"},
{ "Rho;", "Ρ"},
{ "Scaron;", "Š"},
{ "Sigma;", "Σ"},
{ "THORN;", "Þ"},
{ "Tau;", "Τ"},
{ "Theta;", "Θ"},
{ "Uacute;", "Ú"},
{ "Ucirc;", "Û"},
{ "Ugrave;", "Ù"},
{ "Upsilon;", "Υ"},
{ "Uuml;", "Ü"},
{ "Xi;", "Ξ"},
{ "Yacute;", "Ý"},
{ "Yuml;", "Ÿ"},
{ "Zeta;", "Ζ"},
{ "aacute;", "á"},
{ "acirc;", "â"},
{ "acute;", "´"},
{ "aelig;", "æ"},
{ "agrave;", "à"},
{ "alefsym;", "ℵ"},
{ "alpha;", "α"},
{ "amp;", "&"},
{ "and;", "∧"},
{ "ang;", "∠"},
{ "apos;", "'"},
{ "aring;", "å"},
{ "asymp;", "≈"},
{ "atilde;", "ã"},
{ "auml;", "ä"},
{ "bdquo;", "„"},
{ "beta;", "β"},
{ "brvbar;", "¦"},
{ "bull;", "•"},
{ "cap;", "∩"},
{ "ccedil;", "ç"},
{ "cedil;", "¸"},
{ "cent;", "¢"},
{ "chi;", "χ"},
{ "circ;", "ˆ"},
{ "clubs;", "♣"},
{ "cong;", "≅"},
{ "copy;", "©"},
{ "crarr;", "↵"},
{ "cup;", "∪"},
{ "curren;", "¤"},
{ "dArr;", "⇓"},
{ "dagger;", "†"},
{ "darr;", "↓"},
{ "deg;", "°"},
{ "delta;", "δ"},
{ "diams;", "♦"},
{ "divide;", "÷"},
{ "eacute;", "é"},
{ "ecirc;", "ê"},
{ "egrave;", "è"},
{ "empty;", "∅"},
{ "emsp;", "\xE2\x80\x83"},
{ "ensp;", "\xE2\x80\x82"},
{ "epsilon;", "ε"},
{ "equiv;", "≡"},
{ "eta;", "η"},
{ "eth;", "ð"},
{ "euml;", "ë"},
{ "euro;", "€"},
{ "exist;", "∃"},
{ "fnof;", "ƒ"},
{ "forall;", "∀"},
{ "frac12;", "½"},
{ "frac14;", "¼"},
{ "frac34;", "¾"},
{ "frasl;", "⁄"},
{ "gamma;", "γ"},
{ "ge;", "≥"},
{ "gt;", ">"},
{ "hArr;", "⇔"},
{ "harr;", "↔"},
{ "hearts;", "♥"},
{ "hellip;", "…"},
{ "iacute;", "í"},
{ "icirc;", "î"},
{ "iexcl;", "¡"},
{ "igrave;", "ì"},
{ "image;", "ℑ"},
{ "infin;", "∞"},
{ "int;", "∫"},
{ "iota;", "ι"},
{ "iquest;", "¿"},
{ "isin;", "∈"},
{ "iuml;", "ï"},
{ "kappa;", "κ"},
{ "lArr;", "⇐"},
{ "lambda;", "λ"},
{ "lang;", "〈"},
{ "laquo;", "«"},
{ "larr;", "←"},
{ "lceil;", "⌈"},
{ "ldquo;", "“"},
{ "le;", "≤"},
{ "lfloor;", "⌊"},
{ "lowast;", "∗"},
{ "loz;", "◊"},
{ "lrm;", "\xE2\x80\x8E"},
{ "lsaquo;", "‹"},
{ "lsquo;", "‘"},
{ "lt;", "<"},
{ "macr;", "¯"},
{ "mdash;", "—"},
{ "micro;", "µ"},
{ "middot;", "·"},
{ "minus;", "−"},
{ "mu;", "μ"},
{ "nabla;", "∇"},
{ "nbsp;", "\xC2\xA0"},
{ "ndash;", "–"},
{ "ne;", "≠"},
{ "ni;", "∋"},
{ "not;", "¬"},
{ "notin;", "∉"},
{ "nsub;", "⊄"},
{ "ntilde;", "ñ"},
{ "nu;", "ν"},
{ "oacute;", "ó"},
{ "ocirc;", "ô"},
{ "oelig;", "œ"},
{ "ograve;", "ò"},
{ "oline;", "‾"},
{ "omega;", "ω"},
{ "omicron;", "ο"},
{ "oplus;", "⊕"},
{ "or;", "∨"},
{ "ordf;", "ª"},
{ "ordm;", "º"},
{ "oslash;", "ø"},
{ "otilde;", "õ"},
{ "otimes;", "⊗"},
{ "ouml;", "ö"},
{ "para;", "¶"},
{ "part;", "∂"},
{ "permil;", "‰"},
{ "perp;", "⊥"},
{ "phi;", "φ"},
{ "pi;", "π"},
{ "piv;", "ϖ"},
{ "plusmn;", "±"},
{ "pound;", "£"},
{ "prime;", "′"},
{ "prod;", "∏"},
{ "prop;", "∝"},
{ "psi;", "ψ"},
{ "quot;", "\""},
{ "rArr;", "⇒"},
{ "radic;", "√"},
{ "rang;", "〉"},
{ "raquo;", "»"},
{ "rarr;", "→"},
{ "rceil;", "⌉"},
{ "rdquo;", "”"},
{ "real;", "ℜ"},
{ "reg;", "®"},
{ "rfloor;", "⌋"},
{ "rho;", "ρ"},
{ "rlm;", "\xE2\x80\x8F"},
{ "rsaquo;", "›"},
{ "rsquo;", "’"},
{ "sbquo;", "‚"},
{ "scaron;", "š"},
{ "sdot;", "⋅"},
{ "sect;", "§"},
{ "shy;", "\xC2\xAD"},
{ "sigma;", "σ"},
{ "sigmaf;", "ς"},
{ "sim;", "∼"},
{ "spades;", "♠"},
{ "sub;", "⊂"},
{ "sube;", "⊆"},
{ "sum;", "∑"},
{ "sup1;", "¹"},
{ "sup2;", "²"},
{ "sup3;", "³"},
{ "sup;", "⊃"},
{ "supe;", "⊇"},
{ "szlig;", "ß"},
{ "tau;", "τ"},
{ "there4;", "∴"},
{ "theta;", "θ"},
{ "thetasym;", "ϑ"},
{ "thinsp;", "\xE2\x80\x89"},
{ "thorn;", "þ"},
{ "tilde;", "˜"},
{ "times;", "×"},
{ "trade;", "™"},
{ "uArr;", "⇑"},
{ "uacute;", "ú"},
{ "uarr;", "↑"},
{ "ucirc;", "û"},
{ "ugrave;", "ù"},
{ "uml;", "¨"},
{ "upsih;", "ϒ"},
{ "upsilon;", "υ"},
{ "uuml;", "ü"},
{ "weierp;", "℘"},
{ "xi;", "ξ"},
{ "yacute;", "ý"},
{ "yen;", "¥"},
{ "yuml;", "ÿ"},
{ "zeta;", "ζ"},
{ "zwj;", "\xE2\x80\x8D"},
{ "zwnj;", "\xE2\x80\x8C"}
};
static int cmp(const void *key, const void *value) {
return strncmp((const char *) key, *(const char *const *) value,
strlen(*(const char *const *) value));
}
static const char *get_named_entity(const char *name) {
const char *const *entity = (const char *const *) bsearch(name,
NAMED_ENTITIES, sizeof NAMED_ENTITIES / sizeof *NAMED_ENTITIES,
sizeof *NAMED_ENTITIES, cmp);
return entity ? entity[1] : NULL;
}
static size_t putc_utf8(unsigned long cp, char *buffer) {
unsigned char *bytes = (unsigned char *) buffer;
if (cp <= 0x007Ful) {
bytes[0] = (unsigned char) cp;
return 1;
}
if (cp <= 0x07FFul) {
bytes[1] = (unsigned char) ((2 << 6) | (cp & 0x3F));
bytes[0] = (unsigned char) ((6 << 5) | (cp >> 6));
return 2;
}
if (cp <= 0xFFFFul) {
bytes[2] = (unsigned char) ((2 << 6) | (cp & 0x3F));
bytes[1] = (unsigned char) ((2 << 6) | ((cp >> 6) & 0x3F));
bytes[0] = (unsigned char) ((14 << 4) | (cp >> 12));
return 3;
}
if (cp <= 0x10FFFFul) {
bytes[3] = (unsigned char) ((2 << 6) | (cp & 0x3F));
bytes[2] = (unsigned char) ((2 << 6) | ((cp >> 6) & 0x3F));
bytes[1] = (unsigned char) ((2 << 6) | ((cp >> 12) & 0x3F));
bytes[0] = (unsigned char) ((30 << 3) | (cp >> 18));
return 4;
}
return 0;
}
static bool parse_entity(
const char *current, char **to, const char **from) {
const char *end = strchr(current, ';');
if (!end) return 0;
if (current[1] == '#') {
char *tail = NULL;
int errno_save = errno;
bool hex = current[2] == 'x' || current[2] == 'X';
errno = 0;
unsigned long cp = strtoul(
current + (hex ? 3 : 2), &tail, hex ? 16 : 10);
bool fail = errno || tail != end || cp > UNICODE_MAX;
errno = errno_save;
if (fail) return 0;
*to += putc_utf8(cp, *to);
*from = end + 1;
return 1;
} else {
const char *entity = get_named_entity(&current[1]);
if (!entity) return 0;
size_t len = strlen(entity);
memcpy(*to, entity, len);
*to += len;
*from = end + 1;
return 1;
}
}
size_t decode_html_entities_utf8(char *dest, const char *src) {
if (!src) src = dest;
char *to = dest;
const char *from = src;
const char *current;
for (current; (current = strchr(from, '&'));) {
memmove(to, from, (size_t) (current - from));
to += current - from;
if (parse_entity(current, &to, &from))
continue;
from = current;
*to++ = *from++;
}
size_t remaining = strlen(from);
memmove(to, from, remaining);
to += remaining;
*to = 0;
return (size_t) (to - dest);
}
char *unhtml(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error);
my_bool unhtml_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
void unhtml_deinit(UDF_INIT *initid);
my_bool unhtml_init(UDF_INIT *initid, UDF_ARGS *args, char *message) {
if (args->arg_count != 1) {
strcpy(message, "unhtml() requires one argument");
return 1;
}
args->arg_type[0] = STRING_RESULT;
initid->maybe_null = 1; //can return null
return 0;
}
char *unhtml(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error) {
if (args->args[0] == NULL) {
*is_null = 1;
return 0;
}
char *tbuf, *buf, *p, *tp, *rp, c, lc;
int br, depth = 0, in_q = 0;
uint8_t state = 0;
size_t pos, i = 0;
char *allow_free = NULL;
const char *allow_actual;
char is_xml = 0;
char *rbuf = args->args[0];
buf = malloc(sizeof (char) * (args->lengths[0] + 1));
memcpy(buf, args->args[0], args->lengths[0]);
buf[args->lengths[0]] = '\0';
c = *buf;
lc = '\0';
p = buf;
rp = rbuf;
br = 0;
tbuf = tp = NULL;
while (i < args->lengths[0]) {
switch (c) {
case '\0':
break;
case '<':
if (in_q) {
break;
}
if (isspace(*(p + 1))) {
goto reg_char;
}
if (state == 0) {
lc = '<';
state = 1;
} else if (state == 1) {
depth++;
}
break;
case '(':
if (state == 2) {
if (lc != '"' && lc != '\'') {
lc = '(';
br++;
}
} else if (state == 0) {
*(rp++) = c;
}
break;
case ')':
if (state == 2) {
if (lc != '"' && lc != '\'') {
lc = ')';
br--;
}
} else if (state == 0) {
*(rp++) = c;
}
break;
case '>':
if (depth) {
depth--;
break;
}
if (in_q) {
break;
}
switch (state) {
case 1: /* HTML/XML */
lc = '>';
if (is_xml && *(p - 1) == '-') {
break;
}
in_q = state = is_xml = 0;
break;
case 2: /* PHP */
if (!br && lc != '\"' && *(p - 1) == '?') {
in_q = state = 0;
tp = tbuf;
}
break;
case 3:
in_q = state = 0;
tp = tbuf;
break;
case 4: /* JavaScript/CSS/etc... */
if (p >= buf + 2 && *(p - 1) == '-' && *(p - 2) == '-') {
in_q = state = 0;
tp = tbuf;
}
break;
default:
*(rp++) = c;
break;
}
break;
case '"':
case '\'':
if (state == 4) {
/* Inside <!-- comment --> */
break;
} else if (state == 2 && *(p - 1) != '\\') {
if (lc == c) {
lc = '\0';
} else if (lc != '\\') {
lc = c;
}
} else if (state == 0) {
*(rp++) = c;
}
if (state && p != buf && (state == 1 || *(p - 1) != '\\') && (!in_q || *p == in_q)) {
if (in_q) {
in_q = 0;
} else {
in_q = *p;
}
}
break;
case '!':
/* JavaScript & Other HTML scripting languages */
if (state == 1 && *(p - 1) == '<') {
state = 3;
lc = c;
} else {
if (state == 0) {
*(rp++) = c;
}
}
break;
case '-':
if (state == 3 && p >= buf + 2 && *(p - 1) == '-' && *(p - 2) == '!') {
state = 4;
} else {
goto reg_char;
}
break;
case '?':
if (state == 1 && *(p - 1) == '<') {
br = 0;
state = 2;
break;
}
case 'E':
case 'e':
/* !DOCTYPE exception */
if (state == 3 && p > buf + 6
&& tolower(*(p - 1)) == 'p'
&& tolower(*(p - 2)) == 'y'
&& tolower(*(p - 3)) == 't'
&& tolower(*(p - 4)) == 'c'
&& tolower(*(p - 5)) == 'o'
&& tolower(*(p - 6)) == 'd') {
state = 1;
break;
}
/* fall-through */
case 'l':
case 'L':
/* swm: If we encounter '<?xml' then we shouldn't be in
* state == 2 (PHP). Switch back to HTML.
*/
if (state == 2 && p > buf + 4 && strncasecmp(p - 4, "<?xm", 4) == 0) {
state = 1;
is_xml = 1;
break;
}
/* fall-through */
default:
reg_char :
if (state == 0) {
*(rp++) = c;
}
break;
}
c = *(++p);
i++;
}
if (rp < rbuf + args->lengths[0]) {
*rp = '\0';
}
free(buf);
size_t rbuf_length = decode_html_entities_utf8(rbuf, NULL);
const char *start = rbuf;
const char *end = start + strlen(rbuf);
while (start != end) {
unsigned char c = (unsigned char) *start;
if (c <= ' ' &&
(c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\0')) {
start++;
} else {
break;
}
}
while (start != end) {
unsigned char c = (unsigned char) *(end - 1);
if (c <= ' ' &&
(c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || c == '\0')) {
end--;
} else {
break;
}
}
if (rbuf_length == end - start) {
*length = rbuf_length;
return rbuf;
} else if (end - start == 0) {
*length = 0;
return "\0";
} else {
buf = malloc(sizeof (char) * (end - start + 1));
memcpy(buf, start, end - start);
buf[end - start] = '\0';
*length = end - start;
//free(rbuf);
return buf;
}
}
#endif /* HAVE_DLOPEN */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment