Skip to content

Instantly share code, notes, and snippets.

@wam
Created November 8, 2010 18:20
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wam/668025 to your computer and use it in GitHub Desktop.
Save wam/668025 to your computer and use it in GitHub Desktop.
php function that replaces html entities with ascii near-equivalents
function asciify($text) {
$entities = array();
$ascii = array();
// 32 through 127 correspond to ascii letters
for ($i = 32; $i < 127; $i++) {
$entities[] = "&#$i;";
$ascii[] = chr($i);
}
// 32 through 99 have alternates with padding
for ($i = 32; $i < 100; $i++) {
$entities[] = "&#0$i;";
$ascii[] = chr($i);
}
$entities[] = "&#160;"; $ascii[] = ' '; # non-breaking space
$entities[] = "&#161;"; $ascii[] = '!'; # inverted exclamation mark
$entities[] = "&#162;"; $ascii[] = 'cents'; # cent sign
$entities[] = "&#163;"; $ascii[] = 'pounds'; # pound sign
$entities[] = "&#164;"; $ascii[] = '$'; # currency sign
$entities[] = "&#165;"; $ascii[] = 'yen'; # yen sign
$entities[] = "&#166;"; $ascii[] = '|'; # broken vertical bar
$entities[] = "&#167;"; $ascii[] = 'Ss'; # section sign
$entities[] = "&#168;"; $ascii[] = '``'; # spacing diaeresis - umlaut
$entities[] = "&#169;"; $ascii[] = '(c)'; # copyright sign
$entities[] = "&#170;"; $ascii[] = 'a'; # feminine ordinal indicator
$entities[] = "&#171;"; $ascii[] = '<<'; # left double angle quotes
$entities[] = "&#172;"; $ascii[] = '~'; # not sign
$entities[] = "&#173;"; $ascii[] = '-'; # soft hyphen
$entities[] = "&#174;"; $ascii[] = '(r)'; # registered trade mark sign
$entities[] = "&#175;"; $ascii[] = '-'; # spacing macron - overline
$entities[] = "&nbsp;"; $ascii[] = ' '; # non-breaking space
$entities[] = "&iexcl;"; $ascii[] = '!'; # inverted exclamation mark
$entities[] = "&cent;"; $ascii[] = 'cents'; # cent sign
$entities[] = "&pound;"; $ascii[] = 'pounds'; # pound sign
$entities[] = "&curren;"; $ascii[] = '$'; # currency sign
$entities[] = "&yen;"; $ascii[] = 'yen'; # yen sign
$entities[] = "&brvbar;"; $ascii[] = '|'; # broken vertical bar
$entities[] = "&sect;"; $ascii[] = 'Ss'; # section sign
$entities[] = "&uml;"; $ascii[] = '``'; # spacing diaeresis - umlaut
$entities[] = "&copy;"; $ascii[] = '(c)'; # copyright sign
$entities[] = "&ordf;"; $ascii[] = 'a'; # feminine ordinal indicator
$entities[] = "&laquo;"; $ascii[] = '<<'; # left double angle quotes
$entities[] = "&not;"; $ascii[] = '~'; # not sign
$entities[] = "&shy;"; $ascii[] = '-'; # soft hyphen
$entities[] = "&reg;"; $ascii[] = '(r)'; # registered trade mark sign
$entities[] = "&macr;"; $ascii[] = '-'; # spacing macron - overline
$entities[] = "&#176;"; $ascii[] = 'deg'; # degree sign
$entities[] = "&#177;"; $ascii[] = '+/-'; # plus-or-minus sign
$entities[] = "&#178;"; $ascii[] = '^2'; # superscript two - squared
$entities[] = "&#179;"; $ascii[] = '^3'; # superscript three - cubed
$entities[] = "&#180;"; $ascii[] = '\''; # acute accent - spacing acute
$entities[] = "&#181;"; $ascii[] = 'u'; # micro sign
$entities[] = "&#182;"; $ascii[] = 'par'; # pilcrow sign - paragraph sign
$entities[] = "&#183;"; $ascii[] = '.'; # middle dot - Georgian comma
$entities[] = "&#184;"; $ascii[] = ','; # spacing cedilla
$entities[] = "&#185;"; $ascii[] = '^1'; # superscript one
$entities[] = "&#186;"; $ascii[] = '^o'; # masculine ordinal indicator
$entities[] = "&#187;"; $ascii[] = '>>'; # right double angle quotes
$entities[] = "&#188;"; $ascii[] = '1/4'; # fraction one quarter
$entities[] = "&#189;"; $ascii[] = '1/2'; # fraction one half
$entities[] = "&#190;"; $ascii[] = '3/4'; # fraction three quarters
$entities[] = "&#191;"; $ascii[] = '?'; # inverted question mark
$entities[] = "&deg;"; $ascii[] = 'deg'; # degree sign
$entities[] = "&plusmn;"; $ascii[] = '+/-'; # plus-or-minus sign
$entities[] = "&sup2;"; $ascii[] = '^2'; # superscript two - squared
$entities[] = "&sup3;"; $ascii[] = '^3'; # superscript three - cubed
$entities[] = "&acute;"; $ascii[] = '\''; # acute accent - spacing acute
$entities[] = "&micro;"; $ascii[] = 'u'; # micro sign
$entities[] = "&para;"; $ascii[] = 'par'; # pilcrow sign - paragraph sign
$entities[] = "&middot;"; $ascii[] = '.'; # middle dot - Georgian comma
$entities[] = "&cedil;"; $ascii[] = ','; # spacing cedilla
$entities[] = "&sup1;"; $ascii[] = '^1'; # superscript one
$entities[] = "&ordm;"; $ascii[] = '^o'; # masculine ordinal indicator
$entities[] = "&raquo;"; $ascii[] = '>>'; # right double angle quotes
$entities[] = "&frac14;"; $ascii[] = '1/4'; # fraction one quarter
$entities[] = "&frac12;"; $ascii[] = '1/2'; # fraction one half
$entities[] = "&frac34;"; $ascii[] = '3/4'; # fraction three quarters
$entities[] = "&iquest;"; $ascii[] = '?'; # inverted question mark
$entities[] = "&#192;"; $ascii[] = 'A'; # latin capital letter A with grave
$entities[] = "&#193;"; $ascii[] = 'A'; # latin capital letter A with acute
$entities[] = "&#194;"; $ascii[] = 'A'; # latin capital letter A with circumflex
$entities[] = "&#195;"; $ascii[] = 'A'; # latin capital letter A with tilde
$entities[] = "&#196;"; $ascii[] = 'A'; # latin capital letter A with diaeresis
$entities[] = "&#197;"; $ascii[] = 'A'; # latin capital letter A with ring above
$entities[] = "&#198;"; $ascii[] = 'AE'; # latin capital letter AE
$entities[] = "&#199;"; $ascii[] = 'C'; # latin capital letter C with cedilla
$entities[] = "&#200;"; $ascii[] = 'E'; # latin capital letter E with grave
$entities[] = "&#201;"; $ascii[] = 'E'; # latin capital letter E with acute
$entities[] = "&#202;"; $ascii[] = 'E'; # latin capital letter E with circumflex
$entities[] = "&#203;"; $ascii[] = 'E'; # latin capital letter E with diaeresis
$entities[] = "&#204;"; $ascii[] = 'I'; # latin capital letter I with grave
$entities[] = "&#205;"; $ascii[] = 'I'; # latin capital letter I with acute
$entities[] = "&#206;"; $ascii[] = 'I'; # latin capital letter I with circumflex
$entities[] = "&#207;"; $ascii[] = 'I'; # latin capital letter I with diaeresis
$entities[] = "&Agrave;"; $ascii[] = 'A'; # latin capital letter A with grave
$entities[] = "&Aacute;"; $ascii[] = 'A'; # latin capital letter A with acute
$entities[] = "&Acirc;"; $ascii[] = 'A'; # latin capital letter A with circumflex
$entities[] = "&Atilde;"; $ascii[] = 'A'; # latin capital letter A with tilde
$entities[] = "&Auml;"; $ascii[] = 'A'; # latin capital letter A with diaeresis
$entities[] = "&Aring;"; $ascii[] = 'A'; # latin capital letter A with ring above
$entities[] = "&AElig;"; $ascii[] = 'AE'; # latin capital letter AE
$entities[] = "&Ccedil;"; $ascii[] = 'C'; # latin capital letter C with cedilla
$entities[] = "&Egrave;"; $ascii[] = 'E'; # latin capital letter E with grave
$entities[] = "&Eacute;"; $ascii[] = 'E'; # latin capital letter E with acute
$entities[] = "&Ecirc;"; $ascii[] = 'E'; # latin capital letter E with circumflex
$entities[] = "&Euml;"; $ascii[] = 'E'; # latin capital letter E with diaeresis
$entities[] = "&Igrave;"; $ascii[] = 'I'; # latin capital letter I with grave
$entities[] = "&Iacute;"; $ascii[] = 'I'; # latin capital letter I with acute
$entities[] = "&Icirc;"; $ascii[] = 'I'; # latin capital letter I with circumflex
$entities[] = "&Iuml;"; $ascii[] = 'I'; # latin capital letter I with diaeresis
$entities[] = "&#208;"; $ascii[] = 'EDH'; # latin capital letter ETH
$entities[] = "&#209;"; $ascii[] = 'N'; # latin capital letter N with tilde
$entities[] = "&#210;"; $ascii[] = 'O'; # latin capital letter O with grave
$entities[] = "&#211;"; $ascii[] = 'O'; # latin capital letter O with acute
$entities[] = "&#212;"; $ascii[] = 'O'; # latin capital letter O with circumflex
$entities[] = "&#213;"; $ascii[] = 'O'; # latin capital letter O with tilde
$entities[] = "&#214;"; $ascii[] = 'O'; # latin capital letter O with diaeresis
$entities[] = "&#215;"; $ascii[] = 'x'; # multiplication sign
$entities[] = "&#216;"; $ascii[] = '0'; # latin capital letter O with slash
$entities[] = "&#217;"; $ascii[] = 'U'; # latin capital letter U with grave
$entities[] = "&#218;"; $ascii[] = 'U'; # latin capital letter U with acute
$entities[] = "&#219;"; $ascii[] = 'U'; # latin capital letter U with circumflex
$entities[] = "&#220;"; $ascii[] = 'U'; # latin capital letter U with diaeresis
$entities[] = "&#221;"; $ascii[] = 'Y'; # latin capital letter Y with acute
$entities[] = "&#222;"; $ascii[] = 'dh'; # latin capital letter THORN
$entities[] = "&#223;"; $ascii[] = 'th'; # latin small letter sharp s - ess-zed
$entities[] = "&ETH;"; $ascii[] = 'EDH'; # latin capital letter ETH
$entities[] = "&Ntilde;"; $ascii[] = 'N'; # latin capital letter N with tilde
$entities[] = "&Ograve;"; $ascii[] = 'O'; # latin capital letter O with grave
$entities[] = "&Oacute;"; $ascii[] = 'O'; # latin capital letter O with acute
$entities[] = "&Ocirc;"; $ascii[] = 'O'; # latin capital letter O with circumflex
$entities[] = "&Otilde;"; $ascii[] = 'O'; # latin capital letter O with tilde
$entities[] = "&Ouml;"; $ascii[] = 'O'; # latin capital letter O with diaeresis
$entities[] = "&times;"; $ascii[] = 'x'; # multiplication sign
$entities[] = "&Oslash;"; $ascii[] = 'O'; # latin capital letter O with slash
$entities[] = "&Ugrave;"; $ascii[] = 'U'; # latin capital letter U with grave
$entities[] = "&Uacute;"; $ascii[] = 'U'; # latin capital letter U with acute
$entities[] = "&Ucirc;"; $ascii[] = 'U'; # latin capital letter U with circumflex
$entities[] = "&Uuml;"; $ascii[] = 'U'; # latin capital letter U with diaeresis
$entities[] = "&Yacute;"; $ascii[] = 'Y'; # latin capital letter Y with acute
$entities[] = "&THORN;"; $ascii[] = 'dh'; # latin capital letter THORN
$entities[] = "&szlig;"; $ascii[] = 'th'; # latin small letter sharp s - ess-zed
$entities[] = "&#224;"; $ascii[] = 'a'; # latin small letter a with grave
$entities[] = "&#225;"; $ascii[] = 'a'; # latin small letter a with acute
$entities[] = "&#226;"; $ascii[] = 'a'; # latin small letter a with circumflex
$entities[] = "&#227;"; $ascii[] = 'a'; # latin small letter a with tilde
$entities[] = "&#228;"; $ascii[] = 'a'; # latin small letter a with diaeresis
$entities[] = "&#229;"; $ascii[] = 'a'; # latin small letter a with ring above
$entities[] = "&#230;"; $ascii[] = 'ae'; # latin small letter ae
$entities[] = "&#231;"; $ascii[] = 'c'; # latin small letter c with cedilla
$entities[] = "&#232;"; $ascii[] = 'e'; # latin small letter e with grave
$entities[] = "&#233;"; $ascii[] = 'e'; # latin small letter e with acute
$entities[] = "&#234;"; $ascii[] = 'e'; # latin small letter e with circumflex
$entities[] = "&#235;"; $ascii[] = 'e'; # latin small letter e with diaeresis
$entities[] = "&#236;"; $ascii[] = 'i'; # latin small letter i with grave
$entities[] = "&#237;"; $ascii[] = 'i'; # latin small letter i with acute
$entities[] = "&#238;"; $ascii[] = 'i'; # latin small letter i with circumflex
$entities[] = "&#239;"; $ascii[] = 'i'; # latin small letter i with diaeresis
$entities[] = "&agrave;"; $ascii[] = 'a'; # latin small letter a with grave
$entities[] = "&aacute;"; $ascii[] = 'a'; # latin small letter a with acute
$entities[] = "&acirc;"; $ascii[] = 'a'; # latin small letter a with circumflex
$entities[] = "&atilde;"; $ascii[] = 'a'; # latin small letter a with tilde
$entities[] = "&auml;"; $ascii[] = 'a'; # latin small letter a with diaeresis
$entities[] = "&aring;"; $ascii[] = 'a'; # latin small letter a with ring above
$entities[] = "&aelig;"; $ascii[] = 'ae'; # latin small letter ae
$entities[] = "&ccedil;"; $ascii[] = 'c'; # latin small letter c with cedilla
$entities[] = "&egrave;"; $ascii[] = 'e'; # latin small letter e with grave
$entities[] = "&eacute;"; $ascii[] = 'e'; # latin small letter e with acute
$entities[] = "&ecirc;"; $ascii[] = 'e'; # latin small letter e with circumflex
$entities[] = "&euml;"; $ascii[] = 'e'; # latin small letter e with diaeresis
$entities[] = "&igrave;"; $ascii[] = 'i'; # latin small letter i with grave
$entities[] = "&iacute;"; $ascii[] = 'i'; # latin small letter i with acute
$entities[] = "&icirc;"; $ascii[] = 'i'; # latin small letter i with circumflex
$entities[] = "&iuml;"; $ascii[] = 'i'; # latin small letter i with diaeresis
$entities[] = "&#240;"; $ascii[] = 'edh'; # latin small letter eth
$entities[] = "&#241;"; $ascii[] = 'n'; # latin small letter n with tilde
$entities[] = "&#242;"; $ascii[] = 'o'; # latin small letter o with grave
$entities[] = "&#243;"; $ascii[] = 'o'; # latin small letter o with acute
$entities[] = "&#244;"; $ascii[] = 'o'; # latin small letter o with circumflex
$entities[] = "&#245;"; $ascii[] = 'o'; # latin small letter o with tilde
$entities[] = "&#246;"; $ascii[] = 'o'; # latin small letter o with diaeresis
$entities[] = "&#247;"; $ascii[] = '/'; # division sign
$entities[] = "&#248;"; $ascii[] = 'o'; # latin small letter o with slash
$entities[] = "&#249;"; $ascii[] = 'u'; # latin small letter u with grave
$entities[] = "&#250;"; $ascii[] = 'u'; # latin small letter u with acute
$entities[] = "&#251;"; $ascii[] = 'u'; # latin small letter u with circumflex
$entities[] = "&#252;"; $ascii[] = 'u'; # latin small letter u with diaeresis
$entities[] = "&#253;"; $ascii[] = 'y'; # latin small letter y with acute
$entities[] = "&#254;"; $ascii[] = 'th'; # latin small letter thorn
$entities[] = "&#255;"; $ascii[] = 'y'; # latin small letter y with diaeresis
$entities[] = "&eth;"; $ascii[] = 'edh'; # latin small letter eth
$entities[] = "&ntilde;"; $ascii[] = 'n'; # latin small letter n with tilde
$entities[] = "&ograve;"; $ascii[] = 'o'; # latin small letter o with grave
$entities[] = "&oacute;"; $ascii[] = 'o'; # latin small letter o with acute
$entities[] = "&ocirc;"; $ascii[] = 'o'; # latin small letter o with circumflex
$entities[] = "&otilde;"; $ascii[] = 'o'; # latin small letter o with tilde
$entities[] = "&ouml;"; $ascii[] = 'o'; # latin small letter o with diaeresis
$entities[] = "&divide;"; $ascii[] = '/'; # division sign
$entities[] = "&oslash;"; $ascii[] = 'o'; # latin small letter o with slash
$entities[] = "&ugrave;"; $ascii[] = 'u'; # latin small letter u with grave
$entities[] = "&uacute;"; $ascii[] = 'u'; # latin small letter u with acute
$entities[] = "&ucirc;"; $ascii[] = 'u'; # latin small letter u with circumflex
$entities[] = "&uuml;"; $ascii[] = 'u'; # latin small letter u with diaeresis
$entities[] = "&yacute;"; $ascii[] = 'y'; # latin small letter y with acute
$entities[] = "&thorn;"; $ascii[] = 'th'; # latin small letter thorn
$entities[] = "&yuml;"; $ascii[] = 'y'; # latin small letter y with diaeresis
$entities[] = "&#338;"; $ascii[] = 'OE'; # latin capital letter OE
$entities[] = "&#339;"; $ascii[] = 'oe'; # latin small letter oe
$entities[] = "&#352;"; $ascii[] = 'S'; # latin capital letter S with caron
$entities[] = "&#353;"; $ascii[] = 's'; # latin small letter s with caron
$entities[] = "&#376;"; $ascii[] = 'U'; # latin capital letter Y with diaeresis
$entities[] = "&#402;"; $ascii[] = 'f'; # latin small f with hook - function
// Higher Punctuation
$entities[] = "&#8194;"; $ascii[] = ' '; # en space
$entities[] = "&#8195;"; $ascii[] = ' '; # em space
$entities[] = "&#8201;"; $ascii[] = ' '; # thin space
$entities[] = "&#8204;"; $ascii[] = ''; # zero width non-joiner,
$entities[] = "&#8205;"; $ascii[] = ''; # zero width joiner
$entities[] = "&#8206;"; $ascii[] = ''; # left-to-right mark
$entities[] = "&#8207;"; $ascii[] = ''; # right-to-left mark
$entities[] = "&#8211;"; $ascii[] = '-'; # en dash
$entities[] = "&#8212;"; $ascii[] = '--'; # em dash
$entities[] = "&#8216;"; $ascii[] = '\''; # left single quotation mark,
$entities[] = "&#8217;"; $ascii[] = '\''; # right single quotation mark,
$entities[] = "&#8218;"; $ascii[] = '"'; # single low-9 quotation mark
$entities[] = "&#8220;"; $ascii[] = '"'; # left double quotation mark,
$entities[] = "&#8221;"; $ascii[] = '"'; # right double quotation mark,
$entities[] = "&#8222;"; $ascii[] = ',,'; # double low-9 quotation mark
$entities[] = "&#8224;"; $ascii[] = '*'; # dagger
$entities[] = "&#8225;"; $ascii[] = '**'; # double dagger
$entities[] = "&#8226;"; $ascii[] = '*'; # bullet
$entities[] = "&#8230;"; $ascii[] = '...'; # horizontal ellipsis
$entities[] = "&#8240;"; $ascii[] = '0/00'; # per mille sign
$entities[] = "&#8249;"; $ascii[] = '<'; # single left-pointing angle quotation mark,
$entities[] = "&#8250;"; $ascii[] = '>'; # single right-pointing angle quotation mark,
$entities[] = "&#8364;"; $ascii[] = 'euro'; # euro sign
$entities[] = "&euro;"; $ascii[] = 'euro'; # euro sign
$entities[] = "&#8482;"; $ascii[] = '(TM)'; # trade mark sign
$entities[] = "&amp;"; $ascii[] = '&'; # ampersand
$output = str_replace($entities, $ascii, $text);
// For CDATA: Remove any instances of ]]> that may have accidentally been created.
// $output = str_replace(']]>', '', $output);
return $output;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment