Skip to content

Instantly share code, notes, and snippets.

@xus
Forked from zeroasterisk/cleanup_uft8_class.php
Created May 23, 2017 12:13
Show Gist options
  • Save xus/c796612553223ab9fa66a5420546637f to your computer and use it in GitHub Desktop.
Save xus/c796612553223ab9fa66a5420546637f to your computer and use it in GitHub Desktop.
A PHP class to cleanup strings to be UTF8
<?php
/* Standardized data cleanup helper class */
class Cleanup {
/**
* Make a string into UTF8 compliant... cleans funcky input characters
* @param mixed $str
* @return mixed $str
*/
static function makeUTF8($str) {
if (is_array($str)) {
$r = array();
foreach($str as $k => $v) {
$r[$k] = self::makeUTF8($v);
}
return $r;
} elseif (is_string($str)) {
$str = strtr($str, array(
// HTML entities
chr(195).chr(129) => '&Aacute;',
chr(195).chr(161) => '&aacute;',
chr(195).chr(130) => '&Acirc;',
chr(195).chr(162) => '&acirc;',
chr(194).chr(180) => '&acute;',
chr(195).chr(134) => '&AElig;',
chr(195).chr(166) => '&aelig;',
chr(195).chr(128) => '&Agrave;',
chr(195).chr(160) => '&agrave;',
chr(226).chr(132).chr(181) => '&alefsym;',
chr(206).chr(145) => '&Alpha;',
chr(206).chr(177) => '&alpha;',
chr(226).chr(136).chr(167) => '&and;',
chr(226).chr(136).chr(160) => '&ang;',
chr(195).chr(133) => '&Aring;',
chr(195).chr(165) => '&aring;',
chr(226).chr(137).chr(136) => '&asymp;',
chr(195).chr(131) => '&Atilde;',
chr(195).chr(163) => '&atilde;',
chr(195).chr(132) => '&Auml;',
chr(195).chr(164) => '&auml;',
chr(226).chr(128).chr(158) => '&bdquo;',
chr(206).chr(146) => '&Beta;',
chr(206).chr(178) => '&beta;',
chr(194).chr(166) => '&brvbar;',
chr(226).chr(128).chr(162) => '&bull;',
chr(226).chr(136).chr(169) => '&cap;',
chr(195).chr(135) => '&Ccedil;',
chr(195).chr(167) => '&ccedil;',
chr(194).chr(184) => '&cedil;',
chr(194).chr(162) => '&cent;',
chr(206).chr(167) => '&Chi;',
chr(207).chr(135) => '&chi;',
chr(203).chr(134) => '&circ;',
chr(226).chr(153).chr(163) => '&clubs;',
chr(226).chr(137).chr(133) => '&cong;',
chr(194).chr(169) => '&copy;',
chr(226).chr(134).chr(181) => '&crarr;',
chr(226).chr(136).chr(170) => '&cup;',
chr(194).chr(164) => '&curren;',
chr(226).chr(128).chr(160) => '&dagger;',
chr(226).chr(128).chr(161) => '&Dagger;',
chr(226).chr(134).chr(147) => '&darr;',
chr(226).chr(135).chr(147) => '&dArr;',
chr(194).chr(176) => '&deg;',
chr(206).chr(148) => '&Delta;',
chr(206).chr(180) => '&delta;',
chr(226).chr(153).chr(166) => '&diams;',
chr(195).chr(183) => '&divide;',
chr(195).chr(137) => '&Eacute;',
chr(195).chr(169) => '&eacute;',
chr(195).chr(138) => '&Ecirc;',
chr(195).chr(170) => '&ecirc;',
chr(195).chr(136) => '&Egrave;',
chr(195).chr(168) => '&egrave;',
chr(226).chr(136).chr(133) => '&empty;',
chr(226).chr(128).chr(131) => '&emsp;',
chr(226).chr(128).chr(130) => '&ensp;',
chr(206).chr(149) => '&Epsilon;',
chr(206).chr(181) => '&epsilon;',
chr(226).chr(137).chr(161) => '&equiv;',
chr(206).chr(151) => '&Eta;',
chr(206).chr(183) => '&eta;',
chr(195).chr(144) => '&ETH;',
chr(195).chr(176) => '&eth;',
chr(195).chr(139) => '&Euml;',
chr(195).chr(171) => '&euml;',
chr(226).chr(130).chr(172) => '&euro;',
chr(226).chr(136).chr(131) => '&exist;',
chr(198).chr(146) => '&fnof;',
chr(226).chr(136).chr(128) => '&forall;',
chr(194).chr(189) => '&frac12;',
chr(194).chr(188) => '&frac14;',
chr(194).chr(190) => '&frac34;',
chr(226).chr(129).chr(132) => '&frasl;',
chr(206).chr(147) => '&Gamma;',
chr(206).chr(179) => '&gamma;',
chr(226).chr(137).chr(165) => '&ge;',
chr(226).chr(134).chr(148) => '&harr;',
chr(226).chr(135).chr(148) => '&hArr;',
chr(226).chr(153).chr(165) => '&hearts;',
chr(226).chr(128).chr(166) => '&hellip;',
chr(195).chr(141) => '&Iacute;',
chr(195).chr(173) => '&iacute;',
chr(195).chr(142) => '&Icirc;',
chr(195).chr(174) => '&icirc;',
chr(194).chr(161) => '&iexcl;',
chr(195).chr(140) => '&Igrave;',
chr(195).chr(172) => '&igrave;',
chr(226).chr(132).chr(145) => '&image;',
chr(226).chr(136).chr(158) => '&infin;',
chr(226).chr(136).chr(171) => '&int;',
chr(206).chr(153) => '&Iota;',
chr(206).chr(185) => '&iota;',
chr(194).chr(191) => '&iquest;',
chr(226).chr(136).chr(136) => '&isin;',
chr(195).chr(143) => '&Iuml;',
chr(195).chr(175) => '&iuml;',
chr(206).chr(154) => '&Kappa;',
chr(206).chr(186) => '&kappa;',
chr(206).chr(155) => '&Lambda;',
chr(206).chr(187) => '&lambda;',
chr(226).chr(140).chr(169) => '&lang;',
chr(194).chr(171) => '&laquo;',
chr(226).chr(134).chr(144) => '&larr;',
chr(226).chr(135).chr(144) => '&lArr;',
chr(226).chr(140).chr(136) => '&lceil;',
chr(226).chr(128).chr(156) => '&ldquo;',
chr(226).chr(137).chr(164) => '&le;',
chr(226).chr(140).chr(138) => '&lfloor;',
chr(226).chr(136).chr(151) => '&lowast;',
chr(226).chr(151).chr(138) => '&loz;',
chr(226).chr(128).chr(142) => '&lrm;',
chr(226).chr(128).chr(185) => '&lsaquo;',
chr(226).chr(128).chr(152) => '&lsquo;',
chr(194).chr(175) => '&macr;',
chr(226).chr(128).chr(148) => '&mdash;',
chr(194).chr(181) => '&micro;',
chr(194).chr(183) => '&middot;',
chr(226).chr(136).chr(146) => '&minus;',
chr(206).chr(156) => '&Mu;',
chr(206).chr(188) => '&mu;',
chr(226).chr(136).chr(135) => '&nabla;',
chr(194).chr(160) => '&nbsp;',
chr(226).chr(128).chr(147) => '&ndash;',
chr(226).chr(137).chr(160) => '&ne;',
chr(226).chr(136).chr(139) => '&ni;',
chr(194).chr(172) => '&not;',
chr(226).chr(136).chr(137) => '&notin;',
chr(226).chr(138).chr(132) => '&nsub;',
chr(195).chr(145) => '&Ntilde;',
chr(195).chr(177) => '&ntilde;',
chr(206).chr(157) => '&Nu;',
chr(206).chr(189) => '&nu;',
chr(195).chr(147) => '&Oacute;',
chr(195).chr(179) => '&oacute;',
chr(195).chr(148) => '&Ocirc;',
chr(195).chr(180) => '&ocirc;',
chr(197).chr(146) => '&OElig;',
chr(197).chr(147) => '&oelig;',
chr(195).chr(146) => '&Ograve;',
chr(195).chr(178) => '&ograve;',
chr(226).chr(128).chr(190) => '&oline;',
chr(206).chr(169) => '&Omega;',
chr(207).chr(137) => '&omega;',
chr(206).chr(159) => '&Omicron;',
chr(206).chr(191) => '&omicron;',
chr(226).chr(138).chr(149) => '&oplus;',
chr(226).chr(136).chr(168) => '&or;',
chr(194).chr(170) => '&ordf;',
chr(194).chr(186) => '&ordm;',
chr(195).chr(152) => '&Oslash;',
chr(195).chr(184) => '&oslash;',
chr(195).chr(149) => '&Otilde;',
chr(195).chr(181) => '&otilde;',
chr(226).chr(138).chr(151) => '&otimes;',
chr(195).chr(150) => '&Ouml;',
chr(195).chr(182) => '&ouml;',
chr(194).chr(182) => '&para;',
chr(226).chr(136).chr(130) => '&part;',
chr(226).chr(128).chr(176) => '&permil;',
chr(226).chr(138).chr(165) => '&perp;',
chr(206).chr(166) => '&Phi;',
chr(207).chr(134) => '&phi;',
chr(206).chr(160) => '&Pi;',
chr(207).chr(128) => '&pi;',
chr(207).chr(150) => '&piv;',
chr(194).chr(177) => '&plusmn;',
chr(194).chr(163) => '&pound;',
chr(226).chr(128).chr(178) => '&prime;',
chr(226).chr(128).chr(179) => '&Prime;',
chr(226).chr(136).chr(143) => '&prod;',
chr(226).chr(136).chr(157) => '&prop;',
chr(206).chr(168) => '&Psi;',
chr(207).chr(136) => '&psi;',
chr(226).chr(136).chr(154) => '&radic;',
chr(226).chr(140).chr(170) => '&rang;',
chr(194).chr(187) => '&raquo;',
chr(226).chr(134).chr(146) => '&rarr;',
chr(226).chr(135).chr(146) => '&rArr;',
chr(226).chr(140).chr(137) => '&rceil;',
chr(226).chr(128).chr(157) => '&rdquo;',
chr(226).chr(132).chr(156) => '&real;',
chr(194).chr(174) => '&reg;',
chr(226).chr(140).chr(139) => '&rfloor;',
chr(206).chr(161) => '&Rho;',
chr(207).chr(129) => '&rho;',
chr(226).chr(128).chr(143) => '&rlm;',
chr(226).chr(128).chr(186) => '&rsaquo;',
chr(226).chr(128).chr(153) => '&rsquo;',
chr(226).chr(128).chr(154) => '&sbquo;',
chr(197).chr(160) => '&Scaron;',
chr(197).chr(161) => '&scaron;',
chr(226).chr(139).chr(133) => '&sdot;',
chr(194).chr(167) => '&sect;',
chr(194).chr(173) => '&shy;',
chr(206).chr(163) => '&Sigma;',
chr(207).chr(131) => '&sigma;',
chr(207).chr(130) => '&sigmaf;',
chr(226).chr(136).chr(188) => '&sim;',
chr(226).chr(153).chr(160) => '&spades;',
chr(226).chr(138).chr(130) => '&sub;',
chr(226).chr(138).chr(134) => '&sube;',
chr(226).chr(136).chr(145) => '&sum;',
chr(194).chr(185) => '&sup1;',
chr(194).chr(178) => '&sup2;',
chr(194).chr(179) => '&sup3;',
chr(226).chr(138).chr(131) => '&sup;',
chr(226).chr(138).chr(135) => '&supe;',
chr(195).chr(159) => '&szlig;',
chr(206).chr(164) => '&Tau;',
chr(207).chr(132) => '&tau;',
chr(226).chr(136).chr(180) => '&there4;',
chr(206).chr(152) => '&Theta;',
chr(206).chr(184) => '&theta;',
chr(207).chr(145) => '&thetasym;',
chr(226).chr(128).chr(137) => '&thinsp;',
chr(195).chr(158) => '&THORN;',
chr(195).chr(190) => '&thorn;',
chr(203).chr(156) => '&tilde;',
chr(195).chr(151) => '&times;',
chr(226).chr(132).chr(162) => '&trade;',
chr(195).chr(154) => '&Uacute;',
chr(195).chr(186) => '&uacute;',
chr(226).chr(134).chr(145) => '&uarr;',
chr(226).chr(135).chr(145) => '&uArr;',
chr(195).chr(155) => '&Ucirc;',
chr(195).chr(187) => '&ucirc;',
chr(195).chr(153) => '&Ugrave;',
chr(195).chr(185) => '&ugrave;',
chr(194).chr(168) => '&uml;',
chr(207).chr(146) => '&upsih;',
chr(206).chr(165) => '&Upsilon;',
chr(207).chr(133) => '&upsilon;',
chr(195).chr(156) => '&Uuml;',
chr(195).chr(188) => '&uuml;',
chr(226).chr(132).chr(152) => '&weierp;',
chr(206).chr(158) => '&Xi;',
chr(206).chr(190) => '&xi;',
chr(195).chr(157) => '&Yacute;',
chr(195).chr(189) => '&yacute;',
chr(194).chr(165) => '&yen;',
chr(195).chr(191) => '&yuml;',
chr(197).chr(184) => '&Yuml;',
chr(206).chr(150) => '&Zeta;',
chr(206).chr(182) => '&zeta;',
chr(226).chr(128).chr(141) => '&zwj;',
chr(226).chr(128).chr(140) => '&zwnj;',
// standard translations (legacy)
chr(225) => 'á', chr(228) => 'ä', chr(232) => 'č', chr(239) => 'ď',
chr(233) => '&eacute;',
chr(236) => '&ecirc;',
chr(237) => 'í', chr(229) => 'ĺ', chr(229) => 'ľ',
chr(242) => 'ň', chr(244) => 'ô', chr(243) => 'ó', chr(154) => 'š', chr(248) => 'ř',
chr(250) => 'ú', chr(249) => 'ů', chr(157) => 'ť', chr(253) => 'ý', chr(158) => 'ž',
chr(193) => 'Á', chr(196) => 'Ä', chr(200) => 'Č', chr(207) => 'Ď', chr(201) => 'É',
chr(204) => 'Ě', chr(205) => 'Í', chr(197) => 'Ĺ', chr(188) => 'Ľ', chr(210) => 'Ň',
chr(212) => 'Ô', chr(211) => 'Ó', chr(138) => 'Š', chr(216) => 'Ř', chr(218) => 'Ú',
chr(217) => 'Ů', chr(141) => 'Ť', chr(221) => 'Ý', chr(142) => 'Ž',
// phonetic alphabet
chr(240) => '&eth;',
chr(230) => '&aelig;',
// other funky translations
chr(160) => ' ',
chr(150) => '-'));
$str = preg_replace(array(
'/[\x60\x82\x91\x92\xb4\xb8]/i', // single quotes
'/[\x84\x93\x94]/i', // double quotes
'/[\x85]/i', // ellipsis ...
'/[\x00-\x0d\x0b\x0c\x0e-\x1f\x7f-\x9f]/i' // all other non-ascii
), array(
'\'',
'"',
'...',
''
), $str);
if (is_string($str) && self::detectUTF8($str)) {
$str=@str_replace("\xE2\x82\xAC", "&euro;", $str);
$str=@iconv("UTF-8", "ISO-8859-1//TRANSLIT", $str);
$str=preg_replace("/[^\x9\xA\xD\x20-\x7F]/", "", $str);
}
}
return $str;
}
/**
* Cleans a string to UTF8, and then tries to translate common entities to plain text
* @link http://www.w3schools.com/tags/ref_entities.asp
*/
static function makeUTF8plain($str) {
$str = self::makeUTF8($str);
$str = strtr($str, array(
'&Agrave;' => 'A',
'&Aacute;' => 'A',
'&Acirc;' => 'A',
'&Atilde;' => 'A',
'&Auml;' => 'A',
'&Aring;' => 'A',
'&AElig;' => 'A',
'&agrave;' => 'a',
'&aacute;' => 'a',
'&acirc;' => 'a',
'&atilde;' => 'a',
'&auml;' => 'a',
'&aring;' => 'a',
'&aelig;' => 'a',
'&Eacute;' => 'E',
'&Egrave;' => 'E',
'&Ecirc;' => 'E',
'&Euml;' => 'E',
'&eacute;' => 'e',
'&egrave;' => 'e',
'&ecirc;' => 'e',
'&euml;' => 'e',
'&iacute;' => 'i',
'&igrave;' => 'i',
'&icirc;' => 'i',
'&iuml;' => 'i',
'&iacute;' => 'i',
'&igrave;' => 'i',
'&icirc;' => 'i',
'&iuml;' => 'i',
'&Ograve;' => 'O',
'&Oacute;' => 'O',
'&Ocirc;' => 'O',
'&Otilde;' => 'O',
'&Ouml;' => 'O',
'&Oslash;' => 'O',
'&ograve;' => 'o',
'&oacute;' => 'o',
'&ocirc;' => 'o',
'&otilde;' => 'o',
'&ouml;' => 'o',
'&oslash;' => 'o',
'&Ugrave;' => 'U',
'&Uacute;' => 'U',
'&Ucirc;' => 'U',
'&Utilde;' => 'U',
'&Uuml;' => 'U',
'&ugrave;' => 'u',
'&uacute;' => 'u',
'&ucirc;' => 'u',
'&utilde;' => 'u',
'&uuml;' => 'u',
'&Ntilde;' => 'N',
'&ntilde;' => 'n',
));
return $str;
}
/**
* Checks if a string is UTF8 compliant
* @param string $str
* @return bool
*/
static function detectUTF8($str) {
return preg_match('%(?:'.
'[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
'|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
'|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
'|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
'|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
'|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
'|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
')+%xs', $str);
}
/**
* Checks if a string is UTF8 compliant
* @param string $str
* @return bool
*/
static function isUTF8($str) {
if ($str === mb_convert_encoding(mb_convert_encoding($str, "UTF-32", "UTF-8"), "UTF-8", "UTF-32")) {
return true;
} else {
return false;
}
}
/**
* This is a helpful tool to check each character in a string and see if we can identify it, for translation/conversion.
* @param string $str
* @return bool
*/
static function dumpChr($str) {
$term = '';
$nodes = array();
foreach ( str_split($str) as $s ) {
if ($s == "\r" || $s == "\n" || $s == "\r\n") {
// ignore
} elseif (in_array($s, array('\\', '/')) || preg_match('/[a-zA-Z0-9\s\t \#\&\;\t\s\<\>\.\,\;\:\'\"\-\_\?\!]/', $s)) {
$term.= $s;
} else {
$nodes[] = $term;
$nodes[] = "[$s] => ".ord($s);
$term = '';
}
}
$nodes[] = $term;
$nodes = array_diff($nodes, array('', ' ', "\n"));
return $nodes;
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment