Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save salipro4ever/92dad7c5059cb79885ef to your computer and use it in GitHub Desktop.
Save salipro4ever/92dad7c5059cb79885ef to your computer and use it in GitHub Desktop.
PHP Clean String of UTF8 Chars – Convert to similar ASCII char
/**
* Returns an string clean of UTF8 characters. It will convert them to a similar ASCII character
* www.unexpectedit.com
*/
function cleanString($text) {
// 1) convert á ô => a o
$text = preg_replace("/[áàâãªä]/u","a",$text);
$text = preg_replace("/[ÁÀÂÃÄ]/u","A",$text);
$text = preg_replace("/[ÍÌÎÏ]/u","I",$text);
$text = preg_replace("/[íìîï]/u","i",$text);
$text = preg_replace("/[éèêë]/u","e",$text);
$text = preg_replace("/[ÉÈÊË]/u","E",$text);
$text = preg_replace("/[óòôõºö]/u","o",$text);
$text = preg_replace("/[ÓÒÔÕÖ]/u","O",$text);
$text = preg_replace("/[úùûü]/u","u",$text);
$text = preg_replace("/[ÚÙÛÜ]/u","U",$text);
$text = preg_replace("/[’‘‹›‚]/u","'",$text);
$text = preg_replace("/[“”«»„]/u",'"',$text);
$text = str_replace("–","-",$text);
$text = str_replace(" "," ",$text);
$text = str_replace("ç","c",$text);
$text = str_replace("Ç","C",$text);
$text = str_replace("ñ","n",$text);
$text = str_replace("Ñ","N",$text);
//2) Translation CP1252. – => -
$trans = get_html_translation_table(HTML_ENTITIES);
$trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
$trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
$trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
$trans[chr(133)] = '…'; // Horizontal Ellipsis
$trans[chr(134)] = '†'; // Dagger
$trans[chr(135)] = '‡'; // Double Dagger
$trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
$trans[chr(137)] = '‰'; // Per Mille Sign
$trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
$trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
$trans[chr(145)] = '‘'; // Left Single Quotation Mark
$trans[chr(146)] = '’'; // Right Single Quotation Mark
$trans[chr(147)] = '“'; // Left Double Quotation Mark
$trans[chr(148)] = '”'; // Right Double Quotation Mark
$trans[chr(149)] = '•'; // Bullet
$trans[chr(150)] = '–'; // En Dash
$trans[chr(151)] = '—'; // Em Dash
$trans[chr(152)] = '˜'; // Small Tilde
$trans[chr(153)] = '™'; // Trade Mark Sign
$trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
$trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = 'œ'; // Latin Small Ligature OE
$trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
$trans['euro'] = '€'; // euro currency symbol
ksort($trans);
foreach ($trans as $k => $v) {
$text = str_replace($v, $k, $text);
}
// 3) remove <p>, <br/> ...
$text = strip_tags($text);
// 4) &amp; => & &quot; => '
$text = html_entity_decode($text);
// 5) remove Windows-1252 symbols like "TradeMark", "Euro"...
$text = preg_replace('/[^(\x20-\x7F)]*/','', $text);
$targets=array('\r\n','\n','\r','\t');
$results=array(" "," "," ","");
$text = str_replace($targets,$results,$text);
//XML compatible
/*
$text = str_replace("&", "and", $text);
$text = str_replace("<", ".", $text);
$text = str_replace(">", ".", $text);
$text = str_replace("\\", "-", $text);
$text = str_replace("/", "-", $text);
*/
return ($text);
}
Usage:
$val = "Arômes ... óòôõº ... áéíóú ... Barça ... “Windows quotes” ... this is not a normal space ( ) ... this is not a normal dash (–) ... Esdrújula ... Wünderlist ... &#160; ... &amp; ... & ... &rsquo; ... &ndash; ... &pound; ... &euro; ... &nbsp; ... ...";
echo cleanString($val);
//result: Aromes ... ooooo ... aeiou ... Barca ... "Windows quotes" ... this is not a normal space ( ) ... this is not a normal dash (-) ... Esdrujula ... Wunderlist ... ... & ... & ... ... ... ... euro ... ... ...
//Note: If you get an empty string, make sure you pass utf8 string to the function
echo cleanString(utf8_encode($val));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment