Skip to content

Instantly share code, notes, and snippets.

@ImpactSeo
Created June 22, 2016 12:12
Show Gist options
  • Save ImpactSeo/78561612047efa7591264b8276357a4b to your computer and use it in GitHub Desktop.
Save ImpactSeo/78561612047efa7591264b8276357a4b to your computer and use it in GitHub Desktop.
Clean UTF-8
<?php
/**
* Main function to call before sending your text through API
*/
function clean_utf8($text)
{
$encoding = mb_detect_encoding($text, 'UTF-8, ISO-8859-1');
if($encoding==='UTF-8')
{
$text = fix_windows_encoding(fixcurly($text));
}
else
{
# External Lib
$text = convert_to_utf8($text);
$encoding = mb_detect_encoding($text, 'UTF-8, ISO-8859-1');
if($encoding==='UTF-8')
{
$text = fix_windows_encoding(fixcurly($text));
}
}
return $text;
}
function convert_to_utf8($text)
{
try
{
# YOU have to install this library :
# https://github.com/neitanod/forceutf8
$text = \ForceUTF8\Encoding::toUTF8($text);
}
catch(Exception $e)
{
error_log('ERROR CONVERING UTF 8 : ' . $e);
}
return $text;
}
function fixcurly($string)
{
// First, replace UTF-8 characters.
$search_utf8 = ["\xe2\x80\x9c", "\xe2\x80\x9d", "\xe2\x80\x98", "\xe2\x80\x99", "\xe2\x80\x93", "\xe2\x80\x94", "\xe2\x80\xa6"];
$replace_utf8 = ['"', '"', "'", "'", '-', '--', '...'];
$string = str_replace($search_utf8, $replace_utf8, $string);
return $string;
}
/**
* http://www.i18nqa.com/debug/utf8-debug.html
* http://stackoverflow.com/questions/9210473/how-to-convert-text-with-html-entites-and-invalid-characters-to-its-utf-8-equi
* http://stackoverflow.com/questions/3565713/how-can-i-convert-html-character-references-x5e3-to-regular-utf-8/3566055#3566055
*
*/
function fix_windows_encoding($text)
{
## 8 bytes
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x80\x9c", "–", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x80\x9d", "—", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x84\xa2", "’", $text);
## 7 bytes
$text = str_replace("\xc3\xa2\xe2\x80\x9a\xc2\xac", "€", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xa1", "‡", $text);
$text = str_replace("\xc3\xa2\xe2\x80\x9e\xc2\xa2", "™", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xa2", "•", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb0", "‰", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb9", "‹", $text);// DUPLICATE
//$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb9", "Š", $text);// DUPLICATE
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xba", "›", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xcb\x9c", "‘", $text);
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc5\x93", "“", $text);
## 6 bytes (NOT IN OPFFICIAL LIST)
$text = str_replace("\xc3\xa2\xc2\x82\xc2\xac", "€", $text);
$text = str_replace("\xc3\xa2\xc2\x80\xc2\xba", "›", $text);
## 5 bytes
# \xc3\xa2\xe2\x82\xac
$text = str_replace("\xc3\xa2\xe2\x82\xac", "”", $text);// DUPLICATE
//$text = str_replace("\xc3\xa2\xe2\x82\xac", "†", $text);// DUPLICATE
# \xc3\x83\xe2
$text = str_replace("\xc3\x83\xe2\x80\x9a", "Â", $text);
$text = str_replace("\xc3\x83\xe2\x80\x9e", "Ä", $text);
$text = str_replace("\xc3\x83\xe2\x80\xa0", "Æ", $text);
$text = str_replace("\xc3\x83\xe2\x80\xa1", "Ç", $text);
$text = str_replace("\xc3\x83\xe2\x80\xa2", "Õ", $text);
$text = str_replace("\xc3\x83\xe2\x80\xa6", "Å", $text);
$text = str_replace("\xc3\x83\xe2\x80\x93", "Ö", $text);
$text = str_replace("\xc3\x83\xe2\x80\x94", "×", $text);
$text = str_replace("\xc3\x83\xe2\x80\x98", "Ñ", $text);
$text = str_replace("\xc3\x83\xe2\x80\x99", "Ò", $text);
$text = str_replace("\xc3\x83\xe2\x80\x9c", "Ó", $text);
$text = str_replace("\xc3\x83\xe2\x80\x9d", "Ô", $text);
$text = str_replace("\xc3\x83\xe2\x80\xb0", "É", $text);
$text = str_replace("\xc3\x83\xe2\x80\xb9", "Ë", $text);
$text = str_replace("\xc3\x83\xe2\x80\xba", "Û", $text);
$text = str_replace("\xc3\x83\xe2\x82\xac", "À", $text);
$text = str_replace("\xc3\x83\xe2\x84\xa2", "Ù", $text);
# \xc3\x85\xe2\x80
$text = str_replace("\xc3\x85\xe2\x80\x99", "Œ", $text);
$text = str_replace("\xc3\x85\xe2\x80\x9c", "œ", $text);
# \xc3\x8b
$text = str_replace("\xc3\x8b\xe2\x80\xa0", "ˆ", $text);
## 4 bytes
# \xc3\x81\xc2
$text = str_replace("\xc3\x81\xc2\xa9", "é", $text);
# \xc3\x82\xc2
$text = str_replace("\xc3\x82\xc2\xa1", "¡", $text);
$text = str_replace("\xc3\x82\xc2\xa2", "¢", $text);
$text = str_replace("\xc3\x82\xc2\xa3", "£", $text);
$text = str_replace("\xc3\x82\xc2\xa4", "¤", $text);
$text = str_replace("\xc3\x82\xc2\xa5", "¥", $text);
$text = str_replace("\xc3\x82\xc2\xa6", "¦", $text);
$text = str_replace("\xc3\x82\xc2\xa7", "§", $text);
$text = str_replace("\xc3\x82\xc2\xa8", "¨", $text);
$text = str_replace("\xc3\x82\xc2\xa9", "©", $text);
$text = str_replace("\xc3\x82\xc2\xaa", "ª", $text);
$text = str_replace("\xc3\x82\xc2\xab", "«", $text);
$text = str_replace("\xc3\x82\xc2\xac", "¬", $text);
$text = str_replace("\xc3\x82\xc2\xad", "", $text);
$text = str_replace("\xc3\x82\xc2\xae", "®", $text);
$text = str_replace("\xc3\x82\xc2\xaf", "¯", $text);
$text = str_replace("\xc3\x82\xc2\xb0", "°", $text);
$text = str_replace("\xc3\x82\xc2\xb1", "±", $text);
$text = str_replace("\xc3\x82\xc2\xb2", "²", $text);
$text = str_replace("\xc3\x82\xc2\xb3", "³", $text);
$text = str_replace("\xc3\x82\xc2\xb4", "´", $text);
$text = str_replace("\xc3\x82\xc2\xb5", "µ", $text);
$text = str_replace("\xc3\x82\xc2\xb6", "¶", $text);
$text = str_replace("\xc3\x82\xc2\xb7", "·", $text);
$text = str_replace("\xc3\x82\xc2\xb8", "¸", $text);
$text = str_replace("\xc3\x82\xc2\xb9", "¹", $text);
$text = str_replace("\xc3\x82\xc2\xba", "º", $text);
$text = str_replace("\xc3\x82\xc2\xbb", "»", $text);
$text = str_replace("\xc3\x82\xc2\xbc", "¼", $text);
$text = str_replace("\xc3\x82\xc2\xbd", "½", $text);
$text = str_replace("\xc3\x82\xc2\xbe", "¾", $text);
$text = str_replace("\xc3\x82\xc2\xbf", "¿", $text);
# \xc3\x85\xc2
$text = str_replace("\xc3\x83\xc2\xa1", "á", $text);
$text = str_replace("\xc3\x83\xc2\xa2", "â", $text);
$text = str_replace("\xc3\x83\xc2\xa3", "ã", $text);
$text = str_replace("\xc3\x83\xc2\xa4", "ä", $text);
$text = str_replace("\xc3\x83\xc2\xa5", "å", $text);
$text = str_replace("\xc3\x83\xc2\xa6", "æ", $text);
$text = str_replace("\xc3\x83\xc2\xa7", "ç", $text);
$text = str_replace("\xc3\x83\xc2\xa8", "è", $text);
$text = str_replace("\xc3\x83\xc2\xa9", "é", $text);
$text = str_replace("\xc3\x83\xc2\xaa", "ê", $text);
$text = str_replace("\xc3\x83\xc2\xab", "ë", $text);
$text = str_replace("\xc3\x83\xc2\xac", "ì", $text);
$text = str_replace("\xc3\x83\xc2\xae", "î", $text);
$text = str_replace("\xc3\x83\xc2\xaf", "ï", $text);
$text = str_replace("\xc3\x83\xc2\xb0", "ð", $text);
$text = str_replace("\xc3\x83\xc2\xb1", "ñ", $text);
$text = str_replace("\xc3\x83\xc2\xb2", "ò", $text);
$text = str_replace("\xc3\x83\xc2\xb3", "ó", $text);
$text = str_replace("\xc3\x83\xc2\xb4", "ô", $text);
$text = str_replace("\xc3\x83\xc2\xb5", "õ", $text);
$text = str_replace("\xc3\x83\xc2\xb6", "ö", $text);
$text = str_replace("\xc3\x83\xc2\xb7", "÷", $text);
$text = str_replace("\xc3\x83\xc2\xb8", "ø", $text);
$text = str_replace("\xc3\x83\xc2\xb9", "ù", $text);
$text = str_replace("\xc3\x83\xc2\xba", "ú", $text);
$text = str_replace("\xc3\x83\xc2\xbb", "û", $text);
$text = str_replace("\xc3\x83\xc2\xbc", "ü", $text);
$text = str_replace("\xc3\x83\xc2\xbd", "ý", $text);
$text = str_replace("\xc3\x83\xc2\xbe", "þ", $text);
$text = str_replace("\xc3\x83\xc2\xbf", "ÿ", $text);
# \xc3\x83\xc5
$text = str_replace("\xc3\x83\xc5\x92", "Ì", $text);
$text = str_replace("\xc3\x83\xc5\x93", "Ü", $text);
$text = str_replace("\xc3\x83\xc5\xa0", "Ê", $text);
$text = str_replace("\xc3\x83\xc5\xa1", "Ú", $text);
$text = str_replace("\xc3\x83\xc5\xb8", "ß", $text);
$text = str_replace("\xc3\x83\xc5\xbd", "Î", $text);
$text = str_replace("\xc3\x83\xc5\xbe", "Þ", $text);
# \xc3\x83\xc6
$text = str_replace("\xc3\x83\xc6\x92", "Ã", $text);
# \xc3\x83\xcb
$text = str_replace("\xc3\x83\xcb\x86", "È", $text);
$text = str_replace("\xc3\x83\xcb\x9c", "Ø", $text);
# \xc3\x85\xc2
$text = str_replace("\xc3\x85\xc2\xa1", "š", $text);
$text = str_replace("\xc3\x85\xc2\xb8", "Ÿ", $text);
$text = str_replace("\xc3\x85\xc2\xbd", "Ž", $text);
$text = str_replace("\xc3\x85\xc2\xbe", "ž", $text);
# \xc3\x8b
$text = str_replace("\xc3\x8b\xc5\x93", "˜", $text);
## 4 bytes
# \xc3\x83
$text = str_replace("\xc3\x83", "à", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "Á", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "Í", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "Ï", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "Ð", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "Ý", $text); // DUPLICATE
// $text = str_replace("\xc3\x83", "í", $text); // DUPLICATE
## 2 bytes
$text = str_replace("\xc3\x82", "", $text);
return $text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment