Created
June 22, 2016 12:12
-
-
Save ImpactSeo/78561612047efa7591264b8276357a4b to your computer and use it in GitHub Desktop.
Clean UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Main function to call before sending your text through API | |
*/ | |
function clean_utf8($text) | |
{ | |
$encoding = mb_detect_encoding($text, 'UTF-8, ISO-8859-1'); | |
if($encoding==='UTF-8') | |
{ | |
$text = fix_windows_encoding(fixcurly($text)); | |
} | |
else | |
{ | |
# External Lib | |
$text = convert_to_utf8($text); | |
$encoding = mb_detect_encoding($text, 'UTF-8, ISO-8859-1'); | |
if($encoding==='UTF-8') | |
{ | |
$text = fix_windows_encoding(fixcurly($text)); | |
} | |
} | |
return $text; | |
} | |
function convert_to_utf8($text) | |
{ | |
try | |
{ | |
# YOU have to install this library : | |
# https://github.com/neitanod/forceutf8 | |
$text = \ForceUTF8\Encoding::toUTF8($text); | |
} | |
catch(Exception $e) | |
{ | |
error_log('ERROR CONVERING UTF 8 : ' . $e); | |
} | |
return $text; | |
} | |
function fixcurly($string) | |
{ | |
// First, replace UTF-8 characters. | |
$search_utf8 = ["\xe2\x80\x9c", "\xe2\x80\x9d", "\xe2\x80\x98", "\xe2\x80\x99", "\xe2\x80\x93", "\xe2\x80\x94", "\xe2\x80\xa6"]; | |
$replace_utf8 = ['"', '"', "'", "'", '-', '--', '...']; | |
$string = str_replace($search_utf8, $replace_utf8, $string); | |
return $string; | |
} | |
/** | |
* http://www.i18nqa.com/debug/utf8-debug.html | |
* http://stackoverflow.com/questions/9210473/how-to-convert-text-with-html-entites-and-invalid-characters-to-its-utf-8-equi | |
* http://stackoverflow.com/questions/3565713/how-can-i-convert-html-character-references-x5e3-to-regular-utf-8/3566055#3566055 | |
* | |
*/ | |
function fix_windows_encoding($text) | |
{ | |
## 8 bytes | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x80\x9c", "–", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x80\x9d", "—", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xe2\x84\xa2", "’", $text); | |
## 7 bytes | |
$text = str_replace("\xc3\xa2\xe2\x80\x9a\xc2\xac", "€", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xa1", "‡", $text); | |
$text = str_replace("\xc3\xa2\xe2\x80\x9e\xc2\xa2", "™", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xa2", "•", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb0", "‰", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb9", "‹", $text);// DUPLICATE | |
//$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xb9", "Š", $text);// DUPLICATE | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc2\xba", "›", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xcb\x9c", "‘", $text); | |
$text = str_replace("\xc3\xa2\xe2\x82\xac\xc5\x93", "“", $text); | |
## 6 bytes (NOT IN OPFFICIAL LIST) | |
$text = str_replace("\xc3\xa2\xc2\x82\xc2\xac", "€", $text); | |
$text = str_replace("\xc3\xa2\xc2\x80\xc2\xba", "›", $text); | |
## 5 bytes | |
# \xc3\xa2\xe2\x82\xac | |
$text = str_replace("\xc3\xa2\xe2\x82\xac", "”", $text);// DUPLICATE | |
//$text = str_replace("\xc3\xa2\xe2\x82\xac", "†", $text);// DUPLICATE | |
# \xc3\x83\xe2 | |
$text = str_replace("\xc3\x83\xe2\x80\x9a", "Â", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x9e", "Ä", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xa0", "Æ", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xa1", "Ç", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xa2", "Õ", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xa6", "Å", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x93", "Ö", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x94", "×", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x98", "Ñ", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x99", "Ò", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x9c", "Ó", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\x9d", "Ô", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xb0", "É", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xb9", "Ë", $text); | |
$text = str_replace("\xc3\x83\xe2\x80\xba", "Û", $text); | |
$text = str_replace("\xc3\x83\xe2\x82\xac", "À", $text); | |
$text = str_replace("\xc3\x83\xe2\x84\xa2", "Ù", $text); | |
# \xc3\x85\xe2\x80 | |
$text = str_replace("\xc3\x85\xe2\x80\x99", "Œ", $text); | |
$text = str_replace("\xc3\x85\xe2\x80\x9c", "œ", $text); | |
# \xc3\x8b | |
$text = str_replace("\xc3\x8b\xe2\x80\xa0", "ˆ", $text); | |
## 4 bytes | |
# \xc3\x81\xc2 | |
$text = str_replace("\xc3\x81\xc2\xa9", "é", $text); | |
# \xc3\x82\xc2 | |
$text = str_replace("\xc3\x82\xc2\xa1", "¡", $text); | |
$text = str_replace("\xc3\x82\xc2\xa2", "¢", $text); | |
$text = str_replace("\xc3\x82\xc2\xa3", "£", $text); | |
$text = str_replace("\xc3\x82\xc2\xa4", "¤", $text); | |
$text = str_replace("\xc3\x82\xc2\xa5", "¥", $text); | |
$text = str_replace("\xc3\x82\xc2\xa6", "¦", $text); | |
$text = str_replace("\xc3\x82\xc2\xa7", "§", $text); | |
$text = str_replace("\xc3\x82\xc2\xa8", "¨", $text); | |
$text = str_replace("\xc3\x82\xc2\xa9", "©", $text); | |
$text = str_replace("\xc3\x82\xc2\xaa", "ª", $text); | |
$text = str_replace("\xc3\x82\xc2\xab", "«", $text); | |
$text = str_replace("\xc3\x82\xc2\xac", "¬", $text); | |
$text = str_replace("\xc3\x82\xc2\xad", "", $text); | |
$text = str_replace("\xc3\x82\xc2\xae", "®", $text); | |
$text = str_replace("\xc3\x82\xc2\xaf", "¯", $text); | |
$text = str_replace("\xc3\x82\xc2\xb0", "°", $text); | |
$text = str_replace("\xc3\x82\xc2\xb1", "±", $text); | |
$text = str_replace("\xc3\x82\xc2\xb2", "²", $text); | |
$text = str_replace("\xc3\x82\xc2\xb3", "³", $text); | |
$text = str_replace("\xc3\x82\xc2\xb4", "´", $text); | |
$text = str_replace("\xc3\x82\xc2\xb5", "µ", $text); | |
$text = str_replace("\xc3\x82\xc2\xb6", "¶", $text); | |
$text = str_replace("\xc3\x82\xc2\xb7", "·", $text); | |
$text = str_replace("\xc3\x82\xc2\xb8", "¸", $text); | |
$text = str_replace("\xc3\x82\xc2\xb9", "¹", $text); | |
$text = str_replace("\xc3\x82\xc2\xba", "º", $text); | |
$text = str_replace("\xc3\x82\xc2\xbb", "»", $text); | |
$text = str_replace("\xc3\x82\xc2\xbc", "¼", $text); | |
$text = str_replace("\xc3\x82\xc2\xbd", "½", $text); | |
$text = str_replace("\xc3\x82\xc2\xbe", "¾", $text); | |
$text = str_replace("\xc3\x82\xc2\xbf", "¿", $text); | |
# \xc3\x85\xc2 | |
$text = str_replace("\xc3\x83\xc2\xa1", "á", $text); | |
$text = str_replace("\xc3\x83\xc2\xa2", "â", $text); | |
$text = str_replace("\xc3\x83\xc2\xa3", "ã", $text); | |
$text = str_replace("\xc3\x83\xc2\xa4", "ä", $text); | |
$text = str_replace("\xc3\x83\xc2\xa5", "å", $text); | |
$text = str_replace("\xc3\x83\xc2\xa6", "æ", $text); | |
$text = str_replace("\xc3\x83\xc2\xa7", "ç", $text); | |
$text = str_replace("\xc3\x83\xc2\xa8", "è", $text); | |
$text = str_replace("\xc3\x83\xc2\xa9", "é", $text); | |
$text = str_replace("\xc3\x83\xc2\xaa", "ê", $text); | |
$text = str_replace("\xc3\x83\xc2\xab", "ë", $text); | |
$text = str_replace("\xc3\x83\xc2\xac", "ì", $text); | |
$text = str_replace("\xc3\x83\xc2\xae", "î", $text); | |
$text = str_replace("\xc3\x83\xc2\xaf", "ï", $text); | |
$text = str_replace("\xc3\x83\xc2\xb0", "ð", $text); | |
$text = str_replace("\xc3\x83\xc2\xb1", "ñ", $text); | |
$text = str_replace("\xc3\x83\xc2\xb2", "ò", $text); | |
$text = str_replace("\xc3\x83\xc2\xb3", "ó", $text); | |
$text = str_replace("\xc3\x83\xc2\xb4", "ô", $text); | |
$text = str_replace("\xc3\x83\xc2\xb5", "õ", $text); | |
$text = str_replace("\xc3\x83\xc2\xb6", "ö", $text); | |
$text = str_replace("\xc3\x83\xc2\xb7", "÷", $text); | |
$text = str_replace("\xc3\x83\xc2\xb8", "ø", $text); | |
$text = str_replace("\xc3\x83\xc2\xb9", "ù", $text); | |
$text = str_replace("\xc3\x83\xc2\xba", "ú", $text); | |
$text = str_replace("\xc3\x83\xc2\xbb", "û", $text); | |
$text = str_replace("\xc3\x83\xc2\xbc", "ü", $text); | |
$text = str_replace("\xc3\x83\xc2\xbd", "ý", $text); | |
$text = str_replace("\xc3\x83\xc2\xbe", "þ", $text); | |
$text = str_replace("\xc3\x83\xc2\xbf", "ÿ", $text); | |
# \xc3\x83\xc5 | |
$text = str_replace("\xc3\x83\xc5\x92", "Ì", $text); | |
$text = str_replace("\xc3\x83\xc5\x93", "Ü", $text); | |
$text = str_replace("\xc3\x83\xc5\xa0", "Ê", $text); | |
$text = str_replace("\xc3\x83\xc5\xa1", "Ú", $text); | |
$text = str_replace("\xc3\x83\xc5\xb8", "ß", $text); | |
$text = str_replace("\xc3\x83\xc5\xbd", "Î", $text); | |
$text = str_replace("\xc3\x83\xc5\xbe", "Þ", $text); | |
# \xc3\x83\xc6 | |
$text = str_replace("\xc3\x83\xc6\x92", "Ã", $text); | |
# \xc3\x83\xcb | |
$text = str_replace("\xc3\x83\xcb\x86", "È", $text); | |
$text = str_replace("\xc3\x83\xcb\x9c", "Ø", $text); | |
# \xc3\x85\xc2 | |
$text = str_replace("\xc3\x85\xc2\xa1", "š", $text); | |
$text = str_replace("\xc3\x85\xc2\xb8", "Ÿ", $text); | |
$text = str_replace("\xc3\x85\xc2\xbd", "Ž", $text); | |
$text = str_replace("\xc3\x85\xc2\xbe", "ž", $text); | |
# \xc3\x8b | |
$text = str_replace("\xc3\x8b\xc5\x93", "˜", $text); | |
## 4 bytes | |
# \xc3\x83 | |
$text = str_replace("\xc3\x83", "à", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "Á", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "Í", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "Ï", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "Ð", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "Ý", $text); // DUPLICATE | |
// $text = str_replace("\xc3\x83", "í", $text); // DUPLICATE | |
## 2 bytes | |
$text = str_replace("\xc3\x82", "", $text); | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment