Function to fix ut8 special characters displayed as 2 characters (utf-8 interpreted as ISO-8859-1 or Windows-1252)
<?php header('Content-Type: text/html; charset=utf-8'); ?> | |
<html> | |
<head> | |
<title>Fix wrong encoded UTF8 characters</title> | |
<meta http-equiv="Content-type" content="text/html; charset=utf-8" /> | |
</head> | |
<body> | |
<pre> | |
<?php | |
/* Problem description: | |
A common problem is for characters encoded as UTF-8 to have their individual bytes interpreted as ISO-8859-1 or Windows-1252. | |
Instead of an expected character, a sequence of Latin characters is shown, typically starting with à or Â. For example, instead of "è" these characters occur: "è". | |
A Web page is encoded as UTF-8 characters. The Web server mistakenly declares the charset to be ISO-8859-1 in the HTTP protocol that delivers the page to the browser. | |
The browser will then display each of the UTF-8 bytes in the Web page as Latin-1 characters. | |
source: http://www.i18nqa.com/debug/bug-utf-8-latin1.html | |
code source: https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152 | |
table source: http://www.i18nqa.com/debug/utf8-debug.html | |
https://www.kasperkamperman.com/ 2018-07-03 | |
*/ | |
$str = 'BLØF - ZOUTELANDE, MØ - FINAL SONG, Fédération Camerounaise de Football, It’s Getting the Best of Me'; | |
echo "original string: ".$str.'<br/>'; | |
echo "fixed string: ".fixWrongUTF8Encoding($str).'<br/>'; | |
// displays: BLØF - ZOUTELANDE, MØ - FINAL SONG, Fédération Camerounaise de Football, It’s Getting the Best of Me | |
function fixWrongUTF8Encoding($inputString) { | |
// code source: https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152 | |
// table source: http://www.i18nqa.com/debug/utf8-debug.html | |
$fix_list = array( | |
// 3 char errors first | |
'‚' => '‚', '„' => '„', '…' => '…', '‡' => '‡', | |
'‰' => '‰', '‹' => '‹', '‘' => '‘', '’' => '’', | |
'“' => '“', '•' => '•', '–' => '–', '—' => '—', | |
'â„¢' => '™', '›' => '›', '€' => '€', | |
// 2 char errors | |
'Â' => 'Â', 'Æ’' => 'ƒ', 'Ã' => 'Ã', 'Ä' => 'Ä', | |
'Ã…' => 'Å', 'â€' => '†', 'Æ' => 'Æ', 'Ç' => 'Ç', | |
'ˆ' => 'ˆ', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê', | |
'Ë' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Ž' => 'Ž', | |
'ÃŽ' => 'Î', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', | |
'â€' => '”', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö', | |
'×' => '×', 'Ëœ' => '˜', 'Ø' => 'Ø', 'Ù' => 'Ù', | |
'Å¡' => 'š', 'Ú' => 'Ú', 'Û' => 'Û', 'Å“' => 'œ', | |
'Ü' => 'Ü', 'ž' => 'ž', 'Þ' => 'Þ', 'Ÿ' => 'Ÿ', | |
'ß' => 'ß', '¡' => '¡', 'á' => 'á', '¢' => '¢', | |
'â' => 'â', '£' => '£', 'ã' => 'ã', '¤' => '¤', | |
'ä' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', '¦' => '¦', | |
'æ' => 'æ', '§' => '§', 'ç' => 'ç', '¨' => '¨', | |
'è' => 'è', '©' => '©', 'é' => 'é', 'ª' => 'ª', | |
'ê' => 'ê', '«' => '«', 'ë' => 'ë', '¬' => '¬', | |
'ì' => 'ì', '®' => '®', 'î' => 'î', '¯' => '¯', | |
'ï' => 'ï', '°' => '°', 'ð' => 'ð', '±' => '±', | |
'ñ' => 'ñ', '²' => '²', 'ò' => 'ò', '³' => '³', | |
'ó' => 'ó', '´' => '´', 'ô' => 'ô', 'µ' => 'µ', | |
'õ' => 'õ', '¶' => '¶', 'ö' => 'ö', '·' => '·', | |
'÷' => '÷', '¸' => '¸', 'ø' => 'ø', '¹' => '¹', | |
'ù' => 'ù', 'º' => 'º', 'ú' => 'ú', '»' => '»', | |
'û' => 'û', '¼' => '¼', 'ü' => 'ü', '½' => '½', | |
'ý' => 'ý', '¾' => '¾', 'þ' => 'þ', '¿' => '¿', | |
'ÿ' => 'ÿ', 'À' => 'À', | |
// 1 char errors last | |
'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï', | |
'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'Ã' => 'í' | |
); | |
$error_chars = array_keys($fix_list); | |
$real_chars = array_values($fix_list); | |
return str_replace($error_chars, $real_chars, $inputString); | |
} | |
?> | |
</pre> | |
</body> | |
</html> |
This comment has been minimized.
This comment has been minimized.
@olcirem Could you elaborate on your comment? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
'
$fix_list = array(
// 3 char errors first
'‚' => '‚', '„' => '„', '…' => '…', '‡' => '‡',
'‰' => '‰', '‹' => '‹', '‘' => '‘', '’' => '’',
'“' => '“', '•' => '•', '–' => '–', '—' => '—',
'â„¢' => '™', '›' => '›', '€' => '€',
// 2 char errors
'Â' => 'Â', 'Æ’' => 'ƒ', 'Ã' => 'Ã', 'Ä' => 'Ä',
'Ã…' => 'Å', 'â€' => '†', 'Æ' => 'Æ', 'Ç' => 'Ç',
'ˆ' => 'ˆ', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
'Ë' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Ž' => 'Ž',
'ÃŽ' => 'Î', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó',
'â€' => '”', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö',
'×' => '×', 'Ëœ' => '˜', 'Ø' => 'Ø', 'Ù' => 'Ù',
'Å¡' => 'š', 'Ú' => 'Ú', 'Û' => 'Û', 'Å“' => 'œ',
'Ü' => 'Ü', 'ž' => 'ž', 'Þ' => 'Þ', 'Ÿ' => 'Ÿ',
'ß' => 'ß', '¡' => '¡', 'á' => 'á', '¢' => '¢',
'â' => 'â', '£' => '£', 'ã' => 'ã', '¤' => '¤',
'ä' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', '¦' => '¦',
'æ' => 'æ', '§' => '§', 'ç' => 'ç', '¨' => '¨',
'è' => 'è', '©' => '©', 'é' => 'é', 'ª' => 'ª',
'ê' => 'ê', '«' => '«', 'ë' => 'ë', '¬' => '¬',
'ì' => 'ì', '®' => '®', 'î' => 'î', '¯' => '¯',
'ï' => 'ï', '°' => '°', 'ð' => 'ð', '±' => '±',
'ñ' => 'ñ', '²' => '²', 'ò' => 'ò', '³' => '³',
'ó' => 'ó', '´' => '´', 'ô' => 'ô', 'µ' => 'µ',
'õ' => 'õ', '¶' => '¶', 'ö' => 'ö', '·' => '·',
'÷' => '÷', '¸' => '¸', 'ø' => 'ø', '¹' => '¹',
'ù' => 'ù', 'º' => 'º', 'ú' => 'ú', '»' => '»',
'û' => 'û', '¼' => '¼', 'ü' => 'ü', '½' => '½',
'ý' => 'ý', '¾' => '¾', 'þ' => 'þ', '¿' => '¿',
'ÿ' => 'ÿ', 'À' => 'À',
// 1 char errors last
'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'Ã' => 'í'
);