Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Function to fix ut8 special characters displayed as 2 characters (utf-8 interpreted as ISO-8859-1 or Windows-1252)
<?php header('Content-Type: text/html; charset=utf-8'); ?>
<html>
<head>
<title>Fix wrong encoded UTF8 characters</title>
<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
</head>
<body>
<pre>
<?php
/* Problem description:
A common problem is for characters encoded as UTF-8 to have their individual bytes interpreted as ISO-8859-1 or Windows-1252.
Instead of an expected character, a sequence of Latin characters is shown, typically starting with à or Â. For example, instead of "è" these characters occur: "è".
A Web page is encoded as UTF-8 characters. The Web server mistakenly declares the charset to be ISO-8859-1 in the HTTP protocol that delivers the page to the browser.
The browser will then display each of the UTF-8 bytes in the Web page as Latin-1 characters.
source: http://www.i18nqa.com/debug/bug-utf-8-latin1.html
code source: https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152
table source: http://www.i18nqa.com/debug/utf8-debug.html
https://www.kasperkamperman.com/ 2018-07-03
*/
$str = 'BLØF - ZOUTELANDE, MØ - FINAL SONG, Fédération Camerounaise de Football, It’s Getting the Best of Me';
echo "original string: ".$str.'<br/>';
echo "fixed string: ".fixWrongUTF8Encoding($str).'<br/>';
// displays: BLØF - ZOUTELANDE, MØ - FINAL SONG, Fédération Camerounaise de Football, It’s Getting the Best of Me
function fixWrongUTF8Encoding($inputString) {
// code source: https://github.com/devgeniem/wp-sanitize-accented-uploads/blob/master/plugin.php#L152
// table source: http://www.i18nqa.com/debug/utf8-debug.html
$fix_list = array(
// 3 char errors first
'‚' => '‚', '„' => '„', '…' => '…', '‡' => '‡',
'‰' => '‰', '‹' => '‹', '‘' => '‘', '’' => '’',
'“' => '“', '•' => '•', '–' => '–', '—' => '—',
'â„¢' => '™', '›' => '›', '€' => '€',
// 2 char errors
'Â' => 'Â', 'Æ’' => 'ƒ', 'Ã' => 'Ã', 'Ä' => 'Ä',
'Ã…' => 'Å', 'â€' => '†', 'Æ' => 'Æ', 'Ç' => 'Ç',
'ˆ' => 'ˆ', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
'Ë' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Ž' => 'Ž',
'ÃŽ' => 'Î', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó',
'â€' => '”', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö',
'×' => '×', 'Ëœ' => '˜', 'Ø' => 'Ø', 'Ù' => 'Ù',
'Å¡' => 'š', 'Ú' => 'Ú', 'Û' => 'Û', 'Å“' => 'œ',
'Ãœ' => 'Ü', 'ž' => 'ž', 'Þ' => 'Þ', 'Ÿ' => 'Ÿ',
'ß' => 'ß', '¡' => '¡', 'á' => 'á', '¢' => '¢',
'â' => 'â', '£' => '£', 'ã' => 'ã', '¤' => '¤',
'ä' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', '¦' => '¦',
'æ' => 'æ', '§' => '§', 'ç' => 'ç', '¨' => '¨',
'è' => 'è', '©' => '©', 'é' => 'é', 'ª' => 'ª',
'ê' => 'ê', '«' => '«', 'ë' => 'ë', '¬' => '¬',
'ì' => 'ì', '®' => '®', 'î' => 'î', '¯' => '¯',
'ï' => 'ï', '°' => '°', 'ð' => 'ð', '±' => '±',
'ñ' => 'ñ', '²' => '²', 'ò' => 'ò', '³' => '³',
'ó' => 'ó', '´' => '´', 'ô' => 'ô', 'µ' => 'µ',
'õ' => 'õ', '¶' => '¶', 'ö' => 'ö', '·' => '·',
'÷' => '÷', '¸' => '¸', 'ø' => 'ø', '¹' => '¹',
'ù' => 'ù', 'º' => 'º', 'ú' => 'ú', '»' => '»',
'û' => 'û', '¼' => '¼', 'ü' => 'ü', '½' => '½',
'ý' => 'ý', '¾' => '¾', 'þ' => 'þ', '¿' => '¿',
'ÿ' => 'ÿ', 'À' => 'À',
// 1 char errors last
'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'í' => 'í'
);
$error_chars = array_keys($fix_list);
$real_chars = array_values($fix_list);
return str_replace($error_chars, $real_chars, $inputString);
}
?>
</pre>
</body>
</html>
@olcirem

This comment has been minimized.

Copy link

@olcirem olcirem commented Feb 14, 2020

'
$fix_list = array(
// 3 char errors first
'‚' => '‚', '„' => '„', '…' => '…', '‡' => '‡',
'‰' => '‰', '‹' => '‹', '‘' => '‘', '’' => '’',
'“' => '“', '•' => '•', '–' => '–', '—' => '—',
'â„¢' => '™', '›' => '›', '€' => '€',
// 2 char errors
'Â' => 'Â', 'Æ’' => 'ƒ', 'Ã' => 'Ã', 'Ä' => 'Ä',
'Ã…' => 'Å', 'â€' => '†', 'Æ' => 'Æ', 'Ç' => 'Ç',
'ˆ' => 'ˆ', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
'Ë' => 'Ë', 'Å’' => 'Œ', 'ÃŒ' => 'Ì', 'Ž' => 'Ž',
'ÃŽ' => 'Î', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó',
'â€' => '”', 'Ô' => 'Ô', 'Õ' => 'Õ', 'Ö' => 'Ö',
'×' => '×', 'Ëœ' => '˜', 'Ø' => 'Ø', 'Ù' => 'Ù',
'Å¡' => 'š', 'Ú' => 'Ú', 'Û' => 'Û', 'Å“' => 'œ',
'Ãœ' => 'Ü', 'ž' => 'ž', 'Þ' => 'Þ', 'Ÿ' => 'Ÿ',
'ß' => 'ß', '¡' => '¡', 'á' => 'á', '¢' => '¢',
'â' => 'â', '£' => '£', 'ã' => 'ã', '¤' => '¤',
'ä' => 'ä', 'Â¥' => '¥', 'Ã¥' => 'å', '¦' => '¦',
'æ' => 'æ', '§' => '§', 'ç' => 'ç', '¨' => '¨',
'è' => 'è', '©' => '©', 'é' => 'é', 'ª' => 'ª',
'ê' => 'ê', '«' => '«', 'ë' => 'ë', '¬' => '¬',
'ì' => 'ì', '®' => '®', 'î' => 'î', '¯' => '¯',
'ï' => 'ï', '°' => '°', 'ð' => 'ð', '±' => '±',
'ñ' => 'ñ', '²' => '²', 'ò' => 'ò', '³' => '³',
'ó' => 'ó', '´' => '´', 'ô' => 'ô', 'µ' => 'µ',
'õ' => 'õ', '¶' => '¶', 'ö' => 'ö', '·' => '·',
'÷' => '÷', '¸' => '¸', 'ø' => 'ø', '¹' => '¹',
'ù' => 'ù', 'º' => 'º', 'ú' => 'ú', '»' => '»',
'û' => 'û', '¼' => '¼', 'ü' => 'ü', '½' => '½',
'ý' => 'ý', '¾' => '¾', 'þ' => 'þ', '¿' => '¿',
'ÿ' => 'ÿ', 'À' => 'À',
// 1 char errors last
'Ã' => 'Á', 'Å' => 'Š', 'Ã' => 'Í', 'Ã' => 'Ï',
'Ã' => 'Ð', 'Ã' => 'Ý', 'Ã' => 'à', 'í' => 'í'
);

    $error_chars = array_keys($fix_list);
@kasperkamperman

This comment has been minimized.

Copy link
Owner Author

@kasperkamperman kasperkamperman commented Feb 14, 2020

@olcirem Could you elaborate on your comment?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment