Skip to content

Instantly share code, notes, and snippets.

@stormsweeper
Created April 24, 2014 03:21
Show Gist options
  • Save stormsweeper/11240363 to your computer and use it in GitHub Desktop.
Save stormsweeper/11240363 to your computer and use it in GitHub Desktop.
<?php
/**
* A utility class to clean up common problems with UTF-8 strings.
*/
class UnicodeUtils{
/**
* Maps double-encoded UTF-8 byte sequences back to single encoded UTF-8. The keys are byte sequences where a valid
* UTF-8 character has been interpreted as multiple characters in CP1252, and then re-converted
* to UTF-8 characters. The values are the UTF-8 character byte sequence that was double encoded.
*
*/
private static $duped_utf8_mapping = [
"\xC3\x82\xC2\xA0" => "\xC2\xA0",
"\xC3\x82\xC2\xA1" => "\xC2\xA1",
"\xC3\x82\xC2\xA2" => "\xC2\xA2",
"\xC3\x82\xC2\xA3" => "\xC2\xA3",
"\xC3\x82\xC2\xA4" => "\xC2\xA4",
"\xC3\x82\xC2\xA5" => "\xC2\xA5",
"\xC3\x82\xC2\xA6" => "\xC2\xA6",
"\xC3\x82\xC2\xA7" => "\xC2\xA7",
"\xC3\x82\xC2\xA8" => "\xC2\xA8",
"\xC3\x82\xC2\xA9" => "\xC2\xA9",
"\xC3\x82\xC2\xAA" => "\xC2\xAA",
"\xC3\x82\xC2\xAB" => "\xC2\xAB",
"\xC3\x82\xC2\xAC" => "\xC2\xAC",
"\xC3\x82\xC2\xAD" => "\xC2\xAD",
"\xC3\x82\xC2\xAE" => "\xC2\xAE",
"\xC3\x82\xC2\xAF" => "\xC2\xAF",
"\xC3\x82\xC2\xB0" => "\xC2\xB0",
"\xC3\x82\xC2\xB1" => "\xC2\xB1",
"\xC3\x82\xC2\xB2" => "\xC2\xB2",
"\xC3\x82\xC2\xB3" => "\xC2\xB3",
"\xC3\x82\xC2\xB4" => "\xC2\xB4",
"\xC3\x82\xC2\xB5" => "\xC2\xB5",
"\xC3\x82\xC2\xB6" => "\xC2\xB6",
"\xC3\x82\xC2\xB7" => "\xC2\xB7",
"\xC3\x82\xC2\xB8" => "\xC2\xB8",
"\xC3\x82\xC2\xB9" => "\xC2\xB9",
"\xC3\x82\xC2\xBA" => "\xC2\xBA",
"\xC3\x82\xC2\xBB" => "\xC2\xBB",
"\xC3\x82\xC2\xBC" => "\xC2\xBC",
"\xC3\x82\xC2\xBD" => "\xC2\xBD",
"\xC3\x82\xC2\xBE" => "\xC2\xBE",
"\xC3\x82\xC2\xBF" => "\xC2\xBF",
"\xC3\x83\xC2\x81" => "\xC3\x81",
"\xC3\x83\xC2\x8D" => "\xC3\x8D",
"\xC3\x83\xC2\x8F" => "\xC3\x8F",
"\xC3\x83\xC2\x90" => "\xC3\x90",
"\xC3\x83\xC2\x9D" => "\xC3\x9D",
"\xC3\x83\xC2\xA0" => "\xC3\xA0",
"\xC3\x83\xC2\xA1" => "\xC3\xA1",
"\xC3\x83\xC2\xA2" => "\xC3\xA2",
"\xC3\x83\xC2\xA3" => "\xC3\xA3",
"\xC3\x83\xC2\xA4" => "\xC3\xA4",
"\xC3\x83\xC2\xA5" => "\xC3\xA5",
"\xC3\x83\xC2\xA6" => "\xC3\xA6",
"\xC3\x83\xC2\xA7" => "\xC3\xA7",
"\xC3\x83\xC2\xA8" => "\xC3\xA8",
"\xC3\x83\xC2\xA9" => "\xC3\xA9",
"\xC3\x83\xC2\xAA" => "\xC3\xAA",
"\xC3\x83\xC2\xAB" => "\xC3\xAB",
"\xC3\x83\xC2\xAC" => "\xC3\xAC",
"\xC3\x83\xC2\xAD" => "\xC3\xAD",
"\xC3\x83\xC2\xAE" => "\xC3\xAE",
"\xC3\x83\xC2\xAF" => "\xC3\xAF",
"\xC3\x83\xC2\xB0" => "\xC3\xB0",
"\xC3\x83\xC2\xB1" => "\xC3\xB1",
"\xC3\x83\xC2\xB2" => "\xC3\xB2",
"\xC3\x83\xC2\xB3" => "\xC3\xB3",
"\xC3\x83\xC2\xB4" => "\xC3\xB4",
"\xC3\x83\xC2\xB5" => "\xC3\xB5",
"\xC3\x83\xC2\xB6" => "\xC3\xB6",
"\xC3\x83\xC2\xB7" => "\xC3\xB7",
"\xC3\x83\xC2\xB8" => "\xC3\xB8",
"\xC3\x83\xC2\xB9" => "\xC3\xB9",
"\xC3\x83\xC2\xBA" => "\xC3\xBA",
"\xC3\x83\xC2\xBB" => "\xC3\xBB",
"\xC3\x83\xC2\xBC" => "\xC3\xBC",
"\xC3\x83\xC2\xBD" => "\xC3\xBD",
"\xC3\x83\xC2\xBE" => "\xC3\xBE",
"\xC3\x83\xC2\xBF" => "\xC3\xBF",
"\xC3\x83\xC5\x92" => "\xC3\x8C",
"\xC3\x83\xC5\x93" => "\xC3\x9C",
"\xC3\x83\xC5\xA0" => "\xC3\x8A",
"\xC3\x83\xC5\xA1" => "\xC3\x9A",
"\xC3\x83\xC5\xB8" => "\xC3\x9F",
"\xC3\x83\xC5\xBD" => "\xC3\x8E",
"\xC3\x83\xC5\xBE" => "\xC3\x9E",
"\xC3\x83\xC6\x92" => "\xC3\x83",
"\xC3\x83\xCB\x86" => "\xC3\x88",
"\xC3\x83\xCB\x9C" => "\xC3\x98",
"\xC3\x83\xE2\x80\x93" => "\xC3\x96",
"\xC3\x83\xE2\x80\x94" => "\xC3\x97",
"\xC3\x83\xE2\x80\x98" => "\xC3\x91",
"\xC3\x83\xE2\x80\x99" => "\xC3\x92",
"\xC3\x83\xE2\x80\x9A" => "\xC3\x82",
"\xC3\x83\xE2\x80\x9C" => "\xC3\x93",
"\xC3\x83\xE2\x80\x9D" => "\xC3\x94",
"\xC3\x83\xE2\x80\x9E" => "\xC3\x84",
"\xC3\x83\xE2\x80\xA0" => "\xC3\x86",
"\xC3\x83\xE2\x80\xA1" => "\xC3\x87",
"\xC3\x83\xE2\x80\xA2" => "\xC3\x95",
"\xC3\x83\xE2\x80\xA6" => "\xC3\x85",
"\xC3\x83\xE2\x80\xB0" => "\xC3\x89",
"\xC3\x83\xE2\x80\xB9" => "\xC3\x8B",
"\xC3\x83\xE2\x80\xBA" => "\xC3\x9B",
"\xC3\x83\xE2\x82\xAC" => "\xC3\x80",
"\xC3\x83\xE2\x84\xA2" => "\xC3\x99",
"\xC3\x85\xC2\xA0" => "\xC5\xA0",
"\xC3\x85\xC2\xA1" => "\xC5\xA1",
"\xC3\x85\xC2\xB8" => "\xC5\xB8",
"\xC3\x85\xC2\xBD" => "\xC5\xBD",
"\xC3\x85\xC2\xBE" => "\xC5\xBE",
"\xC3\x85\xE2\x80\x99" => "\xC5\x92",
"\xC3\x85\xE2\x80\x9C" => "\xC5\x93",
"\xC3\x86\xE2\x80\x99" => "\xC6\x92",
"\xC3\x8B\xC5\x93" => "\xCB\x9C",
"\xC3\x8B\xE2\x80\xA0" => "\xCB\x86",
"\xC3\xA2\xE2\x80\x9A\xC2\xAC" => "\xE2\x82\xAC",
"\xC3\xA2\xE2\x80\x9E\xC2\xA2" => "\xE2\x84\xA2",
"\xC3\xA2\xE2\x82\xAC\xC2\x9D" => "\xE2\x80\x9D",
"\xC3\xA2\xE2\x82\xAC\xC2\xA0" => "\xE2\x80\xA0",
"\xC3\xA2\xE2\x82\xAC\xC2\xA1" => "\xE2\x80\xA1",
"\xC3\xA2\xE2\x82\xAC\xC2\xA2" => "\xE2\x80\xA2",
"\xC3\xA2\xE2\x82\xAC\xC2\xA6" => "\xE2\x80\xA6",
"\xC3\xA2\xE2\x82\xAC\xC2\xB0" => "\xE2\x80\xB0",
"\xC3\xA2\xE2\x82\xAC\xC2\xB9" => "\xE2\x80\xB9",
"\xC3\xA2\xE2\x82\xAC\xC2\xBA" => "\xE2\x80\xBA",
"\xC3\xA2\xE2\x82\xAC\xC5\x93" => "\xE2\x80\x9C",
"\xC3\xA2\xE2\x82\xAC\xC5\xA1" => "\xE2\x80\x9A",
"\xC3\xA2\xE2\x82\xAC\xC5\xBE" => "\xE2\x80\x9E",
"\xC3\xA2\xE2\x82\xAC\xCB\x9C" => "\xE2\x80\x98",
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9C" => "\xE2\x80\x93",
"\xC3\xA2\xE2\x82\xAC\xE2\x80\x9D" => "\xE2\x80\x94",
"\xC3\xA2\xE2\x82\xAC\xE2\x84\xA2" => "\xE2\x80\x99",
];
/**
* Cleans up instances where a UTF-8 string has been incorrectly re-encoded to UTF-8 from ISO-8859-1/CP1252.
*
* @see http://www.i18nqa.com/debug/utf8-debug.html#dbg
* @param string $str
* @return string
*/
public static function dedupUTF8($str) {
// do a check for byte prefixes of the mapping strings
if (
strpos($str, "\xC3") === false
||
(
strpos($str, "\xC3\x83") === false
&&
strpos($str, "\xC3\x82") === false
&&
strpos($str, "\xC3\xA2") === false
&&
strpos($str, "\xC3\x85") === false
&&
strpos($str, "\xC3\x8B") === false
&&
strpos($str, "\xC3\x86") === false
)
) {
return $str;
}
return strtr($str, self::$duped_utf8_mapping);
}
/**
* Converts various "extended" Unicode characters to ASCII equivalents.
*
* @see self::convertCurlyQuotes
* @see self::convertSmartChars
* @see self::convertSpaces
* @param string $str
* @return string
*/
public static function convertToSimpleChars($str) {
return self::convertCurlyQuotes(
self::convertSmartChars(
self::convertSpaces($str)
)
);
}
/**
* Converts curly (aka "smart" or "typographic") quotes to straight quotes a found in ASCII.
*
* @param string $str
* @return string
*/
public static function convertCurlyQuotes($str) {
return strtr(
$str,
[
"\xC2\xB4" => '\'', // acute accent, common on European keyboards
"\xE2\x80\x98" => '\'', // left single quote
"\xE2\x80\x99" => '\'', // right single quote
"\xE2\x80\x9C" => '"', // left double quote
"\xE2\x80\x9D" => '"', // right double quote
]
);
}
/**
* Converts various characters from "smart" versions in Unicode back to ASCII lookalikes.
*
* @param string $str
* @return string
*/
public static function convertSmartChars($str) {
return strtr(
$str,
[
"\xE2\x80\xA6" => '...', // horizontal ellipsis
"\xE2\x80\x90" => '-', // hyphen
"\xE2\x80\x91" => '-', // non-breaking hyphen
"\xE2\x80\x92" => '-', // figure dash
"\xE2\x80\x93" => '-', // en dash
"\xE2\x80\x94" => '--', // em dash
"\xE2\x80\x95" => '--', // horizontal bar
"\xC2\xB7" => '*', // mid dot
"\xE2\x80\xA2" => '*', // bullet
"\xC2\xB0" => 'o', // degree symbol
]
);
}
/**
* Converts all "space" characters in Unicode to a standard space.
*
* @see http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Zs:]
* @param string $str
* @return string
*/
public static function convertSpaces($str) {
return preg_replace('/\p{Zs}/u', ' ', $str);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment