Last active
March 13, 2020 15:59
-
-
Save duzun/cc4ac3fa768bacf7ced0 to your computer and use it in GitHub Desktop.
Check whether a string can be interpreted as a valid UTF-8 or if it has any non-ASCII UTF-8 bytes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Check whether string $str can be interpreted as a valid UTF-8. | |
* | |
* @param (string) $str A string to check for UTF-8 | |
* @param (bool) $hasNonASCII_UTF8 If true, get non-ASCII bytes count in $str. | |
* | |
* @return (bool|int) if $str is not a valid UTF-8 string -> false | |
* if $str is a valid UTF-8 string, | |
* when $hasNonASCII_UTF8 == false -> true, | |
* else $hasNonASCII_UTF8 == true -> nr. of nonASCII bytes | |
* | |
* @author Dumitru Uzun | |
*/ | |
function isUTF8($str, $hasNonASCII_UTF8 = false) { | |
# ASCII # non-overlong 2-byte # excluding overlongs # straight 3-byte # excluding surrogates # planes 1-3 # planes 4-15 # plane 16 | |
// return preg_match('#^(?:[\x09\x0A\x0D\x20-\x7E]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}|\xED[\x80-\x9F][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF]{2}|[\xF1-\xF3][\x80-\xBF]{3}|\xF4[\x80-\x8F][\x80-\xBF]{2})*$#xs', $str); | |
// Alternative | |
// if ( !$hasNonASCII_UTF8 ) return preg_match('//u', $str); | |
static $tests = array( | |
"\xE0\xC0" => "\xC0\x80", | |
"\xF0\xC0\xC0" => "\xE0\x80\x80", | |
"\xF8\xC0\xC0\xC0" => "\xF0\x80\x80\x80", | |
"\xFC\xC0\xC0\xC0\xC0" => "\xF8\x80\x80\x80\x80", | |
"\xFE\xC0\xC0\xC0\xC0\xC0" => "\xFC\x80\x80\x80\x80\x80", | |
); | |
for ($u = false, $i = 0, $l = strlen($str); $i < $l; ++$i) { | |
if (($str[$i] & "\xC0") <= "\x80") { | |
continue; | |
} | |
// ASCII or Async UTF-8 byte | |
// Check sync UTF-8 bytes | |
$s = substr($str, $i, 4); | |
foreach ($tests as $t => $e) { | |
++$i; | |
if (($t & $s) == $e) { | |
$u += 1; | |
continue 2; | |
} | |
} | |
// Not an ASCII, nor sync UTF-8 bytes | |
return false; | |
} | |
return $hasNonASCII_UTF8 ? $u : true; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here is the JS implementation as utf8bytes