Skip to content

Instantly share code, notes, and snippets.

@JburkeRSAC
Created May 5, 2016 18:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JburkeRSAC/c7921d57090796d5da3c1600e5eab8d7 to your computer and use it in GitHub Desktop.
Save JburkeRSAC/c7921d57090796d5da3c1600e5eab8d7 to your computer and use it in GitHub Desktop.
detect if a string contains 60% or more arabic
function uniord($u) {
// I just copied this function fron the php.net comments, but it works fine!
$k = mb_convert_encoding($u, 'UCS-2LE', 'UTF-8');
$k1 = ord(substr($k, 0, 1));
$k2 = ord(substr($k, 1, 1));
return $k2 * 256 + $k1;
}
//returns true or false if string contains 60% or more arabic characters
function is_arabic($str){
if(mb_detect_encoding($str) !== 'UTF-8') {
$str = mb_convert_encoding($str,mb_detect_encoding($str),'UTF-8');
}
/*
$str = str_split($str); <- this function is not mb safe, it splits by bytes, not characters. we cannot use it
$str = preg_split('//u',$str); <- this function woulrd probably work fine but there was a bug reported in some php version so it pslits by bytes and not chars as well
*/
preg_match_all('/.|\n/u', $str, $matches);
$chars = $matches[0];
$arabic_count = 0;
$latin_count = 0;
$total_count = 0;
foreach($chars as $char){
//$pos = ord($char); we cant use that, its not binary safe
$pos = uniord($char);
#echo $char ." --> ".$pos.PHP_EOL;
if($pos >= 1536 && $pos <= 1791){
$arabic_count++;
}elseif($pos > 123 && $pos < 123){
$latin_count++;
}
$total_count++;
}
if(($arabic_count/$total_count) > 0.6) {
// 60% arabic chars, its probably arabic
return true;
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment