Created
September 22, 2017 13:48
-
-
Save debendraoli/6a490f688709f8c80ce18c4173f35fd2 to your computer and use it in GitHub Desktop.
Detects arabic Language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function uniord($u) { | |
// i formated this code from various online resources | |
// Debendra Oli | |
// https://github.com/debendraoli | |
$k = mb_convert_encoding($u, 'UCS-2LE', 'UTF-8'); | |
$k1 = ord(substr($k, 0, 1)); | |
$k2 = ord(substr($k, 1, 1)); | |
return $k2 * 256 + $k1; | |
} | |
function is_arabic($str) { | |
if(mb_detect_encoding($str) !== 'UTF-8') { | |
$str = mb_convert_encoding($str,mb_detect_encoding($str),'UTF-8'); | |
} | |
/* | |
$str = str_split($str); <- this function is not mb safe, it splits by bytes, not characters. we cannot use it | |
$str = preg_split('//u',$str); <- this function woulrd probably work fine but there was a bug reported in some php version so it pslits by bytes and not chars as well | |
*/ | |
preg_match_all('/.|\n/u', $str, $matches); | |
$chars = $matches[0]; | |
$arabic_count = 0; | |
$latin_count = 0; | |
$total_count = 0; | |
foreach($chars as $char) { | |
//$pos = ord($char); we cant use that, its not binary safe | |
$pos = uniord($char); | |
echo $char ." --> ".$pos.PHP_EOL; | |
if($pos >= 1536 && $pos <= 1791) { | |
$arabic_count++; | |
} else if($pos > 123 && $pos < 123) { | |
$latin_count++; | |
} | |
$total_count++; | |
} | |
if(($arabic_count/$total_count) > 0.6) { | |
// 60% arabic chars, its probably arabic | |
return true; | |
} | |
return false; | |
} | |
$arabic = is_arabic('عربية إخبارية تعمل على مدار اليوم. يمكنك مشاهدة بث القناة من خلال الموقع'); | |
var_dump($arabic); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment