function es_api_detect_lang( $text ) { | |
$lang = false; | |
//if we can't tell the lang with 5000 characters we probably can't tell the language | |
$text = mb_substr( $text, 0, 5000 ); | |
//replace non-breaking spaces so they don't match the \p{L} char class | |
$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text ); | |
//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl | |
$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols | |
$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons | |
//try and get rid of URLs that the lang detect api will strip out anyways | |
// avoid getting ES exceptions for empty posts | |
// this regex is simplistic, but should get 99% of cases | |
$text = preg_replace( '#(http|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text ); | |
//and remove email addresses too | |
$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text ); | |
if ( ! preg_match( '/\p{L}/', $text ) ) | |
return false; //no utf-8 letters, we can't detect anything | |
//remove punctuation - leading punctuation seems to cause lang detect to fail | |
// so just strip it all | |
$text = preg_replace( '/\p{P}/', ' ', $text ); | |
//do our own detection of some languages that the langdetect plugin doesn't handle | |
$khmer_lang = es_langdetect_by_chars( $text, '/[\x{1780}-\x{17FF}]/u', 0.25 ); | |
if ( $khmer_lang ) { | |
return 'km'; | |
} | |
//Capital O with a tilde occurs a lot in armenian text, so include that in the charset | |
$armenian_lang = es_langdetect_by_chars( $text, '/[\x{0530}-\x{058F}\x{00D5}]/u', 0.15 ); | |
if ( $armenian_lang ) { | |
return 'hy'; | |
} | |
$ethiopic_lang = es_langdetect_by_chars( $text, '/[\x{1200}-\x{137F}]/u', 0.25 ); | |
if ( $ethiopic_lang ) { | |
//Amharic - assuming this is the best choice, there are multiple dialects | |
// this appears to be the largest | |
return 'am'; | |
} | |
$maldivian_lang = es_langdetect_by_chars( $text, '/[\x{0780}-\x{07BF}]/u', 0.25 ); | |
if ( $maldivian_lang ) { | |
return 'dv'; | |
} | |
$myanmar_lang = es_langdetect_by_chars( $text, '/[\x{1000}-\x{109F}]/u', 0.25 ); | |
if ( $myanmar_lang ) { | |
return 'my'; | |
} | |
//see if we have enough characters to do language detection. | |
// Short text is hard to detect so we shouldn't try to get it right | |
if ( strlen( $text ) < 300 ) { | |
//We use strlen to count number of bytes rather than number of UTF-8 chars. | |
// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters | |
// but actually takes a similar number of bytes. | |
// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes) | |
// So this cutoff is about 60 words in both English and Chinese | |
// this should probably be made smarter at some point | |
return false; | |
} | |
//else run es lang detect | |
$es_client = new \Elastica\Client(); | |
$es_req = new \Elastica\Request( '_langdetect', 'POST', $text, array(), $es_client->getConnection() ); | |
$es_resp = $es_req->send(); | |
if ( $es_resp->isOk() ) { | |
$data = $es_resp->getData(); | |
if ( $data['languages'][0]['probability'] > 0.5 ) | |
$lang = $data['languages'][0]['language']; | |
} | |
} | |
//detect language for text entirely based on a regex | |
// presumably a regex that matches unicode ranges | |
function es_langdetect_by_chars( $text, $unicode_regex, $percentage ) { | |
$khmer_char_cnt = preg_match_all( $unicode_regex, $text ); | |
if ( $khmer_char_cnt ) { | |
$char_cnt = preg_match_all( '/\p{L}/', $text ); | |
//if at least X% of letters are of this language symbols, assume that is the language | |
// Choose X% below 50% because there are also spaces, dates, numbers and other extraneous symbols | |
if ( $khmer_char_cnt / $char_cnt > $percentage ) | |
return true; | |
} | |
return false; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
This is a hacked up version of our code calling the ES langdetect plugin. I haven't actually run it, just pasted it together as an example.