Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
function es_api_detect_lang( $text ) {
$lang = false;
//if we can't tell the lang with 5000 characters we probably can't tell the language
$text = mb_substr( $text, 0, 5000 );
//replace non-breaking spaces so they don't match the \p{L} char class
$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text );
//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl
$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols
$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons
//try and get rid of URLs that the lang detect api will strip out anyways
// avoid getting ES exceptions for empty posts
// this regex is simplistic, but should get 99% of cases
$text = preg_replace( '#(http|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text );
//and remove email addresses too
$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text );
if ( ! preg_match( '/\p{L}/', $text ) )
return false; //no utf-8 letters, we can't detect anything
//remove punctuation - leading punctuation seems to cause lang detect to fail
// so just strip it all
$text = preg_replace( '/\p{P}/', ' ', $text );
//do our own detection of some languages that the langdetect plugin doesn't handle
$khmer_lang = es_langdetect_by_chars( $text, '/[\x{1780}-\x{17FF}]/u', 0.25 );
if ( $khmer_lang ) {
return 'km';
}
//Capital O with a tilde occurs a lot in armenian text, so include that in the charset
$armenian_lang = es_langdetect_by_chars( $text, '/[\x{0530}-\x{058F}\x{00D5}]/u', 0.15 );
if ( $armenian_lang ) {
return 'hy';
}
$ethiopic_lang = es_langdetect_by_chars( $text, '/[\x{1200}-\x{137F}]/u', 0.25 );
if ( $ethiopic_lang ) {
//Amharic - assuming this is the best choice, there are multiple dialects
// this appears to be the largest
return 'am';
}
$maldivian_lang = es_langdetect_by_chars( $text, '/[\x{0780}-\x{07BF}]/u', 0.25 );
if ( $maldivian_lang ) {
return 'dv';
}
$myanmar_lang = es_langdetect_by_chars( $text, '/[\x{1000}-\x{109F}]/u', 0.25 );
if ( $myanmar_lang ) {
return 'my';
}
//see if we have enough characters to do language detection.
// Short text is hard to detect so we shouldn't try to get it right
if ( strlen( $text ) < 300 ) {
//We use strlen to count number of bytes rather than number of UTF-8 chars.
// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters
// but actually takes a similar number of bytes.
// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes)
// So this cutoff is about 60 words in both English and Chinese
// this should probably be made smarter at some point
return false;
}
//else run es lang detect
$es_client = new \Elastica\Client();
$es_req = new \Elastica\Request( '_langdetect', 'POST', $text, array(), $es_client->getConnection() );
$es_resp = $es_req->send();
if ( $es_resp->isOk() ) {
$data = $es_resp->getData();
if ( $data['languages'][0]['probability'] > 0.5 )
$lang = $data['languages'][0]['language'];
}
}
//detect language for text entirely based on a regex
// presumably a regex that matches unicode ranges
function es_langdetect_by_chars( $text, $unicode_regex, $percentage ) {
$khmer_char_cnt = preg_match_all( $unicode_regex, $text );
if ( $khmer_char_cnt ) {
$char_cnt = preg_match_all( '/\p{L}/', $text );
//if at least X% of letters are of this language symbols, assume that is the language
// Choose X% below 50% because there are also spaces, dates, numbers and other extraneous symbols
if ( $khmer_char_cnt / $char_cnt > $percentage )
return true;
}
return false;
}
@gibrown

This comment has been minimized.

Copy link
Owner Author

gibrown commented Jan 27, 2014

This is a hacked up version of our code calling the ES langdetect plugin. I haven't actually run it, just pasted it together as an example.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.