Created
October 23, 2013 16:38
-
-
Save gibrown/7122061 to your computer and use it in GitHub Desktop.
ES lang detect API text cleaning code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//if we can't tell the lang with 5000 characters we probably can't tell the language | |
$text = mb_substr( $text, 0, 5000 ); | |
//replace non-breaking spaces so they don't match the \p{L} char class | |
$text = preg_replace( '/[\x{00A0}\x{2007}\x{202F}]/u', ' ', $text ); | |
//replace unicode symbols: see: http://www.utf8-chartable.de/unicode-utf8-table.pl | |
$text = preg_replace( '/[\x{2000}-\x{2BFF}]/u', ' ', $text ); //symbols | |
$text = preg_replace( '/[\x{1f000}-\x{1ffff}]/u', ' ', $text ); //symbols, emoticons | |
//try and get rid of URLs that the lang detect api will strip out anyways | |
// avoid getting ES exceptions for empty posts | |
// this regex is simplistic, but should get 99% of cases | |
$text = preg_replace( '#(http|https)\:\/\/[a-z0-9\-_.]+\.[a-z]{2,}(/\S*)?#i', ' ', $text ); | |
//and remove email addresses too | |
$text = preg_replace( '#([.0-9a-z_+-]+)@(([0-9a-z-_]+\.)+[0-9a-z]{2,})#i', ' ', $text ); | |
if ( ! preg_match( '/\p{L}/', $text ) ) | |
return false; //no utf-8 letters, we can't detect anything | |
//remove punctuation - leading punctuation seems to cause lang detect to fail | |
// so just strip it all | |
$text = preg_replace( '/\p{P}/', ' ', $text ); | |
//do our own detection of Khmer, because those Unicode symbols cause lang-detect to fail | |
$khmer_char_cnt = preg_match_all( '/[\x{1780}-\x{17FF}]/u', $text ); | |
if ( $khmer_char_cnt ) { | |
$char_cnt = preg_match_all( '/\p{L}/', $text ); | |
//if at least 50% of letters are Khmer language symbols, assume the language is Khmer (km) | |
if ( $khmer_char_cnt / $char_cnt > 0.5 ) | |
return 'km'; | |
} | |
//see if we have enough characters to do language detection. | |
// Short text is hard to detect so we shouldn't try to get it right | |
if ( strlen( $text ) < 300 ) { | |
//We use strlen to count number of bytes rather than number of UTF-8 chars. | |
// this is a hack to (mostly) adjust for the fact that a Chinese/Japanese word takes fewer characters | |
// but actually takes a similar number of bytes. | |
// English is average 5 chars/word (== 5 bytes), Chinese is 1.5 chars/word (==3*1.5==4.5 bytes) | |
// So this cutoff is about 60 words in both English and Chinese | |
// this should probably be made smarter at some point | |
return false; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment