Skip to content

Instantly share code, notes, and snippets.

@ppKrauss
Last active August 29, 2015 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ppKrauss/11200364 to your computer and use it in GitHub Desktop.
Save ppKrauss/11200364 to your computer and use it in GitHub Desktop.
Javascript of http://www.corpuswiki.org translated to PHP and adapted for "restrict universe" option and object orientation.
<?php
/**
* Detect language by trigram sample analysis.
* Adapted by ppkrauss from http://www.corpuswiki.org/langdetect.html
*/
class LangDetect {
private $models;
private $universeConfig;
private $universeOnDetection;
private $isoTr3to2;
public $lang;
public $lang_alpha2;
public $score;
public $score_before;
public $score_befDiffPerc;
public $num_langs;
public function __construct($jsonFile='js/langtrigrams.json', $universe=false) {
$this->models = json_decode(file_get_contents($jsonFile),true);
$this->universeConfig = ( is_array($universe) && count($universe) )? $universe: false;
$this->isoTr3to2=array( // translate ISO 3166-1 codes (from alpha-3 to alpha-2)
'eng'=>'en', 'por'=>'pt', 'spa'=>'es', 'ita'=>'it', 'fra'=>'fr', 'afr'=>'af',
);
}
public function detect( $content, $universe='' ) { // array('eng','por','spa','ita', 'fra', 'afr')
if ($universe==='') $universe = $this->universeConfig;
$listScore = $this->scoreLangs($content,$universe); // do compareLang and updates $universeOnDetection
$this->score = 0;
for($i=0; $i<count($listScore); $i++)
if ( $this->score < $listScore[$i] ) {
$this->lang = $this->universeOnDetection[$i];
$this->lang_alpha2 = $this->isoTr3to2[$this->lang];
$this->score_before = $this->score;
$this->score = $listScore[$i];
$this->score_befDiffPerc = round( 100.0*($this->score - $this->score_before)/$this->score );
// print "\n\t\t-- DEBUG RESULT: bestlang={$this->lang}, score={$this->score} = {$this->models[$i]['iso']}";
}
return $this->lang;
}
private function scoreLangs($content,$universe='') {
$score = array();
$this->universeOnDetection = array();
$mindist = $this->num_langs = 0;
$contentModel = $this->createOrderedModel($content);
foreach ($this->models as $model) if (!$universe || in_array($model['iso'],$universe)) {
$s = $this->compareLang($contentModel, $model['trigrams']);
$score[] = $s;
$this->universeOnDetection[] = $model['iso'];
$this->num_langs++;
}
return $score;
}
private function createOrderedModel($content) {
// Create a list of trigrams in content sorted by frequency.
$trigrams = array(); // assoc
$sortedTrigrams = array(); // array
$content = preg_replace('/[^\p{L}\']+/us',' ',$content); // only-UTF8-words filter
$contentArr = preg_split("//us", strtolower($content), -1, PREG_SPLIT_NO_EMPTY); //str_split_unicode
for ($i = 0, $l = count($contentArr) - 2; $i < $l; $i++) {
$trigramKey = $contentArr[$i] . $contentArr[$i + 1] . $contentArr[$i + 2];
if (!isset($trigrams[$trigramKey]))
$trigrams[$trigramKey] = 1;
else
$trigrams[$trigramKey] += 1;
}
// convert object to array:
foreach ($trigrams as $k=>$v)
$sortedTrigrams[count($sortedTrigrams)] = [$k, $v];
// sort array results:
uksort($sortedTrigrams, function($objA, $objB) {
return $objB[1] - $objA[1]; // sort high-to-low
}
);
return $sortedTrigrams;
}
private function compareLang($model, $known_model) {
// Calculate the distance to the known model.
$dist = 0;
if (!is_array($model) || !count($model)) die("\npau com MODEL");
for ($i = 0, $l = count($model); $i < $l; $i++) {
if (isset($known_model[$model[$i][0]]) && $known_model[$model[$i][0]])
$dist += abs($model[$i][1] - $known_model[$model[$i][0]]);
else
$dist += 300;
}
$score = 1 - ( $dist / (300*count($model)));
return $score;
}
} // class
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment