Suomenkielinen Porter stemmer
<?php | |
add_filter( 'relevanssi_stemmer', 'suomi_stemmer' ); | |
function suomi_stemmer( $merkkijono ) { | |
$stemmer = new FinnishStemmer(); | |
return $stemmer->process( $merkkijono ); | |
} | |
class FinnishStemmer { | |
private static $vowels = array("a", "e", "i", "o", "u", "y", "å", "ä", "ö"); | |
private static $long_vowels = array("aa", "ee", "ii", "oo", "uu", "yy", "åå", "ää", "öö"); | |
private static $consonants = array("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"); | |
protected $word; | |
protected $r1; | |
protected $r2; | |
protected $removed_in_step_3; | |
private function find_r($in) { | |
$word_vowels = array(); | |
for ($i = 0; $i < mb_strlen($in); $i++) { | |
$letter = mb_substr($in, $i, 1); | |
if (in_array($letter, static::$vowels)) $word_vowels[] = $i; | |
} | |
$index = mb_strlen($in); | |
$value = ""; | |
foreach ($word_vowels as $pos) { | |
$after = $pos + 1; | |
$letter = mb_substr($in, $after, 1); | |
if (!in_array($letter, static::$vowels)) { | |
$index = $after + 1; | |
$value = mb_substr($in, $after + 1); | |
break; | |
} | |
} | |
return array($index, $value); | |
} | |
private function is_suffix($suffix, $in, $preceeded = "", $negative_preceeded = "") { | |
$length = mb_strlen($suffix) * -1; | |
if (mb_substr($in, $length) == $suffix) { | |
if (is_array($preceeded)) { | |
foreach($preceeded as $preceed_candidate) { | |
$candidate_length = mb_strlen($preceed_candidate); | |
$preceeding_letter = mb_substr($this->word, $length - $candidate_length, $candidate_length); | |
if ($preceeding_letter == $preceed_candidate) { | |
return true; | |
} | |
} | |
} else if (is_array($negative_preceeded)) { | |
foreach($negative_preceeded as $preceed_candidate) { | |
$candidate_length = mb_strlen($preceed_candidate); | |
$preceeding_letter = mb_substr($this->word, $length - $candidate_length, $candidate_length); | |
if ($preceeding_letter != $preceed_candidate) { | |
return true; | |
} | |
} | |
} else { | |
return true; | |
} | |
} | |
return false; | |
} | |
private function is_suffix_cv($suffix, $in) { | |
$length = mb_strlen($suffix) * -1; | |
if (mb_substr($in, $length) == $suffix) { | |
$preceeding_letter = mb_substr($this->word, $length, 1); | |
if (in_array($preceeding_letter, static::$vowels)) { | |
$preceeding_letter = mb_substr($this->word, $length - 1, 1); | |
if (in_array($preceeding_letter, static::$consonants)) { | |
return true; | |
} | |
} | |
} | |
return false; | |
} | |
private function remove_suffix($suffix) { | |
$this->word = mb_substr($this->word, 0, mb_strlen($suffix) * -1); | |
$this->r1 = mb_substr($this->r1, 0, mb_strlen($suffix) * -1); | |
$this->r2 = mb_substr($this->r2, 0, mb_strlen($suffix) * -1); | |
} | |
function step_1() { | |
$r1 = $this->r1; | |
$r2 = $this->r2; | |
$preceeded = array_merge(static::$vowels, array('n', 't')); | |
if ($this->is_suffix("kin", $r1, $preceeded)) { | |
$this->remove_suffix("kin"); | |
return true; | |
} | |
if ($this->is_suffix("kaan", $r1, $preceeded)) { | |
$this->remove_suffix("kaan"); | |
return true; | |
} | |
if ($this->is_suffix("kään", $r1, $preceeded)) { | |
$this->remove_suffix("kään"); | |
return true; | |
} | |
if ($this->is_suffix("han", $r1, $preceeded)) { | |
$this->remove_suffix("han"); | |
return true; | |
} | |
if ($this->is_suffix("hän", $r1, $preceeded)) { | |
$this->remove_suffix("hän"); | |
return true; | |
} | |
if ($this->is_suffix("ko", $r1, $preceeded)) { | |
$this->remove_suffix("ko"); | |
return true; | |
} | |
if ($this->is_suffix("kö", $r1, $preceeded)) { | |
$this->remove_suffix("kö"); | |
return true; | |
} | |
if ($this->is_suffix("pa", $r1, $preceeded)) { | |
$this->remove_suffix("pa"); | |
return true; | |
} | |
if ($this->is_suffix("pä", $r1, $preceeded)) { | |
$this->remove_suffix("pä"); | |
return true; | |
} | |
if ($this->is_suffix("sti", $r2)) { | |
$this->remove_suffix("sti"); | |
return true; | |
} | |
return false; | |
} | |
function step_2() { | |
$r1 = $this->r1; | |
$negative_preceeded = array('k'); | |
if ($this->is_suffix("si", $r1, false, $negative_preceeded)) { | |
$this->remove_suffix("si"); | |
return true; | |
} | |
if ($this->is_suffix("ni", $r1)) { | |
$this->remove_suffix("ni"); | |
return true; | |
} | |
if ($this->is_suffix("nsa", $r1)) { | |
$this->remove_suffix("nsa"); | |
return true; | |
} | |
if ($this->is_suffix("nsä", $r1)) { | |
$this->remove_suffix("nsä"); | |
return true; | |
} | |
if ($this->is_suffix("mme", $r1)) { | |
$this->remove_suffix("mme"); | |
return true; | |
} | |
if ($this->is_suffix("nne", $r1)) { | |
$this->remove_suffix("nne"); | |
return true; | |
} | |
return false; | |
} | |
private function step_3() { | |
$r1 = $this->r1; | |
$r2 = $this->r2; | |
$preceeded = array('a'); | |
if ($this->is_suffix("han", $r1, $preceeded)) { | |
$this->remove_suffix("han"); | |
return true; | |
} | |
$preceeded = array('e'); | |
if ($this->is_suffix("hen", $r1, $preceeded)) { | |
$this->remove_suffix("hen"); | |
return true; | |
} | |
$preceeded = array('i'); | |
if ($this->is_suffix("hin", $r1, $preceeded)) { | |
$this->remove_suffix("hin"); | |
return true; | |
} | |
$preceeded = array('o'); | |
if ($this->is_suffix("hon", $r1, $preceeded)) { | |
$this->remove_suffix("hon"); | |
return true; | |
} | |
$preceeded = array('y'); | |
if ($this->is_suffix("hyn", $r1, $preceeded)) { | |
$this->remove_suffix("hyn"); | |
return true; | |
} | |
$preceeded = array('ä'); | |
if ($this->is_suffix("hän", $r1, $preceeded)) { | |
$this->remove_suffix("hän"); | |
return true; | |
} | |
$preceeded = array('ö'); | |
if ($this->is_suffix("hön", $r1, $preceeded)) { | |
$this->remove_suffix("hön"); | |
return true; | |
} | |
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi'); | |
if ($this->is_suffix("siin", $r1, $preceeded)) { | |
$this->remove_suffix("siin"); | |
return true; | |
} | |
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi'); | |
if ($this->is_suffix("den", $r1, $preceeded)) { | |
$this->remove_suffix("den"); | |
return true; | |
} | |
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi'); | |
if ($this->is_suffix("tten", $r1, $preceeded)) { | |
$this->remove_suffix("tten"); | |
return true; | |
} | |
$preceeded = static::$long_vowels; | |
if ($this->is_suffix("seen", $r1, $preceeded)) { | |
$this->remove_suffix("seen"); | |
return true; | |
} | |
$preceeded = array('e'); | |
if ($this->is_suffix("tta", $r1, $preceeded)) { | |
$this->remove_suffix("tta"); | |
return true; | |
} | |
if ($this->is_suffix("ttä", $r1, $preceeded)) { | |
$this->remove_suffix("ttä"); | |
return true; | |
} | |
if ($this->is_suffix("sta", $r1)) { | |
$this->remove_suffix("sta"); | |
return true; | |
} | |
if ($this->is_suffix("ssa", $r1)) { | |
$this->remove_suffix("ssa"); | |
return true; | |
} | |
if ($this->is_suffix("stä", $r1)) { | |
$this->remove_suffix("stä"); | |
return true; | |
} | |
if ($this->is_suffix("lta", $r1)) { | |
$this->remove_suffix("lta"); | |
return true; | |
} | |
if ($this->is_suffix("ltä", $r1)) { | |
$this->remove_suffix("ltä"); | |
return true; | |
} | |
if ($this->is_suffix("lla", $r1)) { | |
$this->remove_suffix("lla"); | |
return true; | |
} | |
if ($this->is_suffix("llä", $r1)) { | |
$this->remove_suffix("llä"); | |
return true; | |
} | |
if ($this->is_suffix("lle", $r1)) { | |
$this->remove_suffix("lle"); | |
return true; | |
} | |
if ($this->is_suffix("ksi", $r1)) { | |
$this->remove_suffix("ksi"); | |
return true; | |
} | |
if ($this->is_suffix("ine", $r1)) { | |
$this->remove_suffix("ine"); | |
return true; | |
} | |
if ($this->is_suffix("ta", $r1)) { | |
$this->remove_suffix("ta"); | |
return true; | |
} | |
if ($this->is_suffix("tä", $r1)) { | |
$this->remove_suffix("tä"); | |
return true; | |
} | |
if ($this->is_suffix("na", $r1)) { | |
$this->remove_suffix("na"); | |
return true; | |
} | |
if ($this->is_suffix("nä", $r1)) { | |
$this->remove_suffix("nä"); | |
return true; | |
} | |
if ($this->is_suffix("n", $r1)) { | |
$this->remove_suffix("n"); | |
$preceeding = array_merge(static::$long_vowels, array('ie')); | |
if (in_array(mb_substr($this->word, -2), $preceeding)) { | |
$this->remove_suffix("n"); | |
} | |
return true; | |
} | |
if ($this->is_suffix_cv("a", $r1)) { | |
$this->remove_suffix("a"); | |
return true; | |
} | |
return false; | |
} | |
private function step_4() { | |
$r2 = $this->r2; | |
$negative_preceeded = array('po'); | |
if ($this->is_suffix("mpi", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mpi"); | |
return true; | |
} | |
if ($this->is_suffix("mpa", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mpa"); | |
return true; | |
} | |
if ($this->is_suffix("mpä", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mpä"); | |
return true; | |
} | |
if ($this->is_suffix("mmi", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mmi"); | |
return true; | |
} | |
if ($this->is_suffix("mma", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mma"); | |
return true; | |
} | |
if ($this->is_suffix("mmä", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mmä"); | |
return true; | |
} | |
if ($this->is_suffix("impi", $r2)) { | |
$this->remove_suffix("impi"); | |
return true; | |
} | |
if ($this->is_suffix("impa", $r2)) { | |
$this->remove_suffix("impa"); | |
return true; | |
} | |
if ($this->is_suffix("impä", $r2)) { | |
$this->remove_suffix("impä"); | |
return true; | |
} | |
if ($this->is_suffix("immi", $r2)) { | |
$this->remove_suffix("immi"); | |
return true; | |
} | |
if ($this->is_suffix("imma", $r2)) { | |
$this->remove_suffix("imma"); | |
return true; | |
} | |
if ($this->is_suffix("immä", $r2)) { | |
$this->remove_suffix("immä"); | |
return true; | |
} | |
if ($this->is_suffix("eja", $r2)) { | |
$this->remove_suffix("eja"); | |
return true; | |
} | |
if ($this->is_suffix("ejä", $r2)) { | |
$this->remove_suffix("ejä"); | |
return true; | |
} | |
} | |
private function step_5() { | |
$r1 = $this->r1; | |
$r2 = $this->r2; | |
if ($this->removed_in_step_3) { | |
if ($this->is_suffix("i", $r1)) { | |
$this->remove_suffix("i"); | |
return true; | |
} | |
if ($this->is_suffix("j", $r1)) { | |
$this->remove_suffix("j"); | |
return true; | |
} | |
} | |
else { | |
$preceeded = static::$vowels; | |
if ($this->is_suffix("t", $r1, $preceeded)) { | |
$this->remove_suffix("t"); | |
if ($this->is_suffix("imma", $r2)) { | |
$this->remove_suffix("imma"); | |
} | |
$negative_preceeded = array('po'); | |
if ($this->is_suffix("mma", $r2, false, $negative_preceeded)) { | |
$this->remove_suffix("mma"); | |
} | |
return true; | |
} | |
} | |
} | |
private function step_6() { | |
$ending = mb_substr($this->r1, -2); | |
if (in_array($ending, static::$long_vowels)) { | |
$this->word = mb_substr($this->word, 0, -1); | |
$this->r1 = mb_substr($this->r1, 0, -1); | |
$this->r2 = mb_substr($this->r2, 0, -1); | |
} | |
$last = mb_substr($this->r1, -1); | |
if (in_array($last, array('a', 'e', 'i', 'ä'))) { | |
$second_to_last = mb_substr($this->r1, -2, 1); | |
if (in_array($second_to_last, static::$consonants)) { | |
$this->word = mb_substr($this->word, 0, -1); | |
$this->r1 = mb_substr($this->r1, 0, -1); | |
$this->r2 = mb_substr($this->r2, 0, -1); | |
} | |
} | |
$ending = mb_substr($this->r1, -2); | |
if (in_array($ending, array('oj', 'uj'))) { | |
$this->word = mb_substr($this->word, 0, -1); | |
$this->r1 = mb_substr($this->r1, 0, -1); | |
$this->r2 = mb_substr($this->r2, 0, -1); | |
} | |
$ending = mb_substr($this->r1, -2); | |
if (in_array($ending, array('jo'))) { | |
$this->word = mb_substr($this->word, 0, -1); | |
$this->r1 = mb_substr($this->r1, 0, -1); | |
$this->r2 = mb_substr($this->r2, 0, -1); | |
} | |
$last_consonant_found = false; | |
for ($i = mb_strlen($this->word); $i > 0; $i--) { | |
$letter = mb_substr($this->word, $i, 1); | |
if (in_array($letter, static::$consonants)) { | |
$prev_letter = mb_substr($this->word, $i - 1, 1); | |
if ($prev_letter == $letter) { | |
$this->word = mb_substr($this->word, 0, $i - 1) . mb_substr($this->word, $i); | |
break; | |
} | |
else { | |
break; | |
} | |
} | |
} | |
} | |
private function r1r2() { | |
list($r1_index, $this->r1) = $this->find_r($this->word); | |
if ($this->r1) { | |
list($r2_index, $this->r2) = $this->find_r($this->r1); | |
} | |
} | |
function process($word) { | |
$this->word = $word; | |
$this->r1r2(); | |
$this->step_1(); | |
$this->step_2(); | |
$removed = $this->step_3(); | |
$this->removed_in_step_3 = $removed; | |
$this->step_4(); | |
$this->step_5(); | |
$this->step_6(); | |
; | |
return $this->word; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment