Skip to content

Instantly share code, notes, and snippets.

@msaari

msaari/suomi_stemmer.php

Last active Jan 29, 2019
Embed
What would you like to do?
Suomenkielinen Porter stemmer
<?php
add_filter( 'relevanssi_stemmer', 'suomi_stemmer' );
function suomi_stemmer( $merkkijono ) {
$stemmer = new FinnishStemmer();
return $stemmer->process( $merkkijono );
}
class FinnishStemmer {
private static $vowels = array("a", "e", "i", "o", "u", "y", "å", "ä", "ö");
private static $long_vowels = array("aa", "ee", "ii", "oo", "uu", "yy", "åå", "ää", "öö");
private static $consonants = array("b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z");
protected $word;
protected $r1;
protected $r2;
protected $removed_in_step_3;
private function find_r($in) {
$word_vowels = array();
for ($i = 0; $i < mb_strlen($in); $i++) {
$letter = mb_substr($in, $i, 1);
if (in_array($letter, static::$vowels)) $word_vowels[] = $i;
}
$index = mb_strlen($in);
$value = "";
foreach ($word_vowels as $pos) {
$after = $pos + 1;
$letter = mb_substr($in, $after, 1);
if (!in_array($letter, static::$vowels)) {
$index = $after + 1;
$value = mb_substr($in, $after + 1);
break;
}
}
return array($index, $value);
}
private function is_suffix($suffix, $in, $preceeded = "", $negative_preceeded = "") {
$length = mb_strlen($suffix) * -1;
if (mb_substr($in, $length) == $suffix) {
if (is_array($preceeded)) {
foreach($preceeded as $preceed_candidate) {
$candidate_length = mb_strlen($preceed_candidate);
$preceeding_letter = mb_substr($this->word, $length - $candidate_length, $candidate_length);
if ($preceeding_letter == $preceed_candidate) {
return true;
}
}
} else if (is_array($negative_preceeded)) {
foreach($negative_preceeded as $preceed_candidate) {
$candidate_length = mb_strlen($preceed_candidate);
$preceeding_letter = mb_substr($this->word, $length - $candidate_length, $candidate_length);
if ($preceeding_letter != $preceed_candidate) {
return true;
}
}
} else {
return true;
}
}
return false;
}
private function is_suffix_cv($suffix, $in) {
$length = mb_strlen($suffix) * -1;
if (mb_substr($in, $length) == $suffix) {
$preceeding_letter = mb_substr($this->word, $length, 1);
if (in_array($preceeding_letter, static::$vowels)) {
$preceeding_letter = mb_substr($this->word, $length - 1, 1);
if (in_array($preceeding_letter, static::$consonants)) {
return true;
}
}
}
return false;
}
private function remove_suffix($suffix) {
$this->word = mb_substr($this->word, 0, mb_strlen($suffix) * -1);
$this->r1 = mb_substr($this->r1, 0, mb_strlen($suffix) * -1);
$this->r2 = mb_substr($this->r2, 0, mb_strlen($suffix) * -1);
}
function step_1() {
$r1 = $this->r1;
$r2 = $this->r2;
$preceeded = array_merge(static::$vowels, array('n', 't'));
if ($this->is_suffix("kin", $r1, $preceeded)) {
$this->remove_suffix("kin");
return true;
}
if ($this->is_suffix("kaan", $r1, $preceeded)) {
$this->remove_suffix("kaan");
return true;
}
if ($this->is_suffix("kään", $r1, $preceeded)) {
$this->remove_suffix("kään");
return true;
}
if ($this->is_suffix("han", $r1, $preceeded)) {
$this->remove_suffix("han");
return true;
}
if ($this->is_suffix("hän", $r1, $preceeded)) {
$this->remove_suffix("hän");
return true;
}
if ($this->is_suffix("ko", $r1, $preceeded)) {
$this->remove_suffix("ko");
return true;
}
if ($this->is_suffix("kö", $r1, $preceeded)) {
$this->remove_suffix("kö");
return true;
}
if ($this->is_suffix("pa", $r1, $preceeded)) {
$this->remove_suffix("pa");
return true;
}
if ($this->is_suffix("pä", $r1, $preceeded)) {
$this->remove_suffix("pä");
return true;
}
if ($this->is_suffix("sti", $r2)) {
$this->remove_suffix("sti");
return true;
}
return false;
}
function step_2() {
$r1 = $this->r1;
$negative_preceeded = array('k');
if ($this->is_suffix("si", $r1, false, $negative_preceeded)) {
$this->remove_suffix("si");
return true;
}
if ($this->is_suffix("ni", $r1)) {
$this->remove_suffix("ni");
return true;
}
if ($this->is_suffix("nsa", $r1)) {
$this->remove_suffix("nsa");
return true;
}
if ($this->is_suffix("nsä", $r1)) {
$this->remove_suffix("nsä");
return true;
}
if ($this->is_suffix("mme", $r1)) {
$this->remove_suffix("mme");
return true;
}
if ($this->is_suffix("nne", $r1)) {
$this->remove_suffix("nne");
return true;
}
return false;
}
private function step_3() {
$r1 = $this->r1;
$r2 = $this->r2;
$preceeded = array('a');
if ($this->is_suffix("han", $r1, $preceeded)) {
$this->remove_suffix("han");
return true;
}
$preceeded = array('e');
if ($this->is_suffix("hen", $r1, $preceeded)) {
$this->remove_suffix("hen");
return true;
}
$preceeded = array('i');
if ($this->is_suffix("hin", $r1, $preceeded)) {
$this->remove_suffix("hin");
return true;
}
$preceeded = array('o');
if ($this->is_suffix("hon", $r1, $preceeded)) {
$this->remove_suffix("hon");
return true;
}
$preceeded = array('y');
if ($this->is_suffix("hyn", $r1, $preceeded)) {
$this->remove_suffix("hyn");
return true;
}
$preceeded = array('ä');
if ($this->is_suffix("hän", $r1, $preceeded)) {
$this->remove_suffix("hän");
return true;
}
$preceeded = array('ö');
if ($this->is_suffix("hön", $r1, $preceeded)) {
$this->remove_suffix("hön");
return true;
}
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi');
if ($this->is_suffix("siin", $r1, $preceeded)) {
$this->remove_suffix("siin");
return true;
}
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi');
if ($this->is_suffix("den", $r1, $preceeded)) {
$this->remove_suffix("den");
return true;
}
$preceeded = array('ai', 'ei', 'ii', 'oi', 'ui', 'yi', 'äi', 'öi');
if ($this->is_suffix("tten", $r1, $preceeded)) {
$this->remove_suffix("tten");
return true;
}
$preceeded = static::$long_vowels;
if ($this->is_suffix("seen", $r1, $preceeded)) {
$this->remove_suffix("seen");
return true;
}
$preceeded = array('e');
if ($this->is_suffix("tta", $r1, $preceeded)) {
$this->remove_suffix("tta");
return true;
}
if ($this->is_suffix("ttä", $r1, $preceeded)) {
$this->remove_suffix("ttä");
return true;
}
if ($this->is_suffix("sta", $r1)) {
$this->remove_suffix("sta");
return true;
}
if ($this->is_suffix("ssa", $r1)) {
$this->remove_suffix("ssa");
return true;
}
if ($this->is_suffix("stä", $r1)) {
$this->remove_suffix("stä");
return true;
}
if ($this->is_suffix("lta", $r1)) {
$this->remove_suffix("lta");
return true;
}
if ($this->is_suffix("ltä", $r1)) {
$this->remove_suffix("ltä");
return true;
}
if ($this->is_suffix("lla", $r1)) {
$this->remove_suffix("lla");
return true;
}
if ($this->is_suffix("llä", $r1)) {
$this->remove_suffix("llä");
return true;
}
if ($this->is_suffix("lle", $r1)) {
$this->remove_suffix("lle");
return true;
}
if ($this->is_suffix("ksi", $r1)) {
$this->remove_suffix("ksi");
return true;
}
if ($this->is_suffix("ine", $r1)) {
$this->remove_suffix("ine");
return true;
}
if ($this->is_suffix("ta", $r1)) {
$this->remove_suffix("ta");
return true;
}
if ($this->is_suffix("tä", $r1)) {
$this->remove_suffix("tä");
return true;
}
if ($this->is_suffix("na", $r1)) {
$this->remove_suffix("na");
return true;
}
if ($this->is_suffix("nä", $r1)) {
$this->remove_suffix("nä");
return true;
}
if ($this->is_suffix("n", $r1)) {
$this->remove_suffix("n");
$preceeding = array_merge(static::$long_vowels, array('ie'));
if (in_array(mb_substr($this->word, -2), $preceeding)) {
$this->remove_suffix("n");
}
return true;
}
if ($this->is_suffix_cv("a", $r1)) {
$this->remove_suffix("a");
return true;
}
return false;
}
private function step_4() {
$r2 = $this->r2;
$negative_preceeded = array('po');
if ($this->is_suffix("mpi", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mpi");
return true;
}
if ($this->is_suffix("mpa", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mpa");
return true;
}
if ($this->is_suffix("mpä", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mpä");
return true;
}
if ($this->is_suffix("mmi", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mmi");
return true;
}
if ($this->is_suffix("mma", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mma");
return true;
}
if ($this->is_suffix("mmä", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mmä");
return true;
}
if ($this->is_suffix("impi", $r2)) {
$this->remove_suffix("impi");
return true;
}
if ($this->is_suffix("impa", $r2)) {
$this->remove_suffix("impa");
return true;
}
if ($this->is_suffix("impä", $r2)) {
$this->remove_suffix("impä");
return true;
}
if ($this->is_suffix("immi", $r2)) {
$this->remove_suffix("immi");
return true;
}
if ($this->is_suffix("imma", $r2)) {
$this->remove_suffix("imma");
return true;
}
if ($this->is_suffix("immä", $r2)) {
$this->remove_suffix("immä");
return true;
}
if ($this->is_suffix("eja", $r2)) {
$this->remove_suffix("eja");
return true;
}
if ($this->is_suffix("ejä", $r2)) {
$this->remove_suffix("ejä");
return true;
}
}
private function step_5() {
$r1 = $this->r1;
$r2 = $this->r2;
if ($this->removed_in_step_3) {
if ($this->is_suffix("i", $r1)) {
$this->remove_suffix("i");
return true;
}
if ($this->is_suffix("j", $r1)) {
$this->remove_suffix("j");
return true;
}
}
else {
$preceeded = static::$vowels;
if ($this->is_suffix("t", $r1, $preceeded)) {
$this->remove_suffix("t");
if ($this->is_suffix("imma", $r2)) {
$this->remove_suffix("imma");
}
$negative_preceeded = array('po');
if ($this->is_suffix("mma", $r2, false, $negative_preceeded)) {
$this->remove_suffix("mma");
}
return true;
}
}
}
private function step_6() {
$ending = mb_substr($this->r1, -2);
if (in_array($ending, static::$long_vowels)) {
$this->word = mb_substr($this->word, 0, -1);
$this->r1 = mb_substr($this->r1, 0, -1);
$this->r2 = mb_substr($this->r2, 0, -1);
}
$last = mb_substr($this->r1, -1);
if (in_array($last, array('a', 'e', 'i', 'ä'))) {
$second_to_last = mb_substr($this->r1, -2, 1);
if (in_array($second_to_last, static::$consonants)) {
$this->word = mb_substr($this->word, 0, -1);
$this->r1 = mb_substr($this->r1, 0, -1);
$this->r2 = mb_substr($this->r2, 0, -1);
}
}
$ending = mb_substr($this->r1, -2);
if (in_array($ending, array('oj', 'uj'))) {
$this->word = mb_substr($this->word, 0, -1);
$this->r1 = mb_substr($this->r1, 0, -1);
$this->r2 = mb_substr($this->r2, 0, -1);
}
$ending = mb_substr($this->r1, -2);
if (in_array($ending, array('jo'))) {
$this->word = mb_substr($this->word, 0, -1);
$this->r1 = mb_substr($this->r1, 0, -1);
$this->r2 = mb_substr($this->r2, 0, -1);
}
$last_consonant_found = false;
for ($i = mb_strlen($this->word); $i > 0; $i--) {
$letter = mb_substr($this->word, $i, 1);
if (in_array($letter, static::$consonants)) {
$prev_letter = mb_substr($this->word, $i - 1, 1);
if ($prev_letter == $letter) {
$this->word = mb_substr($this->word, 0, $i - 1) . mb_substr($this->word, $i);
break;
}
else {
break;
}
}
}
}
private function r1r2() {
list($r1_index, $this->r1) = $this->find_r($this->word);
if ($this->r1) {
list($r2_index, $this->r2) = $this->find_r($this->r1);
}
}
function process($word) {
$this->word = $word;
$this->r1r2();
$this->step_1();
$this->step_2();
$removed = $this->step_3();
$this->removed_in_step_3 = $removed;
$this->step_4();
$this->step_5();
$this->step_6();
;
return $this->word;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.