Created
December 9, 2015 21:30
-
-
Save petrovitch/2782c20bb2fbb2936c73 to your computer and use it in GitHub Desktop.
Porter Stemmer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* porter_stemmer.php | |
* | |
* In linguistic morphology and information retrieval, stemming is the process | |
* for reducing inflected (or sometimes derived) words to their stem, base or | |
* root form—generally a written word form. | |
* | |
* Porter Stemmer Steps: | |
* - Step 1: Gets rid of plurals and -ed or -ing suffixes | |
* - Step 2: Turns terminal y to i when there is another vowel in the stem | |
* - Step 3: Maps double suffixes to single ones: -ization, -ational, etc. | |
* - Step 4: Deals with suffixes, -full, -ness etc. | |
* - Step 5: Takes off -ant, -ence, etc. | |
* - Step 6: Removes a final -e | |
* | |
* Porter Mishaps & Shortcomings: | |
* - Severing vs. several => sever | |
* - University vs. universe => univers | |
* - Iron vs. ironic => iron | |
* - Animal vs. animated | |
* - Stemmers are rudimentary | |
* - No word sense disambiguation (“bats” vs “batting”) | |
* - No POS disambiguation (“Batting” could be noun or verb, but “hitting” could only be verb) | |
* - Cannot handle irregular conjungation/inflection (“to be”, etc.) | |
* | |
* @example | |
* $word = "stemming"; | |
* $ps = new PorterStemmer($word); | |
* $stem = $ps->get_stem(); | |
* echo $stem; | |
*/ | |
class PorterStemmer | |
{ | |
private $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)'; | |
private $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)'; | |
private $stems = array(); | |
private $words = array(); | |
private $input; | |
private $stem; | |
private $word; | |
public function __construct($input) | |
{ | |
$this->words = preg_split("/\s/", $input); | |
$this->stems(); | |
} | |
public function writeln($str) | |
{ | |
if (is_array($str)) | |
{ | |
echo "<pre>"; | |
print_r($str); | |
echo "</pre>"; | |
} | |
else | |
{ | |
echo $str; | |
} | |
echo "<br>"; | |
} | |
public function stems () | |
{ | |
foreach ($this->words as $this->stem) | |
{ | |
$this->step1(); | |
$this->step2(); | |
$this->step3(); | |
$this->step4(); | |
$this->step5(); | |
$this->step6(); | |
$this->stems[] = $this->stem; | |
} | |
} | |
/** | |
* Step 1: Gets rid of plurals and -ed or -ing suffixes | |
*/ | |
public function step1() | |
{ | |
if (substr($this->stem, -1) == 's') | |
{ | |
$this->replace($this->stem, 'sses', 'ss') | |
OR $this->replace($this->stem, 'ies', 'i') | |
OR $this->replace($this->stem, 'ss', 'ss') | |
OR $this->replace($this->stem, 's', ''); | |
} | |
if (substr($this->stem, -2, 1) != 'e' OR !$this->replace($this->stem, 'eed', 'ee', 0)) | |
{ | |
$v = $this->regex_vowel; | |
if (preg_match("#$v+#", substr($this->stem, 0, -3)) && $this->replace($this->stem, 'ing', '') | |
OR preg_match("#$v+#", substr($this->stem, 0, -2)) && $this->replace($this->stem, 'ed', '')) | |
{ | |
if (!$this->replace($this->stem, 'at', 'ate') | |
AND !$this->replace($this->stem, 'bl', 'ble') | |
AND !$this->replace($this->stem, 'iz', 'ize')) | |
{ | |
// Double consonant ending | |
if ($this->doubleConsonant($this->stem) | |
AND substr($this->stem, -2) != 'll' | |
AND substr($this->stem, -2) != 'ss' | |
AND substr($this->stem, -2) != 'zz') | |
{ | |
$this->stem = substr($this->stem, 0, -1); | |
} | |
elseif ($this->m($this->stem) == 1 AND $this->cvc($this->stem)) | |
{ | |
$this->stem .= 'e'; | |
} | |
} | |
} | |
} | |
} | |
/** | |
* Step 1c: Turns terminal y to i when there is another vowel in the stem | |
*/ | |
public function step2() | |
{ | |
$v = $this->regex_vowel; | |
if (substr($this->stem, -1) == 'y' && preg_match("#$v+#", substr($this->stem, 0, -1))) | |
{ | |
$this->replace($this->stem, 'y', 'i'); | |
} | |
} | |
/** | |
* Step 2: Maps double suffixes to single ones | |
*/ | |
public function step3() | |
{ | |
switch (substr($this->stem, -2, 1)) | |
{ | |
case 'a': | |
$this->replace($this->stem, 'ational', 'ate', 0) | |
OR $this->replace($this->stem, 'tional', 'tion', 0); | |
break; | |
case 'c': | |
$this->replace($this->stem, 'enci', 'ence', 0) | |
OR $this->replace($this->stem, 'anci', 'ance', 0); | |
break; | |
case 'e': | |
$this->replace($this->stem, 'izer', 'ize', 0); | |
break; | |
case 'g': | |
$this->replace($this->stem, 'logi', 'log', 0); | |
break; | |
case 'l': | |
$this->replace($this->stem, 'entli', 'ent', 0) | |
OR $this->replace($this->stem, 'ousli', 'ous', 0) | |
OR $this->replace($this->stem, 'alli', 'al', 0) | |
OR $this->replace($this->stem, 'bli', 'ble', 0) | |
OR $this->replace($this->stem, 'eli', 'e', 0); | |
break; | |
case 'o': | |
$this->replace($this->stem, 'ization', 'ize', 0) | |
OR $this->replace($this->stem, 'ation', 'ate', 0) | |
OR $this->replace($this->stem, 'ator', 'ate', 0); | |
break; | |
case 's': | |
$this->replace($this->stem, 'iveness', 'ive', 0) | |
OR $this->replace($this->stem, 'fulness', 'ful', 0) | |
OR $this->replace($this->stem, 'ousness', 'ous', 0) | |
OR $this->replace($this->stem, 'alism', 'al', 0); | |
break; | |
case 't': | |
$this->replace($this->stem, 'biliti', 'ble', 0) | |
OR $this->replace($this->stem, 'aliti', 'al', 0) | |
OR $this->replace($this->stem, 'iviti', 'ive', 0); | |
break; | |
} | |
} | |
/** | |
* Step 3: Deals with suffixes, -full, -ness etc. | |
*/ | |
public function step4() | |
{ | |
switch (substr($this->stem, -2, 1)) | |
{ | |
case 'a': | |
$this->replace($this->stem, 'ical', 'ic', 0); | |
break; | |
case 's': | |
$this->replace($this->stem, 'ness', '', 0); | |
break; | |
case 't': | |
$this->replace($this->stem, 'icate', 'ic', 0) | |
OR $this->replace($this->stem, 'iciti', 'ic', 0); | |
break; | |
case 'u': | |
$this->replace($this->stem, 'ful', '', 0); | |
break; | |
case 'v': | |
$this->replace($this->stem, 'ative', '', 0); | |
break; | |
case 'z': | |
$this->replace($this->stem, 'alize', 'al', 0); | |
break; | |
} | |
} | |
/** | |
* Step 4: Takes off -ant, -ence, etc. | |
*/ | |
public function step5() | |
{ | |
switch (substr($this->stem, -2, 1)) | |
{ | |
case 'a': | |
$this->replace($this->stem, 'al', '', 1); | |
break; | |
case 'c': | |
$this->replace($this->stem, 'ance', '', 1) | |
OR $this->replace($this->stem, 'ence', '', 1); | |
break; | |
case 'e': | |
$this->replace($this->stem, 'er', '', 1); | |
break; | |
case 'i': | |
$this->replace($this->stem, 'ic', '', 1); | |
break; | |
case 'l': | |
$this->replace($this->stem, 'able', '', 1) | |
OR $this->replace($this->stem, 'ible', '', 1); | |
break; | |
case 'n': | |
$this->replace($this->stem, 'ant', '', 1) | |
OR $this->replace($this->stem, 'ement', '', 1) | |
OR $this->replace($this->stem, 'ment', '', 1) | |
OR $this->replace($this->stem, 'ent', '', 1); | |
break; | |
case 'o': | |
if (substr($this->stem, -4) == 'tion' OR substr($this->stem, -4) == 'sion') | |
{ | |
$this->replace($this->stem, 'ion', '', 1); | |
} | |
else | |
{ | |
$this->replace($this->stem, 'ou', '', 1); | |
} | |
break; | |
case 's': | |
$this->replace($this->stem, 'ism', '', 1); | |
break; | |
case 't': | |
$this->replace($this->stem, 'ate', '', 1) | |
OR $this->replace($this->stem, 'iti', '', 1); | |
break; | |
case 'u': | |
$this->replace($this->stem, 'ous', '', 1); | |
break; | |
case 'v': | |
$this->replace($this->stem, 'ive', '', 1); | |
break; | |
case 'z': | |
$this->replace($this->stem, 'ize', '', 1); | |
break; | |
} | |
} | |
/** | |
* Step 5: Removes a final -e | |
*/ | |
public function step6() | |
{ | |
// Part a | |
if (substr($this->stem, -1) == 'e') | |
{ | |
if ($this->m(substr($this->stem, 0, -1)) > 1) | |
{ | |
$this->replace($this->stem, 'e', ''); | |
} | |
elseif ($this->m(substr($this->stem, 0, -1)) == 1) | |
{ | |
if (!$this->cvc(substr($this->stem, 0, -1))) | |
{ | |
$this->replace($this->stem, 'e', ''); | |
} | |
} | |
} | |
// Part b | |
if ($this->m($this->stem) > 1 AND $this->doubleConsonant($this->stem) AND substr($this->stem, -1) == 'l') | |
{ | |
$this->stem = substr($this->stem, 0, -1); | |
} | |
} | |
/** | |
* Replaces the first string with the second, at the end of the string. If third | |
* arg is given, then the preceding string must match that m count at least. | |
* | |
* @param string $str String to check | |
* @param string $check Ending to check for | |
* @param string $repl Replacement string | |
* @param int $m Optional minimum number of m() to meet | |
* @return bool Whether the $check string was at the end | |
* of the $str string. True does not necessarily mean | |
* that it was replaced. | |
*/ | |
public function replace(&$str, $check, $repl, $m = null) | |
{ | |
$len = 0 - strlen($check); | |
if (substr($str, $len) == $check) | |
{ | |
$substr = substr($str, 0, $len); | |
if (is_null($m) OR $this->m($substr) > $m) | |
{ | |
$str = $substr . $repl; | |
} | |
return true; | |
} | |
return false; | |
} | |
/** | |
* m() measures the number of consonant sequences in $str. if c is | |
* a consonant sequence and v a vowel sequence, and <..> indicates arbitrary | |
* presence, | |
* | |
* <c><v> gives 0 | |
* <c>vc<v> gives 1 | |
* <c>vcvc<v> gives 2 | |
* <c>vcvcvc<v> gives 3 | |
*/ | |
public function m($str) | |
{ | |
$c = $this->regex_consonant; | |
$v = $this->regex_vowel; | |
$str = preg_replace("#^$c+#", '', $str); | |
$str = preg_replace("#$v+$#", '', $str); | |
preg_match_all("#($v+$c+)#", $str, $matches); | |
return count($matches[1]); | |
} | |
/** | |
* Returns true/false as to whether the given string contains two | |
* of the same consonant next to each other at the end of the string. | |
*/ | |
public function doubleConsonant($str) | |
{ | |
$c = $this->regex_consonant; | |
return preg_match("#$c{2}$#", $str, $matches) AND $matches[0]{0} == $matches[0]{1}; | |
} | |
/** | |
* Checks for ending CVC sequence where second C is not W, X or Y | |
*/ | |
public function cvc($str) | |
{ | |
$c = $this->regex_consonant; | |
$v = $this->regex_vowel; | |
return preg_match("#($c$v$c)$#", $str, $matches) | |
AND strlen($matches[1]) == 3 | |
AND $matches[1]{2} != 'w' | |
AND $matches[1]{2} != 'x' | |
AND $matches[1]{2} != 'y'; | |
} | |
public function get_stems() | |
{ | |
return $this->stems; | |
} | |
} // Class | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment