Skip to content

Instantly share code, notes, and snippets.

@siahr
Created February 18, 2012 03:25
Show Gist options
  • Save siahr/1857194 to your computer and use it in GitHub Desktop.
Save siahr/1857194 to your computer and use it in GitHub Desktop.
English Stemmer
<?php
/**
* English stemming algorithm, based on the publication from
* Porter (1980), "An algorithm for suffix stripping".
*
* This PHP code is based on the product of BaseX Team.
* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java
*
* @package Sugina
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
* @author Toshio HIRAI (Porting to PHP)
* @license http://opensource.org/licenses/BSD-3-Clause The BSD License
*/
class EnglishStemmer {
/** Token to be stemmed. */
private $tok = array();
/** Token length. */
private $te;
/** Stemming length. */
private $tt;
/**
* Constructor.
*/
public function __construct() {
/* Stemming character. */
$this->AT = self::token("at");
$this->BL = self::token("bl");
$this->ED = self::token("ed");
$this->EED = self::token("eed");
$this->IES = self::token("ies");
$this->ING = self::token("ing");
$this->ION = self::token("ion");
$this->IZ = self::token("iz");
$this->LL = self::token("ll");
$this->SION = self::token("sion");
$this->SSES = self::token("sses");
$this->TION = self::token("tion");
$this->S = 115;
$this->Y = 121;
$this->E = 101;
$this->L = 108;
/* Step 2. */
$this->ST2 = array(
self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")),
self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")),
self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")),
self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")),
self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")),
self::tokens(array("iviti", "ive")),
self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")),
self::tokens(array("izer", "ize")),
self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")),
self::tokens(array("tional", "tion")),
);
/* Step 3. */
$this->ST3 = array(
self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")),
self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")),
self::tokens(array("iciti", "ic")), self::tokens(array("ness", ""))
);
/* Step 4. */
$this->ST4 = self::tokens(
array(
"able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
"ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
)
);
}
/**
* Stems a word.
* @param array $str input word to stem
* @return array the stem of the word
*/
public function stem(array $str) {
$this->te = count($str);
$this->tok = $str;
return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te);
}
/**
* Stems the current word.
* @return boolean true if word was stemmed
*/
private function s() {
if($this->te < 3) return false;
// step 1
if($this->e($this->S)) {
if($this->e($this->SSES) || $this->e($this->IES)) $this->te -= 2;
else if($this->l($this->te - 2) != 115) --$this->te;
}
if($this->e($this->EED)) {
if($this->m() > 0) --$this->te;
} else if(($this->e($this->ED) || $this->e($this->ING)) && $this->v()) {
$this->te = $this->tt;
if($this->e($this->AT) || $this->e($this->BL) || $this->e($this->IZ)) {
$this->tt = $this->te;
$this->ac(101);
} else if($this->te > 1) {
$c = $this->l($this->te - 1);
if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) {
--$this->te;
} else if($this->m() == 1) {
if($this->c($this->te)) $this->ac(101);
}
}
}
if($this->e($this->Y) && $this->v()) $this->ac(105);
// step 2
foreach($this->ST2 as $s) {
if($this->e($s[0])) {
if($this->m() > 0) $this->at($s[1]);
break;
}
}
// step 3
foreach($this->ST3 as $s) {
if($this->e($s[0])) {
if($this->m() > 0) $this->at($s[1]);
break;
}
}
// step 4
if(($this->e($this->TION) || $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) {
$this->te -= 3;
} else {
foreach($this->ST4 as $s) {
if($this->e($s)) {
if($this->m() > 1) $this->te = $this->tt;
break;
}
}
}
// step 5
if($this->e($this->E)) {
$m = $this->m();
if($m > 1 || $m == 1 && !$this->c($this->te - 1)) --$this->te;
}
if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te;
return $this->te != count($this->tok);
}
/**
* Checks for the cvc pattern.
* @param $l position
* @return boolean result of check
*/
private function c($l) {
if($l < 3) return false;
$c = $this->l($l - 1);
return $c != 119 && $c != 120 && $c != 121 &&
!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3);
}
/**
* Suffix test for a token.
* @param mixed $s suffix
* @return boolean result of check
*/
private function e($s) {
if (is_array($s)) {
$sl = count($s);
$l = $this->te - $sl;
if($l < 0) return false;
for($i = 0; $i < $sl; ++$i)
if($this->l($l + $i) != $s[$i]) return false;
$this->tt = $l;
return true;
}
$l = $this->te - 1;
if($l < 0 || $this->l($l) != $s) return false;
$this->tt = $l;
return true;
}
/**
* Returns word measure.
* @return integer measure
*/
private function m() {
$c = 0;
$i = -1;
$v = false;
while(++$i < $this->tt) {
if($v xor $this->vt($i)) {
if($v) ++$c;
$v = ($v xor true);
}
}
return $c;
}
/**
* Vowel test.
* @return boolean result of check
*/
private function v() {
for($i = 0; $i < $this->tt; ++$i)
if($this->vt($i)) return true;
return false;
}
/**
* Vowel test.
* @param integer $p position
* @return boolean result of check
*/
private function vt($p) {
$c = $this->l($p);
return $c == 97 || $c == 101 || $c == 105 || $c == 111 || $c == 117 ||
$c == 121 && $p != 0 && !$this->vt($p - 1);
}
/**
* Returns the lower character at the specified position.
* @param integer $p position
* @return integer result of check
*/
private function l($p) {
return TokenUtil::lc($this->tok[$p]);
}
/**
* Adds a character.
* @param integer $c character
*/
private function ac($c) {
$this->te = $this->tt;
$this->tok[$this->te++] = $c;
}
/**
* Adds a token.
* @param array $t token
*/
private function at($t) {
$this->te = $this->tt;
foreach($t as $c) {
$this->tok[$this->te++] = $c;
}
}
/**
* Converts a string to chars array.
* All strings should be converted by this function to guarantee
* a consistent character conversion.
* @param string $str string to be converted
* @return array chars array
*/
private static function token($str) {
return TokenUtil::toChars($str, "UTF-8");
}
/**
* Converts the specified strings to tokens.
* @param array $strs strings
* @return array tokens array
*/
private static function tokens(array $strs) {
$tokens = array();
foreach($strs as $str) {
$tokens[] = self::token($str);
}
return $tokens;
}
}
?>
@siahr
Copy link
Author

siahr commented Feb 18, 2012

Usage:

<?php
require_once '../Sugina/TokenUtil.php';
require_once '../Sugina/EnglishStemmer.php';

$stemmer = new EnglishStemmer();
$res = $stemmer->stem(TokenUtil::toChars("tokens"));
echo TokenUtil::toString($res);
?>

This code returns "token".

TokenUtil.php will be found here.
https://gist.github.com/1857186

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment