siahr/EnglishStemmer.php

## EnglishStemmer.php
<?php
/**
* English stemming algorithm, based on the publication from
* Porter (1980), "An algorithm for suffix stripping".
*
* This PHP code is based on the product of BaseX Team.
* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java
*
* @package Sugina
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
* @author Toshio HIRAI (Porting to PHP)
* @license http://opensource.org/licenses/BSD-3-Clause The BSD License
*/

class EnglishStemmer {
	/** Token to be stemmed. */
	private $tok = array();
	/** Token length. */
	private $te;
	/** Stemming length. */
	private $tt;

	/**
	* Constructor.
	*/
	public function __construct() {
		/* Stemming character. */
		$this->AT = self::token("at");
		$this->BL = self::token("bl");
		$this->ED = self::token("ed");
		$this->EED = self::token("eed");
		$this->IES = self::token("ies");
		$this->ING = self::token("ing");
		$this->ION = self::token("ion");
		$this->IZ = self::token("iz");
		$this->LL = self::token("ll");
		$this->SION = self::token("sion");
		$this->SSES = self::token("sses");
		$this->TION = self::token("tion");

		$this->S = 115;
		$this->Y = 121;
		$this->E = 101;
		$this->L = 108;

		/* Step 2. */
		$this->ST2 = array(
			self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")),
			self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")),
			self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")),
			self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")),
			self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")),
			self::tokens(array("iviti", "ive")),
			self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")),
			self::tokens(array("izer", "ize")),
			self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")),
			self::tokens(array("tional", "tion")),
		);

		/* Step 3. */
		$this->ST3 = array(
			self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")),
			self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")),
			self::tokens(array("iciti", "ic")), self::tokens(array("ness", ""))
		);

		/* Step 4. */
		$this->ST4 = self::tokens(
			array(
		      "able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
		      "ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
		    )
		);
	}

	/**
	 * Stems a word.
	 * @param array $str input word to stem
	 * @return array the stem of the word
	 */
	public function stem(array $str) {
		$this->te = count($str);
		$this->tok = $str;
		return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te);
	}

	/**
	* Stems the current word.
	* @return boolean true if word was stemmed
	*/
	private function s() {
		if($this->te < 3) return false;

		// step 1
		if($this->e($this->S)) {
			if($this->e($this->SSES) || $this->e($this->IES)) $this->te -= 2;
			else if($this->l($this->te - 2) != 115) --$this->te;
		}

		if($this->e($this->EED)) {
			if($this->m() > 0) --$this->te;
		} else if(($this->e($this->ED) || $this->e($this->ING)) && $this->v()) {
			$this->te = $this->tt;

			if($this->e($this->AT) || $this->e($this->BL) || $this->e($this->IZ)) {
				$this->tt = $this->te;
				$this->ac(101);
			} else if($this->te > 1) {
				$c = $this->l($this->te - 1);
				if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) {
					--$this->te;
				} else if($this->m() == 1) {
					if($this->c($this->te)) $this->ac(101);
				}
			}
		}
		if($this->e($this->Y) && $this->v()) $this->ac(105);

		// step 2
		foreach($this->ST2 as $s) {
			if($this->e($s[0])) {
				if($this->m() > 0) $this->at($s[1]);
				break;
			}
		}

		// step 3
		foreach($this->ST3 as $s) {
			if($this->e($s[0])) {
				if($this->m() > 0) $this->at($s[1]);
				break;
			}
		}

		// step 4
		if(($this->e($this->TION) || $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) {
			$this->te -= 3;
		} else {
			foreach($this->ST4 as $s) {
				if($this->e($s)) {
					if($this->m() > 1) $this->te = $this->tt;
					break;
				}
			}
		}

		// step 5
		if($this->e($this->E)) {
			$m = $this->m();
			if($m > 1 || $m == 1 && !$this->c($this->te - 1)) --$this->te;
		}
		if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te;

		return $this->te != count($this->tok);
	}

	/**
	* Checks for the cvc pattern.
	* @param $l position
	* @return boolean result of check
	*/
	private function c($l) {
		if($l < 3) return false;
		$c = $this->l($l - 1);
		return $c != 119 && $c != 120 && $c != 121 &&
		!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3);
	}

	/**
	 * Suffix test for a token.
	 * @param mixed $s suffix
	 * @return boolean result of check
	 */
	private function e($s) {
		if (is_array($s)) {
			$sl = count($s);
			$l = $this->te - $sl;
			if($l < 0) return false;
			for($i = 0; $i < $sl; ++$i)
			if($this->l($l + $i) != $s[$i]) return false;
			$this->tt = $l;
			return true;
		}
		$l = $this->te - 1;
		if($l < 0 || $this->l($l) != $s) return false;
		$this->tt = $l;
		return true;
	}

	/**
	 * Returns word measure.
	 * @return integer measure
	 */
	private function m() {
		$c = 0;
		$i = -1;
		$v = false;
		while(++$i < $this->tt) {
			if($v xor $this->vt($i)) {
				if($v) ++$c;
				$v = ($v xor true);
			}
		}
		return $c;
	}

	/**
	 * Vowel test.
	 * @return boolean result of check
	 */
	private function v() {
		for($i = 0; $i < $this->tt; ++$i)
		if($this->vt($i)) return true;
		return false;
	}

	/**
	 * Vowel test.
	 * @param integer $p position
	 * @return boolean result of check
	 */
	private function vt($p) {
		$c = $this->l($p);
		return $c == 97 || $c == 101 || $c == 105 || $c == 111 || $c == 117 ||
		$c == 121 && $p != 0 && !$this->vt($p - 1);
	}

	/**
	 * Returns the lower character at the specified position.
	 * @param integer $p position
	 * @return integer result of check
	 */
	private function l($p) {
		return TokenUtil::lc($this->tok[$p]);
	}

	/**
	 * Adds a character.
	 * @param integer $c character
	 */
	private function ac($c) {
		$this->te = $this->tt;
		$this->tok[$this->te++] = $c;
	}

	/**
	 * Adds a token.
	 * @param array $t token
	 */
	private function at($t) {
		$this->te = $this->tt;
		foreach($t as $c) {
			$this->tok[$this->te++] = $c;
		}
	}

	/**
	* Converts a string to chars array.
	* All strings should be converted by this function to guarantee
	* a consistent character conversion.
	* @param string $str string to be converted
	* @return array chars array
	*/
	private static function token($str) {
		return TokenUtil::toChars($str, "UTF-8");
	}

	/**
	 * Converts the specified strings to tokens.
	 * @param array $strs strings
	 * @return array tokens array
	 */
	private static function tokens(array $strs) {
		$tokens = array();
		foreach($strs as $str) {
			$tokens[] = self::token($str);
		}
		return $tokens;
	}
}
?>
	<?php
	/**
	* English stemming algorithm, based on the publication from
	* Porter (1980), "An algorithm for suffix stripping".
	*
	* This PHP code is based on the product of BaseX Team.
	* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java
	*
	* @package Sugina
	* @author BaseX Team 2005-12, BSD License
	* @author Christian Gruen
	* @author Toshio HIRAI (Porting to PHP)
	* @license http://opensource.org/licenses/BSD-3-Clause The BSD License
	*/

	class EnglishStemmer {
	/** Token to be stemmed. */
	private $tok = array();
	/** Token length. */
	private $te;
	/** Stemming length. */
	private $tt;

	/**
	* Constructor.
	*/
	public function __construct() {
	/* Stemming character. */
	$this->AT = self::token("at");
	$this->BL = self::token("bl");
	$this->ED = self::token("ed");
	$this->EED = self::token("eed");
	$this->IES = self::token("ies");
	$this->ING = self::token("ing");
	$this->ION = self::token("ion");
	$this->IZ = self::token("iz");
	$this->LL = self::token("ll");
	$this->SION = self::token("sion");
	$this->SSES = self::token("sses");
	$this->TION = self::token("tion");

	$this->S = 115;
	$this->Y = 121;
	$this->E = 101;
	$this->L = 108;

	/* Step 2. */
	$this->ST2 = array(
	self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")),
	self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")),
	self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")),
	self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")),
	self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")),
	self::tokens(array("iviti", "ive")),
	self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")),
	self::tokens(array("izer", "ize")),
	self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")),
	self::tokens(array("tional", "tion")),
	);

	/* Step 3. */
	$this->ST3 = array(
	self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")),
	self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")),
	self::tokens(array("iciti", "ic")), self::tokens(array("ness", ""))
	);

	/* Step 4. */
	$this->ST4 = self::tokens(
	array(
	"able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
	"ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
	)
	);
	}

	/**
	* Stems a word.
	* @param array $str input word to stem
	* @return array the stem of the word
	*/
	public function stem(array $str) {
	$this->te = count($str);
	$this->tok = $str;
	return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te);
	}

	/**
	* Stems the current word.
	* @return boolean true if word was stemmed
	*/
	private function s() {
	if($this->te < 3) return false;

	// step 1
	if($this->e($this->S)) {
	if($this->e($this->SSES) \|\| $this->e($this->IES)) $this->te -= 2;
	else if($this->l($this->te - 2) != 115) --$this->te;
	}

	if($this->e($this->EED)) {
	if($this->m() > 0) --$this->te;
	} else if(($this->e($this->ED) \|\| $this->e($this->ING)) && $this->v()) {
	$this->te = $this->tt;

	if($this->e($this->AT) \|\| $this->e($this->BL) \|\| $this->e($this->IZ)) {
	$this->tt = $this->te;
	$this->ac(101);
	} else if($this->te > 1) {
	$c = $this->l($this->te - 1);
	if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) {
	--$this->te;
	} else if($this->m() == 1) {
	if($this->c($this->te)) $this->ac(101);
	}
	}
	}
	if($this->e($this->Y) && $this->v()) $this->ac(105);

	// step 2
	foreach($this->ST2 as $s) {
	if($this->e($s[0])) {
	if($this->m() > 0) $this->at($s[1]);
	break;
	}
	}

	// step 3
	foreach($this->ST3 as $s) {
	if($this->e($s[0])) {
	if($this->m() > 0) $this->at($s[1]);
	break;
	}
	}

	// step 4
	if(($this->e($this->TION) \|\| $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) {
	$this->te -= 3;
	} else {
	foreach($this->ST4 as $s) {
	if($this->e($s)) {
	if($this->m() > 1) $this->te = $this->tt;
	break;
	}
	}
	}

	// step 5
	if($this->e($this->E)) {
	$m = $this->m();
	if($m > 1 \|\| $m == 1 && !$this->c($this->te - 1)) --$this->te;
	}
	if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te;

	return $this->te != count($this->tok);
	}

	/**
	* Checks for the cvc pattern.
	* @param $l position
	* @return boolean result of check
	*/
	private function c($l) {
	if($l < 3) return false;
	$c = $this->l($l - 1);
	return $c != 119 && $c != 120 && $c != 121 &&
	!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3);
	}

	/**
	* Suffix test for a token.
	* @param mixed $s suffix
	* @return boolean result of check
	*/
	private function e($s) {
	if (is_array($s)) {
	$sl = count($s);
	$l = $this->te - $sl;
	if($l < 0) return false;
	for($i = 0; $i < $sl; ++$i)
	if($this->l($l + $i) != $s[$i]) return false;
	$this->tt = $l;
	return true;
	}
	$l = $this->te - 1;
	if($l < 0 \|\| $this->l($l) != $s) return false;
	$this->tt = $l;
	return true;
	}

	/**
	* Returns word measure.
	* @return integer measure
	*/
	private function m() {
	$c = 0;
	$i = -1;
	$v = false;
	while(++$i < $this->tt) {
	if($v xor $this->vt($i)) {
	if($v) ++$c;
	$v = ($v xor true);
	}
	}
	return $c;
	}

	/**
	* Vowel test.
	* @return boolean result of check
	*/
	private function v() {
	for($i = 0; $i < $this->tt; ++$i)
	if($this->vt($i)) return true;
	return false;
	}

	/**
	* Vowel test.
	* @param integer $p position
	* @return boolean result of check
	*/
	private function vt($p) {
	$c = $this->l($p);
	return $c == 97 \|\| $c == 101 \|\| $c == 105 \|\| $c == 111 \|\| $c == 117 \|\|
	$c == 121 && $p != 0 && !$this->vt($p - 1);
	}

	/**
	* Returns the lower character at the specified position.
	* @param integer $p position
	* @return integer result of check
	*/
	private function l($p) {
	return TokenUtil::lc($this->tok[$p]);
	}

	/**
	* Adds a character.
	* @param integer $c character
	*/
	private function ac($c) {
	$this->te = $this->tt;
	$this->tok[$this->te++] = $c;
	}

	/**
	* Adds a token.
	* @param array $t token
	*/
	private function at($t) {
	$this->te = $this->tt;
	foreach($t as $c) {
	$this->tok[$this->te++] = $c;
	}
	}

	/**
	* Converts a string to chars array.
	* All strings should be converted by this function to guarantee
	* a consistent character conversion.
	* @param string $str string to be converted
	* @return array chars array
	*/
	private static function token($str) {
	return TokenUtil::toChars($str, "UTF-8");
	}

	/**
	* Converts the specified strings to tokens.
	* @param array $strs strings
	* @return array tokens array
	*/
	private static function tokens(array $strs) {
	$tokens = array();
	foreach($strs as $str) {
	$tokens[] = self::token($str);
	}
	return $tokens;
	}
	}
	?>