Skip to content

Instantly share code, notes, and snippets.

@pschultz
Created February 26, 2013 17:46
Show Gist options
  • Save pschultz/5040460 to your computer and use it in GitHub Desktop.
Save pschultz/5040460 to your computer and use it in GitHub Desktop.
Ngram tokenizer in php
<?php
class NgramTokenizer
{
public static function tokenize($word, $n)
{
if ($n === 1) {
return str_split($word);
}
$grams = array();
for ($i = 0; $i <= strlen($word) - $n; ++$i) {
$grams[] = substr($word, $i, $n);
}
return $grams;
}
}
<?php
class NgramTokenizerTest extends \PHPUnit_Framework_TestCase
{
public function testUnigram()
{
$tokens = NgramTokenizer::tokenize("abcd", 1);
$this->assertEquals(range('a', 'd'), $tokens);
}
public function testBigram()
{
$tokens = NgramTokenizer::tokenize("abcd", 2);
$this->assertEquals(array('ab', 'bc', 'cd'), $tokens);
}
public function testTrigram()
{
$tokens = NgramTokenizer::tokenize("abcdef", 3);
$this->assertEquals(array('abc', 'bcd', 'cde', 'def'), $tokens);
}
}
@xeoncross
Copy link

ASCII string only NGram tokenizer. No support for strings with unicode or breaking things up by certain characters like spaces.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment