Skip to content

Instantly share code, notes, and snippets.

@devig
Last active April 25, 2022 13:57
Show Gist options
  • Save devig/9c2374e4a3ae43f6ed613b294a580f97 to your computer and use it in GitHub Desktop.
Save devig/9c2374e4a3ae43f6ed613b294a580f97 to your computer and use it in GitHub Desktop.
PHP ngram
<?php
class NGram
{
/**
* The length of the n-gram.
*
* @var int
*/
protected $n;
/**
* @var string
*/
protected $string;
/**
* NGram constructor.
*
* @param int $n
* @param string $string
*
* @throws \TextUtils\Exceptions\InvalidArgumentException
*/
public function __construct(int $n, string $string)
{
$this->setN($n);
$this->setString($string);
}
/**
* Static wrapper for n-gram generator.
*
* @param string $text
* @param int $n
*
* @throws \TextUtils\Exceptions\InvalidArgumentException
*
* @return array
*/
public static function for(string $text, int $n = 3)
{
return (new static($n, $text))->get();
}
/**
* Static wrapper to generate a bigram.
*
* @param string $text
*
* @throws \TextUtils\Exceptions\InvalidArgumentException
*
* @return array
*/
public static function bigram(string $text) : array
{
return self::for($text, 2);
}
/**
* Static wrapper to generate a trigram.
*
* @param string $text
*
* @throws \TextUtils\Exceptions\InvalidArgumentException
*
* @return array
*/
public static function trigram(string $text) : array
{
return self::for($text, 3);
}
/**
* Generate the N-gram for the provided string.
*
* @return array
*/
public function get() : array
{
$nGrams = [];
$text = $this->getString();
$max = mb_strlen($text);
$n = $this->getN();
for ($i = 0; $i + $n <= $max; $i++) {
$partial = '';
for ($j = 0; $j < $n; $j++) {
$partial .= mb_substr($text, $j + $i,1); //$text[$j + $i];
}
$nGrams[] = $partial;
}
return $nGrams;
}
/**
* @return int
*/
public function getN() : int
{
return $this->n;
}
/**
* Set the length of the n-gram.
*
* @param int $n
*
* @throws \TextUtils\Exceptions\InvalidArgumentException
*
* @return \TextUtils\NGram
*/
public function setN(int $n) : NGram
{
if ($n < 1) {
//throw new InvalidArgumentException('Provided number cannot be smaller than 1');
}
$this->n = $n;
return $this;
}
/**
* Set the string to create the n-gram for.
*
* @param string $string
*
* @return \TextUtils\NGram
*/
public function setString(string $string) : NGram
{
$this->string = $string;
return $this;
}
/**
* Get the string used for the n-gram.
*
* @return string
*/
public function getString() : string
{
return $this->string;
}
}
/*
$n=new NGram(3,'электростанция');
print_r($n->trigram('электростанция'));*/
DELIMITER ;;
DROP FUNCTION IF EXISTS `ngram`;;
CREATE FUNCTION `ngram`(`inStr` varchar(255), `n` tinyint(1)) RETURNS varchar(255) CHARSET utf8
BEGIN
SET @counter = 1;
SET @len = 1;
SET @result = "";
SET @len = CHAR_LENGTH(inStr);
WHILE (@counter + (n-1)) <= @len DO
SET @ngram = "";
SET @i = 0;
WHILE @i < n DO
SET @ngram = concat(@ngram, SUBSTRING(inStr,@counter+@i, 1));
SET @i = @i+1;
END WHILE;
IF @result="" THEN
SET @result = concat(@result, @ngram);
ELSE
SET @result = concat(@result, "-", @ngram);
END IF;
SET @counter = @counter+1;
END WHILE;
RETURN @result;
END;;
DELIMITER ;
<?php
function createNGram($n, $token)
{
$nGrams = array();
$counter = 0;
while (($counter + ($n - 1)) < mb_strlen($token)) {
$nGram = "";
for ($i = 0; $i < $n; $i++) {
$nGram .= mb_substr($token, $counter + $i, 1);
if ($i != $n - 1) {
$nGram .= "";
}
}
$nGrams[] = $nGram;
$counter++;
}
return array_values($nGrams);
}
//print_r(createNGram(3,"электростанция"));
@devig
Copy link
Author

devig commented Feb 2, 2020

SELECT keyword
FROM terms
WHERE MATCH (trigrams) AGAINST ('сто-ток-ока-кан' );

@devig
Copy link
Author

devig commented Apr 25, 2022

SET GLOBAL log_bin_trust_function_creators = 1;

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment