Last active
April 25, 2022 13:57
-
-
Save devig/9c2374e4a3ae43f6ed613b294a580f97 to your computer and use it in GitHub Desktop.
PHP ngram
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class NGram | |
{ | |
/** | |
* The length of the n-gram. | |
* | |
* @var int | |
*/ | |
protected $n; | |
/** | |
* @var string | |
*/ | |
protected $string; | |
/** | |
* NGram constructor. | |
* | |
* @param int $n | |
* @param string $string | |
* | |
* @throws \TextUtils\Exceptions\InvalidArgumentException | |
*/ | |
public function __construct(int $n, string $string) | |
{ | |
$this->setN($n); | |
$this->setString($string); | |
} | |
/** | |
* Static wrapper for n-gram generator. | |
* | |
* @param string $text | |
* @param int $n | |
* | |
* @throws \TextUtils\Exceptions\InvalidArgumentException | |
* | |
* @return array | |
*/ | |
public static function for(string $text, int $n = 3) | |
{ | |
return (new static($n, $text))->get(); | |
} | |
/** | |
* Static wrapper to generate a bigram. | |
* | |
* @param string $text | |
* | |
* @throws \TextUtils\Exceptions\InvalidArgumentException | |
* | |
* @return array | |
*/ | |
public static function bigram(string $text) : array | |
{ | |
return self::for($text, 2); | |
} | |
/** | |
* Static wrapper to generate a trigram. | |
* | |
* @param string $text | |
* | |
* @throws \TextUtils\Exceptions\InvalidArgumentException | |
* | |
* @return array | |
*/ | |
public static function trigram(string $text) : array | |
{ | |
return self::for($text, 3); | |
} | |
/** | |
* Generate the N-gram for the provided string. | |
* | |
* @return array | |
*/ | |
public function get() : array | |
{ | |
$nGrams = []; | |
$text = $this->getString(); | |
$max = mb_strlen($text); | |
$n = $this->getN(); | |
for ($i = 0; $i + $n <= $max; $i++) { | |
$partial = ''; | |
for ($j = 0; $j < $n; $j++) { | |
$partial .= mb_substr($text, $j + $i,1); //$text[$j + $i]; | |
} | |
$nGrams[] = $partial; | |
} | |
return $nGrams; | |
} | |
/** | |
* @return int | |
*/ | |
public function getN() : int | |
{ | |
return $this->n; | |
} | |
/** | |
* Set the length of the n-gram. | |
* | |
* @param int $n | |
* | |
* @throws \TextUtils\Exceptions\InvalidArgumentException | |
* | |
* @return \TextUtils\NGram | |
*/ | |
public function setN(int $n) : NGram | |
{ | |
if ($n < 1) { | |
//throw new InvalidArgumentException('Provided number cannot be smaller than 1'); | |
} | |
$this->n = $n; | |
return $this; | |
} | |
/** | |
* Set the string to create the n-gram for. | |
* | |
* @param string $string | |
* | |
* @return \TextUtils\NGram | |
*/ | |
public function setString(string $string) : NGram | |
{ | |
$this->string = $string; | |
return $this; | |
} | |
/** | |
* Get the string used for the n-gram. | |
* | |
* @return string | |
*/ | |
public function getString() : string | |
{ | |
return $this->string; | |
} | |
} | |
/* | |
$n=new NGram(3,'электростанция'); | |
print_r($n->trigram('электростанция'));*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DELIMITER ;; | |
DROP FUNCTION IF EXISTS `ngram`;; | |
CREATE FUNCTION `ngram`(`inStr` varchar(255), `n` tinyint(1)) RETURNS varchar(255) CHARSET utf8 | |
BEGIN | |
SET @counter = 1; | |
SET @len = 1; | |
SET @result = ""; | |
SET @len = CHAR_LENGTH(inStr); | |
WHILE (@counter + (n-1)) <= @len DO | |
SET @ngram = ""; | |
SET @i = 0; | |
WHILE @i < n DO | |
SET @ngram = concat(@ngram, SUBSTRING(inStr,@counter+@i, 1)); | |
SET @i = @i+1; | |
END WHILE; | |
IF @result="" THEN | |
SET @result = concat(@result, @ngram); | |
ELSE | |
SET @result = concat(@result, "-", @ngram); | |
END IF; | |
SET @counter = @counter+1; | |
END WHILE; | |
RETURN @result; | |
END;; | |
DELIMITER ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function createNGram($n, $token) | |
{ | |
$nGrams = array(); | |
$counter = 0; | |
while (($counter + ($n - 1)) < mb_strlen($token)) { | |
$nGram = ""; | |
for ($i = 0; $i < $n; $i++) { | |
$nGram .= mb_substr($token, $counter + $i, 1); | |
if ($i != $n - 1) { | |
$nGram .= ""; | |
} | |
} | |
$nGrams[] = $nGram; | |
$counter++; | |
} | |
return array_values($nGrams); | |
} | |
//print_r(createNGram(3,"электростанция")); |
SET GLOBAL log_bin_trust_function_creators = 1;
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
SELECT keyword
FROM terms
WHERE MATCH (trigrams) AGAINST ('сто-ток-ока-кан' );