Skip to content

Instantly share code, notes, and snippets.

@vuvanly
Last active May 18, 2018 04:29
Show Gist options
  • Save vuvanly/5019370ffe25799ea2b5245749321a99 to your computer and use it in GitHub Desktop.
Save vuvanly/5019370ffe25799ea2b5245749321a99 to your computer and use it in GitHub Desktop.
MySQL日本語と全文検索 ref: http://qiita.com/vuvanly/items/ed6e14ac7e4aa34c4165
CREATE TABLE `article_test` (
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL COMMENT 'タイトル',
`content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci COMMENT '本文',
PRIMARY KEY (`id`),
) ENGINE=InnoDB DEFAULT CHARSET=utf8
mysql> select count(*) from article_test;
+----------+
| count(*) |
+----------+
| 300 |
+----------+
1 row in set (0.00 sec)
mysql> UPDATE article_test SET bigram = CONCAT(NGRAM(title, 2), ' ', NGRAM(content, 2));
ERROR 1366 (HY000): Incorrect string value: '\xE3\x81\xA7\xE3\x81\x99' for column 'tResult' at row 1
DELIMITER //
CREATE TRIGGER `trg_article_test_insert` BEFORE INSERT ON `article_test` FOR EACH ROW BEGIN
SET NEW.bigram = CONCAT(NGRAM(NEW.title, 2), ' ', NGRAM(SUBSTR(NEW.content, 1, 9000), 2));
END
//
DELIMITER ;
DELIMITER //
CREATE TRIGGER `trg_article_test_update` BEFORE UPDATE ON `article_test` FOR EACH ROW BEGIN
IF (NEW.title <> OLD.title OR NEW.content <> OLD.content) THEN
SET NEW.bigram = CONCAT(NGRAM(NEW.title, 2), ' ', NGRAM(SUBSTR(content, 1, 9000), 2));
END IF;
END
//
mysql> show variables like '%ft%';
+---------------------------------+----------------+
| Variable_name | Value |
+---------------------------------+----------------+
| ft_boolean_syntax | + -><()~*:""&| |
| ft_max_word_len | 84 |
| ft_min_word_len | 4 |
| ft_query_expansion_limit | 20 |
| ft_stopword_file | (built-in) |
| innodb_ft_aux_table | |
| innodb_ft_cache_size | 8000000 |
| innodb_ft_enable_diag_print | OFF |
| innodb_ft_enable_stopword | ON |
| innodb_ft_max_token_size | 84 |
| innodb_ft_min_token_size | 3 |
| innodb_ft_num_word_optimize | 2000 |
| innodb_ft_result_cache_limit | 2000000000 |
| innodb_ft_server_stopword_table | |
| innodb_ft_sort_pll_degree | 2 |
| innodb_ft_total_cache_size | 640000000 |
| innodb_ft_user_stopword_table | |
+---------------------------------+----------------+
# vim /etc/my.cnf
ft_min_word_len=1 #MyISAM
innodb_ft_min_token_size=1 #InnoDB
]# /etc/init.d/mysqld restart
mysql> SELECT count(*) FROM article_test WHERE (MATCH (bigram) AGAINST (CONCAT('+', '"', NGRAM('講談社', 2), '"') IN BOOLEAN MODE));
+----------+
| count(*) |
+----------+
| 0 |
+----------+
1 row in set (0.03 sec)
mysql> select count(*) from article_test where title like '%講談社%' or content like '%講談社%';
+----------+
| count(*) |
+----------+
| 1 |
+----------+
1 row in set (0.02 sec)
mysql> select count(*) from article_test;
+----------+
| count(*) |
+----------+
| 50000 |
+----------+
1 row in set (0.25 sec)
mysql> select count(*) from article_test where title like '%講談社%' or content like '%講談社%';
+----------+
| count(*) |
+----------+
| 1170 |
+----------+
1 row in set (5.86 sec)
DELIMITER //
CREATE FUNCTION `NGRAM`(`tText` TEXT, `n` INT)
RETURNS text
DETERMINISTIC
BEGIN
DECLARE tResult TEXT;
DECLARE nLength INT;
DECLARE nPosition INT;
DECLARE tPart VARCHAR(16);
IF tText IS NULL THEN
RETURN NULL;
END IF;
SET tResult = '';
SET tText = TRIM(REPLACE(tText, ' ', ''));
SET nLength = CHAR_LENGTH(tText);
SET nPosition = 1;
WHILE nPosition <= nLength DO
SET tPart = TRIM(SUBSTR(tText, nPosition, n));
IF CHAR_LENGTH(tPart) > 0 THEN
SET tResult = CONCAT(tResult, ' ', tPart);
END IF;
SET nPosition = nPosition + 1;会員王六
END WHILE;
RETURN TRIM(tResult);
END//
DELIMITER;
mysql> select NGRAM('系全国', 2);
+-----------------------+
| NGRAM('系全国', 2) |
+-----------------------+
| 系全 全国 国 |
+-----------------------+
ALTER TABLE article_test ADD COLUMN `bigram` TEXT NULL CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci COMMENT 'bigram分かち書き';
ALTER TABLE article_test ADD FULLTEXT(bigram);
UPDATE article_test SET bigram = CONCAT(NGRAM(title, 2), ' ', NGRAM(SUBSTR(content, 1, 9000), 2));
mysql> select length('系全国分かち書き');
+------------------------------------+
| length('系全国分かち書き') |
+------------------------------------+
| 24 |
+------------------------------------+
mysql> select length(NGRAM('系全国分かち書き', 2));
+----------------------------------------------+
| length(NGRAM('系全国分かち書き', 2)) |
+----------------------------------------------+
| 52 |
+----------------------------------------------+
mysql> select char_length(NGRAM('系全国分かち書き', 2));
+---------------------------------------------------+
| char_length(NGRAM('系全国分かち書き', 2)) |
+---------------------------------------------------+
| 22 |
+---------------------------------------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment