Last active
May 18, 2018 04:29
-
-
Save vuvanly/5019370ffe25799ea2b5245749321a99 to your computer and use it in GitHub Desktop.
MySQL日本語と全文検索 ref: http://qiita.com/vuvanly/items/ed6e14ac7e4aa34c4165
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE `article_test` ( | |
`id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, | |
`title` varchar(255) DEFAULT NULL COMMENT 'タイトル', | |
`content` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci COMMENT '本文', | |
PRIMARY KEY (`id`), | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select count(*) from article_test; | |
+----------+ | |
| count(*) | | |
+----------+ | |
| 300 | | |
+----------+ | |
1 row in set (0.00 sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> UPDATE article_test SET bigram = CONCAT(NGRAM(title, 2), ' ', NGRAM(content, 2)); | |
ERROR 1366 (HY000): Incorrect string value: '\xE3\x81\xA7\xE3\x81\x99' for column 'tResult' at row 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DELIMITER // | |
CREATE TRIGGER `trg_article_test_insert` BEFORE INSERT ON `article_test` FOR EACH ROW BEGIN | |
SET NEW.bigram = CONCAT(NGRAM(NEW.title, 2), ' ', NGRAM(SUBSTR(NEW.content, 1, 9000), 2)); | |
END | |
// | |
DELIMITER ; | |
DELIMITER // | |
CREATE TRIGGER `trg_article_test_update` BEFORE UPDATE ON `article_test` FOR EACH ROW BEGIN | |
IF (NEW.title <> OLD.title OR NEW.content <> OLD.content) THEN | |
SET NEW.bigram = CONCAT(NGRAM(NEW.title, 2), ' ', NGRAM(SUBSTR(content, 1, 9000), 2)); | |
END IF; | |
END | |
// |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> show variables like '%ft%'; | |
+---------------------------------+----------------+ | |
| Variable_name | Value | | |
+---------------------------------+----------------+ | |
| ft_boolean_syntax | + -><()~*:""&| | | |
| ft_max_word_len | 84 | | |
| ft_min_word_len | 4 | | |
| ft_query_expansion_limit | 20 | | |
| ft_stopword_file | (built-in) | | |
| innodb_ft_aux_table | | | |
| innodb_ft_cache_size | 8000000 | | |
| innodb_ft_enable_diag_print | OFF | | |
| innodb_ft_enable_stopword | ON | | |
| innodb_ft_max_token_size | 84 | | |
| innodb_ft_min_token_size | 3 | | |
| innodb_ft_num_word_optimize | 2000 | | |
| innodb_ft_result_cache_limit | 2000000000 | | |
| innodb_ft_server_stopword_table | | | |
| innodb_ft_sort_pll_degree | 2 | | |
| innodb_ft_total_cache_size | 640000000 | | |
| innodb_ft_user_stopword_table | | | |
+---------------------------------+----------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim /etc/my.cnf | |
ft_min_word_len=1 #MyISAM | |
innodb_ft_min_token_size=1 #InnoDB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
]# /etc/init.d/mysqld restart |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> SELECT count(*) FROM article_test WHERE (MATCH (bigram) AGAINST (CONCAT('+', '"', NGRAM('講談社', 2), '"') IN BOOLEAN MODE)); | |
+----------+ | |
| count(*) | | |
+----------+ | |
| 0 | | |
+----------+ | |
1 row in set (0.03 sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select count(*) from article_test where title like '%講談社%' or content like '%講談社%'; | |
+----------+ | |
| count(*) | | |
+----------+ | |
| 1 | | |
+----------+ | |
1 row in set (0.02 sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select count(*) from article_test; | |
+----------+ | |
| count(*) | | |
+----------+ | |
| 50000 | | |
+----------+ | |
1 row in set (0.25 sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select count(*) from article_test where title like '%講談社%' or content like '%講談社%'; | |
+----------+ | |
| count(*) | | |
+----------+ | |
| 1170 | | |
+----------+ | |
1 row in set (5.86 sec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DELIMITER // | |
CREATE FUNCTION `NGRAM`(`tText` TEXT, `n` INT) | |
RETURNS text | |
DETERMINISTIC | |
BEGIN | |
DECLARE tResult TEXT; | |
DECLARE nLength INT; | |
DECLARE nPosition INT; | |
DECLARE tPart VARCHAR(16); | |
IF tText IS NULL THEN | |
RETURN NULL; | |
END IF; | |
SET tResult = ''; | |
SET tText = TRIM(REPLACE(tText, ' ', '')); | |
SET nLength = CHAR_LENGTH(tText); | |
SET nPosition = 1; | |
WHILE nPosition <= nLength DO | |
SET tPart = TRIM(SUBSTR(tText, nPosition, n)); | |
IF CHAR_LENGTH(tPart) > 0 THEN | |
SET tResult = CONCAT(tResult, ' ', tPart); | |
END IF; | |
SET nPosition = nPosition + 1;会員王六 | |
END WHILE; | |
RETURN TRIM(tResult); | |
END// | |
DELIMITER; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select NGRAM('系全国', 2); | |
+-----------------------+ | |
| NGRAM('系全国', 2) | | |
+-----------------------+ | |
| 系全 全国 国 | | |
+-----------------------+ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ALTER TABLE article_test ADD COLUMN `bigram` TEXT NULL CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci COMMENT 'bigram分かち書き'; | |
ALTER TABLE article_test ADD FULLTEXT(bigram); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
UPDATE article_test SET bigram = CONCAT(NGRAM(title, 2), ' ', NGRAM(SUBSTR(content, 1, 9000), 2)); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mysql> select length('系全国分かち書き'); | |
+------------------------------------+ | |
| length('系全国分かち書き') | | |
+------------------------------------+ | |
| 24 | | |
+------------------------------------+ | |
mysql> select length(NGRAM('系全国分かち書き', 2)); | |
+----------------------------------------------+ | |
| length(NGRAM('系全国分かち書き', 2)) | | |
+----------------------------------------------+ | |
| 52 | | |
+----------------------------------------------+ | |
mysql> select char_length(NGRAM('系全国分かち書き', 2)); | |
+---------------------------------------------------+ | |
| char_length(NGRAM('系全国分かち書き', 2)) | | |
+---------------------------------------------------+ | |
| 22 | | |
+---------------------------------------------------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment