Skip to content

Instantly share code, notes, and snippets.

@frsyuki
Last active June 23, 2021 22:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frsyuki/3ace7d2188699b058570939535cca964 to your computer and use it in GitHub Desktop.
Save frsyuki/3ace7d2188699b058570939535cca964 to your computer and use it in GitHub Desktop.
PostgreSQL bi-gram full text search without application code
-- ref: https://www.clear-code.com/blog/2020/12/22.html
create type fts_bigram_token as (token text, position bigint);
create or replace function fts_bigram_tokenize(value text)
returns setof fts_bigram_token as $$
select trim(coalesce(lag(c) over (), '') || c), row_number() over ()
from (
select unnest(
regexp_matches(regexp_replace(value, '[[:punct:]]', ' ', 'g') || ' ', '(?!\s\s).', 'g')
) as c
) chars
$$ language sql immutable;
create or replace function fts_bigram_to_tsvector(cols text[])
returns tsvector as $$
select string_agg(token || ':' || position, ' ')::tsvector
from (
select token, position from fts_bigram_tokenize(array_to_string(cols, ' '))
) tokenizer
$$ language sql immutable;
create aggregate tsquery_phrase_agg(tsquery) (
SFUNC = tsquery_phrase,
STYPE = tsquery
);
create or replace function fts_bigram_to_tsquery(word text)
returns tsquery as $$
select
coalesce(q, tsquery(trim(word)))
from (
select tsquery_phrase_agg(tsquery(token)) as q
from (
select token, position from fts_bigram_tokenize(word)
) tokenizer
where length(token) > 1
) phrase
$$ language sql immutable;
-- Setup table
drop table if exists fts_test cascade;
create table fts_test (
id bigserial primary key,
name text,
descr text
);
create index fts_test_gist on fts_test using gist (fts_bigram_to_tsvector(array[name, descr]));
create view fts_test_search
as select *, fts_bigram_to_tsvector(array[name, descr]) as fts_text
from fts_test;
-- Insert example data
insert into fts_test (name, descr)
values ('楽しい', '日本 京都府庁'),
('検索結果は', '東京都庁'),
('English', 'search abc ttt');
select * from fts_test_search;
-- id | name | descr | fts_text
-- ----+------------+----------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------
-- 1 | 楽しい | 日本 京都府庁 | 'い':4 'しい':3 '京':8 '京都':9 '庁':12 '府庁':11 '日':5 '日本':6 '本':7 '楽':1 '楽し':2 '都府':10
-- 2 | 検索結果は | 東京都庁 | 'は':6 '京都':9 '庁':11 '東':7 '東京':8 '果は':5 '検':1 '検索':2 '索結':3 '結果':4 '都庁':10
-- 3 | English | search abc ttt | 'E':1 'En':2 'a':16 'ab':17 'ar':12 'bc':18 'c':19 'ch':14 'ea':11 'gl':4 'h':8,15 'is':6 'li':5 'ng':3 'rc':13 's':9 'se':10 'sh':7 't':20,23 'tt':21,22
-- Query
select fts_bigram_to_tsquery('東京都');
-- fts_bigram_to_tsquery
-- -----------------------
-- '東京' <-> '京都'
select id, name, descr, ts_rank(fts_text, fts_bigram_to_tsquery('東京都')) as ts_rank
from fts_test_search
where fts_text @@ fts_bigram_to_tsquery('東京都')
order by ts_rank desc;
-- id | name | descr | ts_rank
-- ----+------------+----------+-----------
-- 2 | 検索結果は | 東京都庁 | 0.0991032
explain analyze
select id, name, descr
from fts_test_search
where fts_text @@ fts_bigram_to_tsquery('東京都');
-- QUERY PLAN
-- -------------------------------------------------------------------------------------------------------------------------
-- Index Scan using fts_test_gist on fts_test (cost=0.14..8.16 rows=1 width=72) (actual time=0.185..0.185 rows=1 loops=1)
-- Index Cond: (fts_bigram_to_tsvector(ARRAY[name, descr]) @@ '''東京'' <-> ''京都'''::tsquery)
-- Planning Time: 0.306 ms
-- Execution Time: 0.204 ms
explain analyze
select id, name, descr, ts_rank(fts_text, fts_bigram_to_tsquery('東京都')) as ts_rank
from fts_test_search
where fts_text @@ fts_bigram_to_tsquery('東京都')
order by ts_rank desc;
-- QUERY PLAN
-- -------------------------------------------------------------------------------------------------------------------------------
-- Sort (cost=8.42..8.43 rows=1 width=76) (actual time=0.459..0.459 rows=1 loops=1)
-- Sort Key: (ts_rank(fts_bigram_to_tsvector(ARRAY[fts_test.name, fts_test.descr]), '''東京'' <-> ''京都'''::tsquery)) DESC
-- Sort Method: quicksort Memory: 25kB
-- -> Index Scan using fts_test_gist on fts_test (cost=0.14..8.41 rows=1 width=76) (actual time=0.455..0.456 rows=1 loops=1)
-- Index Cond: (fts_bigram_to_tsvector(ARRAY[name, descr]) @@ '''東京'' <-> ''京都'''::tsquery)
-- Planning Time: 0.466 ms
-- Execution Time: 0.481 ms
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment