Note: Any bugs or findings of interested are documented inline with
Run the examples from the pg_trgm
docs. Ensure that the
results match the expected results in the posts. This serves as a rough check
for compatibility with Postgres in basic cases.
SELECT word_similarity('word', 'two words');
SELECT strict_word_similarity('word', 'two words'), similarity('word', 'words');
CREATE TABLE test_trgm (t text);
INSERT INTO test_trgm VALUES
('foo'),
('bar'),
('baz'),
('hoard'),
('wordy'),
('world'),
('weird'),
('weird'),
('xxxfooyyybar'),
('wofooord');
INSERT INTO test_trgm SELECT 'empty' FROM generate_series(1, 10000);
SELECT t, similarity(t, 'word') AS sml
FROM test_trgm
WHERE t % 'word'
ORDER BY sml DESC, t;
SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
CREATE INDEX trgm_idx ON test_trgm USING GIST (t gist_trgm_ops);
-- This should be faster with the index.
SELECT t, similarity(t, 'word') AS sml
FROM test_trgm
WHERE t % 'word'
ORDER BY sml DESC, t;
-- This should be faster with the index.
SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
-- This should not be faster with the index - we don't accelerate this yet.
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
SELECT
above without any indexes takes ~6ms in PG and ~26ms in
CRDB, running in a single-node cockroach demo
. Changing the %
comparison to
=
speeds up the query to ~1.2ms in PG and 8ms in CRDB, which are both about as
fast as a simple full table scan over test_trgm
like SELECT * FROM test_trgm WHERE t = 'word';
. This suggests that there may be optimization opportunities
in our implementation of %
.
Created #86610 to
track this.
(Same as above, but with GIN
instead of GIST
, which are the same in CRDB.)
CREATE TABLE test_trgm (t text);
INSERT INTO test_trgm VALUES
('foo'),
('bar'),
('baz'),
('hoard'),
('wordy'),
('world'),
('weird'),
('weird'),
('xxxfooyyybar'),
('wofooord');
INSERT INTO test_trgm SELECT 'empty' FROM generate_series(1, 10000);
SELECT t, similarity(t, 'word') AS sml
FROM test_trgm
WHERE t % 'word'
ORDER BY sml DESC, t;
SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
CREATE INDEX trgm_idx ON test_trgm USING GIN (t gin_trgm_ops);
-- This should be faster with the index.
SELECT t, similarity(t, 'word') AS sml
FROM test_trgm
WHERE t % 'word'
ORDER BY sml DESC, t;
-- This should be faster with the index.
SELECT * FROM test_trgm WHERE t LIKE '%foo%bar';
-- This should not be faster with the index - we don't accelerate this yet.
SELECT * FROM test_trgm WHERE t ~ '(foo|bar)';
gin_trgm_ops
and gist_trgm_ops
with the opposite
type of index. There's no reason these need to fail in CRDB - GIST and GIN are
aliases for our general-purpose inverted indexes. However, we may want to be
consistent with Postgres here. For example, both of these succeed in CRDB but
fail in PG:
Created #86611 to
track this.
CREATE INDEX trgm_idx ON test_trgm USING GIST (t gin_trgm_ops);
CREATE INDEX trgm_idx ON test_trgm USING GIN (t gist_trgm_ops);
Test that CREATE EXTENSION pg_trgm
works. It should be a no-op.
Test the following built-ins manually and compare the results to Postgres.
SELECT show_trgm('sql queries team');
SELECT show_trgm('sql queries team'::TEXT);
SELECT show_trgm('sql queries team'::VARCHAR);
SELECT show_trgm('sql queries team'::CHAR);
SELECT show_trgm('');
SELECT show_trgm(' ');
SELECT show_trgm(' ');
SELECT show_trgm(' ');
SELECT show_trgm('a');
SELECT show_trgm('ab');
SELECT show_trgm('foo%bar');
SELECT show_trgm('foo%.***%%$#@bar');
SELECT similarity('sql queries team', 'sql queries team');
SELECT similarity('sql queries team', 'sql team queries');
SELECT similarity('sql queries team'::TEXT, 'sql team queries'::VARCHAR);
SELECT similarity('foo bar baz', 'foo baz');
SELECT similarity('foo bar baz', 'baz bar');
SELECT similarity('foobarbaz', 'ooaraz');
SELECT 'foo bar baz' % 'foo baz';
SELECT 'foo bar baz' % 'baz bar';
SELECT 'foobarbaz' % 'ooaraz';
Test changing the session setting pg_trgm.similarity_threshold
.
SET pg_trgm.similarity_threshold = 0.05;
SELECT 'foo bar baz' % 'foo baz';
SELECT 'foo bar baz' % 'baz bar';
SELECT 'foobarbaz' % 'ooaraz';
These should succeed:
CREATE TABLE test_trgm (s STRING, INVERTED INDEX (s gist_trgm_ops));
SHOW CREATE TABLE test_trgm;
DROP TABLE test_trgm;
CREATE TABLE test_trgm (s STRING, INVERTED INDEX (s gin_trgm_ops));
SHOW CREATE TABLE test_trgm;
DROP TABLE test_trgm;
CREATE TABLE test_trgm (s STRING);
CREATE INVERTED INDEX ON test_trgm (s gist_trgm_ops);
SHOW CREATE TABLE test_trgm;
DROP TABLE test_trgm;
CREATE TABLE test_trgm (s STRING);
CREATE INDEX ON test_trgm USING GIN (s gin_trgm_ops);
SHOW CREATE TABLE test_trgm;
DROP TABLE test_trgm;
CREATE TABLE test_trgm (s STRING);
CREATE INDEX ON test_trgm USING GIST (s gist_trgm_ops);
SHOW CREATE TABLE test_trgm;
DROP TABLE test_trgm;
gin_trgm_ops
in SHOW CREATE TABLE
regardless of the op class used
when the table was created. However, I'm not sure whether this is considered a
bug given that the behave the same in CRDB.
These should fail:
CREATE TABLE test_trgm (s STRING, INVERTED INDEX (s));
CREATE TABLE test_trgm (s STRING, INVERTED INDEX (s));
CREATE TABLE test_trgm (s STRING, INDEX (s gist_trgm_ops));
CREATE TABLE test_trgm (s STRING, INDEX (s gin_trgm_ops));
gist_trgm_ops
and gin_trgm_ops
for non-inverted indexes. This is
a known bug: cockroachdb/cockroach#84512
Re-create this table to test with before each section.
DROP TABLE IF EXISTS test_trgm;
CREATE TABLE test_trgm (
k INT PRIMARY KEY,
a INT,
b INT,
x TEXT
);
INSERT INTO test_trgm
SELECT i, i % 29, i * 100,
chr(ascii('A')+(i%3)) || chr(ascii('A')+(i%5)) ||
chr(ascii('A')+(i%7)) || chr(ascii('A')+(i%11)) ||
chr(ascii('A')+(i%13)) || chr(ascii('A')+(i%17)) ||
chr(ascii('A')+(i%19)) || chr(ascii('A')+(i%23)) ||
chr(ascii('A')+(i%29)) || chr(ascii('A')+(i%31)) ||
chr(ascii('A')+(i%37)) || chr(ascii('A')+(i%41)) ||
chr(ascii('A')+(i%43)) || chr(ascii('A')+(i%47))
FROM generate_series (1, 10000) g(i);
ANALYZE test_trgm;
CREATE INDEX ON test_trgm USING GIST (x gist_trgm_ops) WHERE a > 10;
SELECT k FROM test_trgm WHERE a > 10 AND x LIKE '%FOO%';
SELECT k, similarity(x, 'FOOBAR') FROM test_trgm WHERE a > 10 AND x LIKE '%FOO%' ORDER BY 2;
SELECT k, a, similarity(x, 'FOOBAR') FROM test_trgm WHERE a > 20 AND x LIKE '%FOO%' ORDER BY 3;
UPDATE test_trgm SET a = a + 5 WHERE true;
SELECT k, similarity(x, 'FOOBAR') FROM test_trgm WHERE a > 10 AND x LIKE '%FOO%' ORDER BY 2;
SELECT k, a, similarity(x, 'FOOBAR') FROM test_trgm WHERE a > 15 AND x LIKE '%FOO%' ORDER BY 3;
UPDATE test_trgm SET a = a + 1, x = x || 'world' WHERE true;
SELECT k, a, x, similarity(x, 'BAD world') FROM test_trgm WHERE a > 10 AND x LIKE '%FO%world%' ORDER BY 4, 2, 1 LIMIT 20;
CREATE INDEX ON test_trgm USING GIST (a, x gist_trgm_ops);
SELECT k, a, x FROM test_trgm WHERE a = 11 AND x LIKE '%FO%';
LIKE
expression. This leads to a confusing
error "ERROR: index "i" is inverted and cannot be used for this query".
Created #86612 to
track this.
SHOW CREATE TABLE
incorrectly shows an opclass for non-inverted prefix
columns: INVERTED INDEX test_trgm_a_x_idx (a gin_trgm_ops ASC, x gin_trgm_ops)
.
Created #86614 to
track this.
CREATE INDEX ON test_trgm USING GIST ((lower(x)) gist_trgm_ops);
SELECT k, a, x FROM test_trgm WHERE a = 14 AND lower(x) LIKE '%foo%';
DROP TABLE IF EXISTS test_trgm;
CREATE TABLE test_trgm (s TEXT COLLATE "de_DE");
INSERT INTO test_trgm VALUES
('foo'),
('Fussball'),
('Fußball'),
('bar'),
('ball');
SELECT show_trgm(s) FROM test_trgm;
SELECT show_trgm(s::text) FROM test_trgm;
SELECT s % 'fussball', similarity(s, 'fussball')
FROM test_trgm;
SELECT s::TEXT % 'fussball', similarity(s::TEXT, 'fussball')
FROM test_trgm;
CREATE INDEX i ON test_trgm USING GIST (s gist_trgm_ops);
ERROR: column s of type string is not allowed as the last column in an inverted index
SQLSTATE: 0A000
HINT: see the documentation for more information about inverted indexes: https://www.cockroachlabs.com/docs/v22.2/inverted-indexes.html
Created #86617 to track this.
I believe the actual problem is that trigrams on collated strings are not supported. The error message should be more clear about that.