Skip to content

Instantly share code, notes, and snippets.

@searls
Created December 29, 2020 13:32
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save searls/c0b7ffedbcf7261b654c5cb492250a3a to your computer and use it in GitHub Desktop.
Save searls/c0b7ffedbcf7261b654c5cb492250a3a to your computer and use it in GitHub Desktop.
KameSame's December 2020 search overhaul
-- This all starts with some functions and a *materialized* postgres view that unnests several
-- arrays of strings of definitions into flattened rows that are easier to search. Fun fact:
-- you can even create indexes on materialized views' columns! They'll refresh whenever the view
-- is refreshed (which in my case is every time that we pull new dictionary data from WaniKani or JMDICT
-- This function will take an array of strings and convert all the double-width alphanumeric characters
-- and normalize them as half-width. That way a search query can be massaged from "OK" to "ok" easily
CREATE OR REPLACE FUNCTION array_hankakufy_romaji(character varying[])
RETURNS character varying[]
AS
$$
DECLARE
arrTexts ALIAS FOR $1;
retVal character varying[];
BEGIN
FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
retVal[I] := translate(arrTexts[I],
'0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
'0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz'
);
END LOOP;
RETURN retVal;
END;
$$
LANGUAGE plpgsql
STABLE
RETURNS NULL ON NULL INPUT;
-- Similarly, this function normalizes english definitions by lowercasing them and removing extra spaces between words
CREATE OR REPLACE FUNCTION massage_english(character varying)
RETURNS character varying
AS
$$
BEGIN
return regexp_replace(
regexp_replace(lower($1), '[^a-z0-9\s]+', '','g'),
'\s+', ' ', 'g'
);
END;
$$
LANGUAGE plpgsql
STABLE
RETURNS NULL ON NULL INPUT;
-- And this function just runs an array through the above function
CREATE OR REPLACE FUNCTION array_massage_english(character varying[])
RETURNS character varying[]
AS
$$
DECLARE
arrTexts ALIAS FOR $1;
retVal character varying[];
BEGIN
FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
retVal[I] := massage_english(arrTexts[I]);
END LOOP;
RETURN retVal;
END;
$$
LANGUAGE plpgsql
STABLE
RETURNS NULL ON NULL INPUT;
-- Here's the real view! It will unnest all Japanese spellings and readings + all English definitions
-- into a mega table where either/both are searchable and can load he entire row into an `Item` model
create materialized view searchable_items as
select items.*, j_text, e_text
from items,
unnest(
array_hankakufy_romaji(array_prepend(
items.text,
array_cat(
items.alt_texts,
items.reading_texts
)
))
) j_text,
unnest(array_massage_english(items.meaning_texts)) e_text
where items.type <> 'radical';
-- Here's me throwing some spaghetti at the index wall. The trigram ops do NOT work
-- on the Japanese text column, but the similarity() function *does* work on them when they are cast to
-- byte strings, so I do both b-tree and gin trigram ops on both english & Japanese here
CREATE INDEX searchable_items_e_text_idx ON searchable_items USING GIN (e_text gin_trgm_ops);
CREATE INDEX searchable_items_e_text_btree_idx ON searchable_items(e_text);
CREATE INDEX searchable_items_j_text_idx ON searchable_items USING GIN (j_text gin_trgm_ops);
CREATE INDEX searchable_items_j_text_btree_idx ON searchable_items(j_text);
-- Gotta initially refresh (fresh?) that view with some data!
refresh materialized view searchable_items;
-- Here's the main search query itself, pulled out of a Ruby file that executes it as SQL with
-- ever `:`-led token being a bind parameter of queries or tokens or tokens wrapped in '%' for like
-- operators
select *,
(select count(*) from learnings where user_id = :user_id and item_id = o2.id) learnings_count
from (
select distinct on (o1.id) o1.*,
greatest(
(case when :e_tokens_present then similarity(o1.e_text, :e_query) else 0 end),
(case when :j_tokens_present then similarity(o1.j_text::bytea::text, :j_query::bytea::text) else 0 end)
) rank
from ((
-- Exact Matches on whole query
select distinct on (searchable_items.id) searchable_items.*
from searchable_items
where (
(:j_query_present and j_text = :j_query)
or
(:e_query_present and e_text = :e_query)
)
limit :exact_limit
) union (
-- LIKE matches on any one space-delimited token in the query
select distinct on (searchable_items.id) searchable_items.*
from searchable_items
where (
(:j_tokens_present and j_text like any (array[:j_likes]))
or
(:e_tokens_present and e_text like any (array[:e_likes]))
)
limit :like_limit
) union (
-- Similarity for English queries based on pg_trgm similarity & index
select distinct on (searchable_items.id) searchable_items.*
from searchable_items
where :e_query_present and e_text % :e_query
limit :e_similar_limit
) union (
-- Strip conjugation from verbs & adjectives
select distinct on (searchable_items.id) searchable_items.*
from searchable_items
where (
parts_of_speech && '{する verb,godan verb, ichidan verb,い adjective}'::varchar[]
and
j_text like any (array[:j_stems])
)
limit :stems_limit
) union (
-- Alternate english definitions (user defined + official)
select distinct on (searchable_items.id)
searchable_items.id,
searchable_items.type,
searchable_items.wanikani_api_url,
searchable_items.wanikani_app_url,
searchable_items.wanikani_level,
searchable_items.wanikani_slug,
searchable_items.text,
searchable_items.parts_of_speech,
searchable_items.wanikani_component_item_ids,
searchable_items.wanikani_created_at,
searchable_items.wanikani_updated_at,
searchable_items.created_at,
searchable_items.updated_at,
searchable_items.wanikani_id,
searchable_items.meaning_texts,
searchable_items.reading_texts,
searchable_items.source,
searchable_items.jmdict_id,
searchable_items.jmdict_updated_on,
searchable_items.priority,
searchable_items.alt_texts,
searchable_items.usually_written_using_kana_alone,
searchable_items.common_word,
searchable_items.conjugation_code,
searchable_items.j_text,
massage_english(definitions.text) e_text --- And then there's Maude!
from searchable_items
join definitions on searchable_items.id = definitions.item_id and (definitions.official or definitions.user_id = :user_id)
where (
:e_tokens_present
and
(
definitions.text = :query
or
massage_english(definitions.text) = any (array[:e_tokens])
)
)
limit :english_definition_limit
) union (
-- Alternate Japanese spellings (user defined + official)
select distinct on (searchable_items.id)
searchable_items.id,
searchable_items.type,
searchable_items.wanikani_api_url,
searchable_items.wanikani_app_url,
searchable_items.wanikani_level,
searchable_items.wanikani_slug,
searchable_items.text,
searchable_items.parts_of_speech,
searchable_items.wanikani_component_item_ids,
searchable_items.wanikani_created_at,
searchable_items.wanikani_updated_at,
searchable_items.created_at,
searchable_items.updated_at,
searchable_items.wanikani_id,
searchable_items.meaning_texts,
searchable_items.reading_texts,
searchable_items.source,
searchable_items.jmdict_id,
searchable_items.jmdict_updated_on,
searchable_items.priority,
searchable_items.alt_texts,
searchable_items.usually_written_using_kana_alone,
searchable_items.common_word,
searchable_items.conjugation_code,
spellings.text j_text, --- And then there's Maude!
searchable_items.e_text
from searchable_items
join spellings on searchable_items.id = spellings.item_id and (spellings.official or spellings.user_id = :user_id)
where (
:j_tokens_present
and
(
spellings.text = :query
or
spellings.text = any (array[:j_tokens])
)
)
limit :japanese_spelling_limit
)) o1
) o2
order by o2.rank desc
limit :results_limit
-- There are edge cases that are expensive to query but still important
-- to cover, which I'm lumping in this query and only executing it when
-- the first query's results are garbage (in Ruby, I score every query result
-- based on a dozen factors in order to sort them more accurately and if they're bad
-- and there's time remaining on the self-imposed search SLA, I will run this query too
--
-- Note I pulled a couple out of here and into the main query after speeding them up significantly
-- so right now there is only a Japanese similarity query (again, b/c trigram ops are slow on byte
-- strings and Japanese isn't strictly supported
select *,
(select count(*) from learnings where user_id = :user_id and item_id = o.id) learnings_count
from ((
-- Misspellings in Japanese queries (similarity is not possible with pg_trgm, so using levenshtein)
select distinct on (searchable_items.id) searchable_items.*,
(case when :j_tokens_present then similarity(j_text::bytea::text, :j_query::bytea::text) else 0 end) rank
from searchable_items
where :j_query_present and levenshtein_less_equal(j_text, :j_query, 1) < 2
limit :j_similar_limit
)) o
order by o.rank desc
limit :results_limit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment