searls/001_functions_and_views.sql

## 001_functions_and_views.sql
-- This all starts with some functions and a *materialized* postgres view that unnests several
-- arrays of strings of definitions into flattened rows that are easier to search. Fun fact:
-- you can even create indexes on materialized views' columns! They'll refresh whenever the view
-- is refreshed (which in my case is every time that we pull new dictionary data from WaniKani or JMDICT

-- This function will take an array of strings and convert all the double-width alphanumeric characters
-- and normalize them as half-width. That way a search query can be massaged from "ＯＫ" to "ok" easily
CREATE OR REPLACE FUNCTION array_hankakufy_romaji(character varying[])
RETURNS character varying[]
AS
$$
DECLARE
   arrTexts ALIAS FOR $1;
   retVal character varying[];
BEGIN
   FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
    retVal[I] := translate(arrTexts[I],
      '０１２３４５６７８９ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ',
      '0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz'
    );
   END LOOP;
RETURN retVal;
END;
$$
LANGUAGE plpgsql
   STABLE
RETURNS NULL ON NULL INPUT;

-- Similarly, this function normalizes english definitions by lowercasing them and removing extra spaces between words
CREATE OR REPLACE FUNCTION massage_english(character varying)
RETURNS character varying
AS
$$
BEGIN
  return regexp_replace(
    regexp_replace(lower($1), '[^a-z0-9\s]+', '','g'),
    '\s+', ' ', 'g'
  );
END;
$$
LANGUAGE plpgsql
   STABLE
RETURNS NULL ON NULL INPUT;

-- And this function just runs an array through the above function
CREATE OR REPLACE FUNCTION array_massage_english(character varying[])
RETURNS character varying[]
AS
$$
DECLARE
   arrTexts ALIAS FOR $1;
   retVal character varying[];
BEGIN
   FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
    retVal[I] := massage_english(arrTexts[I]);
   END LOOP;
RETURN retVal;
END;
$$
LANGUAGE plpgsql
   STABLE
RETURNS NULL ON NULL INPUT;

-- Here's the real view! It will unnest all Japanese spellings and readings + all English definitions
-- into a mega table where either/both are searchable and can load he entire row into an `Item` model
create materialized view searchable_items as
  select items.*, j_text, e_text
  from items,
    unnest(
      array_hankakufy_romaji(array_prepend(
        items.text,
        array_cat(
          items.alt_texts,
          items.reading_texts
        )
      ))
    ) j_text,
    unnest(array_massage_english(items.meaning_texts)) e_text
  where items.type <> 'radical';

-- Here's me throwing some spaghetti at the index wall. The trigram ops do NOT work
-- on the Japanese text column, but the similarity() function *does* work on them when they are cast to
-- byte strings, so I do both b-tree and gin trigram ops on both english & Japanese here
CREATE INDEX searchable_items_e_text_idx ON searchable_items USING GIN (e_text gin_trgm_ops);
CREATE INDEX searchable_items_e_text_btree_idx ON searchable_items(e_text);
CREATE INDEX searchable_items_j_text_idx ON searchable_items USING GIN (j_text gin_trgm_ops);
CREATE INDEX searchable_items_j_text_btree_idx ON searchable_items(j_text);

-- Gotta initially refresh (fresh?) that view with some data!
refresh materialized view searchable_items;

## 002_main_query.sql
-- Here's the main search query itself, pulled out of a Ruby file that executes it as SQL with
-- ever `:`-led token being a bind parameter of queries or tokens or tokens wrapped in '%' for like
-- operators
select *,
  (select count(*) from learnings where user_id = :user_id and item_id = o2.id) learnings_count
from (
  select distinct on (o1.id) o1.*,
    greatest(
      (case when :e_tokens_present then similarity(o1.e_text, :e_query) else 0 end),
      (case when :j_tokens_present then similarity(o1.j_text::bytea::text, :j_query::bytea::text) else 0 end)
    ) rank
  from ((
    -- Exact Matches on whole query
    select distinct on (searchable_items.id) searchable_items.*
    from searchable_items
    where (
      (:j_query_present and j_text = :j_query)
      or
      (:e_query_present and e_text = :e_query)
    )
    limit :exact_limit
  ) union (
    -- LIKE matches on any one space-delimited token in the query
    select distinct on (searchable_items.id) searchable_items.*
    from searchable_items
    where (
      (:j_tokens_present and j_text like any (array[:j_likes]))
      or
      (:e_tokens_present and e_text like any (array[:e_likes]))
    )
    limit :like_limit
  ) union (
    -- Similarity for English queries based on pg_trgm similarity & index
    select distinct on (searchable_items.id) searchable_items.*
    from searchable_items
    where :e_query_present and e_text % :e_query
    limit :e_similar_limit
  ) union (
    -- Strip conjugation from verbs & adjectives
    select distinct on (searchable_items.id) searchable_items.*
    from searchable_items
    where (
      parts_of_speech && '{する verb,godan verb, ichidan verb,い adjective}'::varchar[]
      and
      j_text like any (array[:j_stems])
    )
    limit :stems_limit
  ) union (
    -- Alternate english definitions (user defined + official)
    select distinct on (searchable_items.id)
      searchable_items.id,
      searchable_items.type,
      searchable_items.wanikani_api_url,
      searchable_items.wanikani_app_url,
      searchable_items.wanikani_level,
      searchable_items.wanikani_slug,
      searchable_items.text,
      searchable_items.parts_of_speech,
      searchable_items.wanikani_component_item_ids,
      searchable_items.wanikani_created_at,
      searchable_items.wanikani_updated_at,
      searchable_items.created_at,
      searchable_items.updated_at,
      searchable_items.wanikani_id,
      searchable_items.meaning_texts,
      searchable_items.reading_texts,
      searchable_items.source,
      searchable_items.jmdict_id,
      searchable_items.jmdict_updated_on,
      searchable_items.priority,
      searchable_items.alt_texts,
      searchable_items.usually_written_using_kana_alone,
      searchable_items.common_word,
      searchable_items.conjugation_code,
      searchable_items.j_text,
      massage_english(definitions.text) e_text --- And then there's Maude!
    from searchable_items
    join definitions on searchable_items.id = definitions.item_id and (definitions.official or definitions.user_id = :user_id)
    where (
      :e_tokens_present
      and
      (
        definitions.text = :query
        or
        massage_english(definitions.text) = any (array[:e_tokens])
      )
    )
    limit :english_definition_limit
  ) union (
    -- Alternate Japanese spellings (user defined + official)
    select distinct on (searchable_items.id)
      searchable_items.id,
      searchable_items.type,
      searchable_items.wanikani_api_url,
      searchable_items.wanikani_app_url,
      searchable_items.wanikani_level,
      searchable_items.wanikani_slug,
      searchable_items.text,
      searchable_items.parts_of_speech,
      searchable_items.wanikani_component_item_ids,
      searchable_items.wanikani_created_at,
      searchable_items.wanikani_updated_at,
      searchable_items.created_at,
      searchable_items.updated_at,
      searchable_items.wanikani_id,
      searchable_items.meaning_texts,
      searchable_items.reading_texts,
      searchable_items.source,
      searchable_items.jmdict_id,
      searchable_items.jmdict_updated_on,
      searchable_items.priority,
      searchable_items.alt_texts,
      searchable_items.usually_written_using_kana_alone,
      searchable_items.common_word,
      searchable_items.conjugation_code,
      spellings.text j_text, --- And then there's Maude!
      searchable_items.e_text
    from searchable_items
    join spellings on searchable_items.id = spellings.item_id and (spellings.official or spellings.user_id = :user_id)
    where (
      :j_tokens_present
      and
      (
        spellings.text = :query
        or
        spellings.text = any (array[:j_tokens])
      )
    )
    limit :japanese_spelling_limit
  )) o1
) o2
order by o2.rank desc
limit :results_limit

## 003_fallback_query.sql
-- There are edge cases that are expensive to query but still important
-- to cover, which I'm lumping in this query and only executing it when
-- the first query's results are garbage (in Ruby, I score every query result
-- based on a dozen factors in order to sort them more accurately and if they're bad
-- and there's time remaining on the self-imposed search SLA, I will run this query too
--
-- Note I pulled a couple out of here and into the main query after speeding them up significantly
-- so right now there is only a Japanese similarity query (again, b/c trigram ops are slow on byte
-- strings and Japanese isn't strictly supported
select *,
  (select count(*) from learnings where user_id = :user_id and item_id = o.id) learnings_count
from ((
  -- Misspellings in Japanese queries (similarity is not possible with pg_trgm, so using levenshtein)
  select distinct on (searchable_items.id) searchable_items.*,
    (case when :j_tokens_present then similarity(j_text::bytea::text, :j_query::bytea::text) else 0 end) rank
  from searchable_items
  where :j_query_present and levenshtein_less_equal(j_text, :j_query, 1) < 2
  limit :j_similar_limit
)) o
order by o.rank desc
limit :results_limit
	-- This all starts with some functions and a materialized postgres view that unnests several
	-- arrays of strings of definitions into flattened rows that are easier to search. Fun fact:
	-- you can even create indexes on materialized views' columns! They'll refresh whenever the view
	-- is refreshed (which in my case is every time that we pull new dictionary data from WaniKani or JMDICT

	-- This function will take an array of strings and convert all the double-width alphanumeric characters
	-- and normalize them as half-width. That way a search query can be massaged from "ＯＫ" to "ok" easily
	CREATE OR REPLACE FUNCTION array_hankakufy_romaji(character varying[])
	RETURNS character varying[]
	AS
	$$
	DECLARE
	arrTexts ALIAS FOR $1;
	retVal character varying[];
	BEGIN
	FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
	retVal[I] := translate(arrTexts[I],
	'０１２３４５６７８９ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ',
	'0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz'
	);
	END LOOP;
	RETURN retVal;
	END;
	$$
	LANGUAGE plpgsql
	STABLE
	RETURNS NULL ON NULL INPUT;

	-- Similarly, this function normalizes english definitions by lowercasing them and removing extra spaces between words
	CREATE OR REPLACE FUNCTION massage_english(character varying)
	RETURNS character varying
	AS
	$$
	BEGIN
	return regexp_replace(
	regexp_replace(lower($1), '[^a-z0-9\s]+', '','g'),
	'\s+', ' ', 'g'
	);
	END;
	$$
	LANGUAGE plpgsql
	STABLE
	RETURNS NULL ON NULL INPUT;

	-- And this function just runs an array through the above function
	CREATE OR REPLACE FUNCTION array_massage_english(character varying[])
	RETURNS character varying[]
	AS
	$$
	DECLARE
	arrTexts ALIAS FOR $1;
	retVal character varying[];
	BEGIN
	FOR I IN array_lower(arrTexts, 1)..array_upper(arrTexts, 1) LOOP
	retVal[I] := massage_english(arrTexts[I]);
	END LOOP;
	RETURN retVal;
	END;
	$$
	LANGUAGE plpgsql
	STABLE
	RETURNS NULL ON NULL INPUT;

	-- Here's the real view! It will unnest all Japanese spellings and readings + all English definitions
	-- into a mega table where either/both are searchable and can load he entire row into an `Item` model
	create materialized view searchable_items as
	select items.*, j_text, e_text
	from items,
	unnest(
	array_hankakufy_romaji(array_prepend(
	items.text,
	array_cat(
	items.alt_texts,
	items.reading_texts
	)
	))
	) j_text,
	unnest(array_massage_english(items.meaning_texts)) e_text
	where items.type <> 'radical';

	-- Here's me throwing some spaghetti at the index wall. The trigram ops do NOT work
	-- on the Japanese text column, but the similarity() function does work on them when they are cast to
	-- byte strings, so I do both b-tree and gin trigram ops on both english & Japanese here
	CREATE INDEX searchable_items_e_text_idx ON searchable_items USING GIN (e_text gin_trgm_ops);
	CREATE INDEX searchable_items_e_text_btree_idx ON searchable_items(e_text);
	CREATE INDEX searchable_items_j_text_idx ON searchable_items USING GIN (j_text gin_trgm_ops);
	CREATE INDEX searchable_items_j_text_btree_idx ON searchable_items(j_text);

	-- Gotta initially refresh (fresh?) that view with some data!
	refresh materialized view searchable_items;
	-- Here's the main search query itself, pulled out of a Ruby file that executes it as SQL with
	-- ever `:`-led token being a bind parameter of queries or tokens or tokens wrapped in '%' for like
	-- operators
	select *,
	(select count(*) from learnings where user_id = :user_id and item_id = o2.id) learnings_count
	from (
	select distinct on (o1.id) o1.*,
	greatest(
	(case when :e_tokens_present then similarity(o1.e_text, :e_query) else 0 end),
	(case when :j_tokens_present then similarity(o1.j_text::bytea::text, :j_query::bytea::text) else 0 end)
	) rank
	from ((
	-- Exact Matches on whole query
	select distinct on (searchable_items.id) searchable_items.*
	from searchable_items
	where (
	(:j_query_present and j_text = :j_query)
	or
	(:e_query_present and e_text = :e_query)
	)
	limit :exact_limit
	) union (
	-- LIKE matches on any one space-delimited token in the query
	select distinct on (searchable_items.id) searchable_items.*
	from searchable_items
	where (
	(:j_tokens_present and j_text like any (array[:j_likes]))
	or
	(:e_tokens_present and e_text like any (array[:e_likes]))
	)
	limit :like_limit
	) union (
	-- Similarity for English queries based on pg_trgm similarity & index
	select distinct on (searchable_items.id) searchable_items.*
	from searchable_items
	where :e_query_present and e_text % :e_query
	limit :e_similar_limit
	) union (
	-- Strip conjugation from verbs & adjectives
	select distinct on (searchable_items.id) searchable_items.*
	from searchable_items
	where (
	parts_of_speech && '{する verb,godan verb, ichidan verb,い adjective}'::varchar[]
	and
	j_text like any (array[:j_stems])
	)
	limit :stems_limit
	) union (
	-- Alternate english definitions (user defined + official)
	select distinct on (searchable_items.id)
	searchable_items.id,
	searchable_items.type,
	searchable_items.wanikani_api_url,
	searchable_items.wanikani_app_url,
	searchable_items.wanikani_level,
	searchable_items.wanikani_slug,
	searchable_items.text,
	searchable_items.parts_of_speech,
	searchable_items.wanikani_component_item_ids,
	searchable_items.wanikani_created_at,
	searchable_items.wanikani_updated_at,
	searchable_items.created_at,
	searchable_items.updated_at,
	searchable_items.wanikani_id,
	searchable_items.meaning_texts,
	searchable_items.reading_texts,
	searchable_items.source,
	searchable_items.jmdict_id,
	searchable_items.jmdict_updated_on,
	searchable_items.priority,
	searchable_items.alt_texts,
	searchable_items.usually_written_using_kana_alone,
	searchable_items.common_word,
	searchable_items.conjugation_code,
	searchable_items.j_text,
	massage_english(definitions.text) e_text --- And then there's Maude!
	from searchable_items
	join definitions on searchable_items.id = definitions.item_id and (definitions.official or definitions.user_id = :user_id)
	where (
	:e_tokens_present
	and
	(
	definitions.text = :query
	or
	massage_english(definitions.text) = any (array[:e_tokens])
	)
	)
	limit :english_definition_limit
	) union (
	-- Alternate Japanese spellings (user defined + official)
	select distinct on (searchable_items.id)
	searchable_items.id,
	searchable_items.type,
	searchable_items.wanikani_api_url,
	searchable_items.wanikani_app_url,
	searchable_items.wanikani_level,
	searchable_items.wanikani_slug,
	searchable_items.text,
	searchable_items.parts_of_speech,
	searchable_items.wanikani_component_item_ids,
	searchable_items.wanikani_created_at,
	searchable_items.wanikani_updated_at,
	searchable_items.created_at,
	searchable_items.updated_at,
	searchable_items.wanikani_id,
	searchable_items.meaning_texts,
	searchable_items.reading_texts,
	searchable_items.source,
	searchable_items.jmdict_id,
	searchable_items.jmdict_updated_on,
	searchable_items.priority,
	searchable_items.alt_texts,
	searchable_items.usually_written_using_kana_alone,
	searchable_items.common_word,
	searchable_items.conjugation_code,
	spellings.text j_text, --- And then there's Maude!
	searchable_items.e_text
	from searchable_items
	join spellings on searchable_items.id = spellings.item_id and (spellings.official or spellings.user_id = :user_id)
	where (
	:j_tokens_present
	and
	(
	spellings.text = :query
	or
	spellings.text = any (array[:j_tokens])
	)
	)
	limit :japanese_spelling_limit
	)) o1
	) o2
	order by o2.rank desc
	limit :results_limit
	-- There are edge cases that are expensive to query but still important
	-- to cover, which I'm lumping in this query and only executing it when
	-- the first query's results are garbage (in Ruby, I score every query result
	-- based on a dozen factors in order to sort them more accurately and if they're bad
	-- and there's time remaining on the self-imposed search SLA, I will run this query too
	--
	-- Note I pulled a couple out of here and into the main query after speeding them up significantly
	-- so right now there is only a Japanese similarity query (again, b/c trigram ops are slow on byte
	-- strings and Japanese isn't strictly supported
	select *,
	(select count(*) from learnings where user_id = :user_id and item_id = o.id) learnings_count
	from ((
	-- Misspellings in Japanese queries (similarity is not possible with pg_trgm, so using levenshtein)
	select distinct on (searchable_items.id) searchable_items.*,
	(case when :j_tokens_present then similarity(j_text::bytea::text, :j_query::bytea::text) else 0 end) rank
	from searchable_items
	where :j_query_present and levenshtein_less_equal(j_text, :j_query, 1) < 2
	limit :j_similar_limit
	)) o
	order by o.rank desc
	limit :results_limit