Skip to content

Instantly share code, notes, and snippets.

@shikanime
Last active August 31, 2023 16:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shikanime/4434d0e72f64c110a7ce0eebc35e96bc to your computer and use it in GitHub Desktop.
Save shikanime/4434d0e72f64c110a7ce0eebc35e96bc to your computer and use it in GitHub Desktop.
Exploring similarity search based on text embedding. The model, seamlessly integrated into the execution engine, offers user-friendly functionality and a performance advantage over any remote solutions.
CREATE OR REPLACE MODEL
search.universal_sentence_encoder_large OPTIONS(model_type='tensorflow',
model_path='gs://shikanime-studio-labs/universal-sentence-encoder-multilingual-large/*')
CREATE OR REPLACE TABLE
search.text_embeddings AS (
WITH
samples AS (
SELECT
SKU AS id,
TRIM(LOWER(name)) AS inputs
FROM
`data-to-insights.ecommerce.products` )
SELECT
id,
inputs AS text,
outputs AS embeddings,
FROM
ML.PREDICT(MODEL search.universal_sentence_encoder_large,
TABLE samples ))
WITH
sample AS (
SELECT
*
FROM
`search.text_tops`
WHERE
id = "GGOEWALJ082713" )
SELECT
b.id,
b.text
FROM
sample,
UNNEST(candidate_ids) AS a
JOIN
`search.text_embeddings` b
ON
a.candidate_id = b.id
CREATE OR REPLACE TABLE
search.text_similarities AS (
WITH
pairs AS (
SELECT
a.id AS id,
a.embeddings AS embeddings,
b.id AS candidate_id,
b.embeddings AS candidate_embeddings,
FROM
`search.text_embeddings` a
INNER JOIN
`search.text_embeddings` b
ON
a.id < b.id )
SELECT
id,
candidate_id,
ML.DISTANCE( embeddings,
candidate_embeddings,
"COSINE" ) AS cosine_similarity
FROM
pairs)
CREATE OR REPLACE TABLE
search.text_tops AS (
SELECT
id,
ARRAY_AGG(STRUCT(candidate_id,
cosine_similarity)
ORDER BY
cosine_similarity ASC
LIMIT
10) AS candidate_ids
FROM
`search.text_similarities`
GROUP BY
1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment