Skip to content

Instantly share code, notes, and snippets.

@shikanime
Last active August 31, 2023 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shikanime/68ad4e5e9c08812e7db6b41b3a54a1ff to your computer and use it in GitHub Desktop.
Save shikanime/68ad4e5e9c08812e7db6b41b3a54a1ff to your computer and use it in GitHub Desktop.
Conducting experimentation involving similarity search through text embedding using Vertex AI. To sum up, encountering a familiar issue seen in several data products that incorporate Vertex AI—experiencing excessively high network overhead.
CREATE OR REPLACE MODEL
search.vertex_ai_text_embeddings REMOTE
WITH CONNECTION `us.vertex_ai` OPTIONS(REMOTE_SERVICE_TYPE="CLOUD_AI_TEXT_EMBEDDING_MODEL_V1")
CREATE OR REPLACE TABLE
search.text_embeddings AS (
WITH
samples AS (
SELECT
SKU AS id,
TRIM(LOWER(name)) AS content
FROM
`data-to-insights.ecommerce.products`)
SELECT
*
FROM
ML.GENERATE_TEXT_EMBEDDING( MODEL `search.vertex_ai_text_embeddings`,
TABLE samples,
STRUCT(TRUE AS flatten_json_output)))
WITH
sample AS (
SELECT
*
FROM
`search.text_tops`
WHERE
id = "GGOEWALJ082713" )
SELECT
b.id,
b.content
FROM
sample,
UNNEST(candidate_ids) AS a
JOIN
`search.text_embeddings` b
ON
a.candidate_id = b.id
CREATE OR REPLACE TABLE
search.text_similarities AS (
WITH
pairs AS (
SELECT
a.id AS id,
a.text_embedding AS embedding,
b.id AS candidate_id,
b.text_embedding AS candidate_embedding,
FROM
`search.text_embeddings` a
INNER JOIN
`search.text_embeddings` b
ON
a.id < b.id )
SELECT
id,
candidate_id,
ML.DISTANCE( embedding,
candidate_embedding,
"COSINE" ) AS cosine_similarity
FROM
pairs)
CREATE OR REPLACE TABLE
search.text_tops AS (
SELECT
id,
ARRAY_AGG(STRUCT(candidate_id,
cosine_similarity)
ORDER BY
cosine_similarity ASC
LIMIT
10) AS candidate_ids
FROM
`search.text_similarities`
GROUP BY
1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment