Skip to content

Instantly share code, notes, and snippets.

@lgavish
Created May 7, 2021 18:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lgavish/4208c1c72f6ebd453d5498e9b86ca226 to your computer and use it in GitHub Desktop.
Save lgavish/4208c1c72f6ebd453d5498e9b86ca226 to your computer and use it in GitHub Desktop.
Extracting field health metrics from Snowflake
SELECT
DATE_TRUNC('HOUR', created_on) as bucket_start,
DATEADD(hr, 1, DATE_TRUNC('HOUR', created_on)) as bucket_end,
COUNT(*) as row_count,
-- string field
COUNT(account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___completeness,
COUNT(DISTINCT account_id) as account_id___approx_distinct_count,
COUNT(DISTINCT account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___approx_distinctness,
AVG(LENGTH(account_id)) as account_id___mean_length,
MAX(LENGTH(account_id)) as account_id___max_length,
MIN(LENGTH(account_id)) as account_id___min_length,
STDDEV(CAST(LENGTH(account_id) as double)) as account_id___std_length,
SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9]+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_int_rate,
SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_number_rate,
SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_uuid_rate,
SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^(\\s+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_all_spaces_rate,
SUM(IFF(UPPER(account_id) IN ('NULL', 'NONE', 'NIL', 'NOTHING'), 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_null_keyword_rate,
-- numeric field
COUNT(num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___completeness,
SUM(IFF(num_of_users = 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___zero_rate,
SUM(IFF(num_of_users < 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___negative_rate,
COUNT(DISTINCT num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___approx_distinctness,
AVG(num_of_users) as num_of_users___numeric_mean,
MIN(num_of_users) as num_of_users___numeric_min,
MAX(num_of_users) as num_of_users___numeric_max,
STDDEV(CAST(num_of_users as double)) as num_of_users___numeric_std,
ARRAY_CONSTRUCT(APPROX_PERCENTILE(num_of_users, 0.00), APPROX_PERCENTILE(num_of_users, 0.20), APPROX_PERCENTILE(num_of_users, 0.40), APPROX_PERCENTILE(num_of_users, 0.60), APPROX_PERCENTILE(num_of_users, 0.80), APPROX_PERCENTILE(num_of_users, 1.00)) as num_of_users___approx_quantiles
FROM analytics.prod.client_hub
WHERE
DATE_TRUNC('HOUR', measurement_timestamp) >= DATEADD(day, -1, CURRENT_TIMESTAMP())
GROUP BY bucket_start, bucket_end
ORDER BY bucket_start ASC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment