lgavish/snowflake_field_health.sql Secret

## snowflake_field_health.sql
SELECT
    DATE_TRUNC('HOUR', created_on) as bucket_start,
    DATEADD(hr, 1, DATE_TRUNC('HOUR', created_on)) as bucket_end,

    COUNT(*) as row_count,

    -- string field
    COUNT(account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___completeness,
    COUNT(DISTINCT account_id) as account_id___approx_distinct_count,
    COUNT(DISTINCT account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___approx_distinctness,
    AVG(LENGTH(account_id)) as account_id___mean_length,
    MAX(LENGTH(account_id)) as account_id___max_length,
    MIN(LENGTH(account_id)) as account_id___min_length,
    STDDEV(CAST(LENGTH(account_id) as double)) as account_id___std_length,
    SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9]+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_int_rate,
    SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_number_rate,
    SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_uuid_rate,
    SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^(\\s+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_all_spaces_rate,
    SUM(IFF(UPPER(account_id) IN ('NULL', 'NONE', 'NIL', 'NOTHING'), 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_null_keyword_rate,

    -- numeric field
    COUNT(num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___completeness,
    SUM(IFF(num_of_users = 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___zero_rate,
    SUM(IFF(num_of_users < 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___negative_rate,
    COUNT(DISTINCT num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___approx_distinctness,
    AVG(num_of_users) as num_of_users___numeric_mean,
    MIN(num_of_users) as num_of_users___numeric_min,
    MAX(num_of_users) as num_of_users___numeric_max,
    STDDEV(CAST(num_of_users as double)) as num_of_users___numeric_std,
    ARRAY_CONSTRUCT(APPROX_PERCENTILE(num_of_users, 0.00), APPROX_PERCENTILE(num_of_users, 0.20), APPROX_PERCENTILE(num_of_users, 0.40), APPROX_PERCENTILE(num_of_users, 0.60), APPROX_PERCENTILE(num_of_users, 0.80), APPROX_PERCENTILE(num_of_users, 1.00)) as num_of_users___approx_quantiles

FROM analytics.prod.client_hub
WHERE
    DATE_TRUNC('HOUR', measurement_timestamp) >= DATEADD(day, -1, CURRENT_TIMESTAMP())
GROUP BY bucket_start, bucket_end
ORDER BY bucket_start ASC;
	SELECT
	DATE_TRUNC('HOUR', created_on) as bucket_start,
	DATEADD(hr, 1, DATE_TRUNC('HOUR', created_on)) as bucket_end,

	COUNT(*) as row_count,

	-- string field
	COUNT(account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___completeness,
	COUNT(DISTINCT account_id) as account_id___approx_distinct_count,
	COUNT(DISTINCT account_id) / CAST(COUNT(*) AS NUMERIC) as account_id___approx_distinctness,
	AVG(LENGTH(account_id)) as account_id___mean_length,
	MAX(LENGTH(account_id)) as account_id___max_length,
	MIN(LENGTH(account_id)) as account_id___min_length,
	STDDEV(CAST(LENGTH(account_id) as double)) as account_id___std_length,
	SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9]+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_int_rate,
	SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([-+]?[0-9][.]?[0-9]+([eE][-+]?[0-9]+)?)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT() AS NUMERIC) as account_id___text_number_rate,
	SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_uuid_rate,
	SUM(IFF(REGEXP_COUNT(TO_VARCHAR(account_id), '^(\\s+)$', 1, 'i') != 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_all_spaces_rate,
	SUM(IFF(UPPER(account_id) IN ('NULL', 'NONE', 'NIL', 'NOTHING'), 1, 0)) / CAST(COUNT(*) AS NUMERIC) as account_id___text_null_keyword_rate,

	-- numeric field
	COUNT(num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___completeness,
	SUM(IFF(num_of_users = 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___zero_rate,
	SUM(IFF(num_of_users < 0, 1, 0)) / CAST(COUNT(*) AS NUMERIC) as num_of_users___negative_rate,
	COUNT(DISTINCT num_of_users) / CAST(COUNT(*) AS NUMERIC) as num_of_users___approx_distinctness,
	AVG(num_of_users) as num_of_users___numeric_mean,
	MIN(num_of_users) as num_of_users___numeric_min,
	MAX(num_of_users) as num_of_users___numeric_max,
	STDDEV(CAST(num_of_users as double)) as num_of_users___numeric_std,
	ARRAY_CONSTRUCT(APPROX_PERCENTILE(num_of_users, 0.00), APPROX_PERCENTILE(num_of_users, 0.20), APPROX_PERCENTILE(num_of_users, 0.40), APPROX_PERCENTILE(num_of_users, 0.60), APPROX_PERCENTILE(num_of_users, 0.80), APPROX_PERCENTILE(num_of_users, 1.00)) as num_of_users___approx_quantiles

	FROM analytics.prod.client_hub
	WHERE
	DATE_TRUNC('HOUR', measurement_timestamp) >= DATEADD(day, -1, CURRENT_TIMESTAMP())
	GROUP BY bucket_start, bucket_end
	ORDER BY bucket_start ASC;