Skip to content

Instantly share code, notes, and snippets.

@jthandy
Created February 21, 2019 18:29
Show Gist options
  • Save jthandy/7fd9d91166d5aaea2a389cbf253d3db8 to your computer and use it in GitHub Desktop.
Save jthandy/7fd9d91166d5aaea2a389cbf253d3db8 to your computer and use it in GitHub Desktop.
{% macro source_select_sql(relation, column_name) %}
select {{column_name}} as f from {{relation}}
{% endmacro %}
{% macro single_pass_stats(relation, column_name) %}
{%- call statement('single_pass_column_stats', fetch_result=True) -%}
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ),
single_pass_stats as (
select
sum(case when f is null then 1 end)::varchar as null_values,
count(f)::varchar as not_null_values,
sum(case when f = '' then 1 end)::varchar as empty_strings,
count(distinct f)::varchar as distinct_values,
min(f ignore nulls)::varchar as max_value,
max(f ignore nulls)::varchar as min_value
from source
)
select * from single_pass_stats
{%- endcall -%}
{% endmacro %}
{% macro multi_pass_stats(relation, column_name) %}
{%- call statement('multi_pass_column_stats', fetch_result=True) -%}
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ),
is_unique as (
select
'is unique' as statistic,
1 as order,
case when count(*) > 1 then 'no' else 'yes' end as value
from source
where f is not null
group by 1, 2, f
order by count(*) desc
limit 1
),
longest_value as (
select 'longest value', 2, f
from source
where f is not null
order by length(f) desc
limit 1
),
shortest_value as (
select 'shortest value', 3, f
from source
where f is not null
order by length(f)
limit 1
),
stats as (
select * from is_unique
union all
select * from longest_value
union all
select * from shortest_value
)
select statistic, value from stats order by "order"
{%- endcall -%}
{% endmacro %}
{% macro top_values(relation, column_name) %}
{%- call statement('top_column_values', fetch_result=True) -%}
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ),
top_five_by_frequency as (
select
f as value,
count(*)::varchar as frequency
from source
where f is not null
group by 1
order by count(*) desc
limit 5
)
select * from top_five_by_frequency
{%- endcall -%}
{% endmacro %}
{% macro describe_categorical_column(relation, column_name) %}
{% if not execute %}
{{return('')}}
{% endif %}
{{ fishtown_internal_analytics.single_pass_stats(relation, column_name) }}
{{ fishtown_internal_analytics.multi_pass_stats(relation, column_name) }}
{{ fishtown_internal_analytics.top_values(relation, column_name) }}
{{ log(
"Null values: " ~
load_result('single_pass_column_stats')['data'][0][0], true
)}}
{{ log(
"Not null values: " ~
load_result('single_pass_column_stats')['data'][0][1], true
)}}
{{ log(
"Is unique: " ~
load_result('multi_pass_column_stats')['data'][0][1], true
)}}
{{ log(
"Distinct values: " ~
load_result('single_pass_column_stats')['data'][0][3], true
)}}
{{ log(
"Empty strings: " ~
load_result('single_pass_column_stats')['data'][0][2], true
)}}
{{ log(
"Min value: " ~
load_result('single_pass_column_stats')['data'][0][5], true
)}}
{{ log(
"Max value: " ~
load_result('single_pass_column_stats')['data'][0][4], true
)}}
{{ log(
"Shortest value: " ~
load_result('multi_pass_column_stats')['data'][2][1], true
)}}
{{ log(
"Longest value: " ~
load_result('multi_pass_column_stats')['data'][1][1], true
)}}
{{ log("", true)}}
{{ log("=====================================================", true)}}
{{ log("", true)}}
{{ log("Top values:", true)}}
{% set _ = load_result('top_column_values')['table']
.print_table(output=info_log_stream)
%}
{% endmacro %}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment