Created
February 21, 2019 18:29
-
-
Save jthandy/7fd9d91166d5aaea2a389cbf253d3db8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{% macro source_select_sql(relation, column_name) %} | |
select {{column_name}} as f from {{relation}} | |
{% endmacro %} | |
{% macro single_pass_stats(relation, column_name) %} | |
{%- call statement('single_pass_column_stats', fetch_result=True) -%} | |
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ), | |
single_pass_stats as ( | |
select | |
sum(case when f is null then 1 end)::varchar as null_values, | |
count(f)::varchar as not_null_values, | |
sum(case when f = '' then 1 end)::varchar as empty_strings, | |
count(distinct f)::varchar as distinct_values, | |
min(f ignore nulls)::varchar as max_value, | |
max(f ignore nulls)::varchar as min_value | |
from source | |
) | |
select * from single_pass_stats | |
{%- endcall -%} | |
{% endmacro %} | |
{% macro multi_pass_stats(relation, column_name) %} | |
{%- call statement('multi_pass_column_stats', fetch_result=True) -%} | |
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ), | |
is_unique as ( | |
select | |
'is unique' as statistic, | |
1 as order, | |
case when count(*) > 1 then 'no' else 'yes' end as value | |
from source | |
where f is not null | |
group by 1, 2, f | |
order by count(*) desc | |
limit 1 | |
), | |
longest_value as ( | |
select 'longest value', 2, f | |
from source | |
where f is not null | |
order by length(f) desc | |
limit 1 | |
), | |
shortest_value as ( | |
select 'shortest value', 3, f | |
from source | |
where f is not null | |
order by length(f) | |
limit 1 | |
), | |
stats as ( | |
select * from is_unique | |
union all | |
select * from longest_value | |
union all | |
select * from shortest_value | |
) | |
select statistic, value from stats order by "order" | |
{%- endcall -%} | |
{% endmacro %} | |
{% macro top_values(relation, column_name) %} | |
{%- call statement('top_column_values', fetch_result=True) -%} | |
with source as ( {{ fishtown_internal_analytics.source_select_sql(relation, column_name) }} ), | |
top_five_by_frequency as ( | |
select | |
f as value, | |
count(*)::varchar as frequency | |
from source | |
where f is not null | |
group by 1 | |
order by count(*) desc | |
limit 5 | |
) | |
select * from top_five_by_frequency | |
{%- endcall -%} | |
{% endmacro %} | |
{% macro describe_categorical_column(relation, column_name) %} | |
{% if not execute %} | |
{{return('')}} | |
{% endif %} | |
{{ fishtown_internal_analytics.single_pass_stats(relation, column_name) }} | |
{{ fishtown_internal_analytics.multi_pass_stats(relation, column_name) }} | |
{{ fishtown_internal_analytics.top_values(relation, column_name) }} | |
{{ log( | |
"Null values: " ~ | |
load_result('single_pass_column_stats')['data'][0][0], true | |
)}} | |
{{ log( | |
"Not null values: " ~ | |
load_result('single_pass_column_stats')['data'][0][1], true | |
)}} | |
{{ log( | |
"Is unique: " ~ | |
load_result('multi_pass_column_stats')['data'][0][1], true | |
)}} | |
{{ log( | |
"Distinct values: " ~ | |
load_result('single_pass_column_stats')['data'][0][3], true | |
)}} | |
{{ log( | |
"Empty strings: " ~ | |
load_result('single_pass_column_stats')['data'][0][2], true | |
)}} | |
{{ log( | |
"Min value: " ~ | |
load_result('single_pass_column_stats')['data'][0][5], true | |
)}} | |
{{ log( | |
"Max value: " ~ | |
load_result('single_pass_column_stats')['data'][0][4], true | |
)}} | |
{{ log( | |
"Shortest value: " ~ | |
load_result('multi_pass_column_stats')['data'][2][1], true | |
)}} | |
{{ log( | |
"Longest value: " ~ | |
load_result('multi_pass_column_stats')['data'][1][1], true | |
)}} | |
{{ log("", true)}} | |
{{ log("=====================================================", true)}} | |
{{ log("", true)}} | |
{{ log("Top values:", true)}} | |
{% set _ = load_result('top_column_values')['table'] | |
.print_table(output=info_log_stream) | |
%} | |
{% endmacro %} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment