Skip to content

Instantly share code, notes, and snippets.

@brito
Created May 23, 2020 02:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brito/36b5e57bf8ceed7f8c1a7dc7e704c1d8 to your computer and use it in GitHub Desktop.
Save brito/36b5e57bf8ceed7f8c1a7dc7e704c1d8 to your computer and use it in GitHub Desktop.
Kitchen sink analysis of relations in schemas
/* __ _ _ _ __
__ _ / _| |_ ___ __ _ _ __ __ _ ___| |_ ___ ___| |__ __ _ ___ _ __ ___ /_/ ___
/ _` | |_| __/ _ \ / _` | '_ \ / _` / __| __/ _ \ / __| '_ \ / _` / __| '_ ` _ \ / _ \/ __|
| (_| | _| || (_) | (_| | | | | (_| \__ \ || (_) | (__| | | | (_| \__ \ | | | | | (_) \__ \
\__,_|_| \__\___/ \__,_|_| |_|\__,_|___/\__\___/ \___|_| |_|\__,_|___/_| |_| |_|\___/|___/
αυτοαναστοχασμός ergo legit */
select
_column, _type,
_nulls,
_slider(
(_nulls::float/max(_nulls) over () * 30)::int,
(_distinct::float/max(_distinct) over () * 30)::int)
as "nulls vs distinct",
_distinct,
_range[1]::text,
_bar(_pct_nulls::int) as "% nulls",
_tally.*,
_iqt.*,
_histogram.*,
concat_ws(' ± ',
_avg_length[1]::text::numeric,
ceil((_max_length[1]::text::int - _min_length[1]::text::int)/2))
as _width,
_schema,
_relname,
_kind
from _meta (
'{app_public}',
'{data_report_session,data_raw_session,data_report}'),
-- tally columns
_tally(_schema,_relname,_column) as _tally,
-- interquartile
_iqt(_schema,_relname,_column,_type) as _iqt,
-- histogram buckets
_histogram(_schema,_relname,_column,_type) as _histogram,
-- count distinct values
round((xpath('/row/count/text()', query_to_xml(format($$
select count(*) from (select distinct %3$I from %1$I.%2$I) d
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _distinct,
-- count nulls
round((xpath('/row/count/text()', query_to_xml(format($$
select count(*) from %I.%I where %I is null
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _nulls,
-- percent null
round((xpath('/row/p/text()', query_to_xml(format($$
select round(100*count(*) filter (where %3$I is null) / count(*)) as p from %1$I.%2$I
$$,_schema,_relname,_column), false, true, '')))[1]::text::int) as _pct_nulls,
-- count zeroes
-- FIXME not very useful either zeroes or the scale at which "near zero" is calculated
-- round((xpath('/row/count/text()', query_to_xml(format($$
-- select count(*) from %1$I.%2$I
-- where '%4$I' ~ 'int|float' and %3$I::text ~ '^(-|00:00:)0'
-- $$,_schema,_relname,_column,_type), false, true, '')))[1]::text::int) as _near_zeroes,
-- range
xpath('/row/r/text()', query_to_xml(format(case
-- transforming a uuid into a number just to get a range is a bit unnecessary
-- when typname = '_uuid' then $$
-- select numrange(
-- min(('x'||translate(left(%2$I::text,18),'-',''))::bit(64)::bigint),
-- max(('x'||translate(left(%2$I::text,18),'-',''))::bit(64)::bigint),'[]')
-- as r from %3$I.%4$I $$
when _type ~ 'int\d|float|numeric' then $$
select numrange(min(%3$I::numeric),max(%3$I::numeric),'[]')
as r from %1$I.%2$I $$
when _type = '_timestamptz' then $$
select tstzrange(min(%3$I),max(%3$I))
as r from %1$I.%2$I $$
when _type = '_tstzrange' then $$
select tstzrange(min(lower(%3$I)),max(upper(%3$I)))
as r from %1$I.%2$I $$
when _type = '_interval' then $$
select concat_ws(',',min(%3$I),max(%3$I))
as r from %1$I.%2$I $$
when _type = '_bool' then $$
select to_json(array[
count(*) filter (where %3$I = true),
count(*) filter (where %3$I = false)])
as r from %1$I.%2$I $$
when _type = '_text' and '%2$I' ~ '^\d\.?\d*' then $$
select numrange(min(%3$I::numeric),max(%3$I::numeric))
as r from %1$I.%2$I $$
when _type = '_text' and not '%2$I' ~ '^\d\.?\d*' then $$
select numrange(min(length(%3$I)),max(length(%3$I)))
as r from %1$I.%2$I $$
end,_schema,_relname,_column,_type), false, true, '')) as _range,
-- min length as text
xpath('/row/min/text()', query_to_xml(format($$
select min(length(%3$I::text)) from %1$I.%2$I
$$,_schema,_relname,_column), false, true, '')) as _min_length,
-- avg length as text
xpath('/row/round/text()', query_to_xml(format($$
select round(avg(length(%3$I::text))) from %1$I.%2$I
$$,_schema,_relname,_column), false, true, '')) as _avg_length,
-- max length as text
xpath('/row/max/text()', query_to_xml(format($$
select max(length(%3$I::text)) from %1$I.%2$I
$$,_schema,_relname,_column), false, true, '')) as _max_length
;
@brito
Copy link
Author

brito commented May 23, 2020

Definitely room to improve with regards to syntax... I'm definitely misusing xml here because I was unnecessarily avoiding pgplsql.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment