Skip to content

Instantly share code, notes, and snippets.

@cimmanon
Last active December 18, 2016 15:20
Show Gist options
  • Save cimmanon/04c6ead97fae2257dd32a5977898de76 to your computer and use it in GitHub Desktop.
Save cimmanon/04c6ead97fae2257dd32a5977898de76 to your computer and use it in GitHub Desktop.
PostgreSQL function for doing diagnostics on unfamiliar tables
begin;
/*----------------------------------------------------------------------------------------------------*\
| Functions for generating statistics
\*----------------------------------------------------------------------------------------------------*/
create or replace function rel_column_count(schema_name text, table_name text, column_name text, column_type text) returns table
( not_null_count bigint
, distinct_count bigint
, min_length int
, max_length int
) as $$
DECLARE
s ALIAS FOR schema_name;
t ALIAS FOR table_name;
c ALIAS FOR column_name;
q text;
BEGIN
q := case when column_type = 'text'
then format('select count(*), count(distinct %I), min(length(%I)), max(length(%I)) from %I.%I where nullif(%I, '''') is not null', c, c, c, s, t, c)
else format('select count(*), count(distinct %I), null :: int, null :: int from %I.%I where %I is not null', c, s, t, c) END;
return query execute q;
END;
$$ language plpgsql stable;
-------------------------------------
create or replace function rel_statistics(schema_name text, table_name text) returns table
( column_name name
, column_type text
, maybe_fk bool
, maybe_bool bool
, not_null_count bigint
, not_null_percent numeric(5,2)
, distinct_count bigint
, min_length int
, max_length int
) as $$
declare
s ALIAS FOR schema_name;
t ALIAS FOR table_name;
c bigint;
begin
execute format('select count(*) from %I.%I', s, t) into c;
return query
select
x.column_name
, x.column_type
, x.maybe_fk
, x.maybe_bool
, x.not_null_count
, x.not_null_percent
, x.distinct_count
, x.min_length
, x.max_length
from (
select
r.column_name
, r.column_type
, r.column_name ilike '%_id' as maybe_fk
, r.column_type = 'text'
and cc.distinct_count <= 2
-- ^ if there's 2 values, its a good sign that its a fake bool
and (cc.max_length = 1 or (cc.min_length >= 2 and cc.max_length <= 3)) as maybe_bool
-- ^ 1 letter = Y/N or 1/0
-- ^ 2 or 3 letters = yes or no
, cc.not_null_count
, (round(((cc.not_null_count) :: numeric / c :: numeric) * 10000) / 100) :: numeric(5,2) as not_null_percent
, cc.distinct_count
, cc.min_length
, cc.max_length
from
(
select
-- ns.nspname as schema_name,
-- cls.relname as table_name,
attr.attname as column_name,
trim(leading '_' from tp.typname) as column_type
from
pg_catalog.pg_attribute as attr
join pg_catalog.pg_class as cls on cls.oid = attr.attrelid
join pg_catalog.pg_namespace as ns on ns.oid = cls.relnamespace
join pg_catalog.pg_type as tp on tp.typelem = attr.atttypid
where
ns.nspname = s
and cls.relname = t
and not attr.attisdropped
and tp.typanalyze :: text = 'array_typanalyze'
and attr.attnum > 0
order by
attr.attnum
) as r
, rel_column_count(s, t, r.column_name, r.column_type) as cc
) as x
order by
x.not_null_count = 0
, x.column_type
, x.maybe_fk desc
, x.maybe_bool desc
, x.not_null_count desc
, x.distinct_count
, x.column_name
;
end;
$$ language plpgsql;
--select * from rel_statistics('public', 'tax_items');
--select * from rel_statistics('public', 'sales_reps');
/*----------------------------------------------------------------------------------------------------*\
| Generate the reports in CSV format
\*----------------------------------------------------------------------------------------------------*/
DO $$
DECLARE
s TEXT := 'public';
r RECORD;
BEGIN
FOR r IN
SELECT * FROM system.netsuite_view_schedule ORDER BY name
/*
SELECT matviewname, schemaname
FROM pg_matviews JOIN system.netsuite_view_schedule ON matviewname = netsuite_view_schedule.name
ORDER BY matviewname
-- LIMIT 5
*/
LOOP
RAISE NOTICE 'Generating statistics for %...', r.name;
EXECUTE format('COPY (SELECT * FROM rel_statistics(''%s'', ''%s'')) to ''/home/deploy/notes/%s.csv'' csv header', s, r.name, r.name);
END LOOP;
END
$$ LANGUAGE plpgsql;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment