Skip to content

Instantly share code, notes, and snippets.

@tomasgreif
Created October 9, 2013 17:17
Show Gist options
  • Save tomasgreif/6904830 to your computer and use it in GitHub Desktop.
Save tomasgreif/6904830 to your computer and use it in GitHub Desktop.
Guess table definition. The function will create sql create table statement from sample data.
create or replace function gtd(sample text)
returns varchar as $$
declare outsql varchar;
begin
select tbl_definition into outsql from (
with
source as (select $1::varchar instr)
,elems as (
select elem[en], rn, en
from (
select elem, rn, generate_subscripts(elem,1) en
from(
select string_to_array(arr[rn],';') as elem, rn
from (
select *, generate_subscripts(arr, 1) as rn
from (
select regexp_split_to_array(instr, E'\\n') as arr from source
) w
) x
) y
) z
)
, type_tests as (
select
*,
case
when elem ~ '^$' then 'empty'
when elem ~ '^[-+]?[0-9]+$' then 'integer'
when elem ~ '^[-+]?[0-9]*\.?[0-9]*([eE][-+]?[0-9]+)?$' then 'numeric'
when elem ~ '^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\.[0-9]*)?$' or
elem ~ '^[0-9]{4}/[0-9]{1,2}/[0-9]{1,2} [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}(\.[0-9]*)?$' then 'timestamp'
when elem ~ '^[0-9]{1,2}:[0-9]{1,2}(:[0-9]{1,2}(\.[0-9]+)?)?$' then 'time'
when elem ~ '^[0-9]{4}/[0-9]{1,2}/[0-9]{1,2}$' or elem ~ '^[0-9]{4}-[0-9]{1,2}-[0-9]{1,2}$' then 'date'
else 'varchar' end as dtype
from
elems
)
, results_agg as (
select
tt2.en,
tt2.elem,
count(*),
array_agg(tt.dtype order by tt.rn) as dtypes,
array_agg(tt.elem order by tt.rn) tmp
from
type_tests tt
join type_tests tt2 on (tt.en = tt2.en and tt2.rn=1 and tt.rn <> 1)
group by
tt2.en,
tt2.elem
order by
tt2.en
)
, type_determination as (
select
*,
case
when dtypes @> array['integer'] and not(dtypes && array['numeric','date','time','timestamp','varchar']) then 'integer'
when dtypes @> array['timestamp'] and not(dtypes && array['numeric','time','varchar']) then 'timestamp'
when dtypes @> array['date'] and not(dtypes && array['numeric','time','varchar']) then 'date'
when dtypes @> array['time'] and not(dtypes && array['numeric','date','varchar','timestamp']) then 'time'
when dtypes @> array['numeric'] and not(dtypes && array['date','time','timestamp','varchar']) then 'numeric'
else 'varchar' end as result_type
from
results_agg
)
select
E'create table (\n' || array_to_string(array_agg(E'\t' || replace(lower(elem),' ','_') ||
' ' || result_type order by en),E',\n') || E'\n);' as tbl_definition
from
type_determination
) a;
return outsql;
end;
$$ LANGUAGE plpgsql
;
select gtd('a a;b;c c c;d;e;f;g;h;i i;j;k
2012-01-01 13:04:04.321;test;2012-01-01;2012/01/01;abcd;321;-324;.1;-5.5;0;14:54
2012/01/01 13:04:04.321;test;2012-01-01;2012/01/01;abcd;321;-324;.1;-5;0;14:54
2012-01-01 13:04:04.321;test;2012-01-01;2012/01/01;.1;321;-324;.1;-5.5;0;14:54
2012-01-01 13:04:04.321;test;2012-01-01;2012/01/01;abcd;321;-324;.1;-5.5;0;14:54
2012-01-01 13:04:04.321;;2012-01-01;2012/01/01;1;321;-324;.1;-5.5;0;14:54');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment