MarkPryceMaherMSFT/Profile_table.sql

## Profile_table.sql

--exec profile_table  'dbo','Copy_into_example_c'

/*
Proc to profile the data in a table.
This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint

It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar

It will also give you the number of distinct values, which is useful for selecting a distribution column.

It will work over normal and external tables.

WARNING: This proc was built as a proof of concept - it is very very very very very very very slow.
So just be careful! Test out on small tables (i.e. in both rows and columns)

Note:  The results are pushed to a table, *not a temp table* - this is so you can open another session and query the table to see how far through it is.

*/
alter proc profile_table
@schema_name varchar(50), @table_name varchar(500)
as
begin


DECLARE @sSQL varchar(8000);

if exists (
select 1 from sys.schemas s inner join sys.tables t
on s.schema_id = t.schema_id
where s.name =   'dbo' and t.name =  'tmp_cols' )
BEGIN
    PRINT 'DELETE TABLE'
    set @sSQL = 'DROP TABLE [dbo].[tmp_cols];'
    EXEC (@sSQL);
END
ELSE
BEGIN
    PRINT 'TABLE DOES NOT EXISTS'
END

create table dbo.tmp_cols
(
	seq_id bigint,
	column_name varchar(255),
	datatype varchar(255),
	col_max_length bigint,
	col_precision bigint,
	col_scale bigint,
	is_nullable bigint,
	min_length bigint,
	max_length bigint,
	min_value varchar(max),
	max_value varchar(max),
	distinct_values bigint,
	total_rows bigint,
	empty_values bigint,
	null_values bigint
)
with
( distribution=round_robin, heap)


insert into dbo.tmp_cols
select column_id,
c.name as 'column_name',  st.name as 'datatype',
c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null
from sys.columns c inner join  sys.tables t on
t.object_id = c.object_id
inner join sys.schemas s on s.schema_id = t.schema_id
inner join sys.types st on c.system_type_id = st.system_type_id
where t.name = @table_name and s.name = @schema_name

declare @sql2 varchar(4000);
declare @sql3 varchar(4000);

set @sql3 = '
declare @total_rows bigint;

select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + '

update dbo.tmp_cols set total_rows = @total_rows
'
exec(@sql3)

DECLARE
	@i INT = 1
	, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols)
	,@min_length bigint
	,@max_length bigint
	,@min_value varchar(max)
	,@max_value varchar(max)
	,@column_name varchar(255)
	,@datatype varchar(255)
WHILE @i <= @t
  BEGIN

select @column_name = column_name , @datatype=datatype  from dbo.tmp_cols
where @i = seq_id


	set @sql2 = 'declare @min_length bigint;
declare @max_length bigint;
declare @min_col varchar(max);
declare @max_col varchar(max);
declare @distinct_values bigint;
declare @empty_values bigint;
declare @null_values bigint;
select
@distinct_values  = count(distinct [' + @column_name +']),
@min_length = min(len([' + @column_name +'])) ,
@max_length = max(len([' + @column_name +'])) ,
@min_col = min([' + @column_name +']) ,
@max_col= max([' + @column_name +']) ,
@empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) ,
@null_values = count(case when  [' + @column_name +'] is null then 1 end )
from ' + @schema_name + '.'+ @table_name + '

update  dbo.tmp_cols
set
	min_length = @min_length,
	max_length = @max_length,
	min_value = @min_col,
	max_value = @max_col,
	distinct_values = @distinct_values,
	empty_values = @empty_values,
	null_values = @null_values
where seq_id = ' + convert(varchar(400),@i)

print @sql2;

 exec (@sql2)


	SET @i+=1;
END

select * from dbo.tmp_cols ;

end

	--exec profile_table 'dbo','Copy_into_example_c'

	/*
	Proc to profile the data in a table.
	This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint

	It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar

	It will also give you the number of distinct values, which is useful for selecting a distribution column.

	It will work over normal and external tables.

	WARNING: This proc was built as a proof of concept - it is very very very very very very very slow.
	So just be careful! Test out on small tables (i.e. in both rows and columns)

	Note: The results are pushed to a table, not a temp table - this is so you can open another session and query the table to see how far through it is.

	*/
	alter proc profile_table
	@schema_name varchar(50), @table_name varchar(500)
	as
	begin


	DECLARE @sSQL varchar(8000);

	if exists (
	select 1 from sys.schemas s inner join sys.tables t
	on s.schema_id = t.schema_id
	where s.name = 'dbo' and t.name = 'tmp_cols' )
	BEGIN
	PRINT 'DELETE TABLE'
	set @sSQL = 'DROP TABLE [dbo].[tmp_cols];'
	EXEC (@sSQL);
	END
	ELSE
	BEGIN
	PRINT 'TABLE DOES NOT EXISTS'
	END

	create table dbo.tmp_cols
	(
	seq_id bigint,
	column_name varchar(255),
	datatype varchar(255),
	col_max_length bigint,
	col_precision bigint,
	col_scale bigint,
	is_nullable bigint,
	min_length bigint,
	max_length bigint,
	min_value varchar(max),
	max_value varchar(max),
	distinct_values bigint,
	total_rows bigint,
	empty_values bigint,
	null_values bigint
	)
	with
	( distribution=round_robin, heap)


	insert into dbo.tmp_cols
	select column_id,
	c.name as 'column_name', st.name as 'datatype',
	c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null
	from sys.columns c inner join sys.tables t on
	t.object_id = c.object_id
	inner join sys.schemas s on s.schema_id = t.schema_id
	inner join sys.types st on c.system_type_id = st.system_type_id
	where t.name = @table_name and s.name = @schema_name

	declare @sql2 varchar(4000);
	declare @sql3 varchar(4000);

	set @sql3 = '
	declare @total_rows bigint;

	select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + '

	update dbo.tmp_cols set total_rows = @total_rows
	'
	exec(@sql3)

	DECLARE
	@i INT = 1
	, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols)
	,@min_length bigint
	,@max_length bigint
	,@min_value varchar(max)
	,@max_value varchar(max)
	,@column_name varchar(255)
	,@datatype varchar(255)
	WHILE @i <= @t
	BEGIN

	select @column_name = column_name , @datatype=datatype from dbo.tmp_cols
	where @i = seq_id


	set @sql2 = 'declare @min_length bigint;
	declare @max_length bigint;
	declare @min_col varchar(max);
	declare @max_col varchar(max);
	declare @distinct_values bigint;
	declare @empty_values bigint;
	declare @null_values bigint;
	select
	@distinct_values = count(distinct [' + @column_name +']),
	@min_length = min(len([' + @column_name +'])) ,
	@max_length = max(len([' + @column_name +'])) ,
	@min_col = min([' + @column_name +']) ,
	@max_col= max([' + @column_name +']) ,
	@empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) ,
	@null_values = count(case when [' + @column_name +'] is null then 1 end )
	from ' + @schema_name + '.'+ @table_name + '

	update dbo.tmp_cols
	set
	min_length = @min_length,
	max_length = @max_length,
	min_value = @min_col,
	max_value = @max_col,
	distinct_values = @distinct_values,
	empty_values = @empty_values,
	null_values = @null_values
	where seq_id = ' + convert(varchar(400),@i)

	print @sql2;

	exec (@sql2)


	SET @i+=1;
	END

	select * from dbo.tmp_cols ;

	end