Skip to content

Instantly share code, notes, and snippets.

@MarkPryceMaherMSFT
Last active November 8, 2022 15:30
Show Gist options
  • Save MarkPryceMaherMSFT/135771ae2cee0dc88b7f693da9c89d8a to your computer and use it in GitHub Desktop.
Save MarkPryceMaherMSFT/135771ae2cee0dc88b7f693da9c89d8a to your computer and use it in GitHub Desktop.
Proc to profile the data in a table. This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar It will also give you the number o…
--exec profile_table 'dbo','Copy_into_example_c'
/*
Proc to profile the data in a table.
This proc will give to the min and max values in every column, with is useful when selecting the right numberic data type - bit/int/bigint
It will give you the length of the of the value, which is useful when picking the right variable length datatype - char/varchar/nvarchar
It will also give you the number of distinct values, which is useful for selecting a distribution column.
It will work over normal and external tables.
WARNING: This proc was built as a proof of concept - it is very very very very very very very slow.
So just be careful! Test out on small tables (i.e. in both rows and columns)
Note: The results are pushed to a table, *not a temp table* - this is so you can open another session and query the table to see how far through it is.
*/
alter proc profile_table
@schema_name varchar(50), @table_name varchar(500)
as
begin
DECLARE @sSQL varchar(8000);
if exists (
select 1 from sys.schemas s inner join sys.tables t
on s.schema_id = t.schema_id
where s.name = 'dbo' and t.name = 'tmp_cols' )
BEGIN
PRINT 'DELETE TABLE'
set @sSQL = 'DROP TABLE [dbo].[tmp_cols];'
EXEC (@sSQL);
END
ELSE
BEGIN
PRINT 'TABLE DOES NOT EXISTS'
END
create table dbo.tmp_cols
(
seq_id bigint,
column_name varchar(255),
datatype varchar(255),
col_max_length bigint,
col_precision bigint,
col_scale bigint,
is_nullable bigint,
min_length bigint,
max_length bigint,
min_value varchar(max),
max_value varchar(max),
distinct_values bigint,
total_rows bigint,
empty_values bigint,
null_values bigint
)
with
( distribution=round_robin, heap)
insert into dbo.tmp_cols
select column_id,
c.name as 'column_name', st.name as 'datatype',
c.max_length, c.precision, c.scale, c.is_nullable , null,null,null,null ,null,null,null,null
from sys.columns c inner join sys.tables t on
t.object_id = c.object_id
inner join sys.schemas s on s.schema_id = t.schema_id
inner join sys.types st on c.system_type_id = st.system_type_id
where t.name = @table_name and s.name = @schema_name
declare @sql2 varchar(4000);
declare @sql3 varchar(4000);
set @sql3 = '
declare @total_rows bigint;
select @total_rows = count(*) from ' + @schema_name + '.'+ @table_name + '
update dbo.tmp_cols set total_rows = @total_rows
'
exec(@sql3)
DECLARE
@i INT = 1
, @t INT = (SELECT COUNT(*) FROM dbo.tmp_cols)
,@min_length bigint
,@max_length bigint
,@min_value varchar(max)
,@max_value varchar(max)
,@column_name varchar(255)
,@datatype varchar(255)
WHILE @i <= @t
BEGIN
select @column_name = column_name , @datatype=datatype from dbo.tmp_cols
where @i = seq_id
set @sql2 = 'declare @min_length bigint;
declare @max_length bigint;
declare @min_col varchar(max);
declare @max_col varchar(max);
declare @distinct_values bigint;
declare @empty_values bigint;
declare @null_values bigint;
select
@distinct_values = count(distinct [' + @column_name +']),
@min_length = min(len([' + @column_name +'])) ,
@max_length = max(len([' + @column_name +'])) ,
@min_col = min([' + @column_name +']) ,
@max_col= max([' + @column_name +']) ,
@empty_values = count(case when [' + @column_name +'] = '''' then 1 end ) ,
@null_values = count(case when [' + @column_name +'] is null then 1 end )
from ' + @schema_name + '.'+ @table_name + '
update dbo.tmp_cols
set
min_length = @min_length,
max_length = @max_length,
min_value = @min_col,
max_value = @max_col,
distinct_values = @distinct_values,
empty_values = @empty_values,
null_values = @null_values
where seq_id = ' + convert(varchar(400),@i)
print @sql2;
exec (@sql2)
SET @i+=1;
END
select * from dbo.tmp_cols ;
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment