Skip to content

Instantly share code, notes, and snippets.

@johnsmclay
Last active February 4, 2016 18:13
Show Gist options
  • Save johnsmclay/c5899cd3552c7f55d470 to your computer and use it in GitHub Desktop.
Save johnsmclay/c5899cd3552c7f55d470 to your computer and use it in GitHub Desktop.
Get basic statistical and variation info on every column in a table
DROP TABLE IF EXISTS `clay_sandbox`.`column_stats`;
CREATE TABLE `clay_sandbox`.`column_stats` (
`database` varchar(64) NOT NULL,
`table` varchar(64) NOT NULL,
`column` varchar(64) NOT NULL,
`position_in_table` bigint(20) SIGNED NOT NULL,
`nonnull_vals` bigint(20) DEFAULT NULL,
`nonmnull_pctg` decimal(7,5) DEFAULT NULL,
`sum_val` decimal(32,16) DEFAULT NULL,
`min_val` decimal(32,16) DEFAULT NULL,
`max_val` decimal(32,16) DEFAULT NULL,
`avg_val` decimal(32,16) DEFAULT NULL,
`count_distinct` bigint(20) DEFAULT NULL,
`distinct_values` varchar(2048) DEFAULT NULL,
PRIMARY KEY (`customer`, `database_group`, `table`, `column`)
) ENGINE=`InnoDB` CHARACTER SET utf8;
DROP PROCEDURE IF EXISTS `clay_sandbox`.`get_column_stats`;
DELIMITER //
CREATE PROCEDURE `clay_sandbox`.`get_column_stats`(in database_name_req varchar(64), in table_name_req varchar(64))
BEGIN
-- EXPLANATION: These hold the values used for the current iteration
declare current_database varchar(64);
declare current_table varchar(64);
declare current_column varchar(64);
declare current_position int(11);
declare current_column_type varchar(64);
declare current_nonnulls bigint(20);
declare current_count_distinct bigint(20);
declare current_distinct_values varchar(2048);
declare current_row_count bigint(20);
declare current_sum_val decimal(32,16);
declare current_min_val decimal(32,16);
declare current_max_val decimal(32,16);
declare current_avg_val decimal(32,16);
declare done boolean;
declare the_cursor cursor for
SELECT
TABLE_SCHEMA AS `database_name`
,TABLE_NAME AS `table`
,COLUMN_NAME AS `column`
,ORDINAL_POSITION AS position_in_table
,DATA_TYPE AS column_type
FROM `information_schema`.`COLUMNS`
WHERE 1=1
AND TABLE_SCHEMA = database_name_req
AND TABLE_NAME = table_name_req
order by
ORDINAL_POSITION ASC
;
-- EXPLANATION: when doing the "fetch" later, if there is not another line set the "done" flag
declare continue handler for not found set done=TRUE;
SET
done = FALSE
,current_nonnulls := NULL
,current_count_distinct := NULL
,current_distinct_values := NULL
,current_sum_val := NULL
,current_min_val := NULL
,current_max_val := NULL
,current_avg_val := NULL
;
SET @sql = CONCAT('SELECT COUNT(*) FROM ',database_name_req,'.',table_name_req,' INTO @current_row_count;');
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
SET current_row_count := @current_row_count;
open the_cursor;
the_loop: loop
-- EXPLANATION: sets the values for the current iteration into the variables declared above
fetch the_cursor into
current_database
,current_table
,current_column
,current_position
,current_column_type
;
-- EXPLANATION: check the "done" flag to make sure we haven't processed all the segments/lines
if done = TRUE then leave the_loop; end if;
-- === DO ALL YOU THINGS FOR EACH SEGMENT BELOW HERE =======
SET @sql = CONCAT(
'SELECT COUNT(DISTINCT ',current_column,
'),SUM(',current_column,'),MIN(',current_column,
'),MAX(',current_column,'),AVG(',current_column,
'),LEFT(GROUP_CONCAT(DISTINCT ',current_column,'),2048),SUM(IF(',current_column,' IS NOT NULL,1,0)) FROM ',current_database,'.',current_table,
' INTO @current_count_distinct,@current_sum_val,@current_min_val,@current_max_val,@current_avg_val, @current_distinct_values,@current_nonnulls;');
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
SET
current_count_distinct := @current_count_distinct
,current_sum_val := @current_sum_val
,current_min_val := @current_min_val
,current_max_val := @current_max_val
,current_avg_val := @current_avg_val
,current_distinct_values := @current_distinct_values
,current_nonnulls := @current_nonnulls
;
DELETE FROM `clay_sandbox`.`column_stats`
WHERE TRUE
AND `database` = current_database
AND `table` = current_table
AND `column` = current_column
;
INSERT INTO `clay_sandbox`.`column_stats` (
,`database`
,`table`
,`column`
,position_in_table
,nonnull_vals
,nonmnull_pctg
,sum_val
,min_val
,max_val
,avg_val
,count_distinct
,distinct_values
)
VALUES (
current_database
,current_table
,current_column
,current_position
,current_nonnulls
,(current_nonnulls/current_row_count)
,IF(current_column_type IN (
'decimal'
,'int'
,'bigint'
,'tinyint'
,'double'
,'float'
,'mediumint'
,'smallint'
),current_sum_val,NULL)
,IF(current_column_type IN (
'decimal'
,'int'
,'bigint'
,'tinyint'
,'double'
,'float'
,'mediumint'
,'smallint'
,'timestamp'
,'date'
,'datetime'
),current_min_val,NULL)
,IF(current_column_type IN (
'decimal'
,'int'
,'bigint'
,'tinyint'
,'double'
,'float'
,'mediumint'
,'smallint'
,'timestamp'
,'date'
,'datetime'
),current_max_val,NULL)
,IF(current_column_type IN (
'decimal'
,'int'
,'bigint'
,'tinyint'
,'double'
,'float'
,'mediumint'
,'smallint'
),current_avg_val,NULL)
,current_count_distinct
,current_distinct_values
);
-- === DO ALL YOU THINGS FOR EACH SEGMENT ABOVE HERE =======
end loop;
close the_cursor;
SELECT 'Complete' AS status;
END //
DELIMITER ;
CALL `clay_sandbox`.`get_column_stats`('funtimesdb','new_landing_table');
SELECT * FROM `clay_sandbox`.`column_stats`;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment