Skip to content

Instantly share code, notes, and snippets.

@dhmacher
Last active July 1, 2023 11:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dhmacher/065ab2faaa89db29f4c1bf849a8834f6 to your computer and use it in GitHub Desktop.
Save dhmacher/065ab2faaa89db29f4c1bf849a8834f6 to your computer and use it in GitHub Desktop.
-- Source: https://sqlsunday.com/2017/02/21/finding-primary-key-candidates/
IF (OBJECT_ID('dbo.FindPrimaryKey') IS NULL)
EXEC('CREATE PROCEDURE dbo.FindPrimaryKey AS --');
GO
/*
This stored procedure is used to identify primary key candidates.
Copyright Daniel Hutmacher under Creative Commons 4.0 license with attribution.
http://creativecommons.org/licenses/by/4.0/
Source: http://sqlsunday.com/downloads/
Version: 2017-10-13
DISCLAIMER: This script does not make any modifications to the server, except
for installing a stored procedure. However, the script may not be
suitable to run in a production environment. I cannot assume any
responsibility regarding the accuracy of the output information,
performance impacts on your server, or any other consequence. If
your juristiction does not allow for this kind of waiver/disclaimer,
or if you do not accept these terms, you are NOT allowed to store,
distribute or use this code in any way.
*/
ALTER PROCEDURE dbo.FindPrimaryKey
@table sysname,
@column_wildcard sysname=N'%', --- Optional: limit your search to just specific columns (for instance '%id')
@where nvarchar(max)=NULL, --- Optional: specify a WHERE clause on the table to limit rows.
@initial_sample bigint=1000, --- Optional: initial sample size in rows.
@max_column_count tinyint=16 --- Optional: maximum number of columns to try.
AS
--- Anything to declare?
DECLARE @sql nvarchar(max), --- The source table we're analyzing.
@qualified_table sysname=@table, --- Database-qualified table name (if we're using tempdb)
@count bigint, --- Total row count of the source table.
@id int;
SET NOCOUNT ON;
--- If we're talking temp tables, say so.
IF (@qualified_table LIKE N'#%') SET @qualified_table=N'tempdb.dbo.'+@qualified_table;
--- This table keeps track of how many unique
--- members there are in each column of the table.
CREATE TABLE #counts (
col sysname NOT NULL,
_dist bigint NOT NULL,
_count bigint NOT NULL,
PRIMARY KEY CLUSTERED (col)
);
--- These are all the candidates we're going to test:
CREATE TABLE #candidates (
id int NOT NULL,
index_name sysname NULL,
cols xml NOT NULL,
_dist bigint NOT NULL,
is_unique bit NULL,
PRIMARY KEY CLUSTERED (id)
);
CREATE TABLE #columns (
[name] sysname NOT NULL,
PRIMARY KEY CLUSTERED ([name])
);
INSERT INTO #columns ([name])
SELECT [name] FROM sys.columns
WHERE [object_id]=OBJECT_ID(@qualified_table) AND
[name] LIKE @column_wildcard AND
scale=0 AND
system_type_id NOT IN (SELECT system_type_id
FROM sys.types
WHERE [name] IN ('text', 'ntext', 'xml', 'image', 'sql_variant', 'bit'));
IF (@@ROWCOUNT=0)
INSERT INTO #columns ([name])
SELECT [name]
FROM tempdb.sys.columns
WHERE [object_id]=OBJECT_ID(@qualified_table) AND
[name] LIKE @column_wildcard AND
scale=0 AND
system_type_id NOT IN (SELECT system_type_id
FROM tempdb.sys.types
WHERE [name] IN ('text', 'ntext', 'xml', 'image', 'sql_variant', 'bit'));
--- Here be dynamic SQL.
SET @sql=N'
INSERT INTO #counts (col, _dist, _count)
SELECT x.col, x._dist, src._count
FROM (
SELECT COUNT(*) AS _count'+CAST((SELECT N', COUNT(DISTINCT '+QUOTENAME([name])+N') AS '+QUOTENAME(N'_dist_'+[name])
FROM #columns AS cols
FOR XML PATH(''), TYPE) AS varchar(max))+'
FROM '+@table+ISNULL(N'
WHERE '+@where, N'')+N') AS src
CROSS APPLY (
VALUES '+SUBSTRING(CAST((SELECT N', (src.'+QUOTENAME('_dist_'+[name])+N', '''+[name]+N''')'
FROM #columns AS cols
FOR XML PATH(''), TYPE) AS varchar(max)), 3, 10000)+N'
) AS x(_dist, col);';
EXECUTE sys.sp_executesql @sql;
--- We've stored the row count of the table in one of the columns.
SELECT TOP (1) @count=_count FROM #counts;
IF (EXISTS (SELECT NULL FROM #counts WHERE _dist=_count)) BEGIN;
SELECT col AS [Columns], 'UNIQUE' AS Uniqueness
FROM #counts
WHERE _dist=_count;
RETURN;
END;
--- These are all the candidates we're going to test:
WITH cte AS (
SELECT 1 AS colcount, col, CAST('<col>'+col+'</col>' AS varchar(max)) AS cols, _dist, 1 AS _colcount
FROM #counts
WHERE _dist<_count
UNION ALL
SELECT cte.colcount+1, c.col, CAST(cte.cols+'<col>'+c.col+'</col>' AS varchar(max)), cte._dist*c._dist, _colcount+1
FROM cte
INNER JOIN #counts AS c ON cte.col<c.col AND cte._dist<@count
WHERE _colcount<@max_column_count),
ix AS (
SELECT i.[name] AS index_name,
CAST((SELECT c.[name] AS col
FROM sys.index_columns AS ic
INNER JOIN sys.columns AS c ON ic.[object_id]=c.[object_id] AND ic.column_id=c.column_id
WHERE i.[object_id]=ic.[object_id] AND i.index_id=ic.index_id --AND
-- c.[name] LIKE @column_wildcard AND
-- c.system_type_id NOT IN (SELECT system_type_id
-- FROM sys.types
-- WHERE [name] IN ('text', 'ntext', 'xml', 'image', 'sql_variant', 'bit'))
ORDER BY c.[name]
FOR XML PATH(''), TYPE) AS varchar(max)) AS cols
FROM sys.indexes AS i
WHERE i.[object_id]=OBJECT_ID(@qualified_table) AND i.is_unique=0 AND i.index_id>0
UNION
SELECT i.[name] AS index_name,
CAST((SELECT c.[name] AS col
FROM tempdb.sys.index_columns AS ic
INNER JOIN tempdb.sys.columns AS c ON ic.[object_id]=c.[object_id] AND ic.column_id=c.column_id
WHERE i.[object_id]=ic.[object_id] AND i.index_id=ic.index_id --AND
-- c.[name] LIKE @column_wildcard AND
-- c.system_type_id NOT IN (SELECT system_type_id
-- FROM tempdb.sys.types
-- WHERE [name] IN ('text', 'ntext', 'xml', 'image', 'sql_variant', 'bit'))
ORDER BY c.[name]
FOR XML PATH(''), TYPE) AS varchar(max)) AS cols
FROM tempdb.sys.indexes AS i
WHERE i.[object_id]=OBJECT_ID(@qualified_table) AND i.is_unique=0 AND i.index_id>0)
INSERT INTO #candidates (id, index_name, cols, _dist, is_unique)
SELECT ROW_NUMBER() OVER (ORDER BY cte.colcount, cte._dist), ix.index_name,
CAST('<cols>'+cte.cols+'</cols>' AS xml) AS cols, cte._dist, NULL
FROM cte
LEFT JOIN (SELECT *, ROW_NUMBER() OVER (PARTITION BY cols ORDER BY index_name) AS _duplicate FROM ix) AS ix ON cte.cols=ix.cols AND ix._duplicate=1
WHERE _dist>=@count;
--- Loop through them, iterate through increasing sample sizes:
WHILE (EXISTS (SELECT NULL FROM #candidates WHERE is_unique IS NULL)) BEGIN;
SELECT TOP (1) @id=c.id, @sql=N'
DECLARE @sample bigint='+CAST(@initial_sample AS nvarchar(10))+N'/5,
@duplicates bit=0,
@rowcount bigint='+CAST(@count AS nvarchar(10))+';
WHILE (@duplicates=0 AND @sample<@rowcount) BEGIN;
SET @sample=@sample*5;
IF (@sample='+CAST(@initial_sample AS varchar(10))+')
PRINT ''Testing ('+ REPLACE(REPLACE(REPLACE(CAST(cols AS nvarchar(max)), '</col><col>', ', '), '<cols><col>', ''), '</col></cols>', '') +'), sample size=''+STR(@sample, 12, 0)+'' rows.'';
IF (@sample>'+CAST(@initial_sample AS varchar(10))+')
PRINT '' '+REPLICATE(' ', LEN(REPLACE(REPLACE(REPLACE(CAST(cols AS nvarchar(max)), '</col><col>', ', '), '<cols><col>', ''), '</col></cols>', '')))+' sample size=''+STR(@sample, 12, 0)+'' rows.'';
SELECT TOP (1) @duplicates=1
FROM (
SELECT TOP (@sample) *
FROM '+@table+ISNULL(N'
WHERE '+@where, N'')+N') AS t
GROUP BY '+REPLACE(REPLACE(REPLACE(CAST(cols AS nvarchar(max)), '</col><col>', '], ['), '<cols><col>', '['), '</col></cols>', ']')+'
HAVING COUNT(*)>1;
END;
UPDATE #candidates
SET is_unique=1-@duplicates
WHERE id='+CAST(id AS nvarchar(10))+N';'
FROM #candidates AS c
WHERE is_unique IS NULL
ORDER BY id;
EXECUTE sys.sp_executesql @sql;
--- Delete candidates that are implicitly unique as a consequence of the unique keys
--- we just discovered. For instance, if (a, c, e) is unique, (a, b, c, d, e) is implicitly
--- also unique and can be removed from #candidates.
DELETE sub
FROM #candidates AS uq
INNER JOIN #candidates AS sub ON
uq.is_unique=1 AND
uq.id=@id AND
sub.id!=@id
CROSS APPLY (
VALUES (
REPLACE(REPLACE(REPLACE(CAST( uq.cols AS nvarchar(max)), '</col><col>', ']%,%['), '<cols><col>', '%['), '</col></cols>', ']%'),
REPLACE(REPLACE(REPLACE(CAST(sub.cols AS nvarchar(max)), '</col><col>', '],['), '<cols><col>', '['), '</col></cols>', ']')
)) AS x(uq_cols, sub_cols)
WHERE x.sub_cols LIKE x.uq_cols AND
--- ... except if the implicit candidate is indexed, in which case we'll just
--- keep it for the sake of information:
sub.index_name IS NULL;
END;
--- Output the results.
SELECT ISNULL(index_name, '') AS [Index name],
REPLACE(REPLACE(REPLACE(CAST(cols AS nvarchar(max)), '</col><col>', ', '), '<cols><col>', ''), '</col></cols>', '') AS [Columns],
(CASE WHEN is_unique=1 THEN 'UNIQUE' WHEN is_unique=0 THEN 'Not unique' END) AS Uniqueness
FROM #candidates
WHERE is_unique=1 OR index_name IS NOT NULL
ORDER BY is_unique DESC, 1;
--- Clean-up.
DROP TABLE #candidates;
DROP TABLE #counts;
GO
--EXECUTE dbo.FindPrimaryKey '#table'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment