Instantly share code, notes, and snippets.
Created
February 2, 2024 12:48
-
Save samot1/5269da8ce06b47fe7f57f77a48f70247 to your computer and use it in GitHub Desktop.
Normalize StackOverFlow's Posts table (Tags)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* https://www.brentozar.com/archive/2024/01/finding-tagged-questions-faster-answers-discussion/#comments | |
Task: "Fix" not existing normalization of the Tags column in the Posts table without breaking old code | |
*/ | |
/* in a real world example where you really replace the original table Posts instead of working with [Post] you would rename the old table and change the INSERT SELECT FROM to Posts_old | |
EXEC sys.sp_rename @objname = 'dbo.Posts' | |
, @newname = 'Posts_old' | |
, @objtype = 'object' | |
*/ | |
GO | |
CREATE SEQUENCE dbo.s_PostId AS INTEGER; | |
DECLARE @PostId INTEGER = (SELECT TOP 1 p.Id FROM dbo.Posts AS p ORDER BY p.Id DESC) /* TOP 1 is faster than a MAX(id), since it reads just 3 pages (index backward scan) instead of the whole table */ | |
, @cmd VARCHAR(500) | |
SET @cmd = CONCAT('ALTER SEQUENCE dbo.s_PostId RESTART WITH ', @PostId) | |
EXEC (@cmd) | |
; | |
GO | |
/* =====================================================================================================================*/ | |
GO | |
CREATE TABLE dbo.PostsData | |
(Id INT NOT NULL CONSTRAINT DF_PostsData_Id DEFAULT NEXT VALUE FOR dbo.s_PostId | |
, AcceptedAnswerId INT | |
, AnswerCount INT | |
, Body NVARCHAR(MAX) NOT NULL | |
, ClosedDate DATETIME | |
, CommentCount INT | |
, CommunityOwnedDate DATETIME | |
, CreationDate DATETIME NOT NULL | |
, FavoriteCount INT | |
, LastActivityDate DATETIME NOT NULL | |
, LastEditDate DATETIME | |
, LastEditorDisplayName NVARCHAR(40) | |
, LastEditorUserId INT | |
, OwnerUserId INT | |
, ParentId INT | |
, PostTypeId INT NOT NULL | |
, Score INT NOT NULL | |
, Title NVARCHAR(250) | |
, ViewCount INT NOT NULL) | |
; | |
GO | |
ALTER TABLE dbo.PostsData ADD CONSTRAINT PK_PostsData_Id PRIMARY KEY CLUSTERED (Id) WITH (DATA_COMPRESSION = PAGE); /* no need to waste space on disk and in RAM - except you are so CPU bound that can't spare the 1-2% extra CPU */ | |
GO | |
/* all columns except Tags*/ | |
INSERT INTO dbo.PostsData (Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount) | |
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Title, p.ViewCount | |
FROM dbo.Posts AS p; | |
GO | |
/* if you have other indexes on dbo.Posts that not includes [Tags] add them here too*/ | |
GO | |
GO | |
CREATE TABLE dbo.PostsTags ( Id BIGINT IDENTITY NOT NULL /* not INT, since there may be up to 5 times as many Tags as Posts and PostId is already an INT*/ | |
CONSTRAINT pk_PostTags PRIMARY KEY NONCLUSTERED /* not really needed, but we will follow the usual scheme in the StackOverflow database */ | |
, PostId INT NOT NULL | |
, Tag NVARCHAR(150) NOT NULL /*theoretical someone could have created a single Tag with 148 chars on a post*/ | |
) | |
go | |
CREATE UNIQUE CLUSTERED INDEX iuc_PostsTags__PostId__Tag ON dbo.PostsTags (PostId, Tag) WITH (DATA_COMPRESSION = PAGE) | |
go | |
ALTER TABLE dbo.PostsTags ADD CONSTRAINT FK_PostsTags__PostsData | |
FOREIGN KEY (PostID) | |
REFERENCES dbo.PostsData (Id) | |
ON UPDATE CASCADE ON DELETE CASCADE; /* delete the Tags automatical, if the Post was deleted (and update the PostId if an Update happen, which is very unlikely, but you'll never know */ | |
GO | |
INSERT INTO dbo.PostsTags (PostId, Tag) | |
SELECT p.Id AS Post_id | |
, SUBSTRING(ss.value, 2, 150) AS Tag -- remove the leading '<' | |
FROM dbo.Posts AS p | |
CROSS APPLY STRING_SPLIT(p.Tags, '>') AS ss | |
WHERE p.Tags IS NOT NULL /* don't bother to try to split empty Tags*/ | |
AND ss.value <> '' /* STRING_SPLIT() does not return NULL but empty strings after the last Tag */ | |
GO | |
/* creating the index AFTER we filled the table will be faster then before the INSERT */ | |
CREATE NONCLUSTERED INDEX iuc_PostTags__Tag ON dbo.PostsTags (Tag) INCLUDE (PostId) WITH (DATA_COMPRESSION = PAGE) | |
; | |
GO | |
/* =====================================================================================================================*/ | |
/* =====================================================================================================================*/ | |
CREATE OR ALTER VIEW dbo.Post /* used dbo.Post instead of dbo.Posts because I don't want to mess up my StackOverflow database */ | |
AS /* NEVER use SELECT * in a view - it can cause the content to be listed in the wrong columns when you drop a column from a source table */ | |
SELECT pd.Id | |
, pd.AcceptedAnswerId | |
, pd.AnswerCount | |
, pd.Body | |
, pd.ClosedDate | |
, pd.CommentCount | |
, pd.CommunityOwnedDate | |
, pd.CreationDate | |
, pd.FavoriteCount | |
, pd.LastActivityDate | |
, pd.LastEditDate | |
, pd.LastEditorDisplayName | |
, pd.LastEditorUserId | |
, pd.OwnerUserId | |
, pd.ParentId | |
, pd.PostTypeId | |
, pd.Score | |
, pd.Title | |
, pd.ViewCount | |
, pt.Tags | |
FROM dbo.PostsData AS pd | |
LEFT JOIN (SELECT pt.PostId | |
, '<' + STRING_AGG(pt.Tag, '><') + '>' AS Tags /* add WITHIN GROUP (ORDER BY pt.Id) if you really need the original order (will reduce the performance) OR change the Clustered Index to (PostId + Id)*/ | |
FROM dbo.PostsTags AS pt | |
GROUP BY pt.PostId | |
) AS pt | |
ON pt.PostId = pd.Id | |
GO | |
/* =====================================================================================================================*/ | |
GO | |
CREATE OR ALTER TRIGGER tr_Post_before_insert ON dbo.Post | |
INSTEAD OF INSERT | |
AS | |
BEGIN | |
IF @@ROWCOUNT = 0 RETURN; | |
/* has to use a #temp table, since I found no way to let OUTPUT return Inserted.Id and i.Tags when inserting into the dbo.PostsData*/ | |
CREATE TABLE #tags | |
(Id INT NOT NULL PRIMARY KEY /* using DEFAULT NEXT VALUE FOR dbo.s_PostId will not work for #temp tables (Object not found), since it is in the [tempdb], but it is no good idea to refer hardcoded to the database + schema + sequence*/ | |
, AcceptedAnswerId INT | |
, AnswerCount INT | |
, Body NVARCHAR(MAX) NOT NULL | |
, ClosedDate DATETIME | |
, CommentCount INT | |
, CommunityOwnedDate DATETIME | |
, CreationDate DATETIME NOT NULL | |
, FavoriteCount INT | |
, LastActivityDate DATETIME NOT NULL | |
, LastEditDate DATETIME | |
, LastEditorDisplayName NVARCHAR(40) | |
, LastEditorUserId INT | |
, OwnerUserId INT | |
, ParentId INT | |
, PostTypeId INT NOT NULL | |
, Score INT NOT NULL | |
, Title NVARCHAR(250) | |
, ViewCount INT NOT NULL | |
, Tags NVARCHAR(150)); | |
INSERT INTO #tags (Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount, Tags) | |
SELECT NEXT VALUE FOR dbo.s_PostId AS Id | |
, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount, Tags | |
FROM Inserted AS i | |
; | |
/* all columns except Tags */ | |
INSERT INTO dbo.PostsData | |
( Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount) | |
SELECT t.Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount | |
FROM #tags AS t | |
; | |
INSERT INTO dbo.PostsTags (PostId, Tag) | |
SELECT t.Id, SUBSTRING(ss.value, 2, 150) AS Tag | |
FROM #tags AS t | |
CROSS APPLY STRING_SPLIT(t.Tags, '>', 1) AS ss | |
WHERE t.Tags IS NOT NULL | |
AND ss.value <> '' | |
ORDER BY t.Id, ss.ordinal | |
; | |
END | |
GO | |
/* =====================================================================================================================*/ | |
GO | |
CREATE OR ALTER TRIGGER tr_Post_before_update ON dbo.Post | |
INSTEAD OF UPDATE | |
AS | |
BEGIN | |
IF @@ROWCOUNT = 0 RETURN; | |
UPDATE pd | |
SET pd.AcceptedAnswerId = i.AcceptedAnswerId | |
, pd.AnswerCount = i.AnswerCount | |
, pd.Body = i.Body | |
, pd.ClosedDate = i.ClosedDate | |
, pd.CommentCount = i.CommentCount | |
, pd.CommunityOwnedDate = i.CommunityOwnedDate | |
, pd.CreationDate = i.CreationDate | |
, pd.FavoriteCount = i.FavoriteCount | |
, pd.LastActivityDate = i.LastActivityDate | |
, pd.LastEditDate = i.LastEditDate | |
, pd.LastEditorDisplayName = i.LastEditorDisplayName | |
, pd.LastEditorUserId = i.LastEditorUserId | |
, pd.OwnerUserId = i.OwnerUserId | |
, pd.ParentId = i.ParentId | |
, pd.PostTypeId = i.PostTypeId | |
, pd.Score = i.Score | |
, pd.Title = i.Title | |
, pd.ViewCount = i.ViewCount | |
FROM dbo.PostsData AS pd | |
INNER JOIN Inserted AS i | |
ON i.Id = pd.Id | |
/* before SQL 2022 use UPDATE(<column>) instead of pd.<column> IS DISTINCT FROM i.<column>*/ | |
WHERE pd.AcceptedAnswerId IS DISTINCT FROM i.AcceptedAnswerId | |
OR pd.AnswerCount IS DISTINCT FROM i.AnswerCount | |
OR pd.Body IS DISTINCT FROM i.Body | |
OR pd.ClosedDate IS DISTINCT FROM i.ClosedDate | |
OR pd.CommentCount IS DISTINCT FROM i.CommentCount | |
OR pd.CommunityOwnedDate IS DISTINCT FROM i.CommunityOwnedDate | |
OR pd.CreationDate IS DISTINCT FROM i.CreationDate | |
OR pd.FavoriteCount IS DISTINCT FROM i.FavoriteCount | |
OR pd.LastActivityDate IS DISTINCT FROM i.LastActivityDate | |
OR pd.LastEditDate IS DISTINCT FROM i.LastEditDate | |
OR pd.LastEditorDisplayName IS DISTINCT FROM i.LastEditorDisplayName | |
OR pd.LastEditorUserId IS DISTINCT FROM i.LastEditorUserId | |
OR pd.OwnerUserId IS DISTINCT FROM i.OwnerUserId | |
OR pd.ParentId IS DISTINCT FROM i.ParentId | |
OR pd.PostTypeId IS DISTINCT FROM i.PostTypeId | |
OR pd.Score IS DISTINCT FROM i.Score | |
OR pd.Title IS DISTINCT FROM i.Title | |
OR pd.ViewCount IS DISTINCT FROM i.ViewCount | |
; | |
IF UPDATE(Tags) /* only when Tags was in the SET list of the UPDATE statement)*/ | |
BEGIN | |
SELECT i.Id AS PostId, SUBSTRING(ss.value, 2, 150) AS Tag | |
INTO #tags | |
FROM Inserted AS i | |
CROSS APPLY STRING_SPLIT(i.Tags, '>') AS ss | |
WHERE i.Tags IS NOT NULL | |
AND ss.value <> '' | |
; | |
INSERT INTO dbo.PostsTags (PostId, Tag) | |
SELECT * | |
FROM #tags AS t | |
WHERE NOT EXISTS (SELECT * FROM dbo.PostsTags AS pt WHERE pt.PostId = t.PostId AND pt.Tag = t.Tag) | |
; | |
DELETE pt | |
FROM dbo.PostsTags AS pt | |
INNER JOIN Inserted AS i /* without this it would delete all Tags from Posts that were not just updated */ | |
ON i.Id = pt.PostId | |
WHERE NOT EXISTS (SELECT * FROM #tags AS t WHERE pt.PostId = t.PostId AND pt.Tag = t.Tag) | |
; | |
END; | |
END | |
GO | |
/* =====================================================================================================================*/ | |
GO | |
CREATE OR ALTER TRIGGER tr_Post_before_delete ON dbo.Post | |
INSTEAD OF DELETE | |
AS | |
BEGIN | |
IF @@ROWCOUNT IS NULL RETURN; | |
DELETE pd | |
FROM dbo.PostsData AS pd | |
INNER JOIN Deleted d | |
ON d.Id = pd.Id | |
; | |
/* no need to delete from PostTags since there is a cascading Foreign Key which handles the DELETE */ | |
END | |
; | |
GO |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment