Skip to content

Instantly share code, notes, and snippets.

@samot1
Created February 2, 2024 12:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samot1/5269da8ce06b47fe7f57f77a48f70247 to your computer and use it in GitHub Desktop.
Save samot1/5269da8ce06b47fe7f57f77a48f70247 to your computer and use it in GitHub Desktop.
Normalize StackOverFlow's Posts table (Tags)
/* https://www.brentozar.com/archive/2024/01/finding-tagged-questions-faster-answers-discussion/#comments
Task: "Fix" not existing normalization of the Tags column in the Posts table without breaking old code
*/
/* in a real world example where you really replace the original table Posts instead of working with [Post] you would rename the old table and change the INSERT SELECT FROM to Posts_old
EXEC sys.sp_rename @objname = 'dbo.Posts'
, @newname = 'Posts_old'
, @objtype = 'object'
*/
GO
CREATE SEQUENCE dbo.s_PostId AS INTEGER;
DECLARE @PostId INTEGER = (SELECT TOP 1 p.Id FROM dbo.Posts AS p ORDER BY p.Id DESC) /* TOP 1 is faster than a MAX(id), since it reads just 3 pages (index backward scan) instead of the whole table */
, @cmd VARCHAR(500)
SET @cmd = CONCAT('ALTER SEQUENCE dbo.s_PostId RESTART WITH ', @PostId)
EXEC (@cmd)
;
GO
/* =====================================================================================================================*/
GO
CREATE TABLE dbo.PostsData
(Id INT NOT NULL CONSTRAINT DF_PostsData_Id DEFAULT NEXT VALUE FOR dbo.s_PostId
, AcceptedAnswerId INT
, AnswerCount INT
, Body NVARCHAR(MAX) NOT NULL
, ClosedDate DATETIME
, CommentCount INT
, CommunityOwnedDate DATETIME
, CreationDate DATETIME NOT NULL
, FavoriteCount INT
, LastActivityDate DATETIME NOT NULL
, LastEditDate DATETIME
, LastEditorDisplayName NVARCHAR(40)
, LastEditorUserId INT
, OwnerUserId INT
, ParentId INT
, PostTypeId INT NOT NULL
, Score INT NOT NULL
, Title NVARCHAR(250)
, ViewCount INT NOT NULL)
;
GO
ALTER TABLE dbo.PostsData ADD CONSTRAINT PK_PostsData_Id PRIMARY KEY CLUSTERED (Id) WITH (DATA_COMPRESSION = PAGE); /* no need to waste space on disk and in RAM - except you are so CPU bound that can't spare the 1-2% extra CPU */
GO
/* all columns except Tags*/
INSERT INTO dbo.PostsData (Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount)
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Title, p.ViewCount
FROM dbo.Posts AS p;
GO
/* if you have other indexes on dbo.Posts that not includes [Tags] add them here too*/
GO
GO
CREATE TABLE dbo.PostsTags ( Id BIGINT IDENTITY NOT NULL /* not INT, since there may be up to 5 times as many Tags as Posts and PostId is already an INT*/
CONSTRAINT pk_PostTags PRIMARY KEY NONCLUSTERED /* not really needed, but we will follow the usual scheme in the StackOverflow database */
, PostId INT NOT NULL
, Tag NVARCHAR(150) NOT NULL /*theoretical someone could have created a single Tag with 148 chars on a post*/
)
go
CREATE UNIQUE CLUSTERED INDEX iuc_PostsTags__PostId__Tag ON dbo.PostsTags (PostId, Tag) WITH (DATA_COMPRESSION = PAGE)
go
ALTER TABLE dbo.PostsTags ADD CONSTRAINT FK_PostsTags__PostsData
FOREIGN KEY (PostID)
REFERENCES dbo.PostsData (Id)
ON UPDATE CASCADE ON DELETE CASCADE; /* delete the Tags automatical, if the Post was deleted (and update the PostId if an Update happen, which is very unlikely, but you'll never know */
GO
INSERT INTO dbo.PostsTags (PostId, Tag)
SELECT p.Id AS Post_id
, SUBSTRING(ss.value, 2, 150) AS Tag -- remove the leading '<'
FROM dbo.Posts AS p
CROSS APPLY STRING_SPLIT(p.Tags, '>') AS ss
WHERE p.Tags IS NOT NULL /* don't bother to try to split empty Tags*/
AND ss.value <> '' /* STRING_SPLIT() does not return NULL but empty strings after the last Tag */
GO
/* creating the index AFTER we filled the table will be faster then before the INSERT */
CREATE NONCLUSTERED INDEX iuc_PostTags__Tag ON dbo.PostsTags (Tag) INCLUDE (PostId) WITH (DATA_COMPRESSION = PAGE)
;
GO
/* =====================================================================================================================*/
/* =====================================================================================================================*/
CREATE OR ALTER VIEW dbo.Post /* used dbo.Post instead of dbo.Posts because I don't want to mess up my StackOverflow database */
AS /* NEVER use SELECT * in a view - it can cause the content to be listed in the wrong columns when you drop a column from a source table */
SELECT pd.Id
, pd.AcceptedAnswerId
, pd.AnswerCount
, pd.Body
, pd.ClosedDate
, pd.CommentCount
, pd.CommunityOwnedDate
, pd.CreationDate
, pd.FavoriteCount
, pd.LastActivityDate
, pd.LastEditDate
, pd.LastEditorDisplayName
, pd.LastEditorUserId
, pd.OwnerUserId
, pd.ParentId
, pd.PostTypeId
, pd.Score
, pd.Title
, pd.ViewCount
, pt.Tags
FROM dbo.PostsData AS pd
LEFT JOIN (SELECT pt.PostId
, '<' + STRING_AGG(pt.Tag, '><') + '>' AS Tags /* add WITHIN GROUP (ORDER BY pt.Id) if you really need the original order (will reduce the performance) OR change the Clustered Index to (PostId + Id)*/
FROM dbo.PostsTags AS pt
GROUP BY pt.PostId
) AS pt
ON pt.PostId = pd.Id
GO
/* =====================================================================================================================*/
GO
CREATE OR ALTER TRIGGER tr_Post_before_insert ON dbo.Post
INSTEAD OF INSERT
AS
BEGIN
IF @@ROWCOUNT = 0 RETURN;
/* has to use a #temp table, since I found no way to let OUTPUT return Inserted.Id and i.Tags when inserting into the dbo.PostsData*/
CREATE TABLE #tags
(Id INT NOT NULL PRIMARY KEY /* using DEFAULT NEXT VALUE FOR dbo.s_PostId will not work for #temp tables (Object not found), since it is in the [tempdb], but it is no good idea to refer hardcoded to the database + schema + sequence*/
, AcceptedAnswerId INT
, AnswerCount INT
, Body NVARCHAR(MAX) NOT NULL
, ClosedDate DATETIME
, CommentCount INT
, CommunityOwnedDate DATETIME
, CreationDate DATETIME NOT NULL
, FavoriteCount INT
, LastActivityDate DATETIME NOT NULL
, LastEditDate DATETIME
, LastEditorDisplayName NVARCHAR(40)
, LastEditorUserId INT
, OwnerUserId INT
, ParentId INT
, PostTypeId INT NOT NULL
, Score INT NOT NULL
, Title NVARCHAR(250)
, ViewCount INT NOT NULL
, Tags NVARCHAR(150));
INSERT INTO #tags (Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount, Tags)
SELECT NEXT VALUE FOR dbo.s_PostId AS Id
, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount, Tags
FROM Inserted AS i
;
/* all columns except Tags */
INSERT INTO dbo.PostsData
( Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount)
SELECT t.Id, AcceptedAnswerId, AnswerCount, Body, ClosedDate, CommentCount, CommunityOwnedDate, CreationDate, FavoriteCount, LastActivityDate, LastEditDate, LastEditorDisplayName, LastEditorUserId, OwnerUserId, ParentId, PostTypeId, Score, Title, ViewCount
FROM #tags AS t
;
INSERT INTO dbo.PostsTags (PostId, Tag)
SELECT t.Id, SUBSTRING(ss.value, 2, 150) AS Tag
FROM #tags AS t
CROSS APPLY STRING_SPLIT(t.Tags, '>', 1) AS ss
WHERE t.Tags IS NOT NULL
AND ss.value <> ''
ORDER BY t.Id, ss.ordinal
;
END
GO
/* =====================================================================================================================*/
GO
CREATE OR ALTER TRIGGER tr_Post_before_update ON dbo.Post
INSTEAD OF UPDATE
AS
BEGIN
IF @@ROWCOUNT = 0 RETURN;
UPDATE pd
SET pd.AcceptedAnswerId = i.AcceptedAnswerId
, pd.AnswerCount = i.AnswerCount
, pd.Body = i.Body
, pd.ClosedDate = i.ClosedDate
, pd.CommentCount = i.CommentCount
, pd.CommunityOwnedDate = i.CommunityOwnedDate
, pd.CreationDate = i.CreationDate
, pd.FavoriteCount = i.FavoriteCount
, pd.LastActivityDate = i.LastActivityDate
, pd.LastEditDate = i.LastEditDate
, pd.LastEditorDisplayName = i.LastEditorDisplayName
, pd.LastEditorUserId = i.LastEditorUserId
, pd.OwnerUserId = i.OwnerUserId
, pd.ParentId = i.ParentId
, pd.PostTypeId = i.PostTypeId
, pd.Score = i.Score
, pd.Title = i.Title
, pd.ViewCount = i.ViewCount
FROM dbo.PostsData AS pd
INNER JOIN Inserted AS i
ON i.Id = pd.Id
/* before SQL 2022 use UPDATE(<column>) instead of pd.<column> IS DISTINCT FROM i.<column>*/
WHERE pd.AcceptedAnswerId IS DISTINCT FROM i.AcceptedAnswerId
OR pd.AnswerCount IS DISTINCT FROM i.AnswerCount
OR pd.Body IS DISTINCT FROM i.Body
OR pd.ClosedDate IS DISTINCT FROM i.ClosedDate
OR pd.CommentCount IS DISTINCT FROM i.CommentCount
OR pd.CommunityOwnedDate IS DISTINCT FROM i.CommunityOwnedDate
OR pd.CreationDate IS DISTINCT FROM i.CreationDate
OR pd.FavoriteCount IS DISTINCT FROM i.FavoriteCount
OR pd.LastActivityDate IS DISTINCT FROM i.LastActivityDate
OR pd.LastEditDate IS DISTINCT FROM i.LastEditDate
OR pd.LastEditorDisplayName IS DISTINCT FROM i.LastEditorDisplayName
OR pd.LastEditorUserId IS DISTINCT FROM i.LastEditorUserId
OR pd.OwnerUserId IS DISTINCT FROM i.OwnerUserId
OR pd.ParentId IS DISTINCT FROM i.ParentId
OR pd.PostTypeId IS DISTINCT FROM i.PostTypeId
OR pd.Score IS DISTINCT FROM i.Score
OR pd.Title IS DISTINCT FROM i.Title
OR pd.ViewCount IS DISTINCT FROM i.ViewCount
;
IF UPDATE(Tags) /* only when Tags was in the SET list of the UPDATE statement)*/
BEGIN
SELECT i.Id AS PostId, SUBSTRING(ss.value, 2, 150) AS Tag
INTO #tags
FROM Inserted AS i
CROSS APPLY STRING_SPLIT(i.Tags, '>') AS ss
WHERE i.Tags IS NOT NULL
AND ss.value <> ''
;
INSERT INTO dbo.PostsTags (PostId, Tag)
SELECT *
FROM #tags AS t
WHERE NOT EXISTS (SELECT * FROM dbo.PostsTags AS pt WHERE pt.PostId = t.PostId AND pt.Tag = t.Tag)
;
DELETE pt
FROM dbo.PostsTags AS pt
INNER JOIN Inserted AS i /* without this it would delete all Tags from Posts that were not just updated */
ON i.Id = pt.PostId
WHERE NOT EXISTS (SELECT * FROM #tags AS t WHERE pt.PostId = t.PostId AND pt.Tag = t.Tag)
;
END;
END
GO
/* =====================================================================================================================*/
GO
CREATE OR ALTER TRIGGER tr_Post_before_delete ON dbo.Post
INSTEAD OF DELETE
AS
BEGIN
IF @@ROWCOUNT IS NULL RETURN;
DELETE pd
FROM dbo.PostsData AS pd
INNER JOIN Deleted d
ON d.Id = pd.Id
;
/* no need to delete from PostTags since there is a cascading Foreign Key which handles the DELETE */
END
;
GO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment