Created
January 31, 2024 10:03
-
-
Save Montro1981/ab10d261ac5cee1096583539d1605f4c to your computer and use it in GitHub Desktop.
Brent Ozar Query Exercise: Find Tagged Questions Faster with Trigrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
After reading the site Vedran Galic linked (and it was in Brent's mail of yesterday) I gave the trigram solution a try. | |
Summary: | |
It makes finding often used tags slower from the overhead from generating and searching the trigrams table (about 10X in miliseconds). | |
However, finding the outliers is way faster though (110X faster). | |
LIKE %sql-server%: 103 ms | |
LIKE %progress-reports%: 10161 ms | |
Trigram search %sql-server%: 1013 ms | |
Trigram search %progress-reports%: 89 ms | |
*/ | |
CREATE OR ALTER FUNCTION dbo.GenerateTagTrigrams | |
( | |
@tags nvarchar(150) | |
) | |
RETURNS TABLE WITH SCHEMABINDING | |
AS | |
RETURN | |
( | |
WITH Ten AS | |
( | |
SELECT Ten = Digit.Ten, Weigth = COUNT(Digit.Ten) OVER () | |
FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) AS Digit (Ten) | |
), Hundred AS | |
( | |
SELECT Hundred = Ten.Ten + (Ten.Weigth * Hundred.Ten), Weigth = Ten.Weigth * Ten.Weigth | |
FROM Ten AS Ten CROSS APPLY Ten AS Hundred | |
), ThreeHundred AS | |
( | |
SELECT ThreeHundred = (Digit.Three * ThreeHundred.Weigth) + ThreeHundred.Hundred + 1 | |
FROM (VALUES (0), (1), (2)) AS Digit (Three) | |
CROSS APPLY Hundred AS ThreeHundred | |
), TagTrigrams AS | |
( | |
SELECT TOP (CASE WHEN LEN(@tags) > 2 THEN LEN(@tags) - 2 ELSE 0 END) | |
Trigram = SUBSTRING(@tags, threeHund.ThreeHundred, 3) | |
FROM ThreeHundred AS threeHund | |
ORDER BY threeHund.ThreeHundred | |
) | |
SELECT DISTINCT t.trigram | |
FROM TagTrigrams AS t | |
WHERE t.trigram COLLATE Latin1_General_BIN2 NOT LIKE '%[^A-Z0-9a-z]%' | |
); | |
GO | |
DROP TABLE IF EXISTS dbo.TagsTrigrams; | |
CREATE TABLE dbo.TagsTrigrams | |
( | |
PostId int NOT NULL | |
,Trigram nchar(3) NOT NULL | |
); | |
INSERT INTO dbo.TagsTrigrams WITH (TABLOCKX) | |
( | |
PostId | |
,Trigram | |
) | |
SELECT | |
p.Id | |
,t.Trigram | |
FROM dbo.Posts AS p | |
CROSS APPLY dbo.GenerateTagTrigrams (tags) AS t; | |
GO | |
ALTER TABLE dbo.TagsTrigrams ADD CONSTRAINT pk_TagsTrigrams PRIMARY KEY CLUSTERED (Trigram, PostId); | |
GO | |
CREATE OR ALTER VIEW dbo.TagsTrigramCounts | |
WITH SCHEMABINDING AS | |
SELECT tt.Trigram, NumberOfOccurrences = COUNT_BIG(*) | |
FROM dbo.TagsTrigrams AS tt | |
GROUP BY tt.Trigram; | |
GO | |
CREATE UNIQUE CLUSTERED INDEX idx_TagsTrigramCounts ON dbo.TagsTrigramCounts (Trigram); | |
GO | |
CREATE OR ALTER FUNCTION dbo.PostTag_GetBestTrigrams | |
( | |
@searchTag nvarchar(50) | |
) | |
RETURNS TABLE WITH SCHEMABINDING | |
AS | |
RETURN | |
( | |
SELECT | |
TagTrigram1 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 1 THEN bestTrigrams.Trigram END) | |
,TagTrigram2 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 2 THEN bestTrigrams.Trigram END) | |
,TagTrigram3 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 3 THEN bestTrigrams.Trigram END) | |
FROM | |
( | |
SELECT TOP 3 | |
TrigramOrder = ROW_NUMBER() OVER (ORDER BY ttc.NumberOfOccurrences ASC), st.Trigram | |
FROM dbo.GenerateTagTrigrams(@searchTag) AS st | |
INNER JOIN dbo.TagsTrigramCounts AS ttc WITH (NOEXPAND) ON ttc.Trigram = st.Trigram | |
ORDER BY ttc.NumberOfOccurrences ASC | |
) AS bestTrigrams | |
); | |
GO | |
CREATE OR ALTER FUNCTION dbo.GetTagTrigramPostIDs | |
( | |
@TagTrigram1 nchar(3) | |
,@TagTrigram2 nchar(3) | |
,@TagTrigram3 nchar(3) | |
) | |
RETURNS @IDs TABLE | |
( | |
PostId int PRIMARY KEY | |
) WITH SCHEMABINDING AS | |
BEGIN /* Begin function body */ | |
IF @TagTrigram1 IS NOT NULL | |
BEGIN | |
IF @TagTrigram2 IS NOT NULL | |
BEGIN | |
IF @TagTrigram3 IS NOT NULL | |
BEGIN | |
INSERT @IDs (PostId) | |
SELECT tt1.PostId | |
FROM dbo.TagsTrigrams AS tt1 | |
WHERE tt1.Trigram = @TagTrigram1 | |
INTERSECT | |
SELECT tt2.PostId | |
FROM dbo.TagsTrigrams AS tt2 | |
WHERE tt2.Trigram = @TagTrigram2 | |
INTERSECT | |
SELECT tt3.PostId | |
FROM dbo.TagsTrigrams AS tt3 | |
WHERE tt3.Trigram = @TagTrigram3 | |
OPTION (MERGE JOIN); | |
END; | |
ELSE | |
BEGIN | |
INSERT @IDs (PostId) | |
SELECT tt1.PostId | |
FROM dbo.TagsTrigrams AS tt1 | |
WHERE tt1.Trigram = @TagTrigram1 | |
INTERSECT | |
SELECT tt2.PostId | |
FROM dbo.TagsTrigrams AS tt2 | |
WHERE tt2.Trigram = @TagTrigram2 | |
OPTION (MERGE JOIN); | |
END; | |
END; | |
ELSE | |
BEGIN | |
INSERT @IDs (PostId) | |
SELECT tt1.PostId | |
FROM dbo.TagsTrigrams AS tt1 | |
WHERE tt1.Trigram = @TagTrigram1 | |
END; | |
END; | |
RETURN; | |
END; /* End function body */ | |
GO | |
CREATE OR ALTER FUNCTION dbo.PostTags_TrigramSearch | |
( | |
@Tag nvarchar(150) | |
) | |
RETURNS TABLE WITH SCHEMABINDING | |
AS | |
RETURN | |
SELECT Posts.Id, Posts.AcceptedAnswerId, Posts.AnswerCount, Posts.Body, Posts.ClosedDate, Posts.CommentCount, Posts.CommunityOwnedDate, Posts.CreationDate, Posts.FavoriteCount, Posts.LastActivityDate, Posts.LastEditDate, Posts.LastEditorDisplayName, Posts.LastEditorUserId, Posts.OwnerUserId, Posts.ParentId, Posts.PostTypeId, Posts.Score, Posts.Tags, Posts.Title, Posts.ViewCount | |
FROM dbo.PostTag_GetBestTrigrams(@Tag) AS ptt | |
CROSS APPLY | |
( | |
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Tags, p.Title, p.ViewCount | |
FROM dbo.GetTagTrigramPostIDs (ptt.TagTrigram1, ptt.TagTrigram2, ptt.TagTrigram3) AS trigrams | |
INNER JOIN dbo.Posts AS p ON p.id = trigrams.PostId | |
WHERE ptt.TagTrigram1 IS NOT NULL AND p.Tags LIKE @Tag AND p.Tags IS NOT NULL | |
UNION ALL | |
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Tags, p.Title, p.ViewCount | |
FROM dbo.Posts AS p | |
WHERE ptt.TagTrigram1 IS NULL AND p.Tags LIKE @Tag AND p.Tags IS NOT NULL | |
) AS Posts; | |
GO | |
SET NOCOUNT ON; | |
DECLARE | |
@StartTime datetime2 | |
,@ElapsedTime int; | |
SET @StartTime = SYSUTCDATETIME(); | |
SELECT TOP 100 * | |
FROM dbo.Posts | |
WHERE Tags LIKE '%sql-server%' | |
ORDER BY Score DESC; | |
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME()); | |
PRINT 'LIKE %sql-server%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms'; | |
SET @StartTime = SYSUTCDATETIME(); | |
SELECT TOP 100 * | |
FROM dbo.Posts | |
WHERE Tags LIKE '%progress-reports%' | |
ORDER BY Score DESC; | |
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME()); | |
PRINT 'LIKE %progress-reports%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms'; | |
SET @StartTime = SYSUTCDATETIME(); | |
SELECT TOP 100 Score, Id | |
FROM dbo.PostTags_TrigramSearch(N'%sql-server%') | |
ORDER BY Score DESC; | |
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME()); | |
PRINT 'Trigram search %sql-server%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms'; | |
SET @StartTime = SYSUTCDATETIME(); | |
SELECT TOP 100 * | |
FROM dbo.PostTags_TrigramSearch(N'%progress-reports%') | |
ORDER BY Score DESC; | |
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME()); | |
PRINT 'Trigram search %progress-reports%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms'; | |
SET NOCOUNT OFF; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment