Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Montro1981/ab10d261ac5cee1096583539d1605f4c to your computer and use it in GitHub Desktop.
Save Montro1981/ab10d261ac5cee1096583539d1605f4c to your computer and use it in GitHub Desktop.
Brent Ozar Query Exercise: Find Tagged Questions Faster with Trigrams
/*
After reading the site Vedran Galic linked (and it was in Brent's mail of yesterday) I gave the trigram solution a try.
Summary:
It makes finding often used tags slower from the overhead from generating and searching the trigrams table (about 10X in miliseconds).
However, finding the outliers is way faster though (110X faster).
LIKE %sql-server%: 103 ms
LIKE %progress-reports%: 10161 ms
Trigram search %sql-server%: 1013 ms
Trigram search %progress-reports%: 89 ms
*/
CREATE OR ALTER FUNCTION dbo.GenerateTagTrigrams
(
@tags nvarchar(150)
)
RETURNS TABLE WITH SCHEMABINDING
AS
RETURN
(
WITH Ten AS
(
SELECT Ten = Digit.Ten, Weigth = COUNT(Digit.Ten) OVER ()
FROM (VALUES (0), (1), (2), (3), (4), (5), (6), (7), (8), (9)) AS Digit (Ten)
), Hundred AS
(
SELECT Hundred = Ten.Ten + (Ten.Weigth * Hundred.Ten), Weigth = Ten.Weigth * Ten.Weigth
FROM Ten AS Ten CROSS APPLY Ten AS Hundred
), ThreeHundred AS
(
SELECT ThreeHundred = (Digit.Three * ThreeHundred.Weigth) + ThreeHundred.Hundred + 1
FROM (VALUES (0), (1), (2)) AS Digit (Three)
CROSS APPLY Hundred AS ThreeHundred
), TagTrigrams AS
(
SELECT TOP (CASE WHEN LEN(@tags) > 2 THEN LEN(@tags) - 2 ELSE 0 END)
Trigram = SUBSTRING(@tags, threeHund.ThreeHundred, 3)
FROM ThreeHundred AS threeHund
ORDER BY threeHund.ThreeHundred
)
SELECT DISTINCT t.trigram
FROM TagTrigrams AS t
WHERE t.trigram COLLATE Latin1_General_BIN2 NOT LIKE '%[^A-Z0-9a-z]%'
);
GO
DROP TABLE IF EXISTS dbo.TagsTrigrams;
CREATE TABLE dbo.TagsTrigrams
(
PostId int NOT NULL
,Trigram nchar(3) NOT NULL
);
INSERT INTO dbo.TagsTrigrams WITH (TABLOCKX)
(
PostId
,Trigram
)
SELECT
p.Id
,t.Trigram
FROM dbo.Posts AS p
CROSS APPLY dbo.GenerateTagTrigrams (tags) AS t;
GO
ALTER TABLE dbo.TagsTrigrams ADD CONSTRAINT pk_TagsTrigrams PRIMARY KEY CLUSTERED (Trigram, PostId);
GO
CREATE OR ALTER VIEW dbo.TagsTrigramCounts
WITH SCHEMABINDING AS
SELECT tt.Trigram, NumberOfOccurrences = COUNT_BIG(*)
FROM dbo.TagsTrigrams AS tt
GROUP BY tt.Trigram;
GO
CREATE UNIQUE CLUSTERED INDEX idx_TagsTrigramCounts ON dbo.TagsTrigramCounts (Trigram);
GO
CREATE OR ALTER FUNCTION dbo.PostTag_GetBestTrigrams
(
@searchTag nvarchar(50)
)
RETURNS TABLE WITH SCHEMABINDING
AS
RETURN
(
SELECT
TagTrigram1 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 1 THEN bestTrigrams.Trigram END)
,TagTrigram2 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 2 THEN bestTrigrams.Trigram END)
,TagTrigram3 = MAX(CASE WHEN bestTrigrams.TrigramOrder = 3 THEN bestTrigrams.Trigram END)
FROM
(
SELECT TOP 3
TrigramOrder = ROW_NUMBER() OVER (ORDER BY ttc.NumberOfOccurrences ASC), st.Trigram
FROM dbo.GenerateTagTrigrams(@searchTag) AS st
INNER JOIN dbo.TagsTrigramCounts AS ttc WITH (NOEXPAND) ON ttc.Trigram = st.Trigram
ORDER BY ttc.NumberOfOccurrences ASC
) AS bestTrigrams
);
GO
CREATE OR ALTER FUNCTION dbo.GetTagTrigramPostIDs
(
@TagTrigram1 nchar(3)
,@TagTrigram2 nchar(3)
,@TagTrigram3 nchar(3)
)
RETURNS @IDs TABLE
(
PostId int PRIMARY KEY
) WITH SCHEMABINDING AS
BEGIN /* Begin function body */
IF @TagTrigram1 IS NOT NULL
BEGIN
IF @TagTrigram2 IS NOT NULL
BEGIN
IF @TagTrigram3 IS NOT NULL
BEGIN
INSERT @IDs (PostId)
SELECT tt1.PostId
FROM dbo.TagsTrigrams AS tt1
WHERE tt1.Trigram = @TagTrigram1
INTERSECT
SELECT tt2.PostId
FROM dbo.TagsTrigrams AS tt2
WHERE tt2.Trigram = @TagTrigram2
INTERSECT
SELECT tt3.PostId
FROM dbo.TagsTrigrams AS tt3
WHERE tt3.Trigram = @TagTrigram3
OPTION (MERGE JOIN);
END;
ELSE
BEGIN
INSERT @IDs (PostId)
SELECT tt1.PostId
FROM dbo.TagsTrigrams AS tt1
WHERE tt1.Trigram = @TagTrigram1
INTERSECT
SELECT tt2.PostId
FROM dbo.TagsTrigrams AS tt2
WHERE tt2.Trigram = @TagTrigram2
OPTION (MERGE JOIN);
END;
END;
ELSE
BEGIN
INSERT @IDs (PostId)
SELECT tt1.PostId
FROM dbo.TagsTrigrams AS tt1
WHERE tt1.Trigram = @TagTrigram1
END;
END;
RETURN;
END; /* End function body */
GO
CREATE OR ALTER FUNCTION dbo.PostTags_TrigramSearch
(
@Tag nvarchar(150)
)
RETURNS TABLE WITH SCHEMABINDING
AS
RETURN
SELECT Posts.Id, Posts.AcceptedAnswerId, Posts.AnswerCount, Posts.Body, Posts.ClosedDate, Posts.CommentCount, Posts.CommunityOwnedDate, Posts.CreationDate, Posts.FavoriteCount, Posts.LastActivityDate, Posts.LastEditDate, Posts.LastEditorDisplayName, Posts.LastEditorUserId, Posts.OwnerUserId, Posts.ParentId, Posts.PostTypeId, Posts.Score, Posts.Tags, Posts.Title, Posts.ViewCount
FROM dbo.PostTag_GetBestTrigrams(@Tag) AS ptt
CROSS APPLY
(
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Tags, p.Title, p.ViewCount
FROM dbo.GetTagTrigramPostIDs (ptt.TagTrigram1, ptt.TagTrigram2, ptt.TagTrigram3) AS trigrams
INNER JOIN dbo.Posts AS p ON p.id = trigrams.PostId
WHERE ptt.TagTrigram1 IS NOT NULL AND p.Tags LIKE @Tag AND p.Tags IS NOT NULL
UNION ALL
SELECT p.Id, p.AcceptedAnswerId, p.AnswerCount, p.Body, p.ClosedDate, p.CommentCount, p.CommunityOwnedDate, p.CreationDate, p.FavoriteCount, p.LastActivityDate, p.LastEditDate, p.LastEditorDisplayName, p.LastEditorUserId, p.OwnerUserId, p.ParentId, p.PostTypeId, p.Score, p.Tags, p.Title, p.ViewCount
FROM dbo.Posts AS p
WHERE ptt.TagTrigram1 IS NULL AND p.Tags LIKE @Tag AND p.Tags IS NOT NULL
) AS Posts;
GO
SET NOCOUNT ON;
DECLARE
@StartTime datetime2
,@ElapsedTime int;
SET @StartTime = SYSUTCDATETIME();
SELECT TOP 100 *
FROM dbo.Posts
WHERE Tags LIKE '%sql-server%'
ORDER BY Score DESC;
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME());
PRINT 'LIKE %sql-server%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms';
SET @StartTime = SYSUTCDATETIME();
SELECT TOP 100 *
FROM dbo.Posts
WHERE Tags LIKE '%progress-reports%'
ORDER BY Score DESC;
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME());
PRINT 'LIKE %progress-reports%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms';
SET @StartTime = SYSUTCDATETIME();
SELECT TOP 100 Score, Id
FROM dbo.PostTags_TrigramSearch(N'%sql-server%')
ORDER BY Score DESC;
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME());
PRINT 'Trigram search %sql-server%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms';
SET @StartTime = SYSUTCDATETIME();
SELECT TOP 100 *
FROM dbo.PostTags_TrigramSearch(N'%progress-reports%')
ORDER BY Score DESC;
SET @ElapsedTime = DATEDIFF(MILLISECOND, @StartTime, SYSUTCDATETIME());
PRINT 'Trigram search %progress-reports%: ' + CAST(@ElapsedTime AS nvarchar(4000)) + N' ms';
SET NOCOUNT OFF;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment