Instantly share code, notes, and snippets.
Created
February 2, 2024 12:54
-
Save samot1/ba98a0838eddcbb225ef5bce13e9ec1c to your computer and use it in GitHub Desktop.
Brent Ozar's weekly task - Splitting not normalized StackOverflow.dbo.Posts.Tags for fast search using indexed view
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* https://www.brentozar.com/archive/2024/01/query-exercise-find-tagged-questions-faster | |
Goal: | |
The dbo.Posts table has a column [Tags] which contains 1 to 5 tags, e.g. '<a_tag><b_tag>' or just '<a_tag>' or '<a_tag><b_tag><c_tag><d_tag><e_tag>' (or NULL) | |
You have to finde the TOP (x) highest scores for a given tag but must not change the structure of the table (besides indexes), so normalization of the table is no option. | |
SELECT TOP 100 * | |
FROM dbo.Posts | |
WHERE Tags LIKE '%<sql-server>%' | |
ORDER BY Score DESC; | |
*/ | |
USE StackOverflow2013 | |
GO | |
CREATE OR ALTER VIEW dbo.v_post_tags | |
WITH SCHEMABINDING | |
/* | |
manually splits the up to 5 tags into separate columns | |
View is very ugly, since it is an indexed view and you can't neither use CROSS APLLY (for STRING_SPLIT()) nor UNION ALL nor CTE's nor subselects nor PIVOT in an indexed view, so it repeats it often self | |
After changing the view, you have to recreate the indexes (will be automatically dropped): | |
CREATE UNIQUE CLUSTERED INDEX idx_v_post_tags_1 ON dbo.v_post_tags (tag_1, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_2 ON dbo.v_post_tags (tag_2, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_3 ON dbo.v_post_tags (tag_3, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_4 ON dbo.v_post_tags (tag_4, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_5 ON dbo.v_post_tags (tag_5, Score, Id) | |
*/ | |
AS | |
SELECT id, p.Tags, p.Score | |
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) AS tag_1 | |
, CASE WHEN p.Tags LIKE '<%<%<%' | |
THEN SUBSTRING(p.Tags | |
, CHARINDEX('>', p.Tags) + 2 | |
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1 | |
) | |
ELSE NULL -- no 2nd tag | |
END AS tag_2 | |
, CASE WHEN p.Tags LIKE '<%<%<%' | |
THEN SUBSTRING(p.Tags | |
, LEN(CONCAT_WS('><' | |
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) -- Tag_1 | |
, SUBSTRING(p.Tags | |
, CHARINDEX('>', p.Tags) + 2 | |
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1 | |
) -- tag_2 | |
) ) + 4 | |
, CHARINDEX('>', | |
SUBSTRING(p.Tags | |
, LEN(CONCAT_WS('><' | |
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) -- Tag_1 | |
, SUBSTRING(p.Tags | |
, CHARINDEX('>', p.Tags) + 2 | |
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1 | |
) -- tag_2 | |
) ) + 4 | |
, 150) | |
) -1 | |
) | |
ELSE NULL -- no 3rd tag | |
END AS tag_3 | |
, CASE WHEN p.Tags LIKE '<%<%<%<%<%' -- 5 tags | |
THEN REVERSE(SUBSTRING(REVERSE(p.Tags) | |
, CHARINDEX('<', REVERSE(p.Tags)) + 2 | |
, CHARINDEX('<', SUBSTRING(REVERSE(p.Tags), CHARINDEX('<', REVERSE(p.Tags)) + 2, 150)) - 1 | |
) ) -- then reversed code from tag_2 | |
WHEN p.Tags LIKE '<%<%<%<%' -- 4 tags | |
THEN REVERSE(SUBSTRING(REVERSE(p.Tags), 2, CHARINDEX('<', REVERSE(p.Tags)) - 2)) -- then reversed code from tag_1 | |
ELSE NULL | |
END AS tag_4 | |
, CASE WHEN p.Tags LIKE '<%<%<%<%<%' -- 5 tags | |
THEN REVERSE(SUBSTRING(REVERSE(p.Tags), 2, CHARINDEX('<', REVERSE(p.Tags)) - 2)) -- then reversed code from tag_1 | |
ELSE NULL | |
END AS tag_5 | |
FROM dbo.Posts AS p | |
WHERE p.Tags IS NOT NULL AND p.Tags LIKE '<%>' | |
GO | |
-- create the indexes on the view -- duration ~ 45 sec for the StackOverFlow2013 database on my laptop | |
-- you can't filter indexes on views for e.g. tag_2 IS NOT NULL | |
CREATE UNIQUE CLUSTERED INDEX idx_v_post_tags_1 ON dbo.v_post_tags (tag_1, Score, Id); -- there MUST be an clustered index on the view | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_2 ON dbo.v_post_tags (tag_2, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_3 ON dbo.v_post_tags (tag_3, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_4 ON dbo.v_post_tags (tag_4, Score, Id) | |
CREATE NONCLUSTERED INDEX idx_v_post_tags_5 ON dbo.v_post_tags (tag_5, Score, Id) | |
GO | |
CREATE OR ALTER VIEW dbo.v_post_query_tags AS | |
-- view that UNIONs ALL 5 tags to be able to easily query it without having to use '<tag>' in (tag_1, tag_2, tag_3, tag_4, tag_5) | |
SELECT vpt.id, vpt.Score, vpt.tag_1 AS tag | |
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) -- without the NOEXPAND it wouldn't use the index | |
UNION ALL | |
SELECT vpt.id, vpt.Score, vpt.tag_2 AS tag | |
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) | |
WHERE vpt.tag_2 IS NOT NULL | |
UNION ALL | |
SELECT vpt.id, vpt.Score, vpt.tag_3 AS tag | |
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) | |
WHERE vpt.tag_3 IS NOT NULL | |
UNION ALL | |
SELECT vpt.id, vpt.Score, vpt.tag_4 AS tag | |
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) | |
WHERE vpt.tag_4 IS NOT NULL | |
UNION ALL | |
SELECT vpt.id, vpt.Score, vpt.tag_5 AS tag | |
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) | |
WHERE vpt.tag_5 IS NOT NULL | |
; | |
GO | |
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'ora-31011' ORDER BY vpq.Score DESC -- one occurence - 22 reads | |
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'iphone-sdk-4.0.1' ORDER BY vpq.Score DESC -- 10 occurences - 26 reads | |
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'sql-server' ORDER BY vpq.Score DESC -- 75k occurences - 946 reads | |
-- no noticable delay; 53 additional reads in v_post_tags | |
UPDATE dbo.Posts SET Tags = Tags + '<ora-31011>' WHERE Id = 3571036 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment