Skip to content

Instantly share code, notes, and snippets.

@samot1
Created February 2, 2024 12:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samot1/ba98a0838eddcbb225ef5bce13e9ec1c to your computer and use it in GitHub Desktop.
Save samot1/ba98a0838eddcbb225ef5bce13e9ec1c to your computer and use it in GitHub Desktop.
Brent Ozar's weekly task - Splitting not normalized StackOverflow.dbo.Posts.Tags for fast search using indexed view
/* https://www.brentozar.com/archive/2024/01/query-exercise-find-tagged-questions-faster
Goal:
The dbo.Posts table has a column [Tags] which contains 1 to 5 tags, e.g. '<a_tag><b_tag>' or just '<a_tag>' or '<a_tag><b_tag><c_tag><d_tag><e_tag>' (or NULL)
You have to finde the TOP (x) highest scores for a given tag but must not change the structure of the table (besides indexes), so normalization of the table is no option.
SELECT TOP 100 *
FROM dbo.Posts
WHERE Tags LIKE '%<sql-server>%'
ORDER BY Score DESC;
*/
USE StackOverflow2013
GO
CREATE OR ALTER VIEW dbo.v_post_tags
WITH SCHEMABINDING
/*
manually splits the up to 5 tags into separate columns
View is very ugly, since it is an indexed view and you can't neither use CROSS APLLY (for STRING_SPLIT()) nor UNION ALL nor CTE's nor subselects nor PIVOT in an indexed view, so it repeats it often self
After changing the view, you have to recreate the indexes (will be automatically dropped):
CREATE UNIQUE CLUSTERED INDEX idx_v_post_tags_1 ON dbo.v_post_tags (tag_1, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_2 ON dbo.v_post_tags (tag_2, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_3 ON dbo.v_post_tags (tag_3, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_4 ON dbo.v_post_tags (tag_4, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_5 ON dbo.v_post_tags (tag_5, Score, Id)
*/
AS
SELECT id, p.Tags, p.Score
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) AS tag_1
, CASE WHEN p.Tags LIKE '<%<%<%'
THEN SUBSTRING(p.Tags
, CHARINDEX('>', p.Tags) + 2
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1
)
ELSE NULL -- no 2nd tag
END AS tag_2
, CASE WHEN p.Tags LIKE '<%<%<%'
THEN SUBSTRING(p.Tags
, LEN(CONCAT_WS('><'
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) -- Tag_1
, SUBSTRING(p.Tags
, CHARINDEX('>', p.Tags) + 2
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1
) -- tag_2
) ) + 4
, CHARINDEX('>',
SUBSTRING(p.Tags
, LEN(CONCAT_WS('><'
, SUBSTRING(p.Tags, 2, CHARINDEX('>', p.Tags) - 2) -- Tag_1
, SUBSTRING(p.Tags
, CHARINDEX('>', p.Tags) + 2
, CHARINDEX('>', SUBSTRING(p.Tags, CHARINDEX('>', p.Tags) + 2, 150)) - 1
) -- tag_2
) ) + 4
, 150)
) -1
)
ELSE NULL -- no 3rd tag
END AS tag_3
, CASE WHEN p.Tags LIKE '<%<%<%<%<%' -- 5 tags
THEN REVERSE(SUBSTRING(REVERSE(p.Tags)
, CHARINDEX('<', REVERSE(p.Tags)) + 2
, CHARINDEX('<', SUBSTRING(REVERSE(p.Tags), CHARINDEX('<', REVERSE(p.Tags)) + 2, 150)) - 1
) ) -- then reversed code from tag_2
WHEN p.Tags LIKE '<%<%<%<%' -- 4 tags
THEN REVERSE(SUBSTRING(REVERSE(p.Tags), 2, CHARINDEX('<', REVERSE(p.Tags)) - 2)) -- then reversed code from tag_1
ELSE NULL
END AS tag_4
, CASE WHEN p.Tags LIKE '<%<%<%<%<%' -- 5 tags
THEN REVERSE(SUBSTRING(REVERSE(p.Tags), 2, CHARINDEX('<', REVERSE(p.Tags)) - 2)) -- then reversed code from tag_1
ELSE NULL
END AS tag_5
FROM dbo.Posts AS p
WHERE p.Tags IS NOT NULL AND p.Tags LIKE '<%>'
GO
-- create the indexes on the view -- duration ~ 45 sec for the StackOverFlow2013 database on my laptop
-- you can't filter indexes on views for e.g. tag_2 IS NOT NULL
CREATE UNIQUE CLUSTERED INDEX idx_v_post_tags_1 ON dbo.v_post_tags (tag_1, Score, Id); -- there MUST be an clustered index on the view
CREATE NONCLUSTERED INDEX idx_v_post_tags_2 ON dbo.v_post_tags (tag_2, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_3 ON dbo.v_post_tags (tag_3, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_4 ON dbo.v_post_tags (tag_4, Score, Id)
CREATE NONCLUSTERED INDEX idx_v_post_tags_5 ON dbo.v_post_tags (tag_5, Score, Id)
GO
CREATE OR ALTER VIEW dbo.v_post_query_tags AS
-- view that UNIONs ALL 5 tags to be able to easily query it without having to use '<tag>' in (tag_1, tag_2, tag_3, tag_4, tag_5)
SELECT vpt.id, vpt.Score, vpt.tag_1 AS tag
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND) -- without the NOEXPAND it wouldn't use the index
UNION ALL
SELECT vpt.id, vpt.Score, vpt.tag_2 AS tag
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND)
WHERE vpt.tag_2 IS NOT NULL
UNION ALL
SELECT vpt.id, vpt.Score, vpt.tag_3 AS tag
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND)
WHERE vpt.tag_3 IS NOT NULL
UNION ALL
SELECT vpt.id, vpt.Score, vpt.tag_4 AS tag
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND)
WHERE vpt.tag_4 IS NOT NULL
UNION ALL
SELECT vpt.id, vpt.Score, vpt.tag_5 AS tag
FROM dbo.v_post_tags AS vpt WITH (NOEXPAND)
WHERE vpt.tag_5 IS NOT NULL
;
GO
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'ora-31011' ORDER BY vpq.Score DESC -- one occurence - 22 reads
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'iphone-sdk-4.0.1' ORDER BY vpq.Score DESC -- 10 occurences - 26 reads
SELECT TOP 10 * FROM dbo.v_post_query_tags AS vpq WHERE vpq.tag = 'sql-server' ORDER BY vpq.Score DESC -- 75k occurences - 946 reads
-- no noticable delay; 53 additional reads in v_post_tags
UPDATE dbo.Posts SET Tags = Tags + '<ora-31011>' WHERE Id = 3571036
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment