Skip to content

Instantly share code, notes, and snippets.

@SQLAdrian
Last active April 17, 2024 03:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SQLAdrian/a2fe74af4abcc00a4e9147dfcb68288f to your computer and use it in GitHub Desktop.
Save SQLAdrian/a2fe74af4abcc00a4e9147dfcb68288f to your computer and use it in GitHub Desktop.
IN versus JOIN performance SQL2022
--Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info;
--Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64)
--I will be using the Estimate Subtree cost to compare each select statement.
--Test carried out on E4 with 10,000 rows. It would be stated where more rows were used.
--Lookup table is set to 80% of the size of the source table.
--tl;dr.
--It really depends.
--Generally the IN() and JOIN perform similar. Tested to 1m rows.
--IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly.
--JOIN beneftis greatly from appropriate indexes on 250k+ rows
--NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count
--NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+
IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
DROP TABLE #maintable;
CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
DROP TABLE #jointable;
CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
/*It's best to crack open the query plan for this and walk through each one*/
/*Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function*/
;WITH E1(N) AS (
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
), --10E+1 or 10 rows
E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
/*All yours*/
Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million
INSERT #maintable([N], [Other], Invoice)
SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END
, ROW_NUMBER() OVER (Order by N) [Other]
, 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice
/*As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input*/
FROM Emore;
INSERT #jointable([N], [Other], Invoice)
SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END
, [Other]
, [Invoice]
FROM #maintable
/*You can break the section below out to test as individual sections, just drop the #temp tables afterwards*/
/*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN.
49%/51% split. Lower is better.
NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/
/*Let's try joining on TINYINT field using NOT(IN))*/
SELECT COUNT(T1.ID) FROM #maintable T1
WHERE NOT(ID IN ( SELECT ID FROM #jointable))
/*Let's try joining on numbers using LEFT OUTER and looking for unjoined items*/
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID
WHERE T2.ID IS NULL
/*2. Faster JOIN. Very different profile when excluding on a CHAR field.
85%/15% split. Lower is better. At 200k rows this changes to 91%/9%
NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421
TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/
/*How about NOT(IN)) just joining on a CHAR field*/
SELECT COUNT(T1.ID) FROM #maintable T1
WHERE NOT(Other IN ( SELECT Other FROM #jointable))
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other
WHERE T2.ID IS NULL
/*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN.
50%/50% split. Lower is better. Stays consistent on higher row counts
0.255 cost for both*/
/*Let's try joining on tinyint using (IN))*/
SELECT COUNT(T1.ID )
FROM #maintable T1
WHERE (ID IN ( SELECT ID FROM #jointable))
SELECT COUNT(T1.ID )
FROM #maintable T1
INNER JOIN #jointable T2 ON T1.ID = T2.ID
/*4. Tie. Similar performance for CHAR field using IN().
50%/50% split. Lower is better.
0.396 cost for both*/
/*Same as above, just joining on a CHAR field*/
SELECT COUNT(T1.ID )
FROM #maintable T1
WHERE (Other IN ( SELECT Other FROM #jointable))
SELECT COUNT(T1.ID )
FROM #maintable T1
INNER JOIN #jointable T2 ON T1.Other = T2.Other
/*Even when adding an index on the OTHER field, it remains a 50/50*/
/*5. Tie. Similar performance for the VARCHAR field using IN().
50%/50% split. Lower is better.
0.353 cost for both*/
SELECT COUNT(T1.ID )
FROM #maintable T1
WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
SELECT COUNT(T1.ID )
FROM #maintable T1
INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
/*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing.
28%/72% split. Lower is better. At 200k rows this changes to 43%/57%
0.123 for the IN() and 0.317 for the JOIN
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/
CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice)
CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice)
SELECT COUNT(T1.ID )
FROM #maintable T1
WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
SELECT COUNT(T1.ID )
FROM #maintable T1
INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
/*7. Faster JOIN. Very different profile when excluding on a VARCHAR field.
85%/15% split. Lower is better. At 200k rows this changes to 92%/8%
NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/
/*How about NOT(IN)) just joining on a CHAR field*/
SELECT COUNT(T1.ID) FROM #maintable T1
WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable))
SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
WHERE T2.ID IS NULL
/*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field
35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27%
IN() benefits from the index.
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/
CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID)
CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID)
SELECT COUNT(T1.ID )
FROM #maintable T1
WHERE (ID IN ( SELECT ID FROM #jointable))
SELECT COUNT(T1.ID )
FROM #maintable T1
INNER JOIN #jointable T2 ON T1.ID = T2.ID
/*Clean up*/
IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
DROP TABLE #maintable;
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
DROP TABLE #jointable;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment