SQLAdrian/IN versus JOIN performance SQL2022.sql

## IN versus JOIN performance SQL2022.sql
--Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info;
--Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64)
--I will be using the Estimate Subtree cost to compare each select statement.
--Test carried out on E4 with 10,000 rows. It would be stated where more rows were used.
--Lookup table is set to 80% of the size of the source table.

--tl;dr.
--It really depends.
--Generally the IN() and JOIN perform similar. Tested to 1m rows.
--IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly.
--JOIN beneftis greatly from appropriate indexes on 250k+ rows
--NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count
--NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+

IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;
CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
/*It's best to crack open the query plan for this and walk through each one*/
/*Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function*/
 ;WITH E1(N) AS (
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
                 SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
                ),                          --10E+1 or 10 rows
       E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
       E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
	   /*All yours*/
	   Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million

INSERT #maintable([N], [Other], Invoice)
SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END
, ROW_NUMBER() OVER (Order by N) [Other]
, 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice
/*As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input*/
FROM Emore;

INSERT #jointable([N], [Other], Invoice)
SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END
, [Other]
, [Invoice]
FROM #maintable
/*You can break the section below out to test as individual sections, just drop the #temp tables afterwards*/

/*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN.
49%/51% split. Lower is better.
NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/
	/*Let's try joining on TINYINT field using NOT(IN))*/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(ID IN ( SELECT ID FROM #jointable))
	/*Let's try joining on numbers using LEFT OUTER and looking for unjoined items*/
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID
	WHERE T2.ID IS NULL

/*2. Faster JOIN. Very different profile when excluding on a CHAR field.
85%/15% split. Lower is better. At 200k rows this changes to 91%/9%
NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421
TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/
	/*How about NOT(IN)) just joining on a CHAR field*/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other
	WHERE T2.ID IS NULL

/*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN.
50%/50% split. Lower is better. Stays consistent on higher row counts
0.255 cost for both*/
	/*Let's try joining on tinyint using (IN))*/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID

/*4. Tie. Similar performance for CHAR field using IN().
50%/50% split. Lower is better.
0.396 cost for both*/
	/*Same as above, just joining on a CHAR field*/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Other = T2.Other
	/*Even when adding an index on the OTHER field, it remains a 50/50*/

/*5. Tie. Similar performance for the VARCHAR field using IN().
50%/50% split. Lower is better.
0.353 cost for both*/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

/*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing.
28%/72% split. Lower is better. At 200k rows this changes to 43%/57%
0.123 for the IN() and 0.317 for the JOIN
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/
CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice)
CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

/*7. Faster JOIN. Very different profile when excluding on a VARCHAR field.
85%/15% split. Lower is better. At 200k rows this changes to 92%/8%
NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/
	/*How about NOT(IN)) just joining on a CHAR field*/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
	WHERE T2.ID IS NULL

/*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field
35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27%
IN() benefits from the index.
Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/
CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID)
CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID


/*Clean up*/
IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;
	--Tested on 24 core @ 2.5GHz, SELECT cpu_count FROM sys.dm_os_sys_info;
	--Microsoft SQL Server 2022 Developer Edition (RTM-GDR) (KB5035432) - 16.0.1115.1 (X64)
	--I will be using the Estimate Subtree cost to compare each select statement.
	--Test carried out on E4 with 10,000 rows. It would be stated where more rows were used.
	--Lookup table is set to 80% of the size of the source table.

	--tl;dr.
	--It really depends.
	--Generally the IN() and JOIN perform similar. Tested to 1m rows.
	--IN() benefits greatly from appropriate indexes on 0 to 250k rows. More so than a JOIN. Surprisingly.
	--JOIN beneftis greatly from appropriate indexes on 250k+ rows
	--NOT(IN()) on charater fields is slower than a JOIN irrespective of indexes and row count
	--NOT(IN()) on charater fields degrades substantially on higher row counts, 1m+

	IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
	CREATE TABLE #maintable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
	IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;
	CREATE TABLE #jointable (ID INT IDENTITY(1,1), N TINYINT, Other CHAR(22), Invoice VARCHAR(20))
	/It's best to crack open the query plan for this and walk through each one/
	/Thanks to Jeff Moden for sponsoring the CTE. https://www.sqlservercentral.com/articles/tally-oh-an-improved-sql-8k-%e2%80%9ccsv-splitter%e2%80%9d-function/
	;WITH E1(N) AS (
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL
	SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
	), --10E+1 or 10 rows
	E2(N) AS (SELECT 1 FROM E1 a, E1 b), --10E+2 or 100 rows
	E4(N) AS (SELECT 1 FROM E2 a, E2 b), --10E+4 or 10,000 rows max
	/All yours/
	Emore(N) AS (SELECT 1 FROM E4 a, E2 b) --and then some, but stop at 1 million

	INSERT #maintable([N], [Other], Invoice)
	SELECT CASE WHEN RIGHT(RAND(ROW_NUMBER() OVER (Order by N)),1) <5 THEN 0 ELSE 1 END
	, ROW_NUMBER() OVER (Order by N) [Other]
	, 'INV' + CONVERT(VARCHAR(15),ROW_NUMBER() OVER (Order by N)) Invoice
	/As it seems RAND uses the system timestamp it's almost sequention on the CTE, so use the tail of the RAND as input/
	FROM Emore;

	INSERT #jointable([N], [Other], Invoice)
	SELECT TOP 80 PERCENT CASE WHEN RIGHT(RAND(ID),1) <5 THEN 0 ELSE 1 END
	, [Other]
	, [Invoice]
	FROM #maintable
	/You can break the section below out to test as individual sections, just drop the #temp tables afterwards/

	/*1. Tie. Similar performance on NOT(IN) and LEFT OUTER JOIN.
	49%/51% split. Lower is better.
	NOT(IN) just edging out on a cost of 0.246, compared to the JOIN at 0.2578*/
	/Let's try joining on TINYINT field using NOT(IN))/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(ID IN ( SELECT ID FROM #jointable))
	/Let's try joining on numbers using LEFT OUTER and looking for unjoined items/
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.ID = T2.ID
	WHERE T2.ID IS NULL

	/*2. Faster JOIN. Very different profile when excluding on a CHAR field.
	85%/15% split. Lower is better. At 200k rows this changes to 91%/9%
	NOT(IN) has a higher cost at 2.324, compared to the JOIN at 0.421
	TAKES LOOONG TO RUN on Emore CTE. 96%/4% split, so NOT(IN()) degrades with higher row count*/
	/How about NOT(IN)) just joining on a CHAR field/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Other = T2.Other
	WHERE T2.ID IS NULL

	/*3. Tie. Similar performance for TINYINT IN() compared to INNER JOIN.
	50%/50% split. Lower is better. Stays consistent on higher row counts
	0.255 cost for both*/
	/Let's try joining on tinyint using (IN))/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID

	/*4. Tie. Similar performance for CHAR field using IN().
	50%/50% split. Lower is better.
	0.396 cost for both*/
	/Same as above, just joining on a CHAR field/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Other IN ( SELECT Other FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Other = T2.Other
	/Even when adding an index on the OTHER field, it remains a 50/50/

	/*5. Tie. Similar performance for the VARCHAR field using IN().
	50%/50% split. Lower is better.
	0.353 cost for both*/
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

	/*6. Faster IN(). Vastly faster IN() when indexed on VARCHAR. Just to spicy things up, we'll add some indexing.
	28%/72% split. Lower is better. At 200k rows this changes to 43%/57%
	0.123 for the IN() and 0.317 for the JOIN
	Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 65%/35% split.*/
	CREATE NONCLUSTERED INDEX IX_TEST ON #maintable(Invoice)
	CREATE NONCLUSTERED INDEX IX_TEST ON #jointable(Invoice)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.Invoice = T2.Invoice

	/*7. Faster JOIN. Very different profile when excluding on a VARCHAR field.
	85%/15% split. Lower is better. At 200k rows this changes to 92%/8%
	NOT(IN) has a higher cost at 2.010, compared to the JOIN at 0.358*/
	/How about NOT(IN)) just joining on a CHAR field/
	SELECT COUNT(T1.ID) FROM #maintable T1
	WHERE NOT(Invoice IN ( SELECT Invoice FROM #jointable))
	SELECT COUNT(T1.ID) FROM #maintable T1 LEFT OUTER JOIN #jointable T2 ON T1.Invoice = T2.Invoice
	WHERE T2.ID IS NULL

	/*8. Faster IN(). Seeing as indexing benefitted the IN(), let's try again on the ID field
	35%/65% split. Lower is better. At 200k rows this changes to 46%/54%. At 500k rows this changes to 73%/27%
	IN() benefits from the index.
	Fater JOIN on more rows. On 1m+ rows and going parallel this changes to benefit the JOIN at 64%/36% split.*/
	CREATE NONCLUSTERED INDEX IX_TEST2 ON #maintable(ID)
	CREATE NONCLUSTERED INDEX IX_TEST2 ON #jointable(ID)
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	WHERE (ID IN ( SELECT ID FROM #jointable))
	SELECT COUNT(T1.ID )
	FROM #maintable T1
	INNER JOIN #jointable T2 ON T1.ID = T2.ID


	/Clean up/
	IF OBJECT_ID('tempdb..#maintable') IS NOT NULL
	DROP TABLE #maintable;
	IF OBJECT_ID('tempdb..#jointable') IS NOT NULL
	DROP TABLE #jointable;