-
-
Save mburbea/e8e54925cc08c035544eef28b35148e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop function if exists dbo.ListAllUnicodeCodePoints | |
drop function if exists dbo.getnums_sqlkiwi | |
drop table if exists dbo.cs | |
select n1 = 0 | |
,n2 = isnull(convert(int,row_number() over (order by 1/0)),0) | |
into dbo.cs | |
from string_split(space(255),' '),string_split(space(255),' ') b | |
-- Single compressed rowgroup of 65,536 rows | |
CREATE CLUSTERED COLUMNSTORE INDEX CCI ON dbo.CS | |
GO | |
-- The function | |
create or alter function dbo.GetNums_SQLkiwi(@low bigint = 1,@high bigint) | |
returns table | |
with schemabinding AS | |
return | |
select n.rn, | |
n = @low - 1 + n.rn, | |
op = @high + 1 - n.rn | |
from | |
( | |
select rn = row_number() over (order by 1/0) | |
from dbo.cs n1 | |
join dbo.CS n2 | |
-- Batch mode hash cross join | |
-- Integer not null data type avoid hash probe residual | |
-- This is always 0 = 0 | |
on n2.n1 = n1.n1 | |
and @high >= @low | |
-- Try to avoid SQRT on negative numbers and enable simplification | |
-- to single constant scan if @low > @high (with literals) | |
-- No start-up filters in batch mode | |
-- Coarse filter: | |
-- Limit each side of the cross join to SQRT(target number of rows) | |
-- IIF avoids SQRT on negative numbers with parameters | |
and n1.n2 <= convert(integer, ceiling(sqrt(convert(float, iif(@high >= @low, @high - @low + 1, 0))))) | |
and n2.n2 <= convert(integer, ceiling(sqrt(convert(float, iif(@high >= @low, @high - @low + 1, 0))))) | |
) n | |
-- Precise filter: | |
-- Batch mode filter the limited cross join to the exact number of rows needed | |
-- Avoids the optimizer introducing a row-mode Top with following row mode compute scalar | |
where @low - 2 + N.rn < @high; | |
GO | |
CREATE OR ALTER FUNCTION dbo.ListAllUnicodeCodePoints(@EncodeSurrogateCodePointsInUTF8 BIT = 0) | |
RETURNS TABLE | |
with schemabinding AS RETURN | |
WITH nums AS | |
( | |
SELECT num = n from dbo.GetNums_SQLkiwi(0,1114111) | |
), chars AS | |
( | |
SELECT CONVERT(INT, n.[num]) AS [num], -- pass-through | |
RIGHT(CONVERT(CHAR(6), CONVERT(BINARY(3), n.[num]), 2), | |
CASE WHEN n.[num] > 65535 THEN 5 ELSE 4 END) AS [CodePointHex], | |
CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 55232 + (n.[num] / 1024) END) AS [HighSurrogateINT], | |
CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 56320 + (n.[num] % 1024) END) AS [LowSurrogateINT] | |
FROM nums n | |
WHERE n.[num] BETWEEN 0x000000 AND 0x014700 -- filter out 925,455 | |
OR n.[num] BETWEEN 0x016800 AND 0x030000 -- unmapped code | |
OR n.[num] BETWEEN 0x0E0001 AND 0x0E01EF -- points | |
) | |
SELECT | |
'U+' + c.[CodePointHex] AS [CodePoint], | |
c.[num] AS [CdPntINT], | |
'0x' + c.[CodePointHex] AS [CdPntBIN], | |
CASE | |
WHEN c.[num] > 65535 THEN NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT]) | |
ELSE NCHAR(c.[num]) | |
END AS [Char], | |
CASE | |
WHEN c.[num] > 65535 THEN CONVERT(CHAR(10), CONVERT(BINARY(4), | |
NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT])), 1) | |
ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), NCHAR(c.[num])), 1) | |
END AS [UTF-16LE ], | |
'0x' + CASE -- https://rosettacode.org/wiki/UTF-8_encode_and_decode#VBA | |
WHEN c.[num] < 128 | |
THEN CONVERT(CHAR(4), CONVERT(BINARY(1), c.[num]), 2) | |
WHEN c.[num] BETWEEN 128 AND 2047 | |
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] / 64) + 192)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2) | |
WHEN (@EncodeSurrogateCodePointsInUTF8 = 0) AND (c.[num] BETWEEN 55296 AND 57343) | |
THEN 'EFBFBD' -- Replacement (U+FFFD) Surrogate Code Points are invalid in UTF-8 | |
WHEN c.[num] BETWEEN 2048 AND 65535 | |
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) / 64) + 224)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2) | |
WHEN c.[num] BETWEEN 65536 AND 1114111 | |
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) / 64) + 240)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) % 64) + 128)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2) | |
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2) | |
ELSE CONVERT(VARCHAR(15), NULL) | |
END AS [UTF-8 ], | |
c.[HighSurrogateINT] AS [HighSrgtINT], | |
c.[LowSurrogateINT] AS [LowSrgtINT], | |
CONVERT(BINARY(2), c.[HighSurrogateINT]) AS [HighSrgtBIN], | |
CONVERT(BINARY(2), c.[LowSurrogateINT]) AS [LowSrgtBIN], | |
'NCHAR(' + CASE | |
WHEN c.[num] > 65535 THEN CONVERT(CHAR(6), CONVERT(BINARY(2), c.[HighSurrogateINT]), 1) | |
+ ') + NCHAR(' + CONVERT(CHAR(6), CONVERT(BINARY(2), c.[LowSurrogateINT]), 1) | |
ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), c.[num]), 1) | |
END + ')' AS [T-SQL ], | |
'&#x' + c.[CodePointHex] + ';' AS [HTML/XML ], | |
CASE | |
WHEN c.[num] < 65536 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[num]), 2) | |
ELSE CONVERT(VARCHAR(10), NULL) | |
END AS [C#/F#/C++/Java[Script]]/Julia/?], | |
CASE | |
WHEN c.[num] > 65535 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[HighSurrogateINT]), 2) | |
+ '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[LowSurrogateINT]), 2) | |
ELSE CONVERT(VARCHAR(15), NULL) | |
END AS [C#/F#/Java[Script]]/?], | |
'\U' + CONVERT(CHAR(8), CONVERT(BINARY(4), c.[num]), 2) AS [C#/F#/C/C++/Julia/?] | |
FROM chars c; | |
go | |
select * from ListAllUnicodeCodePoints(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment