Skip to content

Instantly share code, notes, and snippets.

@mburbea

mburbea/SqlKiwiUnicodeFunc.sql Secret

Created Apr 16, 2021
Embed
What would you like to do?
drop function if exists dbo.ListAllUnicodeCodePoints
drop function if exists dbo.getnums_sqlkiwi
drop table if exists dbo.cs
select n1 = 0
,n2 = isnull(convert(int,row_number() over (order by 1/0)),0)
into dbo.cs
from string_split(space(255),' '),string_split(space(255),' ') b
-- Single compressed rowgroup of 65,536 rows
CREATE CLUSTERED COLUMNSTORE INDEX CCI ON dbo.CS
GO
-- The function
create or alter function dbo.GetNums_SQLkiwi(@low bigint = 1,@high bigint)
returns table
with schemabinding AS
return
select n.rn,
n = @low - 1 + n.rn,
op = @high + 1 - n.rn
from
(
select rn = row_number() over (order by 1/0)
from dbo.cs n1
join dbo.CS n2
-- Batch mode hash cross join
-- Integer not null data type avoid hash probe residual
-- This is always 0 = 0
on n2.n1 = n1.n1
and @high >= @low
-- Try to avoid SQRT on negative numbers and enable simplification
-- to single constant scan if @low > @high (with literals)
-- No start-up filters in batch mode
-- Coarse filter:
-- Limit each side of the cross join to SQRT(target number of rows)
-- IIF avoids SQRT on negative numbers with parameters
and n1.n2 <= convert(integer, ceiling(sqrt(convert(float, iif(@high >= @low, @high - @low + 1, 0)))))
and n2.n2 <= convert(integer, ceiling(sqrt(convert(float, iif(@high >= @low, @high - @low + 1, 0)))))
) n
-- Precise filter:
-- Batch mode filter the limited cross join to the exact number of rows needed
-- Avoids the optimizer introducing a row-mode Top with following row mode compute scalar
where @low - 2 + N.rn < @high;
GO
CREATE OR ALTER FUNCTION dbo.ListAllUnicodeCodePoints(@EncodeSurrogateCodePointsInUTF8 BIT = 0)
RETURNS TABLE
with schemabinding AS RETURN
WITH nums AS
(
SELECT num = n from dbo.GetNums_SQLkiwi(0,1114111)
), chars AS
(
SELECT CONVERT(INT, n.[num]) AS [num], -- pass-through
RIGHT(CONVERT(CHAR(6), CONVERT(BINARY(3), n.[num]), 2),
CASE WHEN n.[num] > 65535 THEN 5 ELSE 4 END) AS [CodePointHex],
CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 55232 + (n.[num] / 1024) END) AS [HighSurrogateINT],
CONVERT(INT, CASE WHEN n.[num] > 65535 THEN 56320 + (n.[num] % 1024) END) AS [LowSurrogateINT]
FROM nums n
WHERE n.[num] BETWEEN 0x000000 AND 0x014700 -- filter out 925,455
OR n.[num] BETWEEN 0x016800 AND 0x030000 -- unmapped code
OR n.[num] BETWEEN 0x0E0001 AND 0x0E01EF -- points
)
SELECT
'U+' + c.[CodePointHex] AS [CodePoint],
c.[num] AS [CdPntINT],
'0x' + c.[CodePointHex] AS [CdPntBIN],
CASE
WHEN c.[num] > 65535 THEN NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT])
ELSE NCHAR(c.[num])
END AS [Char],
CASE
WHEN c.[num] > 65535 THEN CONVERT(CHAR(10), CONVERT(BINARY(4),
NCHAR(c.[HighSurrogateINT]) + NCHAR(c.[LowSurrogateINT])), 1)
ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), NCHAR(c.[num])), 1)
END AS [UTF-16LE ],
'0x' + CASE -- https://rosettacode.org/wiki/UTF-8_encode_and_decode#VBA
WHEN c.[num] < 128
THEN CONVERT(CHAR(4), CONVERT(BINARY(1), c.[num]), 2)
WHEN c.[num] BETWEEN 128 AND 2047
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] / 64) + 192)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
WHEN (@EncodeSurrogateCodePointsInUTF8 = 0) AND (c.[num] BETWEEN 55296 AND 57343)
THEN 'EFBFBD' -- Replacement (U+FFFD) Surrogate Code Points are invalid in UTF-8
WHEN c.[num] BETWEEN 2048 AND 65535
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) / 64) + 224)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
WHEN c.[num] BETWEEN 65536 AND 1114111
THEN CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) / 64) + 240)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((((c.[num] / 64) / 64) % 64) + 128)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), (((c.[num] / 64) % 64) + 128)), 2)
+ CONVERT(CHAR(2), CONVERT(BINARY(1), ((c.[num] % 64) + 128)), 2)
ELSE CONVERT(VARCHAR(15), NULL)
END AS [UTF-8 ],
c.[HighSurrogateINT] AS [HighSrgtINT],
c.[LowSurrogateINT] AS [LowSrgtINT],
CONVERT(BINARY(2), c.[HighSurrogateINT]) AS [HighSrgtBIN],
CONVERT(BINARY(2), c.[LowSurrogateINT]) AS [LowSrgtBIN],
'NCHAR(' + CASE
WHEN c.[num] > 65535 THEN CONVERT(CHAR(6), CONVERT(BINARY(2), c.[HighSurrogateINT]), 1)
+ ') + NCHAR(' + CONVERT(CHAR(6), CONVERT(BINARY(2), c.[LowSurrogateINT]), 1)
ELSE CONVERT(CHAR(6), CONVERT(BINARY(2), c.[num]), 1)
END + ')' AS [T-SQL ],
'&#x' + c.[CodePointHex] + ';' AS [HTML/XML ],
CASE
WHEN c.[num] < 65536 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[num]), 2)
ELSE CONVERT(VARCHAR(10), NULL)
END AS [C#/F#/C++/Java[Script]]/Julia/?],
CASE
WHEN c.[num] > 65535 THEN '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[HighSurrogateINT]), 2)
+ '\u' + CONVERT(CHAR(4), CONVERT(BINARY(2), c.[LowSurrogateINT]), 2)
ELSE CONVERT(VARCHAR(15), NULL)
END AS [C#/F#/Java[Script]]/?],
'\U' + CONVERT(CHAR(8), CONVERT(BINARY(4), c.[num]), 2) AS [C#/F#/C/C++/Julia/?]
FROM chars c;
go
select * from ListAllUnicodeCodePoints(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment