Skip to content

Instantly share code, notes, and snippets.

@hoehrmann hoehrmann/utf8.sql
Created May 7, 2019

Embed
What would you like to do?
WITH
bytes AS (
SELECT 0x00 AS byte
UNION ALL
SELECT byte+1 FROM bytes WHERE byte < 0xFF
),
base AS (
SELECT 0x0000 AS cp
UNION ALL
SELECT cp+1 FROM base WHERE cp < 0x10FFFF
),
surrogates AS (
SELECT 0xd800 AS cp
UNION ALL
SELECT cp+1 FROM surrogates WHERE cp < 0xDFFF
),
valid AS (
SELECT * FROM base
EXCEPT
SELECT * FROM surrogates
ORDER BY cp
),
cuts as (
SELECT 0x00000 AS cp UNION SELECT 0x00007f -- 1b (ascii)
UNION SELECT 0x00080 UNION SELECT 0x0007ff -- 2b
UNION SELECT 0x00800 UNION SELECT 0x00d7ff -- 3b (1st)
UNION SELECT 0x0d800 UNION SELECT 0x00dfff -- 3b (surrogates)
UNION SELECT 0x0e000 UNION SELECT 0x00ffff -- 3b (2nd)
UNION SELECT 0x10000 UNION SELECT 0x10ffff -- 4b
),
u8 AS (
SELECT
cp,
CASE
WHEN cp <= 0x0007f THEN cp
ELSE (
(
((cp << 0) & 0x0000003f) |
((cp << 2) & 0x00003f00) |
((cp << 4) & 0x003f0000) |
((cp << 6) & 0x3f000000)
)
|
CASE
WHEN cp <= 0x0007FF THEN 0x0000c080
WHEN cp <= 0x00FFFF THEN 0x00e08080
WHEN cp <= 0x10FFFF THEN 0xf0808080
END
)
END AS u8
FROM
cuts -- ought to be `valid` but that is slow
),
stuff as (
SELECT
NULL AS lp,
NULL AS rp,
NULL AS ch,
1234 AS id,
u8_1.u8 AS l8,
u8_2.u8 AS r8,
CASE
WHEN u8_1.u8 > 0x00ffffff then 4
WHEN u8_1.u8 > 0x0000ffff then 3
WHEN u8_1.u8 > 0x000000ff then 2
ELSE 1
END AS len
FROM
(
-- TODO: this is actually the properly segmented input data
SELECT 0x00000 AS lu, 0x00007f AS ru UNION -- 1b
SELECT 0x00080 AS lu, 0x0007ff AS ru UNION -- 2b
SELECT 0x00800 AS lu, 0x00d7ff AS ru UNION -- 3b (1st)
-- SELECT 0x0d800 AS lu, 0x00dfff AS ru UNION -- surrogates
SELECT 0x0e000 AS lu, 0x00ffff AS ru UNION -- 3b (2nd)
SELECT 0x10000 AS lu, 0x10ffff AS ru -- 4b
) x
INNER JOIN u8 u8_1 ON (u8_1.cp = x.lu)
INNER JOIN u8 u8_2 ON (u8_2.cp = x.ru)
),
rec AS (
SELECT * FROM stuff
UNION
SELECT
rec.l8 AS lp,
rec.r8 AS rp,
bytes.byte AS ch,
rec.id AS id,
CASE
WHEN bytes.byte = rec.l8 >> 8*(rec.len - 1)
THEN rec.l8 & (0xFFFFFFFF >> 8*(4 - rec.len + 1))
ELSE 0x80808080 & (0xFFFFFFFF >> 8*(4 - rec.len + 1))
END AS l8,
CASE
WHEN bytes.byte = rec.r8 >> 8*(rec.len - 1)
THEN rec.r8 & (0xFFFFFFFF >> 8*(4 - rec.len + 1))
ELSE 0xBFBFBFBF & (0xFFFFFFFF >> 8*(4 - rec.len + 1))
END AS r8,
rec.len - 1 AS len
FROM
rec
INNER JOIN bytes
WHERE
len > 0
AND
bytes.byte
BETWEEN
rec.l8 >> 8*(rec.len - 1)
AND
rec.r8 >> 8*(rec.len - 1)
),
states AS (
SELECT
l8,
r8,
JSON_GROUP_ARRAY(distinct id) AS ids,
row_number() OVER (ORDER BY l8 <> 0, r8 <> 0, l8, r8) AS state
FROM
rec
WHERE
lp is not null
GROUP BY
l8,
r8
)
select
COALESCE(lhs.state, 1) AS lhs_state,
SUBSTR(JSON_GROUP_ARRAY(rec.ch), 0, 30) AS bytes,
JSON_GROUP_ARRAY(distinct rec.id) AS ids,
rhs.state AS rhs_state
from
rec
left join states rhs
on rhs.l8 = rec.l8 and rhs.r8 = rec.r8
left join states lhs
on lhs.l8 = rec.lp and lhs.r8 = rec.rp
GROUP BY
rec.lp,rec.rp,rec.l8,rec.r8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.