Last active
October 1, 2024 08:54
-
-
Save PhilippSalvisberg/520cb8269462f1011c319cbf05b9b12b to your computer and use it in GitHub Desktop.
PL/SQL Package to convert a UTF-8 codepoint to bytes and vice versa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create or replace package utf8 is | |
function codepoint_to_bytes(in_codepoint in varchar2) return varchar2 deterministic; | |
function bytes_to_codepoint(in_bytes in varchar2) return varchar2 deterministic; | |
function codepoint_to_unistr(in_codepoint in varchar2) return varchar2 deterministic; | |
function unistr_to_codepoint(in_unistr in varchar2) return varchar2 deterministic; | |
end utf8; | |
/ | |
create or replace package body utf8 is | |
function hex_to_num(in_hex in varchar2) return integer deterministic; | |
function int_to_binary(in_int in integer, in_len in integer default 21) return varchar2 deterministic; | |
function build_byte(in_prefix in varchar2, in_suffix in varchar2) return varchar2 deterministic; | |
function bits_from_byte(in_byte in varchar2, in_start_pos in integer default 1) return varchar2 deterministic; | |
function hex_to_num(in_hex in varchar2) return integer deterministic is | |
co_hex constant varchar2(8 char) := in_hex; | |
begin | |
return to_number(co_hex default 0 on conversion error, rpad('X', length(co_hex), 'X'), q'[nls_numeric_characters='.,']'); | |
end hex_to_num; | |
function int_to_binary(in_int in integer, in_len in integer default 21) return varchar2 deterministic is | |
l_result varchar2(21 char); | |
begin | |
-- query based on https://sqlpatterns.wordpress.com/2017/05/25/integer-to-binary-conversion-in-oracle-sql/ | |
select listagg(sign(bitand(in_int, power(2, level - 1))), null) within group(order by level desc) | |
into l_result | |
from dual | |
connect by power(2, level - 1) <= in_int; | |
return lpad(l_result, in_len, '0'); | |
end int_to_binary; | |
function build_byte(in_prefix in varchar2, in_suffix in varchar2) return varchar2 deterministic is | |
co_prefix constant varchar2(7 char) := in_prefix; | |
co_suffix constant varchar2(8 char) := in_suffix; | |
l_binary varchar2(8 char); | |
l_result varchar2(4 char); | |
begin | |
l_binary := co_prefix || co_suffix; | |
select to_char( | |
bin_to_num( | |
to_number(substr(l_binary, 1, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 2, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 3, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 4, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 5, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 6, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 7, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'), | |
to_number(substr(l_binary, 8, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']') | |
), | |
'FMXX' | |
) | |
into l_result | |
from dual; | |
return lpad(l_result, 2, '0'); | |
end build_byte; | |
function bits_from_byte(in_byte in varchar2, in_start_pos in integer default 1) return varchar2 deterministic is | |
co_byte constant varchar2(2 char) := in_byte; | |
l_int integer; | |
l_binary varchar2(8 char); | |
begin | |
l_int := hex_to_num(co_byte); | |
l_binary := int_to_binary(l_int, 8); | |
return substr(l_binary, in_start_pos); | |
end bits_from_byte; | |
function codepoint_to_bytes(in_codepoint in varchar2) return varchar2 deterministic is | |
co_codepoint constant varchar2(8 char) := in_codepoint; | |
l_cp integer; | |
l_binary varchar2(21 char); | |
l_return varchar2(8 char); | |
begin | |
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-8#Description | |
if upper(co_codepoint) like 'U+%' then | |
l_cp := hex_to_num(substr(co_codepoint, 3)); | |
else | |
l_cp := hex_to_num(co_codepoint); | |
end if; | |
if l_cp <= hex_to_num('007F') then -- one byte | |
l_binary := int_to_binary(l_cp, 7); | |
l_return := build_byte('0', l_binary); | |
elsif l_cp <= hex_to_num('07FF') then -- two bytes | |
l_binary := int_to_binary(l_cp, 11); | |
l_return := build_byte('110', substr(l_binary, 1, 5)) | |
|| build_byte('10', substr(l_binary, 6)); | |
elsif l_cp <= hex_to_num('FFFF') then -- three bytes | |
l_binary := int_to_binary(l_cp, 16); | |
l_return := build_byte('1110', substr(l_binary, 1, 4)) | |
|| build_byte('10', substr(l_binary, 5, 6)) | |
|| build_byte('10', substr(l_binary, 11)); | |
elsif l_cp <= hex_to_num('10FFFF') then -- four bytes | |
l_binary := int_to_binary(l_cp, 21); | |
l_return := build_byte('11110', substr(l_binary, 1, 3)) | |
|| build_byte('10', substr(l_binary, 4, 6)) | |
|| build_byte('10', substr(l_binary, 10, 6)) | |
|| build_byte('10', substr(l_binary, 16)); | |
end if; | |
return l_return; | |
end codepoint_to_bytes; | |
function bytes_to_codepoint(in_bytes in varchar2) return varchar2 deterministic is | |
co_bytes constant varchar2(8 char) := in_bytes; | |
l_len integer; | |
l_bits varchar2(21 char); | |
l_return varchar2(8 char); | |
begin | |
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-8#Description | |
l_len := length(co_bytes); | |
case l_len | |
when 2 then -- one byte | |
l_return := 'U+00' || build_byte('0', bits_from_byte(co_bytes, 2)); | |
when 4 then -- two bytes | |
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 4) | |
|| bits_from_byte(substr(co_bytes, 3, 2), 3); | |
l_return := 'U+' || build_byte('00000', substr(l_bits, 1, 3)) | |
|| build_byte(null, substr(l_bits, 4)); | |
when 6 then -- three bytes | |
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 5) | |
|| bits_from_byte(substr(co_bytes, 3, 2), 3) | |
|| bits_from_byte(substr(co_bytes, 5, 2), 3); | |
l_return := 'U+' || build_byte(null, substr(l_bits, 1, 8)) | |
|| build_byte(null, substr(l_bits, 9)); | |
when 8 then -- four bytes | |
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 6) | |
|| bits_from_byte(substr(co_bytes, 3, 2), 3) | |
|| bits_from_byte(substr(co_bytes, 5, 2), 3) | |
|| bits_from_byte(substr(co_bytes, 7, 2), 3); | |
l_return := 'U+' || build_byte('000', substr(l_bits, 1, 5)) | |
|| build_byte(null, substr(l_bits, 6, 8)) | |
|| build_byte(null, substr(l_bits, 14)); | |
end case; | |
return l_return; | |
end bytes_to_codepoint; | |
function codepoint_to_unistr(in_codepoint in varchar2) return varchar2 deterministic is | |
co_codepoint constant varchar2(8 char) := in_codepoint; | |
l_cp integer; | |
l_binary varchar2(20 char); | |
l_return varchar2(10 char); | |
begin | |
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-16 | |
if upper(co_codepoint) like 'U+%' then | |
l_cp := hex_to_num(substr(co_codepoint, 3)); | |
else | |
l_cp := hex_to_num(co_codepoint); | |
end if; | |
if l_cp <= hex_to_num('FFFF') then -- basic multilingual plane (BMP) | |
l_return := '\' || lpad(to_char(l_cp, 'FMXXXX'), 4, '0'); | |
elsif l_cp <= hex_to_num('10FFFF') then -- other plane -> high/low surrogates | |
l_binary := int_to_binary(l_cp - hex_to_num('10000'), 20); | |
l_return := '\' || build_byte('110110', substr(l_binary, 1, 2)) | |
|| build_byte(null, substr(l_binary, 3, 8)) | |
|| '\' || build_byte('110111', substr(l_binary, 11, 2)) | |
|| build_byte(null, substr(l_binary, 13, 8)); | |
end if; | |
return l_return; | |
end codepoint_to_unistr; | |
function unistr_to_codepoint(in_unistr in varchar2) return varchar2 deterministic is | |
co_unistr constant varchar2(10 char) := in_unistr; | |
l_len integer; | |
l_bits varchar2(20 char); | |
l_cp integer; | |
l_return varchar2(8 char); | |
begin | |
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-16 | |
l_len := length(co_unistr); | |
case l_len | |
when 5 then -- basic multilingual plane (BMP) | |
l_return := 'U+' || upper(substr(co_unistr, 2)); | |
when 10 then -- other plane -> high/low surrogates | |
l_bits := bits_from_byte(substr(co_unistr, 2, 2), 7) | |
|| bits_from_byte(substr(co_unistr, 4, 2), 1) | |
|| bits_from_byte(substr(co_unistr, 7, 2), 7) | |
|| bits_from_byte(substr(co_unistr, 9, 2), 1); | |
l_cp := hex_to_num('10000') + | |
hex_to_num( | |
build_byte('0000', substr(l_bits, 1, 4)) | |
|| build_byte(null, substr(l_bits, 5, 8)) | |
|| build_byte(null, substr(l_bits, 13, 8)) | |
); | |
l_return := 'U+' || trim(leading '0' from lpad(to_char(l_cp, 'FMXXXXXX'), 6, '0')); | |
end case; | |
return l_return; | |
end unistr_to_codepoint; | |
end utf8; | |
/ | |
-- demo utf8.codepoint_to_bytes | |
column result_41 format a9 | |
column result_c2a3 format a11 | |
column result_e282ac format a13 | |
column result_e280bc format a13 | |
column result_f09f9880 format a15 | |
select utf8.codepoint_to_bytes('U+0041') as result_41, -- A | |
utf8.codepoint_to_bytes('U+00A3') as result_c2a3, -- £ | |
utf8.codepoint_to_bytes('U+20AC') as result_e282ac, -- € | |
utf8.codepoint_to_bytes('U+203C') as result_e280bc, -- ‼ | |
utf8.codepoint_to_bytes('U+1F600') as result_f09f9880 -- 😀 | |
from dual; | |
-- demo utf8.bytes_to_codepoint | |
column result_0041 format a11 | |
column result_00A3 format a11 | |
column result_20ac format a11 | |
column result_203c format a11 | |
column result_1f600 format a12 | |
select utf8.bytes_to_codepoint('41') as result_0041, -- A | |
utf8.bytes_to_codepoint('c2a3') as result_00A3, -- £ | |
utf8.bytes_to_codepoint('e282ac') as result_20AC, -- € | |
utf8.bytes_to_codepoint('e280bc') as result_203C, -- ‼ | |
utf8.bytes_to_codepoint('f09f9880') as result_1F600 -- 😀 | |
from dual; | |
-- demo utf8.codepoint_to_unistr (result to be used in unistr function) | |
column result_0041 format a11 | |
column result_00A3 format a11 | |
column result_20ac format a11 | |
column result_203c format a11 | |
column result_d83d_de00 format a16 | |
select utf8.codepoint_to_unistr('U+0041') as result_0041, -- A | |
utf8.codepoint_to_unistr('U+00A3') as result_00A3, -- £ | |
utf8.codepoint_to_unistr('U+20AC') as result_20AC, -- € | |
utf8.codepoint_to_unistr('U+203C') as result_203C, -- ‼ | |
utf8.codepoint_to_unistr('U+1F600') as result_D83D_DE00 -- 😀 | |
from dual; | |
-- demo utf8.unistr_to_codepoint (input used in unistr function) | |
column result_0041 format a11 | |
column result_00A3 format a11 | |
column result_20ac format a11 | |
column result_203c format a11 | |
column result_1f600 format a12 | |
select utf8.unistr_to_codepoint('\0041') as result_0041, -- A | |
utf8.unistr_to_codepoint('\00A3') as result_00A3, -- £ | |
utf8.unistr_to_codepoint('\20AC') as result_20AC, -- € | |
utf8.unistr_to_codepoint('\203C') as result_203C, -- ‼ | |
utf8.unistr_to_codepoint('\D83D\DE00') as result_1F600 -- 😀 | |
from dual; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment